#!/usr/bin/perl # --------------------------------------------------------------------------------------------- # # topten.pl --- list the top ten words in the corpus for each bucket high # to low based on word count # # This program authored by Scott W Leighton (helphand@pacbell.net) # for use with Popfile and it's components, which are Copyrighted # by John Graham-Cumming. The author hereby contributes this code # to the Popfile project under the terms of the Popfile License # Agreement. /Scott W Leighton/ May 21, 2003 # # Revised May 22, 2003 - added counts # May 23, 2003 - added access to popfile.cfg for corpus directoryx # May 24, 2003 - make corpus table files optional # May 25, 2003 - Added % bucket and % total # June 20, 2003 - fixed for v 0.19.0 3 byte table file bug # # Popfile # Copyright (c) 2001-2003 John Graham-Cumming # # --------------------------------------------------------------------------------------------- use strict; my $limit = $ARGV[0] || 10; my $t = $limit == 10 ? "Ten" : $limit; my $fn = 'popfile.cfg'; my %cfg; sub ck_pf_config { if ( open CONFIG, " ) { s/(\015|\012)//g; if ( /(\S+) (.+)/ ) { if (defined $cfg{$1}) { print "Popfile.cfg Entry $1 with value $2 is duplicated (last occurance is the one used, you should remove all duplicate $1 entries)\n"; } $cfg{$1}=$2; } } close CONFIG; } else { die "Unable to open popfile.cfg: $!"; } } sub swrite { die "usage: swrite PICTURE ARGS" unless @_; my $format = shift; $^A = ""; formline($format,@_); return $^A; } # # Main # # # Check the popfile.cfg file # if ( (-e $fn) && (-w $fn)) { ck_pf_config; } elsif (-e $fn) { print "popfile.cfg is not writable (is it marked read only?)\n"; ck_pf_config; } else { die "popfile.cfg doesn't exist (are you running from the correct directory?)\n"; } # # Check the corpus # my $corpus = $cfg{corpus} || $cfg{bayes_corpus} || "corpus"; die "corpus dir '$corpus' does not exist\n" unless -d $corpus; $corpus .= "/*"; my @buckets = glob ($corpus); my %topten = (); my %wordcounts =(); my %words = (); my %globalcount = (); my $errors = 0; print "Top Ten Utility for Popfile\n"; foreach my $bucket (@buckets) { my $line = 0; if (open FILE, "<${bucket}/table" ) { while () { if ( /__CORPUS__ __VERSION__ (\d+)/ ) { next; } s/[\r\n]//g; if ($_) { $line++; my @w = split /\s/,$_; if (scalar( $#w) > 1 ) { $errors++; print "Bucket $bucket line $line has a problem '$w[0] $w[1]'\n"; } else { $topten{$bucket}{$w[0]}=$w[1]; $wordcounts{$bucket}+=$w[1]; $words{$bucket}+=1; $globalcount{words}+=1; $globalcount{wordcount}+=$w[1]; } } } close FILE; } else { print "Warning, unable to open bucket $bucket table file\n"; } } if ($errors) { $errors>1?print "The lines above indicate errors found in your corpus\n":print "The line above indicates an error found in your corpus\n"; } else { print "diagnostics complete, no errors found in corpus\n"; } foreach my $bucket (sort keys %topten) { my @keys = sort { $topten{$bucket}{$b} <=> $topten{$bucket}{$a} || length($topten{$bucket}{$b}) <=> length($topten{$bucket}{$a}) || $topten{$bucket}{$a} cmp $topten{$bucket}{$b} } keys %{$topten{$bucket}}; my $perc = sprintf("%.1f",($globalcount{wordcount}?$wordcounts{$bucket}/$globalcount{wordcount}*100:0)); print "\n\nTop $t for Bucket $bucket word count = $wordcounts{$bucket} ($perc%) words = $words{$bucket}\n"; print "Rank Word From Corpus Word Count % Bucket % Total\n"; for my $i ( 0 ... $limit -1 ) { if (defined $topten{$bucket}{$keys[$i]} ) { my $pb = sprintf("%.4f",($wordcounts{$bucket}?$topten{$bucket}{$keys[$i]}/$wordcounts{$bucket}*100:0)); my $pt = sprintf("%.4f",($globalcount{wordcount}?$topten{$bucket}{$keys[$i]}/$globalcount{wordcount}*100:0)); my $string = swrite(<<"END", $i+1, $keys[$i], $topten{$bucket}{$keys[$i]},$pb,$pt); @### @<<<<<<<<<<<<<<<<<<<<<<<<<<< @###### @##.#### @##.#### END print $string; } } }