topten.pl


#!/usr/bin/perl -w
# ---------------------------------------------------------------------------------------------
#
# topten.pl --- list the top ten words in the corpus for each bucket high
#               to low based on word count
#
# This program authored by Scott W Leighton (helphand@pacbell.net)
# for use with POPFile and it's components, which are Copyrighted
# by John Graham-Cumming. The author hereby contributes this code
# to the POPFile project under the terms of the POPFile License
# Agreement.    /Scott W Leighton/  May 21, 2003
#
# Revised May 22, 2003 - added counts
#         May 23, 2003 - added access to popfile.cfg for corpus directoryx
#         May 24, 2003 - make corpus table files optional
#         May 25, 2003 - Added % bucket and % total
#         June 20, 2003 - Major changes, require v 0.19.0, html output
#                         add probabilities.
#         June 22, 2003 - Added MQ for version 0.20.0 compatibility.
#         June 25, 2003 - Fixed auto-vivification issue with topten
#                         hash that was causing tons of warnings
#         June 29, 2003 - Make sure limit doesn't exceed bucket size
#         Sept 13, 2003 - Update for v 0.20.0 API changes
#
# POPFile
# Copyright (c) 2001-2003 John Graham-Cumming
#
# ---------------------------------------------------------------------------------------------

use strict;
use warnings;

    my %components;
    my $time = localtime;


# ---------------------------------------------------------------------------------------------
#
# load_modules
#
# Called to load specific POPFile loadable modules (implemented as .pm files with special
# comment on first line) in a specific subdirectory
#
# $directory          The directory to search for loadable modules
# $type               The 'type' of module being loaded (e.g. proxy, core, ui) which is used
#                     below when fixing up references between modules (e.g. proxy modules all
#                     need access to the classifier module)
# $module             The specific module name to be loaded.
#
# ---------------------------------------------------------------------------------------------

sub load_modules {

     my ( $directory, $type, $module ) = @_;

     $module = $directory . '/' . $module;

     if ( open MODULE, "<$module" ) {
           my $first = <MODULE>;
           close MODULE;

           if ( $first =~ /^# POPFILE LOADABLE MODULE/ ) {
                require $module;

                $module =~ s/\//::/;
                $module =~ s/\.pm//;

                my $mod = new $module;
                my $name = $mod->name();

                $components{$type}{$name} = $mod;

           }
     }

}

    #
    # Main
    #

    #
    # Load the modules we'll be using
    #

    load_modules( 'POPFile',      'core',       'Configuration.pm' );
    load_modules( 'POPFile',      'core',       'Logger.pm' );
    load_modules( 'POPFile',      'core',       'MQ.pm' );
    load_modules( 'Classifier',   'classifier', 'Bayes.pm' );

# Do not run if we are not on version 0.19.0 or higher

if ($components{core}{config}->isa ('POPFile::Module') && $components{core}{config}->can ( 'parameter' ) ) {

    # link each of the objects with the configuration object and
    # the logger and the mq

    foreach my $type (keys %components) {
        foreach my $name (keys %{$components{$type}}) {
            $components{$type}{$name}->configuration($components{core}{config});
            $components{$type}{$name}->logger($components{core}{logger}) if ( $name ne 'logger' );
            $components{$type}{$name}->mq($components{core}{mq}) if ($components{$type}{$name}->can ( 'mq' )) ;
        }
    }


    #
    # Tell each module to initialize itself
    #

    foreach my $type (keys %components) {
        foreach my $name (keys %{$components{$type}}) {
            if ($components{$type}{$name}->initialize() == 0 ) {
                die "Failed to start while initializing the $name module\n";
            }
        }
    }


    # Ensure that a topten subdirectory exists to hold any error log
    # that logger might generate so we don't interfere with a
    # running POPFile.

    mkdir ( 'topten' );

    # Set default count

    $components{core}{config}->parameter('topten_count','10');

    # Load in the POPFile configuration parameters, any configured
    # ones will override the initialized default values
    # NOTE: We are intentially NOT saving this configuration
    # back to disk since the parameters we are allowing
    # for this program are NOT legal POPFile parameters.

    $components{core}{config}->load_configuration();

    # override the logdir and piddir so we don't mess with
    # the production ones

    $components{core}{config}->parameter('logger_logdir','topten/');
    $components{core}{config}->parameter('config_piddir','topten/');

    # Now grab any commandline parameters, they will override
    # the defaults and those in popfile.cfg. 
    
    $components{core}{config}->parse_command_line();

    # force logger to recognize the new logdir before we startup
    # the modules. That way we will not inadvertently log to the
    # production POPFile log.

    $components{core}{logger}->service();


    # now that the configuration is established, tell each module
    # to start


    foreach my $type (keys %components) {
        foreach my $name (keys %{$components{$type}}) {
            if ($components{$type}{$name}->start() == 0 ) {
                die "Failed to start while starting the $name module\n";
            }
        }
    }


    #
    #  Define some global work areas
    #

    my %topten = ();
    my %wordcounts =();
    my %words = ();
    my %globalcount = ();
    my $body;
    my $body2;

    # Have the classifier give us the buckets

    my @buckets = $components{classifier}{bayes}->get_buckets();


    #
    # Go thru each bucket, grab the word list and word counts
    #

    # The API calls changed between v 0.19.0 and 0.20.0, so
    # check for version, if 0.20.0 or higher, use new bucket
    # API, otherwise, use old API calls


    if ($components{classifier}{bayes}->can ( 'get_bucket_word_prefixes' ) ) {
        foreach my $bucket (@buckets) {
            if ($components{classifier}{bayes}->get_bucket_word_count($bucket) > 0) {
               for my $j ( $components{classifier}{bayes}->get_bucket_word_list( $bucket, '.' ) ) {
                    my $word = $j;
                    my $count = $components{classifier}{bayes}->get_count_for_word( $bucket, $j );
                    $topten{$bucket}{$word}{c}=$count;
                    $wordcounts{$bucket}+=$count;
                    $words{$bucket}+=1;
                    $globalcount{words}+=1;
                    $globalcount{wordcount}+=$count;
                }
            }
        }
    }else {
        foreach my $bucket (@buckets) {
            if ($components{classifier}{bayes}->get_bucket_word_count($bucket) > 0) {
                for my $i (@{$components{classifier}{bayes}->get_bucket_word_list($bucket)}) {
                    if (defined ($i)) {
                        my $j = $i;
                        while ( $j =~ m/\G\|(.*?) L?\-?([\.\d]+)\|/g ) {
                            my $word = $1;
                            my $count = $2;
                            $topten{$bucket}{$word}{c}=$count;
                            $wordcounts{$bucket}+=$count;
                            $words{$bucket}+=1;
                            $globalcount{words}+=1;
                            $globalcount{wordcount}+=$count;
                       }
                   }
               }
           }
        }
    }

    #
    # Start doing our reporting, $body holds report 1, $body2 holds
    # report 2
    #

    my $limit = $components{core}{config}->parameter('topten_count');
    my $t = $limit == 10 ? "Ten" : $limit;

    $body = "<a name=\"1report\"><h2 class=\"buckets\">Top $t Ranked High to Low on Probability</h2></a>\n";
    $body .= "&nbsp;&nbsp;&nbsp;Jump to bucket ";
    $body2 = "<a name=\"2report\"><h2 class=\"buckets\">Top $t Ranked High to Low on Simple Word Count</h2></a>\n";
    $body2 .= "&nbsp;&nbsp;&nbsp;Jump to bucket ";

    foreach my $bucket (@buckets) {
        if ($components{classifier}{bayes}->get_bucket_word_count($bucket) > 0) {
            $body .= "[<a href=\"#1$bucket\">$bucket</a>]&nbsp;";
            $body2 .= "[<a href=\"#2$bucket\">$bucket</a>]&nbsp;";
        }
    }

    $body .= "&nbsp;&nbsp; Jump to next report [<a href=\"#2report\">next</a>]\n";
    $body2 .= "&nbsp;&nbsp; Jump to previous report [<a href=\"#1report\">previous</a>]\n";

    #
    # Sort by simple word count
    #


    foreach my $bucket (sort keys %topten) {
        my @keys = map { $_->[1] }
        sort {
                         $b->[0] <=> $a->[0]
                                 ||
                 length($b->[0]) <=> length($a->[0])
                                 ||
                         $a->[0] cmp $b->[0]
        }
        map { [$topten{$bucket}{$_}{c},$_] }
        keys %{$topten{$bucket}};

        #
        # Calculate the score and probability for each word
        #

        foreach my $word (@keys) {
            my $max = 0;
            my $max_bucket = '';
            my $total = 0;
            foreach my $x (@buckets) {
                if (defined($topten{$x}{$word}) && exists ($topten{$x}{$word}{c}) && $topten{$x}{$word}{c} > 0) {
                    my $prob = exp(log($topten{$x}{$word}{c}/$wordcounts{$x}));
                    $total += $prob;
                    if ($prob > $max) {
                        $max = $prob;
                        $max_bucket =  $bucket;
                    }
                } else {
                    $total+= (0.10 / $globalcount{wordcount});
                }
            }
            if (defined($topten{$bucket}{$word}{c}) && $topten{$bucket}{$word}{c} > 0) {
                my $prob = exp(log($topten{$bucket}{$word}{c}/$wordcounts{$bucket}));
                my $n = ($total > 0)?$prob / $total:0;
                my $score = ($#buckets >= 0) ?log($n)/log(@buckets)+1:0;
                $topten{$bucket}{$word}{s}=$score;
                $topten{$bucket}{$word}{p}=$n;
            }
        }


        my $perc = sprintf("%.1f",($globalcount{wordcount}?$wordcounts{$bucket}/$globalcount{wordcount}*100:0));

        $body2 .= "<a name=\"2$bucket\">";
        $body2 .= "<h2 class=\"buckets\">For Bucket ";
        $body2 .= "<font color=\"" . $components{classifier}{bayes}->get_bucket_color($bucket) . "\">$bucket";
        $body2 .= "</font>\n</h2></a>\n\n<br />\n<table width=\"100%\">\n";
        $body2 .= "<tr><td colspan=7 align=center>Top $t for Bucket $bucket  word count = $wordcounts{$bucket} ($perc%) words = $words{$bucket}</td></tr>\n";
        $body2 .= "<tr><td align=center><strong>Rank</strong></td>";
        $body2 .= "<td><strong>Word From Corpus</strong></td>";
        $body2 .= "<td align=center><strong>Word<br>Count</strong></td>";
        $body2 .= "<td align=center><strong>%<br>Bucket</strong></td>";
        $body2 .= "<td align=center><strong>%<br>Total</strong></td>";
        $body2 .= "<td align=center><strong>Score</strong></td>";
        $body2 .= "<td align=center><strong>Probability</strong></td></tr>";
        for my $i ( 0 .. ($limit -1<$components{classifier}{bayes}->get_bucket_word_count($bucket)?$limit -1:$components{classifier}{bayes}->get_bucket_word_count($bucket)) ) {
            if (defined $topten{$bucket}{$keys[$i]}{c} ) {
                $body2 .= "<tr>\n";
                $body2 .= "<td align=center>" . ($i+1) . "</td>\n";
                $body2 .= "<td>";
                $body2 .= $keys[$i] . "</td>\n";
                $body2 .= "<td align=right>$topten{$bucket}{$keys[$i]}{c}</td>\n";
                $body2 .= "<td align=right>" . sprintf("%.8f",($wordcounts{$bucket}?$topten{$bucket}{$keys[$i]}{c}/$wordcounts{$bucket}*100:0)) . "</td>\n";
                $body2 .= "<td align=right>" . sprintf("%.8f",($globalcount{wordcount}?$topten{$bucket}{$keys[$i]}{c}/$globalcount{wordcount}*100:0)) . "</td>\n";
                $body2 .= "<td align=right>" . sprintf("%.10f",$topten{$bucket}{$keys[$i]}{s}) . "</td>\n";
                $body2 .= "<td align=right>" . sprintf("%.10f",$topten{$bucket}{$keys[$i]}{p}) . "</td>\n";
                $body2 .= "</tr>\n";
            }
        }
        $body2 .= "<tr><td colspan=7><a href=\"#1report\">[Back to Top]</a></table>\n";


        #
        # Sort by probability
        #

        @keys = map { $_->[1] }
        sort {
                   $b->[0] <=> $a->[0]
                            ||
           length($b->[0]) <=> length($a->[0])
                            ||
                   $a->[0] cmp $b->[0]
                       }
        map { [$topten{$bucket}{$_}{p},$_] }
        keys %{$topten{$bucket}};


        $perc = sprintf("%.1f",($globalcount{wordcount}?$wordcounts{$bucket}/$globalcount{wordcount}*100:0));


        $body .= "<a name=\"1$bucket\">";
        $body .= "<h2 class=\"buckets\">For Bucket ";
        $body .= "<font color=\"" . $components{classifier}{bayes}->get_bucket_color($bucket) . "\">$bucket";
        $body .= "</font>\n</h2></a>\n\n<br />\n<table width=\"100%\">\n";
        $body .= "<tr><td colspan=7 align=center>Top $t for Bucket $bucket  word count = $wordcounts{$bucket} ($perc%) words = $words{$bucket}</td></tr>\n";
        $body .= "<tr><td align=center><strong>Rank</strong></td>";
        $body .= "<td><strong>Word From Corpus</strong></td>";
        $body .= "<td align=center><strong>Word<br>Count</strong></td>";
        $body .= "<td align=center><strong>%<br>Bucket</strong></td>";
        $body .= "<td align=center><strong>%<br>Total</strong></td>";
        $body .= "<td align=center><strong>Score</strong></td>";
        $body .= "<td align=center><strong>Probability</strong></td></tr>";
        for my $i ( 0 .. ($limit -1<$components{classifier}{bayes}->get_bucket_word_count($bucket)?$limit -1:$components{classifier}{bayes}->get_bucket_word_count($bucket)) ) {
            if (defined $topten{$bucket}{$keys[$i]}{c} ) {
                $body .= "<tr>\n";
                $body .= "<td align=center>" . ($i+1) . "</td>\n";
                $body .= "<td>";
                $body .= $keys[$i] . "</td>\n";
                $body .= "<td align=right>$topten{$bucket}{$keys[$i]}{c}</td>\n";
                $body .= "<td align=right>" . sprintf("%.8f",($wordcounts{$bucket}?$topten{$bucket}{$keys[$i]}{c}/$wordcounts{$bucket}*100:0)) . "</td>\n";
                $body .= "<td align=right>" . sprintf("%.8f",($globalcount{wordcount}?$topten{$bucket}{$keys[$i]}{c}/$globalcount{wordcount}*100:0)) . "</td>\n";
                $body .= "<td align=right>" . sprintf("%.10f",$topten{$bucket}{$keys[$i]}{s}) . "</td>\n";
                $body .= "<td align=right>" . sprintf("%.10f",$topten{$bucket}{$keys[$i]}{p}) . "</td>\n";
                $body .= "</tr>\n";
            }
        }
        $body .= "<tr><td colspan=7><a href=\"#1report\">[Back to Top]</a></table>\n";

    }

    html_output($body . $body2);


    #
    # Cleanup - Get rid of the popfile.pid file created by the configuration
    #           module.
    #

    unlink($components{core}{config}->parameter('config_piddir') . 'popfile.pid');

    # All Done

} else {
    print "$0 is compatible only with POPFile version 0.19.0 or above\n";
}



sub html_output {
    my $text = shift;

    my $body = "<!DOCTYPE HTML PUBLIC \"-//W3C//DTD HTML 4.01 Transitional//EN\" ";
    $body .= "\"http://www.w3.org/TR/html4/loose.dtd\">\n";
    $body .= "<html>\n<head>\n<title>POPFile Top Ten Utility</title>\n";
    $body .= "<style type=\"text/css\">\n";
    if ( open FILE, '<skins/' . $components{core}{config}->parameter( 'html_skin' ) . '.css' ) {
        while (<FILE>) {
            $body .= $_;
        }
        close FILE;
    }
    $body .= "</style>\n";
    $body .= "<meta http-equiv=\"Pragma\" content=\"no-cache\">\n";
    $body .= "<meta http-equiv=\"Expires\" content=\"0\">\n";

    $body .= "<meta http-equiv=\"Cache-Control\" content=\"no-cache\">\n";
    $body .= "<meta http-equiv=\"Content-Type\" content=\"text/html;\">\n</head>\n";

    $body .= "<body>\n<table class=\"shellTop\" align=\"center\" width=\"100%\" summary=\"\">\n";

    # upper whitespace
    $body .= "<tr class=\"shellTopRow\">\n<td class=\"shellTopLeft\"></td>\n<td class=\"shellTopCenter\"></td>\n";
    $body .= "<td class=\"shellTopRight\"></td>\n</tr>\n";

    # logo
    $body .= "<tr>\n<td class=\"shellLeft\"></td>\n";
    $body .= "<td class=\"naked\">\n";
    $body .= "<table class=\"head\" cellspacing=\"0\" summary=\"\">\n<tr>\n";
    $body .= "<td class=\"head\">POPFile Top Ten Report</td>\n";

    $body .= "<td align=\"right\" valign=\"bottom\">\n";
    $body .= "$time &nbsp;\n";

    $body .= "</td>\n</tr>\n<tr>\n";
    $body .= "<td height=\"1%\" colspan=\"3\"></td>\n</tr>\n";
    $body .= "</table>\n</td>\n"; # colspan 2 ?? srk
    $body .= "<td class=\"shellRight\"></td>\n</tr>\n<tr class=\"shellBottomRow\">\n";

    $body .= "<td class=\"shellBottomLeft\"></td>\n<td class=\"shellBottomCenter\"></td>\n";
    $body .= "<td class=\"shellBottomRight\"></td>\n</tr>\n</table>\n";

    # main content area
    $body .= "<table class=\"shell\" align=\"center\" width=\"100%\" summary=\"\">\n<tr class=\"shellTopRow\">\n";
    $body .= "<td class=\"shellTopLeft\"></td>\n<td class=\"shellTopCenter\"></td>\n";
    $body .= "<td class=\"shellTopRight\"></td>\n</tr>\n<tr>\n";
    $body .= "<td class=\"shellLeft\"></td>\n";
    $body .= "<td align=\"left\" class=\"naked\">\n" . $text .  "\n</td>\n";

    $body .= "<td class=\"shellRight\"></td>\n</tr>\n";
    $body .= "<tr class=\"shellBottomRow\">\n<td class=\"shellBottomLeft\"></td>\n";
    $body .= "<td class=\"shellBottomCenter\"></td>\n<td class=\"shellBottomRight\"></td>\n";
    $body .= "</tr>\n</table>\n";

    $body .= "\n</body>\n</html>\n";
    print $body;

}


1