corpus_diff.pl


#!/usr/bin/perl 
# ---------------------------------------------------------------------------------------------
#
# corpus_diff.pl --- Report the difference between a reference corpus
#                    and the current corpus.
#
# This program authored by Scott W Leighton (helphand@pacbell.net)
# based upon the Popfile project, which is Copyrighted
# by John Graham-Cumming. The author hereby contributes this code
# to the Popfile project under the terms of the Popfile License
# Agreement.    /Scott W Leighton/  June 24, 2003
#
# Revised June 24, 2003 - Initial Release
#     
#
# Derived from skeleton.pl
#
# This program compiled by Scott W Leighton (helphand@pacbell.net)
# from POPFile sources as an example for use in creating POPFile
# commandline programs using the POPFile API. In compiling this
# example, sections of POPFile code were copied, verbatim, from
# the code base of the POPFile project, and are not the original
# work of the compiler.
#
# POPFile and it's components, are Copyrighted by John Graham-Cumming.
# The compiler hereby contributes this compilation of POPFile code
# to the Popfile project under the terms of the Popfile License
# Agreement.    /Scott W Leighton/  June 23, 2003
#
#
# POPFile
# Copyright (c) 2001-2003 John Graham-Cumming
#
# ---------------------------------------------------------------------------------------------

use strict;
use warnings;
use locale;

# The POPFile classes are stored by reference in the %components hash, the top level key is
# the type of the component (see load_modules) and then the name of the component derived from
# calls to each loadable modules name() method and which points to the actual module

    my %components;


# ---------------------------------------------------------------------------------------------
#
# load_modules
#
# Called to load specific POPFile loadable modules (implemented as .pm files with special
# comment on first line) in a specific subdirectory
#
# $directory          The directory to search for loadable modules
# $type               The 'type' of module being loaded (e.g. proxy, core, ui) which is used
#                     below when fixing up references between modules (e.g. proxy modules all
#                     need access to the classifier module)
# $module             The specific module name to be loaded.
#
# ---------------------------------------------------------------------------------------------

sub load_modules {

     my ( $directory, $type, $module ) = @_;

     $module = $directory . '/' . $module;

     if ( open MODULE, "<$module" ) {
           my $first = <MODULE>;
           close MODULE;

           if ( $first =~ /^# POPFILE LOADABLE MODULE/ ) {
                require $module;

                $module =~ s/\//::/;
                $module =~ s/\.pm//;

                my $mod = new $module;
                my $name = $mod->name();

                $components{$type}{$name} = $mod;

           }
     }

}

    #
    # Main
    #

    #
    # Load the modules we'll be using
    #

    load_modules( 'POPFile',      'core',       'Configuration.pm' );
    load_modules( 'POPFile',      'core',       'Logger.pm' );
    load_modules( 'POPFile',      'core',       'MQ.pm' );
    load_modules( 'Classifier',   'classifier', 'Bayes.pm' );

#
# Perform a version check by looking for the POPFile::Module that
# was introduced in v 0.19.0. If you require features of v 0.20.0
# then change the second test to
#          $components{core}{config}->can ( 'mq' )
# since the message queue module was introduced in v 0.20.0
#

if ($components{core}{config}->isa ('POPFile::Module') && $components{core}{config}->can ( 'parameter' ) ) {

    # link each of the objects with the configuration object, 
    # the logger (except the logger itself) and, if we are on v 0.20.0
    # to the mq. 

    foreach my $type (keys %components) {
        foreach my $name (keys %{$components{$type}}) {
            $components{$type}{$name}->configuration($components{core}{config});
            $components{$type}{$name}->logger($components{core}{logger}) if ( $name ne 'logger' );
            $components{$type}{$name}->mq($components{core}{mq}) if ( $components{$type}{$name}->can ( 'mq' ) );
        }
    }


    #
    # Tell each module to initialize itself
    #

    foreach my $type (keys %components) {
        foreach my $name (keys %{$components{$type}}) {
            if ($components{$type}{$name}->initialize() == 0 ) {
                die "Failed to start while initializing the $name module\n";
            }
        }
    }


    # Ensure that a temp subdirectory exists to hold any error log
    # that logger might generate so we don't interfere with a
    # running POPFile.

    mkdir ( 'temp' );

    # Set default commandline options, if your program has any
    # commandline options, establish the default values here.
    # Those defaults will be replaced later if the user runs
    # with the commandline option set.

    $components{core}{config}->parameter('myprog_myoption','default value');

    # Load in the Popfile configuration parameters, any configured
    # ones will override the initialized default values
    # NOTE: We are intentially NOT saving this configuration
    # back to disk since the parameters we are allowing
    # for this program are NOT legal Popfile parameters.

    $components{core}{config}->load_configuration();

    # Now grab any commandline parameters, they will override
    # the defaults and those in popfile.cfg. As a byproduct,
    # if the user overrides our program's options this will
    # pick it up for us.
    
    $components{core}{config}->parse_command_line();

    # Now override the logdir and piddir so we don't mess with
    # the production ones, we are intentionally doing this
    # AFTER the command line options are loaded so we are
    # guaranteed that they weren't changed by the user.

    $components{core}{config}->parameter('logger_logdir','temp/');
    $components{core}{config}->parameter('config_piddir','temp/');

    # force logger to recognize the new logdir before we startup
    # the modules. That way we will not inadvertently log to the
    # production POPFile log. 

    $components{core}{logger}->service();

    # now that the configuration is established, tell each module
    # to start

    foreach my $type (keys %components) {
        foreach my $name (keys %{$components{$type}}) {
            if ($components{$type}{$name}->start() == 0 ) {
                die "Failed to start while starting the $name module\n";
            }
        }
    }


    #
    #  Your Program Logic goes here.....
    #

    #
    #  Define some global work areas
    #

    my %words = ();
    my %diff  = ();
    my $body;

    # Have the classifier give us the buckets for the current corpus

    my @buckets = $components{classifier}{bayes}->get_buckets();


    #
    # Go thru each bucket, grab the word list and word counts
    #


    foreach my $bucket (@buckets) {
        if ($components{classifier}{bayes}->get_bucket_word_count($bucket) > 0) {
            for my $i (@{$components{classifier}{bayes}->get_bucket_word_list($bucket)}) {
                if (defined ($i)) {
                    my $j = $i;
                    while ( $j =~ m/\G\|(.*?) L?\-?([\.\d]+)\|/g ) {
                        my $word = $1;
                        my $count = $2;
                        $words{$bucket}{$word}=$count;
                   }
               }
           }
       }
    }

    #
    # change to the backup corpus directory
    #


    $components{core}{config}->parameter('bayes_corpus',$components{core}{config}->parameter('bayes_corpus') . '.bak');
    $components{classifier}{bayes}->config_('unclassified_probability', 0.50);


    #
    # re-start the bayes classifier to force a reload of the backup corpus
    #


    $components{classifier}{bayes}->start();

    #
    # save the buckets from the active installation
    #

    my %saw;
    @saw{@buckets} = ();

    # Have the classifier give us the buckets from the backup corpus

    @buckets = $components{classifier}{bayes}->get_buckets();

    #
    # Go thru each bucket, grab the word list and word counts
    #


    foreach my $bucket (@buckets) {
        if ($components{classifier}{bayes}->get_bucket_word_count($bucket) > 0) {
            for my $i (@{$components{classifier}{bayes}->get_bucket_word_list($bucket)}) {
                if (defined ($i)) {
                    my $j = $i;
                    while ( $j =~ m/\G\|(.*?) L?\-?([\.\d]+)\|/g ) {
                        my $word = $1;
                        my $count = $2;
                        # the words hash contains the current 'new' corpus
                        # compare the entry from the old corpus to the new one
                        if (exists $words{$bucket}{$word}) {
                            # the old one is in the new one
                            unless ($count == $words{$bucket}{$word}) {
                                # the count is different, calculate the difference
                                $diff{$bucket}{$word}{c}=$words{$bucket}{$word} - $count;
                                delete($words{$bucket}{$word});
                            } else {
                                # the old and new are equal, so delete the new one
                                delete($words{$bucket}{$word});
                            }
                        } else {
                            # the old word doesn't exist in the new corpus, so
                            # it's a deleted word
                            $diff{$bucket}{$word}{d}=$count;
                        }
                   }
               }
           }
       }
    }


   # anything left in the original word hash must be new words
   # that were added since the old corpus

    foreach my $bucket (keys %words) {
       foreach my $word (keys %{$words{$bucket}}) {
           $diff{$bucket}{$word}{a}=$words{$bucket}{$word};
       }
    }

    undef %words;

    #
    # now combine the original buckets with the backup buckets
    #

    @saw{@buckets} = ();
    @buckets = sort keys %saw;
    undef %saw;
       

    $body = "<a name=\"1report\"><h2 class=\"buckets\">Corpus Diff</h2></a>\n";
    $body .= "&nbsp;&nbsp;&nbsp;Jump to bucket ";

    foreach my $bucket (@buckets) {
        if (defined($components{classifier}{bayes}->get_bucket_word_count($bucket)) && $components{classifier}{bayes}->get_bucket_word_count($bucket) > 0) {
            $body .= "[<a href=\"#1$bucket\">$bucket</a>]&nbsp;";
        }
    }


    #
    # Sort alpha
    #


    foreach my $bucket (sort keys %diff) {
       $body .= "<a name=\"1$bucket\">";
       $body .= "<h2 class=\"buckets\">For Bucket ";
       $body .= "<font color=\"" . $components{classifier}{bayes}->get_bucket_color($bucket) . "\">$bucket";
       $body .= "</font>\n</h2></a>\n\n<br />\n<table width=\"100%\" border=1 cellspacing=2>\n";
       $body .= "<tr><td colspan=6 align=center>Diff for Bucket $bucket</td></tr>\n";
       $body .= "<tr><td align=center colspan=2 width=\"33%\"><strong>Deleted From Corpus</strong></td>";
       $body .= "<td align=center colspan=2 width=\"33%\"><strong>Word Count Changed</strong></td>";
       $body .= "<td align=center colspan=2 width=\"33%\"><strong>Added to Corpus</strong></td></tr>";
       my $col0row=0;
       my $col1row=0;
       my $col2row=0;
       my @sorted=();
       my @keys = sort keys %{$diff{$bucket}};
       foreach my $word (@keys) {
           if (defined($diff{$bucket}{$word}) && defined($diff{$bucket}{$word}{d}))  {
               $sorted[$col0row++][0]=$word;
           } elsif (defined($diff{$bucket}{$word}) && defined($diff{$bucket}{$word}{c})) {
               $sorted[$col1row++][1]=$word;
           } elsif (defined($diff{$bucket}{$word}) && defined($diff{$bucket}{$word}{a})) {
               $sorted[$col2row++][2]=$word;
           }
       }
       my $max=$col0row > $col1row?$col0row:$col1row;
       $max=$col2row > $max?$col2row:$max;
       for my $row ( 0 ... $max ) {
            $body .= "<tr>\n";
            if (defined($sorted[$row][0]) && defined($diff{$bucket}{$sorted[$row][0]}) && defined($diff{$bucket}{$sorted[$row][0]}{d}))  {
               $body .= "<td align=left width=\"25%\">$sorted[$row][0]</td>\n";
               $body .= "<td align=left width=\"8%\">$diff{$bucket}{$sorted[$row][0]}{d}</td>\n";
            } else {
               $body .= "<td colspan=2></td>";
            }
            if (defined($sorted[$row][1]) && defined($diff{$bucket}{$sorted[$row][1]}) && defined($diff{$bucket}{$sorted[$row][1]}{c})) {
               $body .= "<td align=left width=\"25%\">$sorted[$row][1]</td>\n";
               $body .= "<td align=left width=\"8%\">" . ($diff{$bucket}{$sorted[$row][1]}{c}>0?"+".$diff{$bucket}{$sorted[$row][1]}{c}:$diff{$bucket}{$sorted[$row][1]}{c}) . "</td>\n";
            } else {
               $body .= "<td colspan=2></td>";
            }
            if (defined($sorted[$row][2]) && defined($diff{$bucket}{$sorted[$row][2]}) && defined($diff{$bucket}{$sorted[$row][2]}{a}))  {
               $body .= "<td align=left width=\"25%\">$sorted[$row][2]</td>\n";
               $body .= "<td align=left width=\"8%\">$diff{$bucket}{$sorted[$row][2]}{a}</td>\n";
            } else {
               $body .= "<td colspan=2></td>";
            }
            $body .= "</tr>\n";
       }
       $body .= "<tr><td colspan=6><a href=\"#1report\">[Back to Top]</a></table>\n";



    }

    html_output($body);



   

    #
    # All Done
    #

    #
    # Cleanup - Get rid of the popfile.pid file created by the configuration
    #           module.
    #

    unlink($components{core}{config}->parameter('config_piddir') . 'popfile.pid');

    exit(0);

} else {
    print "$0 is not compatible with your version of POPFile\n";
    exit(1);
}


sub html_output {
    my $text = shift;
    my $time = localtime;

    my $body = "<!DOCTYPE HTML PUBLIC \"-//W3C//DTD HTML 4.01 Transitional//EN\" ";
    $body .= "\"http://www.w3.org/TR/html4/loose.dtd\">\n";
    $body .= "<html>\n<head>\n<title>POPFile Corpus Diff Utility</title>\n";
    $body .= "<style type=\"text/css\">\n";
    if ( open FILE, '<skins/' . $components{core}{config}->parameter( 'html_skin' ) . '.css' ) {
        while (<FILE>) {
            $body .= $_;
        }
        close FILE;
    }
    $body .= "</style>\n";
    $body .= "<meta http-equiv=\"Pragma\" content=\"no-cache\">\n";
    $body .= "<meta http-equiv=\"Expires\" content=\"0\">\n";

    $body .= "<meta http-equiv=\"Cache-Control\" content=\"no-cache\">\n";
    $body .= "<meta http-equiv=\"Content-Type\" content=\"text/html;\">\n</head>\n";

    $body .= "<body>\n<table class=\"shellTop\" align=\"center\" width=\"100%\" summary=\"\">\n";

    # upper whitespace
    $body .= "<tr class=\"shellTopRow\">\n<td class=\"shellTopLeft\"></td>\n<td class=\"shellTopCenter\"></td>\n";
    $body .= "<td class=\"shellTopRight\"></td>\n</tr>\n";

    # logo
    $body .= "<tr>\n<td class=\"shellLeft\"></td>\n";
    $body .= "<td class=\"naked\">\n";
    $body .= "<table class=\"head\" cellspacing=\"0\" summary=\"\">\n<tr>\n";
    $body .= "<td class=\"head\">POPFile Corpus Diff Report</td>\n";

    $body .= "<td align=\"right\" valign=\"bottom\">\n";
    $body .= "$time &nbsp;\n";

    $body .= "</td>\n</tr>\n<tr>\n";
    $body .= "<td height=\"1%\" colspan=\"3\"></td>\n</tr>\n";
    $body .= "</table>\n</td>\n"; # colspan 2 ?? srk
    $body .= "<td class=\"shellRight\"></td>\n</tr>\n<tr class=\"shellBottomRow\">\n";

    $body .= "<td class=\"shellBottomLeft\"></td>\n<td class=\"shellBottomCenter\"></td>\n";
    $body .= "<td class=\"shellBottomRight\"></td>\n</tr>\n</table>\n";

    # main content area
    $body .= "<table class=\"shell\" align=\"center\" width=\"100%\" summary=\"\">\n<tr class=\"shellTopRow\">\n";
    $body .= "<td class=\"shellTopLeft\"></td>\n<td class=\"shellTopCenter\"></td>\n";
    $body .= "<td class=\"shellTopRight\"></td>\n</tr>\n<tr>\n";
    $body .= "<td class=\"shellLeft\"></td>\n";
    $body .= "<td align=\"left\" class=\"naked\">\n" . $text .  "\n</td>\n";

    $body .= "<td class=\"shellRight\"></td>\n</tr>\n";
    $body .= "<tr class=\"shellBottomRow\">\n<td class=\"shellBottomLeft\"></td>\n";
    $body .= "<td class=\"shellBottomCenter\"></td>\n<td class=\"shellBottomRight\"></td>\n";
    $body .= "</tr>\n</table>\n";

    $body .= "\n</body>\n</html>\n";
    print $body;

}



1