#!/usr/bin/perl # --------------------------------------------------------------------------------------------- # # test_arch_msg.pl --- Process message folders in archive against current corpus using # Classifier::Bayes and report any that classify # to a different bucket than the archive message # is in. # # This program authored by Scott W Leighton (helphand@pacbell.net) # as a utility program for Popfile, which is Copyrighted # by John Graham-Cumming. # # test_arch_msg.pl # Copyright (C) 2003 Scott W. Leighton helphand@pacbell.net # # Modified May 24, 2003 - Added support for v 0.19.0 # May 25, 2003 - Added missing support for archive_classes # May 26, 2003 - Added percents # # # This program is free software; you can redistribute it and/or modify # it under the terms of the GNU General Public License as published by # the Free Software Foundation; either version 2 of the License, or # (at your option) any later version. # # This program is distributed in the hope that it will be useful, # but WITHOUT ANY WARRANTY; without even the implied warranty of # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the # GNU General Public License for more details. # # You should have received a copy of the GNU General Public License # along with this program; if not, write to the Free Software # Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA # # # This program authored by Scott W Leighton (helphand@pacbell.net) # based upon bayes.pl, a Popfile component, which is Copyrighted # by John Graham-Cumming. The author hereby contributes this code # to the Popfile project under the terms of the Popfile License # Agreement. /Scott W Leighton/ February 16, 2003 # # Popfile and Classifier::Bayes # Copyright (c) 2001-2003 John Graham-Cumming # # --------------------------------------------------------------------------------------------- use strict; use Classifier::Bayes; use POPFile::Configuration; # version check my $version; if ( -s "Popfile/Module.pm" ) { $version = 19; } else { $version = 18; } sub swrite { die "usage: swrite PICTURE ARGS" unless @_; my $format = shift; $^A = ""; formline($format,@_); return $^A; } sub get_subdirs { my ($dir) = @_; my @subdirs; opendir DIR, $dir or die "Unable to open $dir :@!\n"; while (defined (my $file = readdir DIR)) { next if $file =~ /^\.\.?$/; push @subdirs,"$dir/$file" if ( -d "$dir/$file" ); } return @subdirs; } # main my $start_time = time; my ($tctr, $terrors); my $archive = $ARGV[0] || "archive"; { my $b; my $c; my @dirs = glob "$archive/*"; if ($version == 18) { $b = new Classifier::Bayes; if ( $b->initialize() == 0 ) { die "Failed to start while initializing the classifier module"; } $b->{debug} = 0; $b->{parser}->{debug} = 0; $b->load_word_matrix(); } else { $b = new Classifier::Bayes; $c = new POPFile::Configuration; $b->configuration($c); $c->configuration($c); $c->initialize(); $b->initialize(); $c->load_configuration(); $b->{unclassified__} = $c->parameter("bayes_inclassified_probablity") || 0.5; $b->{debug} = 0; $b->{parser__}->{debug} = 0; $b->start(); # setup dummy buckets foreach my $dir (@dirs) { my ($x, $bucket) = split /\//,$dir; $b->{total__}{$bucket} = 100; $b->{colors__}{$bucket} = 'black'; } } foreach my $dir (@dirs) { my @subdirs; unless (@subdirs=get_subdirs($dir)) { push @subdirs,"$dir"; } my ($x, $bucket) = split /\//,$dir; my $string = swrite(<<"END",$bucket); Results for Message Archive @<<<<<<<<<<<< Archive Current Message File Name Result Result ---------------------------- -------- --------- END print $string; foreach my $sdir (@subdirs) { my @files = glob "$sdir/*.msg"; my $ctr = scalar ($#files) + 1; my $i = 0; foreach my $file (@files) { my $result = $b->classify_file("$file"); unless ($result eq lc($bucket)) { my @x = split /\//,$file; $string = swrite(<<"END",$x[-1], $bucket, $result); @<<<<<<<<<<<<<<<<<<<<<<<<<<< @<<<<<<<<<<< @<<<<<<<<<<<< END print $string; $i++; } } if ($i > 0) { my $perc = sprintf("%.1f",($i?$i/$ctr*100:0)); print "\n$sdir - $ctr files examined, $i errors noted ($perc%).\n\n"; } else { print "\n$sdir - No discrepancies found in $ctr files\n\n"; } $tctr+=$ctr; $terrors+=$i; } } } my $end_time = time; print "process took " . ($end_time - $start_time) . " seconds\n"; my $perc = sprintf("%.1f",($terrors?$terrors/$tctr*100:0)); print "total messages found $tctr with $terrors errors ($perc%)\n";