dump_corpus_2csv.pl
#!/usr/bin/perl -w
# ---------------------------------------------------------------------------------------------
#
# dump_corpus_2csv.pl --- dumps the corpus buckets, with scoring and probability
# added, to an excel compatible csv file for analysis with excel
#
# This program authored by Scott W Leighton (helphand@pacbell.net)
# for use with Popfile and it's components, which are Copyrighted
# by John Graham-Cumming. The author hereby contributes this code
# to the Popfile project under the terms of the Popfile License
# Agreement. /Scott W Leighton/ June 26, 2003
#
# Revised June 26, 2003 - added bucket counts
# June 29, 2003 - quote the word and bucket names
# Sept 14, 2003 - Updated for change to POPFile v 0.20.0
#
# Popfile
# Copyright (c) 2001-2003 John Graham-Cumming
#
# ---------------------------------------------------------------------------------------------
use strict;
use warnings;
my %components;
# ---------------------------------------------------------------------------------------------
#
# load_modules
#
# Called to load specific POPFile loadable modules (implemented as .pm files with special
# comment on first line) in a specific subdirectory
#
# $directory The directory to search for loadable modules
# $type The 'type' of module being loaded (e.g. proxy, core, ui) which is used
# below when fixing up references between modules (e.g. proxy modules all
# need access to the classifier module)
# $module The specific module name to be loaded.
#
# ---------------------------------------------------------------------------------------------
sub load_modules {
my ( $directory, $type, $module ) = @_;
$module = $directory . '/' . $module;
if ( open MODULE, "<$module" ) {
my $first = <MODULE>;
close MODULE;
if ( $first =~ /^# POPFILE LOADABLE MODULE/ ) {
require $module;
$module =~ s/\//::/;
$module =~ s/\.pm//;
my $mod = new $module;
my $name = $mod->name();
$components{$type}{$name} = $mod;
}
}
}
#
# Main
#
#
# Load the modules we'll be using
#
load_modules( 'POPFile', 'core', 'Configuration.pm' );
load_modules( 'POPFile', 'core', 'Logger.pm' );
load_modules( 'POPFile', 'core', 'MQ.pm' );
load_modules( 'Classifier', 'classifier', 'Bayes.pm' );
# Do not run if we are not on version 0.19.0 or higher
if ($components{core}{config}->isa ('POPFile::Module') && $components{core}{config}->can ( 'parameter' ) ) {
# link each of the objects with the configuration object and
# the logger and the mq
foreach my $type (keys %components) {
foreach my $name (keys %{$components{$type}}) {
$components{$type}{$name}->configuration($components{core}{config});
$components{$type}{$name}->logger($components{core}{logger}) if ( $name ne 'logger' );
$components{$type}{$name}->mq($components{core}{mq}) if ($components{$type}{$name}->can ( 'mq' )) ;
}
}
#
# Tell each module to initialize itself
#
foreach my $type (keys %components) {
foreach my $name (keys %{$components{$type}}) {
if ($components{$type}{$name}->initialize() == 0 ) {
die "Failed to start while initializing the $name module\n";
}
}
}
# Ensure that a temp subdirectory exists to hold any error log
# that logger might generate so we don't interfere with a
# running POPFile.
mkdir ( 'temp' );
# Set default quote and separator characters
$components{core}{config}->parameter('csv_quote','"');
$components{core}{config}->parameter('csv_separator',',');
# Load in the Popfile configuration parameters, any configured
# ones will override the initialized default values
# NOTE: We are intentially NOT saving this configuration
# back to disk since the parameters we are allowing
# for this program are NOT legal Popfile parameters.
$components{core}{config}->load_configuration();
# override the logdir and piddir so we don't mess with
# the production ones
$components{core}{config}->parameter('logger_logdir','temp/');
$components{core}{config}->parameter('config_piddir','temp/');
# Now grab any commandline parameters, they will override
# the defaults and those in popfile.cfg. As a byproduct,
# if the user overrides our csv_quote or csv_separator
# parameter, this will pick it up for us.
$components{core}{config}->parse_command_line();
# force logger to recognize the new logdir before we startup
# the modules. That way we will not inadvertently log to the
# production POPFile log.
$components{core}{logger}->service();
# now that the configuration is established, tell each module
# to start
foreach my $type (keys %components) {
foreach my $name (keys %{$components{$type}}) {
if ($components{$type}{$name}->start() == 0 ) {
die "Failed to start while starting the $name module\n";
}
}
}
#
# Define some global work areas
#
my %wordhash = ();
my %wordcounts =();
my %words = ();
my %globalcount = ();
my $fn = 'dump_corpus.csv';
open CSV, ">$fn" or die "Unable to open ${fn} :$!\n";
print CSV join ( $components{core}{config}->parameter("csv_separator"),
wrap_in_quotes($components{core}{config}->parameter("csv_quote"),
qw ( BucketName
Word
BucketCount
WordCount
%Bucket
%Total
Score
Probability
)
));
print CSV "\n";
# Have the classifier give us the buckets
my @buckets = $components{classifier}{bayes}->get_buckets();
#
# Go thru each bucket, grab the word list and word counts
#
# The API calls changed between v 0.19.0 and 0.20.0, so
# check for version, if 0.20.0 or higher, use new bucket
# API, otherwise, use old API calls
if ($components{classifier}{bayes}->can ( 'get_bucket_word_prefixes' ) ) {
foreach my $bucket (@buckets) {
if ($components{classifier}{bayes}->get_bucket_word_count($bucket) > 0) {
for my $j ( $components{classifier}{bayes}->get_bucket_word_list( $bucket, '.' ) ) {
my $word = $j;
my $count = $components{classifier}{bayes}->get_count_for_word( $bucket, $j );
$wordhash{$bucket}{$word}{c}=$count;
$wordcounts{$bucket}+=$count;
$words{$bucket}+=1;
$globalcount{words}+=1;
$globalcount{wordcount}+=$count;
}
}
}
}else {
foreach my $bucket (@buckets) {
if ($components{classifier}{bayes}->get_bucket_word_count($bucket) > 0) {
for my $i (@{$components{classifier}{bayes}->get_bucket_word_list($bucket)}) {
if (defined ($i)) {
my $j = $i;
while ( $j =~ m/\G\|(.*?) L?\-?([\.\d]+)\|/g ) {
my $word = $1;
my $count = $2;
$wordhash{$bucket}{$word}{c}=$count;
$wordcounts{$bucket}+=$count;
$words{$bucket}+=1;
$globalcount{words}+=1;
$globalcount{wordcount}+=$count;
}
}
}
}
}
}
#
# Sort by simple word count
#
foreach my $bucket (sort keys %wordhash) {
my @keys = map { $_->[1] }
sort {
$b->[0] <=> $a->[0]
||
length($b->[0]) <=> length($a->[0])
||
$a->[0] cmp $b->[0]
}
map { [$wordhash{$bucket}{$_}{c},$_] }
keys %{$wordhash{$bucket}};
#
# Calculate the score and probability for each word
#
foreach my $word (@keys) {
my $max = 0;
my $max_bucket = '';
my $total = 0;
foreach my $x (@buckets) {
if (defined($wordhash{$x}{$word}) && exists ($wordhash{$x}{$word}{c}) && $wordhash{$x}{$word}{c} > 0) {
$wordhash{$bucket}{$word}{b}++;
my $prob = exp(log($wordhash{$x}{$word}{c}/$wordcounts{$x}));
$total += $prob;
if ($prob > $max) {
$max = $prob;
$max_bucket = $bucket;
}
} else {
$total+= (0.10 / $globalcount{wordcount});
}
}
if (defined($wordhash{$bucket}{$word}{c}) && $wordhash{$bucket}{$word}{c} > 0) {
my $prob = exp(log($wordhash{$bucket}{$word}{c}/$wordcounts{$bucket}));
my $n = ($total > 0)?$prob / $total:0;
my $score = ($#buckets >= 0) ?log($n)/log(@buckets)+1:0;
$wordhash{$bucket}{$word}{s}=$score;
$wordhash{$bucket}{$word}{p}=$n;
}
}
#
# Sort by probability
#
@keys = map { $_->[1] }
sort {
$b->[0] <=> $a->[0]
||
length($b->[0]) <=> length($a->[0])
||
$a->[0] cmp $b->[0]
}
map { [$wordhash{$bucket}{$_}{p},$_] }
keys %{$wordhash{$bucket}};
for my $i ( 0 .. $#keys ) {
if (defined $wordhash{$bucket}{$keys[$i]}{c} ) {
print CSV join ( $components{core}{config}->parameter('csv_separator'),
wrap_in_quotes($components{core}{config}->parameter('csv_quote'),
(
$bucket,
$keys[$i],
),(
$wordhash{$bucket}{$keys[$i]}{b},
$wordhash{$bucket}{$keys[$i]}{c},
sprintf("%.8f",($wordcounts{$bucket}?$wordhash{$bucket}{$keys[$i]}{c}/$wordcounts{$bucket}*100:0)) ,
sprintf("%.8f",($globalcount{wordcount}?$wordhash{$bucket}{$keys[$i]}{c}/$globalcount{wordcount}*100:0)) ,
sprintf("%.10f",$wordhash{$bucket}{$keys[$i]}{s}),
$wordhash{$bucket}{$keys[$i]}{p})
));
print CSV "\n";
}
}
}
close CSV;
#
# Cleanup - Get rid of the popfile.pid file created by the configuration
# module.
#
unlink($components{core}{config}->parameter('config_piddir') . 'popfile.pid');
# All Done
} else {
print "$0 is compatible only with Popfile version 0.19.0 or above\n";
}
#
# Routine to wrap array values in quotes
#
sub wrap_in_quotes {
my ($default_quote, @list) = @_;
my @newlist;
for (@list) {
push @newlist,$default_quote . $_ . $default_quote;
}
return @newlist;
}