#!/usr/bin/perl # --------------------------------------------------------------------------------------------- # # remove_ignorewords.pl --- Removes all ignore words from all corpus files # # # This program authored by Scott W Leighton (helphand@pacbell.net) # for use with Popfile and it's components, which are Copyrighted # by John Graham-Cumming. The author hereby contributes this code # to the Popfile project under the terms of the Popfile License # Agreement. /Scott W Leighton/ May 21, 2003 # # # Popfile # Copyright (c) 2001-2003 John Graham-Cumming # # --------------------------------------------------------------------------------------------- use strict; my $file = $ARGV[0] || 'stopwords'; my %stopwords = (); my $fn = 'popfile.cfg'; my %cfg; if ( open CONFIG, " ) { s/(\015|\012)//g; if ( /(\S+) (.+)/ ) { if (defined $cfg{$1}) { print "Popfile.cfg entry $1 with value $2 is duplicated (last occurance is the one used, you should remove all duplicate $1 entries)\n"; } $cfg{$1}=$2; } } close CONFIG; } else { die "Unable to open popfile.cfg: $!"; } if (open STOPS, "<$file" ) { while () { s/[\r\n]//g; $stopwords{$_}=1; } close STOPS; my $corpus = $cfg{corpus} || $cfg{bayes_corpus} || "corpus"; die "corpus dir '$corpus' does not exist\n" unless -d $corpus; $corpus .= "/*"; my @buckets = glob ($corpus); my %temp_corpus; foreach my $entry (@buckets) { my ($path,$bucket) = split /\//,$entry; if (open WORDS, "$path/$bucket/table") { while () { if (/__CORPUS__ __VERSION__ (\d+)/ ) { if ($1 != 1) { die "Incompatible corpus version for bucket $bucket\n"; } next; } $temp_corpus{$bucket}{$1} = $2 if ( /([^\s]+) (\d+)/ ); } close WORDS; } my $i=0; foreach my $ignore (keys %stopwords) { if (exists ($temp_corpus{$bucket}{$ignore})) { $i++; delete $temp_corpus{$bucket}{$ignore}; print "removed '$ignore' from $bucket\n"; } } if ($i) { if (open WORDS, ">$path/$bucket/table" ) { print WORDS "__CORPUS__ __VERSION__ 1\n"; foreach my $word (keys %{$temp_corpus{$bucket}}) { print WORDS "$word $temp_corpus{$bucket}{$word}\n" if ( $temp_corpus{$bucket}{$word} > 0 ); } close WORDS; } print "Corpus $bucket was modified, $i ignore words removed\n"; } } } else { print "No action taken, $file file not found\n"; }