#!/usr/bin/perl -w use strict; my $dir = $ARGV[0]; my $pharaoh = $ARGV[1]; my $config = $ARGV[2]; my $input = $ARGV[3]; my $MAX_LENGTH = 10; my $parameters = ""; for(my $i=4;$i<=$#ARGV;$i++) { $parameters .= " ".$ARGV[$i]; } my $norun = 0; if ($parameters =~ /-norun/) { $parameters =~ s/-norun//; $norun = 1; } my $v2=0; if (! -e $dir) { print STDERR "mkdir $dir\n"; `mkdir $dir`; my ($f2n,$n2f,$table); open(INI_OUT,">$dir/pharaoh.ini"); open(INI,$config); while() { print INI_OUT $_; if (/ttable-file-f2n/) { print INI_OUT "$dir/phrase-table.f2n\n"; $f2n = ; } elsif (/ttable-file-n2f/) { print INI_OUT "$dir/phrase-table.n2f\n"; $n2f = ; } elsif (/ttable-file\]/) { print INI_OUT "$dir/phrase-table\n"; $table = ; $v2 = 1; } } close(INI); close(INI_OUT); my @INPUT = `cat $input`; chop(@INPUT); my %PHRASE_USED; foreach my $line (@INPUT) { my @WORD = split(/ +/,$line); for(my $i=0;$i<=$#WORD;$i++) { for(my $j=0;$j<$MAX_LENGTH && $j+$i<=$#WORD;$j++) { my $phrase = ""; for(my $k=$i;$k<=$i+$j;$k++) { $phrase .= $WORD[$k]." "; } chop($phrase); $PHRASE_USED{$phrase}++; } } } my @FILES = ($n2f,$f2n); @FILES = ($table) if $v2; foreach my $file (@FILES) { my ($used,$total) = (0,0); $file =~ s/\n//; if (! -e $file && -e "$file.gz") { open(FILE,"zcat $file.gz|"); } else { open(FILE,$file); } open(FILE_OUT,">$dir/phrase-table".($v2?"":".".($file eq $n2f?"n2f":"f2n"))); while(my $entry = ) { my ($english,$foreign,$scores); ($english,$foreign,$scores) = split(/ \|\|\| /,$entry) if !$v2 && $file eq $f2n; ($foreign,$english,$scores) = split(/ \|\|\| /,$entry) if $v2 || $file eq $n2f; $foreign =~ s/ $// if $v2; $english =~ s/ $// if $v2; if (defined($PHRASE_USED{$foreign})) { print FILE_OUT $entry; $used++; } $total++; } close(FILE); close(FILE_OUT); printf STDERR "$used of $total phrases pairs used (%.2f%s) - note: max length $MAX_LENGTH\n",(100*$used/$total),'%'; } open(INFO,">$dir/info"); print INFO "$config\n$input\n"; close(INFO); } else { my @INFO = `cat $dir/info`; chop(@INFO); if($INFO[0] ne $config || ($INFO[1] ne $input && $INFO[1].".tagged" ne $input)) { print STDERR "directory does not match parameters: ($INFO[0] ne $config || $INFO[1] ne $input)\n"; exit(1); } print STDERR "reusing cached filtered files\n"; } exit if $norun; print STDERR "$pharaoh -f $dir/pharaoh.ini $parameters < $input\n"; print `$pharaoh -f $dir/pharaoh.ini $parameters < $input`;