#!/usr/bin/perl -w

use strict;

my $dir = $ARGV[0];
my $pharaoh = $ARGV[1];
my $config = $ARGV[2];
my $input = $ARGV[3];

my $MAX_LENGTH = 10;

my $parameters = "";
for(my $i=4;$i<=$#ARGV;$i++) {
    $parameters .= " ".$ARGV[$i];
}

my $norun = 0;
if ($parameters =~ /-norun/) {
    $parameters =~ s/-norun//;
    $norun = 1;
}

my $v2=0;
if (-e $dir && ! -e "$dir/info") {
    print STDERR "previous filter run crashed. delete $dir!\n";
    exit(1);
}
if (! -e $dir) {
    print STDERR "mkdir $dir\n";
    `mkdir -p $dir`;
    my ($f2n,$n2f,$table,@DISTORTION,@DISTORTION_OUT);
    open(INI_OUT,">$dir/pharaoh.ini");
    open(INI,$config);
    while(<INI>) {
	print INI_OUT $_;
	if (/ttable-file-f2n/) {
	    print INI_OUT "$dir/phrase-table.f2n\n";
	    $f2n = <INI>;
	}
	elsif (/ttable-file-n2f/) {
	    print INI_OUT "$dir/phrase-table.n2f\n";
	    $n2f = <INI>;
	}
	elsif (/ttable-file\]/) {
	    print INI_OUT "$dir/phrase-table\n";
	    $table = <INI>;
	    $v2 = 1;
	}
	elsif (/distortion-file/) {
	    my $distortion = <INI>;
	    while($distortion =~ /^[^\#\s]/) {
		my $out = $distortion;
		$out =~ s/^.*\/+([^\/]+)/$1/g;
		print INI_OUT "$dir/$out";
		chomp($distortion);
		chomp($out);
		push @DISTORTION,$distortion;
		push @DISTORTION_OUT,$out;
		$distortion = <INI>;
	    }
	    print INI_OUT $distortion;
	}
    }
    close(INI);
    close(INI_OUT);
    if ($parameters =~ /^(.*)-distortion-file +(\S.*?) +(\-.+)$/ || 
	$parameters =~ /^(.*)-distortion-file +(\S.*)()$/) {
	my ($pre,$files,$post) = ($1,$2,$3);
	@DISTORTION = ();
	@DISTORTION_OUT = ();
	foreach my $distortion (split(/ +/,$files)) {
	    my $out = $distortion;
	    $out =~ s/^.*\/+([^\/]+)/$1/g;
	    push @DISTORTION,$distortion;
	    push @DISTORTION_OUT,$out;
	}
    }
    if ($parameters =~ /^(.*)-ttable-file +(\S.*?)( +\-.+)$/ || 
	$parameters =~ /^(.*)-ttable-file +(\S+)()$/) {
	$table = $2;
    }

    my @INPUT = `cat $input`;
    chop(@INPUT);
    my %PHRASE_USED;
    foreach my $line (@INPUT) {
	my @WORD = split(/ +/,$line);
	for(my $i=0;$i<=$#WORD;$i++) {
	    for(my $j=0;$j<$MAX_LENGTH && $j+$i<=$#WORD;$j++) {
		my $phrase = "";
		for(my $k=$i;$k<=$i+$j;$k++) {
		    $phrase .= $WORD[$k]." ";
		}
		chop($phrase);
		$PHRASE_USED{$phrase}++;
	    }
	}
    }

    my @FILES = ($n2f,$f2n);
    @FILES = ($table) if $v2;
    foreach my $file (@FILES) {
	my ($used,$total) = (0,0);
        $file =~ s/\n//;
        if ($file =~ /\.gz$/) { open(FILE,"zcat $file |"); }
        elsif (! -e $file && -e "$file.gz") { open(FILE,"zcat $file.gz|"); }
        else { open(FILE,$file); }
	open(FILE_OUT,">$dir/phrase-table".($v2?"":".".($file eq $n2f?"n2f":"f2n")));
	while(my $entry = <FILE>) {
	    my ($english,$foreign,$scores);
	    ($english,$foreign,$scores) = split(/ \|\|\| /,$entry) 
		if !$v2 && $file eq $f2n;
	    ($foreign,$english,$scores) = split(/ \|\|\| /,$entry) 
		if $v2 || $file eq $n2f;
	    $foreign =~ s/ $// if $v2;
	    $english =~ s/ $// if $v2;
	    if (defined($PHRASE_USED{$foreign})) {
		print FILE_OUT $entry;
		$used++;
	    }
	    $total++;
	}
	close(FILE);
	close(FILE_OUT);
	printf STDERR "$used of $total phrases pairs used (%.2f%s) - note: max length $MAX_LENGTH\n",(100*$used/$total),'%';
    }


    # filter distortion files
    for(my $i=0;$i<=$#DISTORTION;$i++) {
	my ($used,$total) = (0,0);
	my $file = $DISTORTION[$i];
	my $out = $DISTORTION_OUT[$i];
	if (! -e "$dir/$out") {
	    print STDERR "processing $file -> $dir/$out\n";
	    if ($file =~ /\.gz$/) { open(FILE,"zcat $file |"); }
	    elsif (! -e $file && -e "$file.gz") { open(FILE,"zcat $file.gz|"); }
	    else { open(FILE,$file); }
	    open(FILE_OUT,">$dir/$out");
	    while(my $entry = <FILE>) {
		my ($foreign,$rest        );
		($foreign,$rest) = split(/ \|\|\| /,$entry);
		$foreign =~ s/ $//;
		if (defined($PHRASE_USED{$foreign})) {
		    print FILE_OUT $entry;
		    $used++;
		}
		$total++;
	    }
	    close(FILE);
	    close(FILE_OUT);
	    
	    printf STDERR "$used of $total phrases pairs used from %s (%.2f%s) - note: max length $MAX_LENGTH\n",$out,(100*$used/$total),'%';
	}
    }

    open(INFO,">$dir/info");
    print INFO "$config\n$input\n";
    close(INFO);
}
else {
    my @INFO = `cat $dir/info`;
    chop(@INFO);
    if($INFO[0] ne $config || ($INFO[1] ne $input && $INFO[1].".tagged" ne $input)) {
	print STDERR "WARNING: directory does not match parameters: ($INFO[0] ne $config || $INFO[1] ne $input)\n";
    }
    print STDERR "reusing cached filtered files\n";
}

exit if $norun;

if ($parameters =~ /^(.*)-distortion-file +(\S.*?)( +-.+)$/ || 
    $parameters =~ /^(.*)-distortion-file +(\S.*)()$/) {
    my ($pre,$files,$post) = ($1,$2,$3);
    $parameters = "$pre -distortion-file ";
    foreach my $distortion (split(/ +/,$files)) {
	my $out = $distortion;
	$out =~ s/^.*\/+([^\/]+)/$1/g;
	$parameters .= "$dir/$out";
    }
    $parameters .= $post;
}
if ($parameters =~ /^(.*)-ttable-file +(\S+)( +-.+)$/ || 
    $parameters =~ /^(.*)-ttable-file +(\S+)()$/) {
    $parameters = $1.$3;
}

print STDERR "$pharaoh -f $dir/pharaoh.ini $parameters < $input\n";
print `$pharaoh -f $dir/pharaoh.ini $parameters < $input`;
