#!/usr/bin/perl -w

use strict;

my $___CARMEL = "/projects/nlp/koehn/bin/carmel";
my $___PHARAOH = "/projects/nlp/koehn/bin/pharaoh.2004-05-10";

my $___CONFIG = $ARGV[0];
my $___FILTERED_DIR = $ARGV[1];
my $___WORKING_DIR = $ARGV[2];
my $___DEV_F = $ARGV[3];
my $___N_BEST_LIST_SIZE = $ARGV[4];
my $___PARAMETERS = $ARGV[5];
my $___OUTFILE = $ARGV[6];

my $verbose = 4;

#my $pharaoh_parameters = "-dl 4 -b 0.1 -ttable-limit 100";
my $pharaoh_parameters = "";
$pharaoh_parameters = $___PARAMETERS if $___PARAMETERS;

### (1) run decoder ###
print STDERR "NBEST (1) run beamdecoder to produce lattices @ ".`date`;
`mkdir -p $___WORKING_DIR`;
print STDERR "run-filtered-pharaoh.perl $___FILTERED_DIR $___PHARAOH $___CONFIG $___DEV_F $pharaoh_parameters -l $___WORKING_DIR/sentence < $___DEV_F > $___WORKING_DIR/sentence.out\n";
`run-filtered-pharaoh.perl $___FILTERED_DIR $___PHARAOH $___CONFIG $___DEV_F $pharaoh_parameters -l $___WORKING_DIR/sentence < $___DEV_F > $___WORKING_DIR/one-best-translation`;

### (2) extract n-best list from lattice ###
print STDERR "NBEST (2) extract n-best list from lattices @ ".`date`;
my $sentence_count = `cat $___DEV_F | wc -l`;
for(my $sentence=0;$sentence<$sentence_count;$sentence++) {
    my $sentence = sprintf("%04d",$sentence);
    `$___CARMEL -mk $___N_BEST_LIST_SIZE $___WORKING_DIR/sentence.$sentence > $___WORKING_DIR/sentence.best$___N_BEST_LIST_SIZE.$sentence.carmel`;
}

### (3) prepare for component scoring ###
my(@TRANSLATION,@ALIGNMENT);
print STDERR "NBEST (3) prepare n-best list for component scoring @ ".`date`;
open(F,$___DEV_F);
for(my $sentence=0;$sentence<$sentence_count;$sentence++) {
    my $sentence = sprintf("%04d",$sentence);
    print STDERR ".";
    print STDERR $sentence unless ($sentence % 100);
    
    open(OUT,">$___WORKING_DIR/sentence.best$___N_BEST_LIST_SIZE.$sentence");
    my $foreign = <F>;
    chop($foreign);
    my @FOREIGN = split(/ /,$foreign);
    
    # get additional state information
    # (which foreign words are covered by transitions)
    my %WORDS_COVERED;
    open(STATE,"$___WORKING_DIR/sentence.$sentence.state");
    while(<STATE>) {
	chop;
	my ($state,$words_covered) = split;
	$WORDS_COVERED{$state} = $words_covered;
    }
    close(STATE);
    
    my $final_state = `head -1 $___WORKING_DIR/sentence.$sentence`;
    chop($final_state);
    my $words_covered = "";
    for(my $i=0;$i<=$#FOREIGN;$i++) { $words_covered .= "1"; }
    $WORDS_COVERED{$final_state} = $words_covered;
    
    # use n-best list generated by carmel,
    # and prepare as input for rescoring by the beam search decoder
    open(NBEST,"$___WORKING_DIR/sentence.best$___N_BEST_LIST_SIZE.$sentence.carmel");
    while(my $line = <NBEST>) {
	chop($line);
	last if $line eq "0";
	my $translation = "";
	my $alignment = "";
	my $out = "";
	my $words_covered = "";
	for(my $i=0;$i<=$#FOREIGN;$i++) { $words_covered .= "0"; }
	my $previous_last_foreign = -1;
	my $first_english = 0;
	my $last_english;
	while($line =~ /^\("(.+?)" : ".+?" \/ ([\d\+\-\.lne]+) -> (\d+)\) (.+)/) {
	    $line = $4;
	    my ($english,$p,$state) = ($1,$2,$3);    
	    $english =~ s/\\\"/\"/g;
	    my @ENGLISH = split(/ /,$english);
	    $last_english = $first_english + scalar(@ENGLISH)-1;
	    my $first_foreign = -1;
	    my $last_foreign = -1;
	    my $foreign = "";
	    if (!defined($WORDS_COVERED{$state})) { 
		print STDERR "could not find state $state\n"; exit; 
	    }
	    for(my $i=0;$i<=$#FOREIGN;$i++) { 
		if (substr($words_covered,$i,1) ne substr($WORDS_COVERED{$state},$i,1)) {
		    $foreign .= " $FOREIGN[$i]";
		    $first_foreign = $i if $first_foreign == -1;
		    $last_foreign = $i;
		}
	    }
	    $foreign = substr($foreign,1);
	    my $distortion = $first_foreign - $previous_last_foreign -1;
	    $previous_last_foreign = $last_foreign;
	    $words_covered = $WORDS_COVERED{$state};
	    $out .= "E $english F $foreign D $distortion P $p ";
	    $translation .= " " unless $translation eq '';
	    $translation .= "$english";
	    $alignment .= "$first_english-$last_english,$first_foreign-$last_foreign ";
	    $first_english = $last_english+1;
	}
	if ($line !~ /^[\d\-\+\.lne]+$/) {
	    print OUT "ERROR: !$line!\n"; last;
	}
	print OUT $out."T $line\n";
	push @{$TRANSLATION[$sentence]},$translation;
	push @{$ALIGNMENT[$sentence]}, $alignment;
    }
    close(NBEST);
    close(OUT);
#    `rm $___WORKING_DIR/sentence.$sentence.state`;
#    `rm $___WORKING_DIR/sentence.$sentence`;
}
print STDERR "\n";
close(F);

### (4) rescore the n-best list using the beamdecoder to get component scores ###
print STDERR "NBEST (4) score lattices with beamdecoder @ ".`date`;
print STDERR "run-filtered-pharaoh.perl $___FILTERED_DIR $___PHARAOH $___CONFIG $___DEV_F $pharaoh_parameters -rd $___WORKING_DIR/sentence.best$___N_BEST_LIST_SIZE $sentence_count\n";
`run-filtered-pharaoh.perl $___FILTERED_DIR $___PHARAOH $___CONFIG $___DEV_F $pharaoh_parameters -rd $___WORKING_DIR/sentence.best$___N_BEST_LIST_SIZE $sentence_count`; # if $$>1;

### (5) extract component scores ###
print STDERR "NBEST (5) extract component scores @ ".`date`;
my @SCORE;
for(my $sentence=0;$sentence<$sentence_count;$sentence++) {
    $sentence = sprintf("%04d",$sentence);
    open(RESCORE,"$___WORKING_DIR/sentence.best$___N_BEST_LIST_SIZE.$sentence.rescore");
    while(<RESCORE>) {
	my $scores = "";
	/pD: ([^,]+),/;
	$scores .= $1;
	/pLM.?0?.?:( [^,]+),/;
	$scores .= $1;
	/pTM:( [^,]+),/;
	$scores .= $1;
	/pWP:( [^,]+),/;
	$scores .= $1;
	push @{$SCORE[$sentence]},$scores;
    }
    close(RESCORE);
}

### (6) output ###
print STDERR "NBEST (6) generate consolidated n-best list file @ ".`date`;
open(OUT,">".$___OUTFILE);
for(my $sentence=0;$sentence<$sentence_count;$sentence++) {
    for(my $i=0;$i<scalar(@{$TRANSLATION[$sentence]});$i++) {
	print OUT "$sentence $i ||| $TRANSLATION[$sentence][$i] ||| $ALIGNMENT[$sentence][$i]||| $SCORE[$sentence][$i]\n";
    }    
}
close(OUT);
