#!/usr/bin/perl -w

### Input:
###  k
###  lambdas, in the following order: distortion, LM, TM*, word penalty
###  prefix for temp files
###  options to pass to Pharaoh

### Output (stdout):
###  1 ||| the quick brown fox jumped over the lazy dogs ||| 1.23 1.23 1.23
###  1 ||| quick brown fox jumped over the lazy dogs ||| 1.23 1.23 1.23

my $___CARMEL = "carmel";
my $___PHARAOH = "pharaoh";

my $___N_BEST_LIST_SIZE = $ARGV[0];
my $___LAMBDA = $ARGV[1];
my $___PREFIX = $ARGV[2];
my $___PARAMETERS = $ARGV[3];
$___PHARAOH = $ARGV[4] if $#ARGV >= 4;

my $decoder_config;
if ($___LAMBDA =~ /(.+) CONFIG (.+)/) {
   $decoder_config = $1;
   $___LAMBDA = $2;
}
my @LAMBDA = split(/\s+/, $___LAMBDA);

my @DEV_F;
while (<STDIN>) {
  chop;
  if (!/^\s*$/) {
    push @DEV_F, $_;
  }
}
my $sentence_count = $#DEV_F+1;

my @NBEST_LIST;
my @SCORE;
&run_pharaoh_nbest(\@DEV_F, \@LAMBDA, \@NBEST_LIST, \@SCORE);
for (my $sentence=0; $sentence<$sentence_count; $sentence++) {
  for (my $translation=0; $translation<=$#{$SCORE[$sentence]}; $translation++) {
    print "$sentence ||| $NBEST_LIST[$sentence][$translation] ||| ", join(' ',@{$SCORE[$sentence][$translation]}), "\n";
  }
}

sub run_pharaoh_nbest {
    my $pharaoh_parameters = "-dl 4 -b 0.1 -ttable-limit 100";
    $pharaoh_parameters = $___PARAMETERS if $___PARAMETERS;

    my ($DEV_F, $LAMBDA, $NBEST_LIST, $SCORE) = @_;
    
    print STDERR "run beamdecoder to produce lattices\n";

    my $config;
    if ($decoder_config) {
	$config = sprintf($decoder_config,@{$LAMBDA});
    }
    else {
	$config = "-d $$LAMBDA[0] -lm $$LAMBDA[1] -tm";
	for (my $i=2; $i<=$#LAMBDA-1; $i++) {
	    $config .= " $$LAMBDA[$i]";
	}
	$config .= " -w $$LAMBDA[$#LAMBDA]";
    }

    my $info = $config;
    $info =~ s/ /_/g;

    print STDERR "$___PHARAOH $pharaoh_parameters $config -l $___PREFIX > $___PREFIX.trace.$info\n";
    open PHARAOH, "|$___PHARAOH $pharaoh_parameters $config -l $___PREFIX > $___PREFIX.trace.$info";
    for $f (@$DEV_F) {
      print PHARAOH $f, "\n";
    }
    close PHARAOH;
    
    print STDERR "extract n-best list from lattices\n";

    for(my $sentence=0;$sentence<$sentence_count;$sentence++) {
	my $sentence = sprintf("%04d",$sentence);
	`$___CARMEL -mk $___N_BEST_LIST_SIZE $___PREFIX.$sentence > $___PREFIX.best$___N_BEST_LIST_SIZE.$sentence.carmel`;
    }

    &get_n_best_list_using_carmel($DEV_F, $NBEST_LIST);

    # rescore the n-best list using the beamdecoder to get component scores
    print STDERR "score lattices with beamdecoder\n";
    print STDERR "$___PHARAOH $pharaoh_parameters -rd $___PREFIX.best$___N_BEST_LIST_SIZE $sentence_count\n";
    `$___PHARAOH $pharaoh_parameters $config -rd $___PREFIX.best$___N_BEST_LIST_SIZE $sentence_count`;

    &load_component_scores($SCORE);
}

sub get_n_best_list_using_carmel {
  my ($DEV_F,$NBEST_LIST) = @_;
  for(my $sentence=0;$sentence<$sentence_count;$sentence++) {
    my $sentence = sprintf("%04d",$sentence);
    print STDERR ".";
    print STDERR $sentence unless ($sentence % 100);

    open(OUT,">:raw","$___PREFIX.best$___N_BEST_LIST_SIZE.$sentence");
    my $foreign = ${$DEV_F}[$sentence];
    my @FOREIGN = split(/ /,$foreign);

    # get additional state information
    # (which foreign words are covered by transitions)
    my %WORDS_COVERED;
    open(STATE,"<:raw","$___PREFIX.$sentence.state");
    while(<STATE>) {
      chop;
      my ($state,$words_covered) = split;
      $WORDS_COVERED{$state} = $words_covered;
    }
    close(STATE);
    
    open FINAL, "<:raw", "$___PREFIX.$sentence";
    my $final = <FINAL>;
    close(FINAL);
    chop($final);

    my $words_covered = "";
    for(my $i=0;$i<=$#FOREIGN;$i++) { $words_covered .= "1"; }
    $WORDS_COVERED{$final} = $words_covered;
    
    # use n-best list generated by carmel,
    # and prepare as input for rescoring by the beam search decoder
    open(NBEST,"<:raw","$___PREFIX.best$___N_BEST_LIST_SIZE.$sentence.carmel");
    while(my $line = <NBEST>) {
      chop($line);
      last if $line eq "0";
      my $translation = "";
      my $out = "";
      my $words_covered = "";
      for(my $i=0;$i<=$#FOREIGN;$i++) { $words_covered .= "0"; }
      my $previous_last_foreign = -1;
      #while($line =~ /^\("(.*?)" : ".*?" \/ ([\d\+\-\.lne]+) -> (\d+)\) (.+)/) {
      while($line =~ /^\(\d+ -> (\d+) "(.+?)" : ".+?" \/ ([\d\+\-\.lne]+)\) (.+)/) {
	$line = $4;
	#my ($english,$p,$state) = ($1,$2,$3);    
	my ($english,$p,$state) = ($2,$3,$1);    
	$english =~ s/\\\"/\"/g;
	my $first_foreign = -1;
	my $last_foreign = -1;
	my $foreign = "";
	if (!defined($WORDS_COVERED{$state})) { 
	  print STDERR "could not find state $state\n"; exit; 
	}
	for(my $i=0;$i<=$#FOREIGN;$i++) { 
	  if (substr($words_covered,$i,1) ne substr($WORDS_COVERED{$state},$i,1)) {
	    $foreign .= " $FOREIGN[$i]";
	    $first_foreign = $i if $first_foreign == -1;
	    $last_foreign = $i;
	  }
	}
	if ($foreign eq "") {
	    print STDERR "no foreign words mapped (sentence $sentence:".scalar(@{$$NBEST_LIST[$sentence]}).")? 0..$#FOREIGN:\n$words_covered\n$WORDS_COVERED{$state}\n";
	}
	$foreign = substr($foreign,1);
	my $distortion = $first_foreign - $previous_last_foreign -1;
	$previous_last_foreign = $last_foreign;
	$words_covered = $WORDS_COVERED{$state};
	$out .= "E $english F $foreign D $distortion P $p ";
	$translation .= " " unless $translation eq '';
	$translation .= "$english";
      }
      if ($line !~ /^[\d\-\+\.lne]+$/) {
	print OUT "ERROR: !$line!\n"; last;
      }
      print OUT $out."T $line\n";
      push @{$$NBEST_LIST[$sentence]},$translation;
    }
    close(NBEST);
    close(OUT);
    `rm $___PREFIX.$sentence.state`;
    `rm $___PREFIX.$sentence`;
  }
  print STDERR "\n";
}

# munge output of decoder run in rescoring mode
# SCORE is array of lists of component scores (ln). Order matters.

sub load_component_scores {
  my ($SCORE) = @_;
  for(my $sentence=0;$sentence<$sentence_count;$sentence++) {
    $sentence = sprintf("%04d",$sentence);
    open(RESCORE,"<:raw","$___PREFIX.best$___N_BEST_LIST_SIZE.$sentence.rescore");
    while(<RESCORE>) {
      my $scores = "";
      /pD: ([^,]+),/;
      $scores .= $1;
      /pLM.?0?.?:( [^,]+),/;
      $scores .= $1;
      /pTM:( [^,]+),/;
      $scores .= $1;
      /pWP:( [^,]+),/;
      $scores .= $1;
      my @SCORES = split(/ /,$scores);
      push @{${$SCORE}[$sentence]},\@SCORES;
    }
    close(RESCORE);
    `rm $___PREFIX.best$___N_BEST_LIST_SIZE.$sentence`;
    `rm $___PREFIX.best$___N_BEST_LIST_SIZE.$sentence.rescore`;
    `rm $___PREFIX.best$___N_BEST_LIST_SIZE.$sentence.carmel`;
  }
  print STDERR "\n";
}


