Moses: /disk4/html/www/moses/doxygen/mosesdecoder/moses/mbr.cpp Source File

00001 #include <iostream>
00002 #include <fstream>
00003 #include <sstream>
00004 #include <iomanip>
00005 #include <vector>
00006 #include <map>
00007 #include <cstdlib>
00008 #include <cmath>
00009 #include <algorithm>
00010 #include <cstdio>
00011 #include "moses/TrellisPathList.h"
00012 #include "moses/TrellisPath.h"
00013 // #include "moses/StaticData.h"
00014 #include "moses/Util.h"
00015 #include "mbr.h"
00016 
00017 using namespace std ;
00018 using namespace Moses;
00019 
00020 
00021 /* Input :
00022    1. a sorted  n-best list, with duplicates filtered out in the following  format
00023    0 ||| amr moussa is currently on a visit to libya , tomorrow , sunday , to hold talks with regard to the in sudan . ||| 0 -4.94418 0 0 -2.16036 0 0 -81.4462 -106.593 -114.43 -105.55 -12.7873 -26.9057 -25.3715 -52.9336 7.99917 -24 ||| -4.58432
00024 
00025    2. a weight vector
00026    3. bleu order ( default = 4)
00027    4. scaling factor to weigh the weight vector (default = 1.0)
00028 
00029    Output :
00030    translations that minimise the Bayes Risk of the n-best list
00031 
00032 
00033 */
00034 
00035 int BLEU_ORDER = 4;
00036 int SMOOTH = 1;
00037 float min_interval = 1e-4;
00038 void extract_ngrams(const vector<const Factor* >& sentence, map < vector < const Factor* >, int >  & allngrams)
00039 {
00040   vector< const Factor* > ngram;
00041   for (int k = 0; k < BLEU_ORDER; k++) {
00042     for(int i =0; i < max((int)sentence.size()-k,0); i++) {
00043       for ( int j = i; j<= i+k; j++) {
00044         ngram.push_back(sentence[j]);
00045       }
00046       ++allngrams[ngram];
00047       ngram.clear();
00048     }
00049   }
00050 }
00051 
00052 float calculate_score(const vector< vector<const Factor*> > & sents, int ref, int hyp,  vector < map < vector < const Factor *>, int > > & ngram_stats )
00053 {
00054   int comps_n = 2*BLEU_ORDER+1;
00055   vector<int> comps(comps_n);
00056   float logbleu = 0.0, brevity;
00057 
00058   int hyp_length = sents[hyp].size();
00059 
00060   for (int i =0; i<BLEU_ORDER; i++) {
00061     comps[2*i] = 0;
00062     comps[2*i+1] = max(hyp_length-i,0);
00063   }
00064 
00065   map< vector < const Factor * > ,int > & hyp_ngrams = ngram_stats[hyp] ;
00066   map< vector < const Factor * >, int > & ref_ngrams = ngram_stats[ref] ;
00067 
00068   for (map< vector< const Factor * >, int >::iterator it = hyp_ngrams.begin();
00069        it != hyp_ngrams.end(); it++) {
00070     map< vector< const Factor * >, int >::iterator ref_it = ref_ngrams.find(it->first);
00071     if(ref_it != ref_ngrams.end()) {
00072       comps[2* (it->first.size()-1)] += min(ref_it->second,it->second);
00073     }
00074   }
00075   comps[comps_n-1] = sents[ref].size();
00076 
00077   for (int i=0; i<BLEU_ORDER; i++) {
00078     if (comps[0] == 0)
00079       return 0.0;
00080     if ( i > 0 )
00081       logbleu += log((float)comps[2*i]+SMOOTH)-log((float)comps[2*i+1]+SMOOTH);
00082     else
00083       logbleu += log((float)comps[2*i])-log((float)comps[2*i+1]);
00084   }
00085   logbleu /= BLEU_ORDER;
00086   brevity = 1.0-(float)comps[comps_n-1]/comps[1]; // comps[comps_n-1] is the ref length, comps[1] is the test length
00087   if (brevity < 0.0)
00088     logbleu += brevity;
00089   return exp(logbleu);
00090 }
00091 
00092 const TrellisPath doMBR(const TrellisPathList& nBestList, AllOptions const& opts)
00093 {
00094   float marginal = 0;
00095   float mbr_scale = opts.mbr.scale;
00096   vector<float> joint_prob_vec;
00097   vector< vector<const Factor*> > translations;
00098   float joint_prob;
00099   vector< map < vector <const Factor *>, int > > ngram_stats;
00100 
00101   TrellisPathList::const_iterator iter;
00102 
00103   // get max score to prevent underflow
00104   float maxScore = -1e20;
00105   for (iter = nBestList.begin() ; iter != nBestList.end() ; ++iter) {
00106     const TrellisPath &path = **iter;
00107     float score = mbr_scale * path.GetScoreBreakdown()->GetWeightedScore();
00108     if (maxScore < score) maxScore = score;
00109   }
00110 
00111   vector<FactorType> const& oFactors = opts.output.factor_order;
00112   UTIL_THROW_IF2(oFactors.size() != 1, "Need exactly one output factor!");
00113   for (iter = nBestList.begin() ; iter != nBestList.end() ; ++iter) {
00114     const TrellisPath &path = **iter;
00115     joint_prob = UntransformScore(mbr_scale * path.GetScoreBreakdown()->GetWeightedScore() - maxScore);
00116     marginal += joint_prob;
00117     joint_prob_vec.push_back(joint_prob);
00118 
00119     // get words in translation
00120     vector<const Factor*> translation;
00121     GetOutputFactors(path, oFactors[0], translation);
00122 
00123     // collect n-gram counts
00124     map < vector < const Factor *>, int > counts;
00125     extract_ngrams(translation,counts);
00126 
00127     ngram_stats.push_back(counts);
00128     translations.push_back(translation);
00129   }
00130 
00131   vector<float> mbr_loss;
00132   float bleu, weightedLoss;
00133   float weightedLossCumul = 0;
00134   float minMBRLoss = 1000000;
00135   int minMBRLossIdx = -1;
00136 
00137   /* Main MBR computation done here */
00138   iter = nBestList.begin();
00139   for (unsigned int i = 0; i < nBestList.GetSize(); i++) {
00140     weightedLossCumul = 0;
00141     for (unsigned int j = 0; j < nBestList.GetSize(); j++) {
00142       if ( i != j) {
00143         bleu = calculate_score(translations, j, i,ngram_stats );
00144         weightedLoss = ( 1 - bleu) * ( joint_prob_vec[j]/marginal);
00145         weightedLossCumul += weightedLoss;
00146         if (weightedLossCumul > minMBRLoss)
00147           break;
00148       }
00149     }
00150     if (weightedLossCumul < minMBRLoss) {
00151       minMBRLoss = weightedLossCumul;
00152       minMBRLossIdx = i;
00153     }
00154     iter++;
00155   }
00156   /* Find sentence that minimises Bayes Risk under 1- BLEU loss */
00157   return nBestList.at(minMBRLossIdx);
00158   //return translations[minMBRLossIdx];
00159 }
00160 
00161 void
00162 GetOutputFactors(const TrellisPath &path, FactorType const oFactor,
00163                  vector <const Factor*> &translation)
00164 {
00165   const std::vector<const Hypothesis *> &edges = path.GetEdges();
00166 
00167   // print the surface factor of the translation
00168   for (int currEdge = (int)edges.size() - 1 ; currEdge >= 0 ; currEdge--) {
00169     const Hypothesis &edge = *edges[currEdge];
00170     const Phrase &phrase = edge.GetCurrTargetPhrase();
00171     size_t size = phrase.GetSize();
00172     for (size_t pos = 0 ; pos < size ; pos++) {
00173       const Factor *factor = phrase.GetFactor(pos, oFactor);
00174       translation.push_back(factor);
00175     }
00176   }
00177 }
00178