Moses: mosesdecoder/moses-chart-cmd/mbr.cpp Source File

00001 #include <iostream>
00002 #include <fstream>
00003 #include <sstream>
00004 #include <iomanip>
00005 #include <vector>
00006 #include <map>
00007 #include <stdlib.h>
00008 #include <math.h>
00009 #include <algorithm>
00010 #include <stdio.h>
00011 #include "moses/TrellisPathList.h"
00012 #include "moses/TrellisPath.h"
00013 #include "moses/StaticData.h"
00014 #include "moses/Util.h"
00015 
00016 #include "mbr.h"
00017 
00018 using namespace std ;
00019 using namespace Moses;
00020 
00021 
00022 /* Input :
00023    1. a sorted  n-best list, with duplicates filtered out in the following  format
00024    0 ||| amr moussa is currently on a visit to libya , tomorrow , sunday , to hold talks with regard to the in sudan . ||| 0 -4.94418 0 0 -2.16036 0 0 -81.4462 -106.593 -114.43 -105.55 -12.7873 -26.9057 -25.3715 -52.9336 7.99917 -24 ||| -4.58432
00025 
00026    2. a weight vector
00027    3. bleu order ( default = 4)
00028    4. scaling factor to weigh the weight vector (default = 1.0)
00029 
00030    Output :
00031    translations that minimise the Bayes Risk of the n-best list
00032 
00033 
00034 */
00035 
00036 int BLEU_ORDER = 4;
00037 int SMOOTH = 1;
00038 int DEBUG = 0;
00039 float min_interval = 1e-4;
00040 void extract_ngrams(const vector<const Factor* >& sentence, map < vector < const Factor* >, int >  & allngrams)
00041 {
00042   vector< const Factor* > ngram;
00043   for (int k = 0; k < BLEU_ORDER; k++) {
00044     for(int i =0; i < max((int)sentence.size()-k,0); i++) {
00045       for ( int j = i; j<= i+k; j++) {
00046         ngram.push_back(sentence[j]);
00047       }
00048       ++allngrams[ngram];
00049       ngram.clear();
00050     }
00051   }
00052 }
00053 
00054 float calculate_score(const vector< vector<const Factor*> > & sents, int ref, int hyp,  vector < map < vector < const Factor *>, int > > & ngram_stats )
00055 {
00056   int comps_n = 2*BLEU_ORDER+1;
00057   vector<int> comps(comps_n);
00058   float logbleu = 0.0, brevity;
00059 
00060   int hyp_length = sents[hyp].size();
00061 
00062   for (int i =0; i<BLEU_ORDER; i++) {
00063     comps[2*i] = 0;
00064     comps[2*i+1] = max(hyp_length-i,0);
00065   }
00066 
00067   map< vector < const Factor * > ,int > & hyp_ngrams = ngram_stats[hyp] ;
00068   map< vector < const Factor * >, int > & ref_ngrams = ngram_stats[ref] ;
00069 
00070   for (map< vector< const Factor * >, int >::iterator it = hyp_ngrams.begin();
00071        it != hyp_ngrams.end(); it++) {
00072     map< vector< const Factor * >, int >::iterator ref_it = ref_ngrams.find(it->first);
00073     if(ref_it != ref_ngrams.end()) {
00074       comps[2* (it->first.size()-1)] += min(ref_it->second,it->second);
00075     }
00076   }
00077   comps[comps_n-1] = sents[ref].size();
00078 
00079   if (DEBUG) {
00080     for ( int i = 0; i < comps_n; i++)
00081       cerr << "Comp " << i << " : " << comps[i];
00082   }
00083 
00084   for (int i=0; i<BLEU_ORDER; i++) {
00085     if (comps[0] == 0)
00086       return 0.0;
00087     if ( i > 0 )
00088       logbleu += log((float)comps[2*i]+SMOOTH)-log((float)comps[2*i+1]+SMOOTH);
00089     else
00090       logbleu += log((float)comps[2*i])-log((float)comps[2*i+1]);
00091   }
00092   logbleu /= BLEU_ORDER;
00093   brevity = 1.0-(float)comps[comps_n-1]/comps[1]; // comps[comps_n-1] is the ref length, comps[1] is the test length
00094   if (brevity < 0.0)
00095     logbleu += brevity;
00096   return exp(logbleu);
00097 }
00098 
00099 vector<const Factor*> doMBR(const TrellisPathList& nBestList)
00100 {
00101 //   cerr << "Sentence " << sent << " has " << sents.size() << " candidate translations" << endl;
00102   float marginal = 0;
00103 
00104   vector<float> joint_prob_vec;
00105   vector< vector<const Factor*> > translations;
00106   float joint_prob;
00107   vector< map < vector <const Factor *>, int > > ngram_stats;
00108 
00109   TrellisPathList::const_iterator iter;
00110   //TrellisPath* hyp = NULL;
00111   for (iter = nBestList.begin() ; iter != nBestList.end() ; ++iter) {
00112     const TrellisPath &path = **iter;
00113     joint_prob = UntransformScore(StaticData::Instance().GetMBRScale() * path.GetScoreBreakdown().GetWeightedScore());
00114     marginal += joint_prob;
00115     joint_prob_vec.push_back(joint_prob);
00116     //Cache ngram counts
00117     map < vector < const Factor *>, int > counts;
00118     vector<const Factor*> translation;
00119     GetOutputFactors(path, translation);
00120 
00121     //TO DO
00122     extract_ngrams(translation,counts);
00123     ngram_stats.push_back(counts);
00124     translations.push_back(translation);
00125   }
00126 
00127   vector<float> mbr_loss;
00128   float bleu, weightedLoss;
00129   float weightedLossCumul = 0;
00130   float minMBRLoss = 1000000;
00131   int minMBRLossIdx = -1;
00132 
00133   /* Main MBR computation done here */
00134   for (size_t i = 0; i < nBestList.GetSize(); i++) {
00135     weightedLossCumul = 0;
00136     for (size_t j = 0; j < nBestList.GetSize(); j++) {
00137       if ( i != j) {
00138         bleu = calculate_score(translations, j, i,ngram_stats );
00139         weightedLoss = ( 1 - bleu) * ( joint_prob_vec[j]/marginal);
00140         weightedLossCumul += weightedLoss;
00141         if (weightedLossCumul > minMBRLoss)
00142           break;
00143       }
00144     }
00145     if (weightedLossCumul < minMBRLoss) {
00146       minMBRLoss = weightedLossCumul;
00147       minMBRLossIdx = i;
00148     }
00149   }
00150   /* Find sentence that minimises Bayes Risk under 1- BLEU loss */
00151   return translations[minMBRLossIdx];
00152 }
00153 
00154 void GetOutputFactors(const TrellisPath &path, vector <const Factor*> &translation)
00155 {
00156   const std::vector<const Hypothesis *> &edges = path.GetEdges();
00157   const std::vector<FactorType>& outputFactorOrder = StaticData::Instance().GetOutputFactorOrder();
00158   assert (outputFactorOrder.size() == 1);
00159 
00160   // print the surface factor of the translation
00161   for (int currEdge = (int)edges.size() - 1 ; currEdge >= 0 ; currEdge--) {
00162     const Hypothesis &edge = *edges[currEdge];
00163     const Phrase &phrase = edge.GetCurrTargetPhrase();
00164     size_t size = phrase.GetSize();
00165     for (size_t pos = 0 ; pos < size ; pos++) {
00166 
00167       const Factor *factor = phrase.GetFactor(pos, outputFactorOrder[0]);
00168       translation.push_back(factor);
00169     }
00170   }
00171 }
00172