00001 #include <iostream>
00002 #include <fstream>
00003 #include <sstream>
00004 #include <iomanip>
00005 #include <vector>
00006 #include <map>
00007 #include <cstdlib>
00008 #include <cmath>
00009 #include <algorithm>
00010 #include <cstdio>
00011 #include "moses/TrellisPathList.h"
00012 #include "moses/TrellisPath.h"
00013
00014 #include "moses/Util.h"
00015 #include "mbr.h"
00016
00017 using namespace std ;
00018 using namespace Moses;
00019
00020
00021
00022
00023
00024
00025
00026
00027
00028
00029
00030
00031
00032
00033
00034
00035 int BLEU_ORDER = 4;
00036 int SMOOTH = 1;
00037 float min_interval = 1e-4;
00038 void extract_ngrams(const vector<const Factor* >& sentence, map < vector < const Factor* >, int > & allngrams)
00039 {
00040 vector< const Factor* > ngram;
00041 for (int k = 0; k < BLEU_ORDER; k++) {
00042 for(int i =0; i < max((int)sentence.size()-k,0); i++) {
00043 for ( int j = i; j<= i+k; j++) {
00044 ngram.push_back(sentence[j]);
00045 }
00046 ++allngrams[ngram];
00047 ngram.clear();
00048 }
00049 }
00050 }
00051
00052 float calculate_score(const vector< vector<const Factor*> > & sents, int ref, int hyp, vector < map < vector < const Factor *>, int > > & ngram_stats )
00053 {
00054 int comps_n = 2*BLEU_ORDER+1;
00055 vector<int> comps(comps_n);
00056 float logbleu = 0.0, brevity;
00057
00058 int hyp_length = sents[hyp].size();
00059
00060 for (int i =0; i<BLEU_ORDER; i++) {
00061 comps[2*i] = 0;
00062 comps[2*i+1] = max(hyp_length-i,0);
00063 }
00064
00065 map< vector < const Factor * > ,int > & hyp_ngrams = ngram_stats[hyp] ;
00066 map< vector < const Factor * >, int > & ref_ngrams = ngram_stats[ref] ;
00067
00068 for (map< vector< const Factor * >, int >::iterator it = hyp_ngrams.begin();
00069 it != hyp_ngrams.end(); it++) {
00070 map< vector< const Factor * >, int >::iterator ref_it = ref_ngrams.find(it->first);
00071 if(ref_it != ref_ngrams.end()) {
00072 comps[2* (it->first.size()-1)] += min(ref_it->second,it->second);
00073 }
00074 }
00075 comps[comps_n-1] = sents[ref].size();
00076
00077 for (int i=0; i<BLEU_ORDER; i++) {
00078 if (comps[0] == 0)
00079 return 0.0;
00080 if ( i > 0 )
00081 logbleu += log((float)comps[2*i]+SMOOTH)-log((float)comps[2*i+1]+SMOOTH);
00082 else
00083 logbleu += log((float)comps[2*i])-log((float)comps[2*i+1]);
00084 }
00085 logbleu /= BLEU_ORDER;
00086 brevity = 1.0-(float)comps[comps_n-1]/comps[1];
00087 if (brevity < 0.0)
00088 logbleu += brevity;
00089 return exp(logbleu);
00090 }
00091
00092 const TrellisPath doMBR(const TrellisPathList& nBestList, AllOptions const& opts)
00093 {
00094 float marginal = 0;
00095 float mbr_scale = opts.mbr.scale;
00096 vector<float> joint_prob_vec;
00097 vector< vector<const Factor*> > translations;
00098 float joint_prob;
00099 vector< map < vector <const Factor *>, int > > ngram_stats;
00100
00101 TrellisPathList::const_iterator iter;
00102
00103
00104 float maxScore = -1e20;
00105 for (iter = nBestList.begin() ; iter != nBestList.end() ; ++iter) {
00106 const TrellisPath &path = **iter;
00107 float score = mbr_scale * path.GetScoreBreakdown()->GetWeightedScore();
00108 if (maxScore < score) maxScore = score;
00109 }
00110
00111 vector<FactorType> const& oFactors = opts.output.factor_order;
00112 UTIL_THROW_IF2(oFactors.size() != 1, "Need exactly one output factor!");
00113 for (iter = nBestList.begin() ; iter != nBestList.end() ; ++iter) {
00114 const TrellisPath &path = **iter;
00115 joint_prob = UntransformScore(mbr_scale * path.GetScoreBreakdown()->GetWeightedScore() - maxScore);
00116 marginal += joint_prob;
00117 joint_prob_vec.push_back(joint_prob);
00118
00119
00120 vector<const Factor*> translation;
00121 GetOutputFactors(path, oFactors[0], translation);
00122
00123
00124 map < vector < const Factor *>, int > counts;
00125 extract_ngrams(translation,counts);
00126
00127 ngram_stats.push_back(counts);
00128 translations.push_back(translation);
00129 }
00130
00131 vector<float> mbr_loss;
00132 float bleu, weightedLoss;
00133 float weightedLossCumul = 0;
00134 float minMBRLoss = 1000000;
00135 int minMBRLossIdx = -1;
00136
00137
00138 iter = nBestList.begin();
00139 for (unsigned int i = 0; i < nBestList.GetSize(); i++) {
00140 weightedLossCumul = 0;
00141 for (unsigned int j = 0; j < nBestList.GetSize(); j++) {
00142 if ( i != j) {
00143 bleu = calculate_score(translations, j, i,ngram_stats );
00144 weightedLoss = ( 1 - bleu) * ( joint_prob_vec[j]/marginal);
00145 weightedLossCumul += weightedLoss;
00146 if (weightedLossCumul > minMBRLoss)
00147 break;
00148 }
00149 }
00150 if (weightedLossCumul < minMBRLoss) {
00151 minMBRLoss = weightedLossCumul;
00152 minMBRLossIdx = i;
00153 }
00154 iter++;
00155 }
00156
00157 return nBestList.at(minMBRLossIdx);
00158
00159 }
00160
00161 void
00162 GetOutputFactors(const TrellisPath &path, FactorType const oFactor,
00163 vector <const Factor*> &translation)
00164 {
00165 const std::vector<const Hypothesis *> &edges = path.GetEdges();
00166
00167
00168 for (int currEdge = (int)edges.size() - 1 ; currEdge >= 0 ; currEdge--) {
00169 const Hypothesis &edge = *edges[currEdge];
00170 const Phrase &phrase = edge.GetCurrTargetPhrase();
00171 size_t size = phrase.GetSize();
00172 for (size_t pos = 0 ; pos < size ; pos++) {
00173 const Factor *factor = phrase.GetFactor(pos, oFactor);
00174 translation.push_back(factor);
00175 }
00176 }
00177 }
00178