00001 #include <iostream>
00002 #include <fstream>
00003 #include <sstream>
00004 #include <iomanip>
00005 #include <vector>
00006 #include <map>
00007 #include <stdlib.h>
00008 #include <math.h>
00009 #include <algorithm>
00010 #include <stdio.h>
00011 #include "moses/TrellisPathList.h"
00012 #include "moses/TrellisPath.h"
00013 #include "moses/StaticData.h"
00014 #include "moses/Util.h"
00015
00016 #include "mbr.h"
00017
00018 using namespace std ;
00019 using namespace Moses;
00020
00021
00022
00023
00024
00025
00026
00027
00028
00029
00030
00031
00032
00033
00034
00035
00036 int BLEU_ORDER = 4;
00037 int SMOOTH = 1;
00038 int DEBUG = 0;
00039 float min_interval = 1e-4;
00040 void extract_ngrams(const vector<const Factor* >& sentence, map < vector < const Factor* >, int > & allngrams)
00041 {
00042 vector< const Factor* > ngram;
00043 for (int k = 0; k < BLEU_ORDER; k++) {
00044 for(int i =0; i < max((int)sentence.size()-k,0); i++) {
00045 for ( int j = i; j<= i+k; j++) {
00046 ngram.push_back(sentence[j]);
00047 }
00048 ++allngrams[ngram];
00049 ngram.clear();
00050 }
00051 }
00052 }
00053
00054 float calculate_score(const vector< vector<const Factor*> > & sents, int ref, int hyp, vector < map < vector < const Factor *>, int > > & ngram_stats )
00055 {
00056 int comps_n = 2*BLEU_ORDER+1;
00057 vector<int> comps(comps_n);
00058 float logbleu = 0.0, brevity;
00059
00060 int hyp_length = sents[hyp].size();
00061
00062 for (int i =0; i<BLEU_ORDER; i++) {
00063 comps[2*i] = 0;
00064 comps[2*i+1] = max(hyp_length-i,0);
00065 }
00066
00067 map< vector < const Factor * > ,int > & hyp_ngrams = ngram_stats[hyp] ;
00068 map< vector < const Factor * >, int > & ref_ngrams = ngram_stats[ref] ;
00069
00070 for (map< vector< const Factor * >, int >::iterator it = hyp_ngrams.begin();
00071 it != hyp_ngrams.end(); it++) {
00072 map< vector< const Factor * >, int >::iterator ref_it = ref_ngrams.find(it->first);
00073 if(ref_it != ref_ngrams.end()) {
00074 comps[2* (it->first.size()-1)] += min(ref_it->second,it->second);
00075 }
00076 }
00077 comps[comps_n-1] = sents[ref].size();
00078
00079 if (DEBUG) {
00080 for ( int i = 0; i < comps_n; i++)
00081 cerr << "Comp " << i << " : " << comps[i];
00082 }
00083
00084 for (int i=0; i<BLEU_ORDER; i++) {
00085 if (comps[0] == 0)
00086 return 0.0;
00087 if ( i > 0 )
00088 logbleu += log((float)comps[2*i]+SMOOTH)-log((float)comps[2*i+1]+SMOOTH);
00089 else
00090 logbleu += log((float)comps[2*i])-log((float)comps[2*i+1]);
00091 }
00092 logbleu /= BLEU_ORDER;
00093 brevity = 1.0-(float)comps[comps_n-1]/comps[1];
00094 if (brevity < 0.0)
00095 logbleu += brevity;
00096 return exp(logbleu);
00097 }
00098
00099 vector<const Factor*> doMBR(const TrellisPathList& nBestList)
00100 {
00101
00102 float marginal = 0;
00103
00104 vector<float> joint_prob_vec;
00105 vector< vector<const Factor*> > translations;
00106 float joint_prob;
00107 vector< map < vector <const Factor *>, int > > ngram_stats;
00108
00109 TrellisPathList::const_iterator iter;
00110
00111 for (iter = nBestList.begin() ; iter != nBestList.end() ; ++iter) {
00112 const TrellisPath &path = **iter;
00113 joint_prob = UntransformScore(StaticData::Instance().GetMBRScale() * path.GetScoreBreakdown().GetWeightedScore());
00114 marginal += joint_prob;
00115 joint_prob_vec.push_back(joint_prob);
00116
00117 map < vector < const Factor *>, int > counts;
00118 vector<const Factor*> translation;
00119 GetOutputFactors(path, translation);
00120
00121
00122 extract_ngrams(translation,counts);
00123 ngram_stats.push_back(counts);
00124 translations.push_back(translation);
00125 }
00126
00127 vector<float> mbr_loss;
00128 float bleu, weightedLoss;
00129 float weightedLossCumul = 0;
00130 float minMBRLoss = 1000000;
00131 int minMBRLossIdx = -1;
00132
00133
00134 for (size_t i = 0; i < nBestList.GetSize(); i++) {
00135 weightedLossCumul = 0;
00136 for (size_t j = 0; j < nBestList.GetSize(); j++) {
00137 if ( i != j) {
00138 bleu = calculate_score(translations, j, i,ngram_stats );
00139 weightedLoss = ( 1 - bleu) * ( joint_prob_vec[j]/marginal);
00140 weightedLossCumul += weightedLoss;
00141 if (weightedLossCumul > minMBRLoss)
00142 break;
00143 }
00144 }
00145 if (weightedLossCumul < minMBRLoss) {
00146 minMBRLoss = weightedLossCumul;
00147 minMBRLossIdx = i;
00148 }
00149 }
00150
00151 return translations[minMBRLossIdx];
00152 }
00153
00154 void GetOutputFactors(const TrellisPath &path, vector <const Factor*> &translation)
00155 {
00156 const std::vector<const Hypothesis *> &edges = path.GetEdges();
00157 const std::vector<FactorType>& outputFactorOrder = StaticData::Instance().GetOutputFactorOrder();
00158 assert (outputFactorOrder.size() == 1);
00159
00160
00161 for (int currEdge = (int)edges.size() - 1 ; currEdge >= 0 ; currEdge--) {
00162 const Hypothesis &edge = *edges[currEdge];
00163 const Phrase &phrase = edge.GetCurrTargetPhrase();
00164 size_t size = phrase.GetSize();
00165 for (size_t pos = 0 ; pos < size ; pos++) {
00166
00167 const Factor *factor = phrase.GetFactor(pos, outputFactorOrder[0]);
00168 translation.push_back(factor);
00169 }
00170 }
00171 }
00172