00001
00002
00003
00004
00005
00006
00007
00008
00009
00010 #include "LatticeMBR.h"
00011 #include "StaticData.h"
00012 #include <algorithm>
00013 #include <set>
00014
00015 using namespace std;
00016
00017 size_t bleu_order = 4;
00018 float UNKNGRAMLOGPROB = -20;
00019 void GetOutputWords(const TrellisPath &path, vector <Word> &translation)
00020 {
00021 const std::vector<const Hypothesis *> &edges = path.GetEdges();
00022
00023
00024 for (int currEdge = (int)edges.size() - 1 ; currEdge >= 0 ; currEdge--) {
00025 const Hypothesis &edge = *edges[currEdge];
00026 const Phrase &phrase = edge.GetCurrTargetPhrase();
00027 size_t size = phrase.GetSize();
00028 for (size_t pos = 0 ; pos < size ; pos++) {
00029 translation.push_back(phrase.GetWord(pos));
00030 }
00031 }
00032 }
00033
00034
00035 void extract_ngrams(const vector<Word >& sentence, map < Phrase, int > & allngrams)
00036 {
00037 for (int k = 0; k < (int)bleu_order; k++) {
00038 for(int i =0; i < max((int)sentence.size()-k,0); i++) {
00039 Phrase ngram( k+1);
00040 for ( int j = i; j<= i+k; j++) {
00041 ngram.AddWord(sentence[j]);
00042 }
00043 ++allngrams[ngram];
00044 }
00045 }
00046 }
00047
00048
00049
00050 void NgramScores::addScore(const Hypothesis* node, const Phrase& ngram, float score)
00051 {
00052 set<Phrase>::const_iterator ngramIter = m_ngrams.find(ngram);
00053 if (ngramIter == m_ngrams.end()) {
00054 ngramIter = m_ngrams.insert(ngram).first;
00055 }
00056 map<const Phrase*,float>& ngramScores = m_scores[node];
00057 map<const Phrase*,float>::iterator scoreIter = ngramScores.find(&(*ngramIter));
00058 if (scoreIter == ngramScores.end()) {
00059 ngramScores[&(*ngramIter)] = score;
00060 } else {
00061 ngramScores[&(*ngramIter)] = log_sum(score,scoreIter->second);
00062 }
00063 }
00064
00065 NgramScores::NodeScoreIterator NgramScores::nodeBegin(const Hypothesis* node)
00066 {
00067 return m_scores[node].begin();
00068 }
00069
00070
00071 NgramScores::NodeScoreIterator NgramScores::nodeEnd(const Hypothesis* node)
00072 {
00073 return m_scores[node].end();
00074 }
00075
00076 LatticeMBRSolution::LatticeMBRSolution(const TrellisPath& path, bool isMap) :
00077 m_score(0.0f)
00078 {
00079 const std::vector<const Hypothesis *> &edges = path.GetEdges();
00080
00081 for (int currEdge = (int)edges.size() - 1 ; currEdge >= 0 ; currEdge--) {
00082 const Hypothesis &edge = *edges[currEdge];
00083 const Phrase &phrase = edge.GetCurrTargetPhrase();
00084 size_t size = phrase.GetSize();
00085 for (size_t pos = 0 ; pos < size ; pos++) {
00086 m_words.push_back(phrase.GetWord(pos));
00087 }
00088 }
00089 if (isMap) {
00090 m_mapScore = path.GetTotalScore();
00091 } else {
00092 m_mapScore = 0;
00093 }
00094 }
00095
00096
00097 void LatticeMBRSolution::CalcScore(map<Phrase, float>& finalNgramScores, const vector<float>& thetas, float mapWeight)
00098 {
00099 m_ngramScores.assign(thetas.size()-1, -10000);
00100
00101 map < Phrase, int > counts;
00102 extract_ngrams(m_words,counts);
00103
00104
00105 m_score = thetas[0] * m_words.size();
00106
00107
00108 for (map < Phrase, int >::iterator ngrams = counts.begin(); ngrams != counts.end(); ++ngrams) {
00109 float ngramPosterior = UNKNGRAMLOGPROB;
00110 map<Phrase,float>::const_iterator ngramPosteriorIt = finalNgramScores.find(ngrams->first);
00111 if (ngramPosteriorIt != finalNgramScores.end()) {
00112 ngramPosterior = ngramPosteriorIt->second;
00113 }
00114 size_t ngramSize = ngrams->first.GetSize();
00115 m_ngramScores[ngramSize-1] = log_sum(log((float)ngrams->second) + ngramPosterior,m_ngramScores[ngramSize-1]);
00116 }
00117
00118
00119 for (size_t i = 0; i < m_ngramScores.size(); ++i) {
00120 m_ngramScores[i] = exp(m_ngramScores[i]);
00121 m_score += thetas[i+1] * m_ngramScores[i];
00122 }
00123
00124
00125
00126 m_score += m_mapScore*mapWeight;
00127 }
00128
00129
00130 void pruneLatticeFB(Lattice & connectedHyp, map < const Hypothesis*, set <const Hypothesis* > > & outgoingHyps, map<const Hypothesis*, vector<Edge> >& incomingEdges,
00131 const vector< float> & estimatedScores, const Hypothesis* bestHypo, size_t edgeDensity, float scale)
00132 {
00133
00134
00135 VERBOSE(2,"Pruning lattice to edge density " << edgeDensity << endl);
00136 const Hypothesis* emptyHyp = connectedHyp.at(0);
00137 while (emptyHyp->GetId() != 0) {
00138 emptyHyp = emptyHyp->GetPrevHypo();
00139 }
00140 connectedHyp.push_back(emptyHyp);
00141
00142
00143 for (size_t i = 0; i < connectedHyp.size(); ++i) {
00144 if (connectedHyp[i]->GetId() > 0 && connectedHyp[i]->GetPrevHypo()->GetId() == 0)
00145 outgoingHyps[emptyHyp].insert(connectedHyp[i]);
00146 }
00147
00148
00149 multimap<float, const Hypothesis*> sortHypsByVal;
00150 for (size_t i =0; i < estimatedScores.size(); ++i) {
00151 sortHypsByVal.insert(make_pair(estimatedScores[i], connectedHyp[i]));
00152 }
00153
00154 multimap<float, const Hypothesis*>::const_iterator it = --sortHypsByVal.end();
00155 float bestScore = it->first;
00156
00157 sortHypsByVal.insert(make_pair(bestScore, emptyHyp));
00158
00159
00160 IFVERBOSE(3) {
00161 for (multimap<float, const Hypothesis*>::const_iterator it = --sortHypsByVal.end(); it != --sortHypsByVal.begin(); --it) {
00162 const Hypothesis* currHyp = it->second;
00163 cerr << "Hyp " << currHyp->GetId() << ", estimated score: " << it->first << endl;
00164 }
00165 }
00166
00167
00168 set <const Hypothesis*> survivingHyps;
00169
00170 VERBOSE(2, "BEST HYPO TARGET LENGTH : " << bestHypo->GetSize() << endl)
00171 size_t numEdgesTotal = edgeDensity * bestHypo->GetSize();
00172 size_t numEdgesCreated = 0;
00173 VERBOSE(2, "Target edge count: " << numEdgesTotal << endl);
00174
00175 float prevScore = -999999;
00176
00177
00178 for (multimap<float, const Hypothesis*>::const_iterator it = --sortHypsByVal.end(); it != --sortHypsByVal.begin(); --it) {
00179 float currEstimatedScore = it->first;
00180 const Hypothesis* currHyp = it->second;
00181
00182 if (numEdgesCreated >= numEdgesTotal && prevScore > currEstimatedScore)
00183 break;
00184
00185 prevScore = currEstimatedScore;
00186 VERBOSE(3, "Num edges created : "<< numEdgesCreated << ", numEdges wanted " << numEdgesTotal << endl)
00187 VERBOSE(3, "Considering hyp " << currHyp->GetId() << ", estimated score: " << it->first << endl)
00188
00189 survivingHyps.insert(currHyp);
00190
00191
00192 if (survivingHyps.find(currHyp->GetPrevHypo()) != survivingHyps.end()) {
00193 vector <Edge>& edges = incomingEdges[currHyp];
00194 Edge winningEdge(currHyp->GetPrevHypo(),currHyp,scale*(currHyp->GetScore() - currHyp->GetPrevHypo()->GetScore()),currHyp->GetCurrTargetPhrase());
00195 edges.push_back(winningEdge);
00196 ++numEdgesCreated;
00197 }
00198
00199
00200 const ArcList *arcList = currHyp->GetArcList();
00201 if (arcList != NULL) {
00202 ArcList::const_iterator iterArcList;
00203 for (iterArcList = arcList->begin() ; iterArcList != arcList->end() ; ++iterArcList) {
00204 const Hypothesis *loserHypo = *iterArcList;
00205 const Hypothesis* loserPrevHypo = loserHypo->GetPrevHypo();
00206 if (survivingHyps.find(loserPrevHypo) != survivingHyps.end()) {
00207 double arcScore = loserHypo->GetScore() - loserPrevHypo->GetScore();
00208 Edge losingEdge(loserPrevHypo, currHyp, arcScore*scale, loserHypo->GetCurrTargetPhrase());
00209 vector <Edge>& edges = incomingEdges[currHyp];
00210 edges.push_back(losingEdge);
00211 ++numEdgesCreated;
00212 }
00213 }
00214 }
00215
00216
00217 map < const Hypothesis*, set < const Hypothesis* > >::const_iterator outgoingIt = outgoingHyps.find(currHyp);
00218
00219 if (outgoingIt != outgoingHyps.end()) {
00220 const set<const Hypothesis*> & outHyps = outgoingIt->second;
00221 for (set<const Hypothesis*>::const_iterator outHypIts = outHyps.begin(); outHypIts != outHyps.end(); ++outHypIts) {
00222 const Hypothesis* succHyp = *outHypIts;
00223
00224 if (survivingHyps.find(succHyp) == survivingHyps.end())
00225 continue;
00226
00227
00228 if (succHyp->GetPrevHypo() == currHyp) {
00229 vector <Edge>& succEdges = incomingEdges[succHyp];
00230 Edge succWinningEdge(currHyp, succHyp, scale*(succHyp->GetScore() - currHyp->GetScore()), succHyp->GetCurrTargetPhrase());
00231 succEdges.push_back(succWinningEdge);
00232 survivingHyps.insert(succHyp);
00233 ++numEdgesCreated;
00234 }
00235
00236
00237 const ArcList *arcList = succHyp->GetArcList();
00238 if (arcList != NULL) {
00239 ArcList::const_iterator iterArcList;
00240
00241 for (iterArcList = arcList->begin() ; iterArcList != arcList->end() ; ++iterArcList) {
00242 const Hypothesis *loserHypo = *iterArcList;
00243 const Hypothesis* loserPrevHypo = loserHypo->GetPrevHypo();
00244 if (loserPrevHypo == currHyp) {
00245 vector <Edge>& succEdges = incomingEdges[succHyp];
00246 double arcScore = loserHypo->GetScore() - currHyp->GetScore();
00247 Edge losingEdge(currHyp, succHyp,scale* arcScore, loserHypo->GetCurrTargetPhrase());
00248 succEdges.push_back(losingEdge);
00249 ++numEdgesCreated;
00250 }
00251 }
00252 }
00253 }
00254 }
00255 }
00256
00257 connectedHyp.clear();
00258 for (set <const Hypothesis*>::iterator it = survivingHyps.begin(); it != survivingHyps.end(); ++it) {
00259 connectedHyp.push_back(*it);
00260 }
00261
00262 VERBOSE(2, "Done! Num edges created : "<< numEdgesCreated << ", numEdges wanted " << numEdgesTotal << endl)
00263
00264 IFVERBOSE(3) {
00265 cerr << "Surviving hyps: " ;
00266 for (set <const Hypothesis*>::iterator it = survivingHyps.begin(); it != survivingHyps.end(); ++it) {
00267 cerr << (*it)->GetId() << " ";
00268 }
00269 cerr << endl;
00270 }
00271
00272
00273 }
00274
00275 void calcNgramExpectations(Lattice & connectedHyp, map<const Hypothesis*, vector<Edge> >& incomingEdges,
00276 map<Phrase, float>& finalNgramScores, bool posteriors)
00277 {
00278
00279 sort(connectedHyp.begin(),connectedHyp.end(),ascendingCoverageCmp);
00280
00281
00282
00283
00284
00285
00286
00287
00288
00289
00290
00291 map<const Hypothesis*, float> forwardScore;
00292 forwardScore[connectedHyp[0]] = 0.0f;
00293 set< const Hypothesis *> finalHyps;
00294
00295 NgramScores ngramScores;
00296
00297 for (size_t i = 1; i < connectedHyp.size(); ++i) {
00298 const Hypothesis* currHyp = connectedHyp[i];
00299 if (currHyp->GetWordsBitmap().IsComplete()) {
00300 finalHyps.insert(currHyp);
00301 }
00302
00303 VERBOSE(3, "Processing hyp: " << currHyp->GetId() << ", num words cov= " << currHyp->GetWordsBitmap().GetNumWordsCovered() << endl)
00304
00305 vector <Edge> & edges = incomingEdges[currHyp];
00306 for (size_t e = 0; e < edges.size(); ++e) {
00307 const Edge& edge = edges[e];
00308 if (forwardScore.find(currHyp) == forwardScore.end()) {
00309 forwardScore[currHyp] = forwardScore[edge.GetTailNode()] + edge.GetScore();
00310 VERBOSE(3, "Fwd score["<<currHyp->GetId()<<"] = fwdScore["<<edge.GetTailNode()->GetId() << "] + edge Score: " << edge.GetScore() << endl)
00311 } else {
00312 forwardScore[currHyp] = log_sum(forwardScore[currHyp], forwardScore[edge.GetTailNode()] + edge.GetScore());
00313 VERBOSE(3, "Fwd score["<<currHyp->GetId()<<"] += fwdScore["<<edge.GetTailNode()->GetId() << "] + edge Score: " << edge.GetScore() << endl)
00314 }
00315 }
00316
00317
00318 for (size_t j =0 ; j < edges.size(); ++j) {
00319 Edge& edge = edges[j];
00320 const NgramHistory & incomingPhrases = edge.GetNgrams(incomingEdges);
00321
00322
00323 for (NgramHistory::const_iterator it = incomingPhrases.begin(); it != incomingPhrases.end(); ++it) {
00324 const Phrase& ngram = it->first;
00325 const PathCounts& pathCounts = it->second;
00326 VERBOSE(4, "Calculating score for: " << it->first << endl)
00327
00328 for (PathCounts::const_iterator pathCountIt = pathCounts.begin(); pathCountIt != pathCounts.end(); ++pathCountIt) {
00329
00330 const Path& path = pathCountIt->first;
00331
00332 float score = forwardScore[path[0]->GetTailNode()];
00333 for (size_t i = 0; i < path.size(); ++i) {
00334 score += path[i]->GetScore();
00335 }
00336
00337
00338 size_t count = posteriors ? 1 : pathCountIt->second;
00339 for (size_t k = 0; k < count; ++k) {
00340 ngramScores.addScore(currHyp,ngram,score);
00341 }
00342 }
00343 }
00344
00345
00346 for (NgramScores::NodeScoreIterator it = ngramScores.nodeBegin(edge.GetTailNode());
00347 it != ngramScores.nodeEnd(edge.GetTailNode()); ++it) {
00348 const Phrase & currNgram = *(it->first);
00349 float currNgramScore = it->second;
00350 VERBOSE(4, "Calculating score for: " << currNgram << endl)
00351
00352
00353 if (!posteriors || incomingPhrases.find(currNgram) == incomingPhrases.end()) {
00354 float score = edge.GetScore() + currNgramScore;
00355 ngramScores.addScore(currHyp,currNgram,score);
00356 }
00357 }
00358
00359 }
00360 }
00361
00362 float Z = 9999999;
00363
00364
00365 for (set< const Hypothesis *>::iterator finalHyp = finalHyps.begin(); finalHyp != finalHyps.end(); ++finalHyp) {
00366 const Hypothesis* hyp = *finalHyp;
00367
00368 for (NgramScores::NodeScoreIterator it = ngramScores.nodeBegin(hyp); it != ngramScores.nodeEnd(hyp); ++it) {
00369 const Phrase& ngram = *(it->first);
00370 if (finalNgramScores.find(ngram) == finalNgramScores.end()) {
00371 finalNgramScores[ngram] = it->second;
00372 } else {
00373 finalNgramScores[ngram] = log_sum(it->second, finalNgramScores[ngram]);
00374 }
00375 }
00376
00377 if (Z == 9999999) {
00378 Z = forwardScore[hyp];
00379 } else {
00380 Z = log_sum(Z, forwardScore[hyp]);
00381 }
00382 }
00383
00384
00385
00386 for (map<Phrase, float>::iterator finalScoresIt = finalNgramScores.begin(); finalScoresIt != finalNgramScores.end(); ++finalScoresIt) {
00387 finalScoresIt->second = finalScoresIt->second - Z;
00388 IFVERBOSE(2) {
00389 VERBOSE(2,finalScoresIt->first << " [" << finalScoresIt->second << "]" << endl);
00390 }
00391 }
00392
00393 }
00394
00395 const NgramHistory& Edge::GetNgrams(map<const Hypothesis*, vector<Edge> > & incomingEdges)
00396 {
00397
00398 if (m_ngrams.size() > 0)
00399 return m_ngrams;
00400
00401 const Phrase& currPhrase = GetWords();
00402
00403 for (size_t start = 0; start < currPhrase.GetSize(); ++start) {
00404 for (size_t end = start; end < start + bleu_order; ++end) {
00405 if (end < currPhrase.GetSize()) {
00406 Phrase edgeNgram(end-start+1);
00407 for (size_t index = start; index <= end; ++index) {
00408 edgeNgram.AddWord(currPhrase.GetWord(index));
00409 }
00410
00411 vector<const Edge*> edgeHistory;
00412 edgeHistory.push_back(this);
00413 storeNgramHistory(edgeNgram, edgeHistory);
00414 } else {
00415 break;
00416 }
00417 }
00418 }
00419
00420 map<const Hypothesis*, vector<Edge> >::iterator it = incomingEdges.find(m_tailNode);
00421 if (it != incomingEdges.end()) {
00422 vector<Edge> & inEdges = it->second;
00423
00424 for (vector<Edge>::iterator edge = inEdges.begin(); edge != inEdges.end(); ++edge) {
00425 const NgramHistory & edgeIncomingNgrams = edge->GetNgrams(incomingEdges);
00426 for (NgramHistory::const_iterator edgeInNgramHist = edgeIncomingNgrams.begin(); edgeInNgramHist != edgeIncomingNgrams.end(); ++edgeInNgramHist) {
00427 const Phrase& edgeIncomingNgram = edgeInNgramHist->first;
00428 const PathCounts & edgeIncomingNgramPaths = edgeInNgramHist->second;
00429 size_t back = min(edgeIncomingNgram.GetSize(), edge->GetWordsSize());
00430 const Phrase& edgeWords = edge->GetWords();
00431 IFVERBOSE(3) {
00432 cerr << "Edge: "<< *edge <<endl;
00433 cerr << "edgeWords: " << edgeWords << endl;
00434 cerr << "edgeInNgram: " << edgeIncomingNgram << endl;
00435 }
00436
00437 Phrase edgeSuffix(ARRAY_SIZE_INCR);
00438 Phrase ngramSuffix(ARRAY_SIZE_INCR);
00439 GetPhraseSuffix(edgeWords,back,edgeSuffix);
00440 GetPhraseSuffix(edgeIncomingNgram,back,ngramSuffix);
00441
00442 if (ngramSuffix == edgeSuffix) {
00443 size_t edgeInNgramSize = edgeIncomingNgram.GetSize();
00444
00445 for (size_t i = 0; i < GetWordsSize() && i + edgeInNgramSize < bleu_order ; ++i) {
00446 Phrase newNgram(edgeIncomingNgram);
00447 for (size_t j = 0; j <= i ; ++j) {
00448 newNgram.AddWord(GetWords().GetWord(j));
00449 }
00450 VERBOSE(3, "Inserting New Phrase : " << newNgram << endl)
00451
00452 for (PathCounts::const_iterator pathIt = edgeIncomingNgramPaths.begin(); pathIt != edgeIncomingNgramPaths.end(); ++pathIt) {
00453 Path newNgramPath = pathIt->first;
00454 newNgramPath.push_back(this);
00455 storeNgramHistory(newNgram, newNgramPath, pathIt->second);
00456 }
00457 }
00458 }
00459 }
00460 }
00461 }
00462 return m_ngrams;
00463 }
00464
00465
00466 void Edge::GetPhraseSuffix(const Phrase& origPhrase, size_t lastN, Phrase& targetPhrase) const
00467 {
00468 size_t origSize = origPhrase.GetSize();
00469 size_t startIndex = origSize - lastN;
00470 for (size_t index = startIndex; index < origPhrase.GetSize(); ++index) {
00471 targetPhrase.AddWord(origPhrase.GetWord(index));
00472 }
00473 }
00474
00475 bool Edge::operator< (const Edge& compare ) const
00476 {
00477 if (m_headNode->GetId() < compare.m_headNode->GetId())
00478 return true;
00479 if (compare.m_headNode->GetId() < m_headNode->GetId())
00480 return false;
00481 if (m_tailNode->GetId() < compare.m_tailNode->GetId())
00482 return true;
00483 if (compare.m_tailNode->GetId() < m_tailNode->GetId())
00484 return false;
00485 return GetScore() < compare.GetScore();
00486 }
00487
00488 ostream& operator<< (ostream& out, const Edge& edge)
00489 {
00490 out << "Head: " << edge.m_headNode->GetId() << ", Tail: " << edge.m_tailNode->GetId() << ", Score: " << edge.m_score << ", Phrase: " << edge.m_targetPhrase << endl;
00491 return out;
00492 }
00493
00494 bool ascendingCoverageCmp(const Hypothesis* a, const Hypothesis* b)
00495 {
00496 return a->GetWordsBitmap().GetNumWordsCovered() < b->GetWordsBitmap().GetNumWordsCovered();
00497 }
00498
00499 void getLatticeMBRNBest(Manager& manager, TrellisPathList& nBestList,
00500 vector<LatticeMBRSolution>& solutions, size_t n)
00501 {
00502 const StaticData& staticData = StaticData::Instance();
00503 std::map < int, bool > connected;
00504 std::vector< const Hypothesis *> connectedList;
00505 map<Phrase, float> ngramPosteriors;
00506 std::map < const Hypothesis*, set <const Hypothesis*> > outgoingHyps;
00507 map<const Hypothesis*, vector<Edge> > incomingEdges;
00508 vector< float> estimatedScores;
00509 manager.GetForwardBackwardSearchGraph(&connected, &connectedList, &outgoingHyps, &estimatedScores);
00510 pruneLatticeFB(connectedList, outgoingHyps, incomingEdges, estimatedScores, manager.GetBestHypothesis(), staticData.GetLatticeMBRPruningFactor(),staticData.GetMBRScale());
00511 calcNgramExpectations(connectedList, incomingEdges, ngramPosteriors,true);
00512
00513 vector<float> mbrThetas = staticData.GetLatticeMBRThetas();
00514 float p = staticData.GetLatticeMBRPrecision();
00515 float r = staticData.GetLatticeMBRPRatio();
00516 float mapWeight = staticData.GetLatticeMBRMapWeight();
00517 if (mbrThetas.size() == 0) {
00518 mbrThetas.push_back(-1);
00519 mbrThetas.push_back(1/(bleu_order*p));
00520 for (size_t i = 2; i <= bleu_order; ++i) {
00521 mbrThetas.push_back(mbrThetas[i-1] / r);
00522 }
00523 }
00524 IFVERBOSE(2) {
00525 VERBOSE(2,"Thetas: ");
00526 for (size_t i = 0; i < mbrThetas.size(); ++i) {
00527 VERBOSE(2,mbrThetas[i] << " ");
00528 }
00529 VERBOSE(2,endl);
00530 }
00531 TrellisPathList::const_iterator iter;
00532 size_t ctr = 0;
00533 LatticeMBRSolutionComparator comparator;
00534 for (iter = nBestList.begin() ; iter != nBestList.end() ; ++iter, ++ctr) {
00535 const TrellisPath &path = **iter;
00536 solutions.push_back(LatticeMBRSolution(path,iter==nBestList.begin()));
00537 solutions.back().CalcScore(ngramPosteriors,mbrThetas,mapWeight);
00538 sort(solutions.begin(), solutions.end(), comparator);
00539 while (solutions.size() > n) {
00540 solutions.pop_back();
00541 }
00542 }
00543 VERBOSE(2,"LMBR Score: " << solutions[0].GetScore() << endl);
00544 }
00545
00546 vector<Word> doLatticeMBR(Manager& manager, TrellisPathList& nBestList)
00547 {
00548
00549 vector<LatticeMBRSolution> solutions;
00550 getLatticeMBRNBest(manager, nBestList, solutions,1);
00551 return solutions.at(0).GetWords();
00552 }
00553
00554 const TrellisPath doConsensusDecoding(Manager& manager, TrellisPathList& nBestList)
00555 {
00556 static const int BLEU_ORDER = 4;
00557 static const float SMOOTH = 1;
00558
00559
00560 const StaticData& staticData = StaticData::Instance();
00561 std::map < int, bool > connected;
00562 std::vector< const Hypothesis *> connectedList;
00563 map<Phrase, float> ngramExpectations;
00564 std::map < const Hypothesis*, set <const Hypothesis*> > outgoingHyps;
00565 map<const Hypothesis*, vector<Edge> > incomingEdges;
00566 vector< float> estimatedScores;
00567 manager.GetForwardBackwardSearchGraph(&connected, &connectedList, &outgoingHyps, &estimatedScores);
00568 pruneLatticeFB(connectedList, outgoingHyps, incomingEdges, estimatedScores, manager.GetBestHypothesis(), staticData.GetLatticeMBRPruningFactor(),staticData.GetMBRScale());
00569 calcNgramExpectations(connectedList, incomingEdges, ngramExpectations,false);
00570
00571
00572
00573 float ref_length = 0.0f;
00574 for (map<Phrase,float>::const_iterator ref_iter = ngramExpectations.begin();
00575 ref_iter != ngramExpectations.end(); ++ref_iter) {
00576
00577
00578 if (ref_iter->first.GetSize() == 1) {
00579 ref_length += exp(ref_iter->second);
00580
00581 }
00582 }
00583
00584 VERBOSE(2,"REF Length: " << ref_length << endl);
00585
00586
00587 TrellisPathList::const_iterator iter;
00588 TrellisPathList::const_iterator best = nBestList.end();
00589 float bestScore = -100000;
00590
00591 for (iter = nBestList.begin() ; iter != nBestList.end() ; ++iter) {
00592 const TrellisPath &path = **iter;
00593 vector<Word> words;
00594 map<Phrase,int> ngrams;
00595 GetOutputWords(path,words);
00596
00597
00598
00599
00600
00601 extract_ngrams(words,ngrams);
00602
00603 vector<float> comps(2*BLEU_ORDER+1);
00604 float logbleu = 0.0;
00605 float brevity = 0.0;
00606 int hyp_length = words.size();
00607 for (int i = 0; i < BLEU_ORDER; ++i) {
00608 comps[2*i] = 0.0;
00609 comps[2*i+1] = max(hyp_length-i,0);
00610 }
00611
00612 for (map<Phrase,int>::const_iterator hyp_iter = ngrams.begin();
00613 hyp_iter != ngrams.end(); ++hyp_iter) {
00614 map<Phrase,float>::const_iterator ref_iter = ngramExpectations.find(hyp_iter->first);
00615 if (ref_iter != ngramExpectations.end()) {
00616 comps[2*(hyp_iter->first.GetSize()-1)] += min(exp(ref_iter->second), (float)(hyp_iter->second));
00617 }
00618
00619 }
00620 comps[comps.size()-1] = ref_length;
00621
00622
00623
00624
00625
00626
00627 float score = 0.0f;
00628 if (comps[0] != 0) {
00629 for (int i=0; i<BLEU_ORDER; i++) {
00630 if ( i > 0 ) {
00631 logbleu += log((float)comps[2*i]+SMOOTH)-log((float)comps[2*i+1]+SMOOTH);
00632 } else {
00633 logbleu += log((float)comps[2*i])-log((float)comps[2*i+1]);
00634 }
00635 }
00636 logbleu /= BLEU_ORDER;
00637 brevity = 1.0-(float)comps[comps.size()-1]/comps[1];
00638 if (brevity < 0.0) {
00639 logbleu += brevity;
00640 }
00641 score = exp(logbleu);
00642 }
00643
00644
00645 if (score > bestScore) {
00646 bestScore = score;
00647 best = iter;
00648 VERBOSE(2,"NEW BEST: " << score << endl);
00649
00650
00651
00652
00653 }
00654 }
00655
00656 assert (best != nBestList.end());
00657 return **best;
00658
00659
00660
00661 }
00662
00663