00001 #include "BleuScorer.h"
00002
00003 #include <algorithm>
00004 #include <cassert>
00005 #include <cmath>
00006 #include <climits>
00007 #include <fstream>
00008 #include <iostream>
00009 #include <stdexcept>
00010
00011 #include "util/exception.hh"
00012 #include "Ngram.h"
00013 #include "Reference.h"
00014 #include "Util.h"
00015 #include "ScoreDataIterator.h"
00016 #include "FeatureDataIterator.h"
00017 #include "Vocabulary.h"
00018
00019 using namespace std;
00020
00021 namespace
00022 {
00023
00024
00025 const char KEY_REFLEN[] = "reflen";
00026 const char REFLEN_AVERAGE[] = "average";
00027 const char REFLEN_SHORTEST[] = "shortest";
00028 const char REFLEN_CLOSEST[] = "closest";
00029
00030 }
00031
00032 namespace MosesTuning
00033 {
00034
00035
00036 BleuScorer::BleuScorer(const string& config)
00037 : StatisticsBasedScorer("BLEU", config),
00038 m_ref_length_type(CLOSEST)
00039 {
00040 const string reflen = getConfig(KEY_REFLEN, REFLEN_CLOSEST);
00041 if (reflen == REFLEN_AVERAGE) {
00042 m_ref_length_type = AVERAGE;
00043 } else if (reflen == REFLEN_SHORTEST) {
00044 m_ref_length_type = SHORTEST;
00045 } else if (reflen == REFLEN_CLOSEST) {
00046 m_ref_length_type = CLOSEST;
00047 } else {
00048 UTIL_THROW2("Unknown reference length strategy: " + reflen);
00049 }
00050 }
00051
00052 BleuScorer::~BleuScorer() {}
00053
00054 size_t BleuScorer::CountNgrams(const string& line, NgramCounts& counts,
00055 unsigned int n, bool is_testing) const
00056 {
00057 assert(n > 0);
00058 vector<int> encoded_tokens;
00059
00060
00061
00062
00063
00064
00065 if (is_testing) {
00066 TokenizeAndEncodeTesting(line, encoded_tokens);
00067 } else {
00068 TokenizeAndEncode(line, encoded_tokens);
00069 }
00070 const size_t len = encoded_tokens.size();
00071 vector<int> ngram;
00072
00073 for (size_t k = 1; k <= n; ++k) {
00074
00075 if (k > len) {
00076 continue;
00077 }
00078 for (size_t i = 0; i < len - k + 1; ++i) {
00079 ngram.clear();
00080 ngram.reserve(len);
00081 for (size_t j = i; j < i+k && j < len; ++j) {
00082 ngram.push_back(encoded_tokens[j]);
00083 }
00084 counts.Add(ngram);
00085 }
00086 }
00087 return len;
00088 }
00089
00090 void BleuScorer::setReferenceFiles(const vector<string>& referenceFiles)
00091 {
00092
00093 m_references.reset();
00094 mert::VocabularyFactory::GetVocabulary()->clear();
00095
00096
00097 for (size_t i = 0; i < referenceFiles.size(); ++i) {
00098 TRACE_ERR("Loading reference from " << referenceFiles[i] << endl);
00099
00100 ifstream ifs(referenceFiles[i].c_str());
00101 if (!OpenReferenceStream(&ifs, i)) {
00102 UTIL_THROW2("Cannot open " + referenceFiles[i]);
00103 }
00104 }
00105 }
00106
00107 bool BleuScorer::OpenReferenceStream(istream* is, size_t file_id)
00108 {
00109 if (is == NULL) return false;
00110
00111 string line;
00112 size_t sid = 0;
00113 while (getline(*is, line)) {
00114
00115
00116 line = preprocessSentence(line);
00117 if (file_id == 0) {
00118 Reference* ref = new Reference;
00119 m_references.push_back(ref);
00120 }
00121 UTIL_THROW_IF2(m_references.size() <= sid, "Reference " << file_id << "has too many sentences.");
00122
00123 ProcessReferenceLine(line, m_references[sid]);
00124
00125 if (sid > 0 && sid % 100 == 0) {
00126 TRACE_ERR(".");
00127 }
00128 ++sid;
00129 }
00130 return true;
00131 }
00132
00133 void BleuScorer::ProcessReferenceLine(const std::string& line, Reference* ref) const
00134 {
00135 NgramCounts counts;
00136 size_t length = CountNgrams(line, counts, kBleuNgramOrder);
00137
00138
00139 for (NgramCounts::const_iterator ci = counts.begin(); ci != counts.end(); ++ci) {
00140 const NgramCounts::Key& ngram = ci->first;
00141 const NgramCounts::Value newcount = ci->second;
00142
00143 NgramCounts::Value oldcount = 0;
00144 ref->get_counts()->Lookup(ngram, &oldcount);
00145 if (newcount > oldcount) {
00146 ref->get_counts()->operator[](ngram) = newcount;
00147 }
00148 }
00149
00150 ref->push_back(length);
00151 }
00152
00153 bool BleuScorer::GetNextReferenceFromStreams(std::vector<boost::shared_ptr<std::ifstream> >& referenceStreams, Reference& ref) const
00154 {
00155 for (vector<boost::shared_ptr<ifstream> >::iterator ifs=referenceStreams.begin(); ifs!=referenceStreams.end(); ++ifs) {
00156 if (!(*ifs)) return false;
00157 string line;
00158 if (!getline(**ifs, line)) return false;
00159 line = preprocessSentence(line);
00160 ProcessReferenceLine(line, &ref);
00161 }
00162 return true;
00163 }
00164
00165 void BleuScorer::prepareStats(size_t sid, const string& text, ScoreStats& entry)
00166 {
00167 UTIL_THROW_IF2(sid >= m_references.size(), "Sentence id (" << sid << ") not found in reference set");
00168 CalcBleuStats(*(m_references[sid]), text, entry);
00169 }
00170
00171 void BleuScorer::CalcBleuStats(const Reference& ref, const std::string& text, ScoreStats& entry) const
00172 {
00173 NgramCounts testcounts;
00174
00175 vector<ScoreStatsType> stats(kBleuNgramOrder * 2);
00176 string sentence = preprocessSentence(text);
00177 const size_t length = CountNgrams(sentence, testcounts, kBleuNgramOrder, true);
00178
00179 const int reference_len = CalcReferenceLength(ref, length);
00180 stats.push_back(reference_len);
00181
00182
00183 for (NgramCounts::const_iterator testcounts_it = testcounts.begin();
00184 testcounts_it != testcounts.end(); ++testcounts_it) {
00185 const NgramCounts::Value guess = testcounts_it->second;
00186 const size_t len = testcounts_it->first.size();
00187 NgramCounts::Value correct = 0;
00188
00189 NgramCounts::Value v = 0;
00190 if (ref.get_counts()->Lookup(testcounts_it->first, &v)) {
00191 correct = min(v, guess);
00192 }
00193 stats[len * 2 - 2] += correct;
00194 stats[len * 2 - 1] += guess;
00195 }
00196 entry.set(stats);
00197 }
00198
00199 statscore_t BleuScorer::calculateScore(const vector<ScoreStatsType>& comps) const
00200 {
00201 UTIL_THROW_IF(comps.size() != kBleuNgramOrder * 2 + 1, util::Exception, "Error");
00202
00203 float logbleu = 0.0;
00204 for (std::size_t i = 0; i < kBleuNgramOrder; ++i) {
00205 if (comps[2*i] == 0) {
00206 return 0.0;
00207 }
00208 logbleu += log(comps[2*i]) - log(comps[2*i+1]);
00209
00210 }
00211 logbleu /= kBleuNgramOrder;
00212
00213 const float brevity = 1.0 - static_cast<float>(comps[kBleuNgramOrder * 2]) / comps[1];
00214 if (brevity < 0.0) {
00215 logbleu += brevity;
00216 }
00217 return exp(logbleu);
00218 }
00219
00220 int BleuScorer::CalcReferenceLength(const Reference& ref, std::size_t length) const
00221 {
00222 switch (m_ref_length_type) {
00223 case AVERAGE:
00224 return ref.CalcAverage();
00225 break;
00226 case CLOSEST:
00227 return ref.CalcClosest(length);
00228 break;
00229 case SHORTEST:
00230 return ref.CalcShortest();
00231 break;
00232 default:
00233 UTIL_THROW2("Unknown reference types");
00234 }
00235 }
00236
00237 void BleuScorer::DumpCounts(ostream* os,
00238 const NgramCounts& counts) const
00239 {
00240 for (NgramCounts::const_iterator it = counts.begin();
00241 it != counts.end(); ++it) {
00242 *os << "(";
00243 const NgramCounts::Key& keys = it->first;
00244 for (size_t i = 0; i < keys.size(); ++i) {
00245 if (i != 0) {
00246 *os << " ";
00247 }
00248 *os << keys[i];
00249 }
00250 *os << ") : " << it->second << ", ";
00251 }
00252 *os << endl;
00253 }
00254
00255 float smoothedSentenceBleu
00256 (const std::vector<float>& stats, float smoothing, bool smoothBP)
00257 {
00258 UTIL_THROW_IF(stats.size() != kBleuNgramOrder * 2 + 1, util::Exception, "Error");
00259
00260 float logbleu = 0.0;
00261 for (std::size_t j = 0; j < kBleuNgramOrder; j++) {
00262 logbleu += log(stats[2 * j] + smoothing) - log(stats[2 * j + 1] + smoothing);
00263 }
00264 logbleu /= kBleuNgramOrder;
00265 const float reflength = stats[(kBleuNgramOrder * 2)] +
00266 (smoothBP ? smoothing : 0.0f);
00267 const float brevity = 1.0 - reflength / stats[1];
00268
00269 if (brevity < 0.0) {
00270 logbleu += brevity;
00271 }
00272 return exp(logbleu);
00273 }
00274
00275 float sentenceLevelBackgroundBleu(const std::vector<float>& sent, const std::vector<float>& bg)
00276 {
00277
00278 UTIL_THROW_IF(sent.size()!=bg.size(), util::Exception, "Error");
00279 UTIL_THROW_IF(sent.size() != kBleuNgramOrder * 2 + 1, util::Exception, "Error");
00280 std::vector<float> stats(sent.size());
00281
00282 for(size_t i=0; i<sent.size(); i++)
00283 stats[i] = sent[i]+bg[i];
00284
00285
00286 float logbleu = 0.0;
00287 for (std::size_t j = 0; j < kBleuNgramOrder; j++) {
00288 logbleu += log(stats[2 * j]) - log(stats[2 * j + 1]);
00289 }
00290 logbleu /= kBleuNgramOrder;
00291 const float brevity = 1.0 - stats[(kBleuNgramOrder * 2)] / stats[1];
00292
00293 if (brevity < 0.0) {
00294 logbleu += brevity;
00295 }
00296
00297
00298 return exp(logbleu) * stats[kBleuNgramOrder*2];
00299 }
00300
00301 vector<float> BleuScorer::ScoreNbestList(const string& scoreFile, const string& featureFile)
00302 {
00303 vector<string> scoreFiles;
00304 vector<string> featureFiles;
00305 scoreFiles.push_back(scoreFile);
00306 featureFiles.push_back(featureFile);
00307
00308 vector<FeatureDataIterator> featureDataIters;
00309 vector<ScoreDataIterator> scoreDataIters;
00310 for (size_t i = 0; i < featureFiles.size(); ++i) {
00311 featureDataIters.push_back(FeatureDataIterator(featureFiles[i]));
00312 scoreDataIters.push_back(ScoreDataIterator(scoreFiles[i]));
00313 }
00314
00315 vector<pair<size_t,size_t> > hypotheses;
00316 UTIL_THROW_IF2(featureDataIters[0] == FeatureDataIterator::end(),
00317 "At the end of feature data iterator");
00318 for (size_t i = 0; i < featureFiles.size(); ++i) {
00319 UTIL_THROW_IF2(featureDataIters[i] == FeatureDataIterator::end(),
00320 "Feature file " << i << " ended prematurely");
00321 UTIL_THROW_IF2(scoreDataIters[i] == ScoreDataIterator::end(),
00322 "Score file " << i << " ended prematurely");
00323 UTIL_THROW_IF2(featureDataIters[i]->size() != scoreDataIters[i]->size(),
00324 "Features and scores have different size");
00325 for (size_t j = 0; j < featureDataIters[i]->size(); ++j) {
00326 hypotheses.push_back(pair<size_t,size_t>(i,j));
00327 }
00328 }
00329
00330
00331 vector<float> bleuScores;
00332 for (size_t i=0; i < hypotheses.size(); ++i) {
00333 pair<size_t,size_t> translation = hypotheses[i];
00334 float bleu = smoothedSentenceBleu(scoreDataIters[translation.first]->operator[](translation.second));
00335 bleuScores.push_back(bleu);
00336 }
00337 return bleuScores;
00338 }
00339
00340
00341
00342 }