00001
00002
00003
00004
00005
00006
00007
00008
00009
00010
00011
00012
00013
00014
00015
00016
00017
00018
00019
00020 #include <algorithm>
00021 #include <cmath>
00022 #include <iterator>
00023
00024 #define BOOST_FILESYSTEM_VERSION 3
00025 #include <boost/filesystem.hpp>
00026 #include <boost/lexical_cast.hpp>
00027
00028 #include "util/exception.hh"
00029 #include "util/file_piece.hh"
00030
00031 #include "Scorer.h"
00032 #include "HopeFearDecoder.h"
00033
00034 using namespace std;
00035 namespace fs = boost::filesystem;
00036
00037 namespace MosesTuning
00038 {
00039
00040 static const ValType BLEU_RATIO = 5;
00041
00042 std::pair<MiraWeightVector*,size_t>
00043 InitialiseWeights(const string& denseInitFile, const string& sparseInitFile,
00044 const string& type, bool verbose)
00045 {
00046
00047 vector<parameter_t> initParams;
00048 if(!denseInitFile.empty()) {
00049 ifstream opt(denseInitFile.c_str());
00050 string buffer;
00051 if (opt.fail()) {
00052 cerr << "could not open dense initfile: " << denseInitFile << endl;
00053 exit(3);
00054 }
00055 if (verbose) cerr << "Reading dense features:" << endl;
00056 parameter_t val;
00057 getline(opt,buffer);
00058 if (buffer.find_first_of("=") == buffer.npos) {
00059 UTIL_THROW_IF(type == "hypergraph", util::Exception, "For hypergraph version, require dense features in 'name= value' format");
00060 cerr << "WARN: dense features in deprecated Moses mert format. Prefer 'name= value' format." << endl;
00061 istringstream strstrm(buffer);
00062 while(strstrm >> val) {
00063 initParams.push_back(val);
00064 if(verbose) cerr << val << endl;
00065 }
00066 } else {
00067 vector<string> names;
00068 string last_name = "";
00069 size_t feature_ctr = 1;
00070 do {
00071 size_t equals = buffer.find_last_of("=");
00072 UTIL_THROW_IF(equals == buffer.npos, util::Exception, "Incorrect format in dense feature file: '"
00073 << buffer << "'");
00074 string name = buffer.substr(0,equals);
00075 names.push_back(name);
00076 initParams.push_back(boost::lexical_cast<ValType>(buffer.substr(equals+2)));
00077
00078
00079 if (name != last_name) feature_ctr = 1;
00080 last_name = name;
00081 if (feature_ctr>1) {
00082 stringstream namestr;
00083 namestr << names.back() << "_" << feature_ctr;
00084 names[names.size()-1] = namestr.str();
00085 if (feature_ctr == 2) {
00086 stringstream namestr;
00087 namestr << names[names.size()-2] << "_" << (feature_ctr-1);
00088 names[names.size()-2] = namestr.str();
00089 }
00090 }
00091 ++feature_ctr;
00092
00093 } while(getline(opt,buffer));
00094
00095
00096
00097 for (size_t i = 0; i < names.size(); ++i) {
00098 size_t id = SparseVector::encode(names[i]);
00099 assert(id == i);
00100 if (verbose) cerr << names[i] << " " << initParams[i] << endl;
00101 }
00102
00103 }
00104
00105 opt.close();
00106 }
00107 size_t initDenseSize = initParams.size();
00108
00109 if(!sparseInitFile.empty()) {
00110 if(initDenseSize==0) {
00111 cerr << "sparse initialization requires dense initialization" << endl;
00112 exit(3);
00113 }
00114 ifstream opt(sparseInitFile.c_str());
00115 if(opt.fail()) {
00116 cerr << "could not open sparse initfile: " << sparseInitFile << endl;
00117 exit(3);
00118 }
00119 int sparseCount=0;
00120 parameter_t val;
00121 std::string name;
00122 while(opt >> name >> val) {
00123 size_t id = SparseVector::encode(name) + initDenseSize;
00124 while(initParams.size()<=id) initParams.push_back(0.0);
00125 initParams[id] = val;
00126 sparseCount++;
00127 }
00128 cerr << "Found " << sparseCount << " initial sparse features" << endl;
00129 opt.close();
00130 }
00131
00132 return pair<MiraWeightVector*,size_t>(new MiraWeightVector(initParams), initDenseSize);
00133 }
00134
00135 ValType HopeFearDecoder::Evaluate(const AvgWeightVector& wv)
00136 {
00137 vector<ValType> stats(scorer_->NumberOfScores(),0);
00138 for(reset(); !finished(); next()) {
00139 vector<ValType> sent;
00140 MaxModel(wv,&sent);
00141 for(size_t i=0; i<sent.size(); i++) {
00142 stats[i]+=sent[i];
00143 }
00144 }
00145 return scorer_->calculateScore(stats);
00146 }
00147
00148 NbestHopeFearDecoder::NbestHopeFearDecoder(
00149 const vector<string>& featureFiles,
00150 const vector<string>& scoreFiles,
00151 bool streaming,
00152 bool no_shuffle,
00153 bool safe_hope,
00154 Scorer* scorer
00155 ) : safe_hope_(safe_hope)
00156 {
00157 scorer_ = scorer;
00158 if (streaming) {
00159 train_.reset(new StreamingHypPackEnumerator(featureFiles, scoreFiles));
00160 } else {
00161 train_.reset(new RandomAccessHypPackEnumerator(featureFiles, scoreFiles, no_shuffle));
00162 }
00163 }
00164
00165
00166 void NbestHopeFearDecoder::next()
00167 {
00168 train_->next();
00169 }
00170
00171 bool NbestHopeFearDecoder::finished()
00172 {
00173 return train_->finished();
00174 }
00175
00176 void NbestHopeFearDecoder::reset()
00177 {
00178 train_->reset();
00179 }
00180
00181 void NbestHopeFearDecoder::HopeFear(
00182 const std::vector<ValType>& backgroundBleu,
00183 const MiraWeightVector& wv,
00184 HopeFearData* hopeFear
00185 )
00186 {
00187
00188
00189
00190 ValType hope_scale = 1.0;
00191 size_t hope_index=0, fear_index=0, model_index=0;
00192 ValType hope_score=0, fear_score=0, model_score=0;
00193 for(size_t safe_loop=0; safe_loop<2; safe_loop++) {
00194 ValType hope_bleu=0, hope_model=0;
00195 for(size_t i=0; i< train_->cur_size(); i++) {
00196 const MiraFeatureVector& vec=train_->featuresAt(i);
00197 ValType score = wv.score(vec);
00198 ValType bleu = scorer_->calculateSentenceLevelBackgroundScore(train_->scoresAt(i),backgroundBleu);
00199
00200 if(i==0 || (hope_scale*score + bleu) > hope_score) {
00201 hope_score = hope_scale*score + bleu;
00202 hope_index = i;
00203 hope_bleu = bleu;
00204 hope_model = score;
00205 }
00206
00207 if(i==0 || (score - bleu) > fear_score) {
00208 fear_score = score - bleu;
00209 fear_index = i;
00210 }
00211
00212 if(i==0 || score > model_score) {
00213 model_score = score;
00214 model_index = i;
00215 }
00216 }
00217
00218
00219 hope_bleu *= BLEU_RATIO;
00220 if(safe_hope_ && safe_loop==0 && abs(hope_model)>1e-8 && abs(hope_bleu)/abs(hope_model)<hope_scale)
00221 hope_scale = abs(hope_bleu) / abs(hope_model);
00222 else break;
00223 }
00224 hopeFear->modelFeatures = train_->featuresAt(model_index);
00225 hopeFear->hopeFeatures = train_->featuresAt(hope_index);
00226 hopeFear->fearFeatures = train_->featuresAt(fear_index);
00227
00228 hopeFear->hopeStats = train_->scoresAt(hope_index);
00229 hopeFear->hopeBleu = scorer_->calculateSentenceLevelBackgroundScore(hopeFear->hopeStats, backgroundBleu);
00230 const vector<float>& fear_stats = train_->scoresAt(fear_index);
00231 hopeFear->fearBleu = scorer_->calculateSentenceLevelBackgroundScore(fear_stats, backgroundBleu);
00232
00233 hopeFear->modelStats = train_->scoresAt(model_index);
00234 hopeFear->hopeFearEqual = (hope_index == fear_index);
00235 }
00236
00237 void NbestHopeFearDecoder::MaxModel(const AvgWeightVector& wv, std::vector<ValType>* stats)
00238 {
00239
00240 size_t max_index=0;
00241 ValType max_score=0;
00242 for(size_t i=0; i<train_->cur_size(); i++) {
00243 MiraFeatureVector vec(train_->featuresAt(i));
00244 ValType score = wv.score(vec);
00245 if(i==0 || score > max_score) {
00246 max_index = i;
00247 max_score = score;
00248 }
00249 }
00250 *stats = train_->scoresAt(max_index);
00251 }
00252
00253
00254
00255 HypergraphHopeFearDecoder::HypergraphHopeFearDecoder
00256 (
00257 const string& hypergraphDir,
00258 const vector<string>& referenceFiles,
00259 size_t num_dense,
00260 bool streaming,
00261 bool no_shuffle,
00262 bool safe_hope,
00263 size_t hg_pruning,
00264 const MiraWeightVector& wv,
00265 Scorer* scorer
00266 ) :
00267 num_dense_(num_dense)
00268 {
00269
00270 UTIL_THROW_IF(streaming, util::Exception, "Streaming not currently supported for hypergraphs");
00271 UTIL_THROW_IF(!fs::exists(hypergraphDir), HypergraphException, "Directory '" << hypergraphDir << "' does not exist");
00272 UTIL_THROW_IF(!referenceFiles.size(), util::Exception, "No reference files supplied");
00273 references_.Load(referenceFiles, vocab_);
00274
00275 SparseVector weights;
00276 wv.ToSparse(&weights,num_dense_);
00277 scorer_ = scorer;
00278
00279 static const string kWeights = "weights";
00280 fs::directory_iterator dend;
00281 size_t fileCount = 0;
00282
00283 cerr << "Reading hypergraphs" << endl;
00284 for (fs::directory_iterator di(hypergraphDir); di != dend; ++di) {
00285 const fs::path& hgpath = di->path();
00286 if (hgpath.filename() == kWeights) continue;
00287
00288 Graph graph(vocab_);
00289 size_t id = boost::lexical_cast<size_t>(hgpath.stem().string());
00290 util::scoped_fd fd(util::OpenReadOrThrow(hgpath.string().c_str()));
00291
00292 util::FilePiece file(fd.release());
00293 ReadGraph(file,graph);
00294
00295
00296 size_t edgeCount = hg_pruning * references_.Length(id);
00297 boost::shared_ptr<Graph> prunedGraph;
00298 prunedGraph.reset(new Graph(vocab_));
00299 graph.Prune(prunedGraph.get(), weights, edgeCount);
00300 graphs_[id] = prunedGraph;
00301
00302 ++fileCount;
00303 if (fileCount % 10 == 0) cerr << ".";
00304 if (fileCount % 400 == 0) cerr << " [count=" << fileCount << "]\n";
00305 }
00306 cerr << endl << "Done" << endl;
00307
00308 sentenceIds_.resize(graphs_.size());
00309 for (size_t i = 0; i < graphs_.size(); ++i) sentenceIds_[i] = i;
00310 if (!no_shuffle) {
00311 random_shuffle(sentenceIds_.begin(), sentenceIds_.end());
00312 }
00313
00314 }
00315
00316 void HypergraphHopeFearDecoder::reset()
00317 {
00318 sentenceIdIter_ = sentenceIds_.begin();
00319 }
00320
00321 void HypergraphHopeFearDecoder::next()
00322 {
00323 sentenceIdIter_++;
00324 }
00325
00326 bool HypergraphHopeFearDecoder::finished()
00327 {
00328 return sentenceIdIter_ == sentenceIds_.end();
00329 }
00330
00331 void HypergraphHopeFearDecoder::HopeFear(
00332 const vector<ValType>& backgroundBleu,
00333 const MiraWeightVector& wv,
00334 HopeFearData* hopeFear
00335 )
00336 {
00337 size_t sentenceId = *sentenceIdIter_;
00338 SparseVector weights;
00339 wv.ToSparse(&weights, num_dense_);
00340 const Graph& graph = *(graphs_[sentenceId]);
00341
00342
00343 HgHypothesis hopeHypo, fearHypo, modelHypo;
00344 for(size_t safe_loop=0; safe_loop<2; safe_loop++) {
00345
00346
00347 Viterbi(graph, weights, 1, references_, sentenceId, backgroundBleu, &hopeHypo);
00348
00349
00350 Viterbi(graph, weights, -1, references_, sentenceId, backgroundBleu, &fearHypo);
00351
00352
00353 Viterbi(graph, weights, 0, references_, sentenceId, backgroundBleu, &modelHypo);
00354
00355
00356
00357
00358
00359
00360
00361
00362
00363 break;
00364 }
00365
00366 hopeFear->modelFeatures = MiraFeatureVector(modelHypo.featureVector, num_dense_);
00367 hopeFear->hopeFeatures = MiraFeatureVector(hopeHypo.featureVector, num_dense_);
00368 hopeFear->fearFeatures = MiraFeatureVector(fearHypo.featureVector, num_dense_);
00369
00370
00371
00372
00373
00374 vector<ValType> fearStats(scorer_->NumberOfScores());
00375 hopeFear->hopeStats.reserve(scorer_->NumberOfScores());
00376 hopeFear->modelStats.reserve(scorer_->NumberOfScores());
00377 for (size_t i = 0; i < fearStats.size(); ++i) {
00378 hopeFear->modelStats.push_back(modelHypo.bleuStats[i]);
00379 hopeFear->hopeStats.push_back(hopeHypo.bleuStats[i]);
00380
00381 fearStats[i] = fearHypo.bleuStats[i];
00382 }
00383
00384
00385
00386
00387
00388
00389
00390
00391
00392
00393
00394
00395
00396
00397
00398
00399
00400
00401
00402
00403
00404
00405
00406
00407
00408
00409
00410
00411
00412 hopeFear->hopeBleu = sentenceLevelBackgroundBleu(hopeFear->hopeStats, backgroundBleu);
00413 hopeFear->fearBleu = sentenceLevelBackgroundBleu(fearStats, backgroundBleu);
00414
00415
00416 hopeFear->hopeFearEqual = true;
00417 if (hopeFear->hopeFearEqual) {
00418 for (size_t i = 0; i < fearStats.size(); ++i) {
00419 if (fearStats[i] != hopeFear->hopeStats[i]) {
00420 hopeFear->hopeFearEqual = false;
00421 break;
00422 }
00423 }
00424 }
00425 hopeFear->hopeFearEqual = hopeFear->hopeFearEqual && (hopeFear->fearFeatures == hopeFear->hopeFeatures);
00426 }
00427
00428 void HypergraphHopeFearDecoder::MaxModel(const AvgWeightVector& wv, vector<ValType>* stats)
00429 {
00430 assert(!finished());
00431 HgHypothesis bestHypo;
00432 size_t sentenceId = *sentenceIdIter_;
00433 SparseVector weights;
00434 wv.ToSparse(&weights, num_dense_);
00435 vector<ValType> bg(scorer_->NumberOfScores());
00436
00437 Viterbi(*(graphs_[sentenceId]), weights, 0, references_, sentenceId, bg, &bestHypo);
00438 stats->resize(bestHypo.bleuStats.size());
00439
00440
00441
00442
00443
00444
00445 for (size_t i = 0; i < bestHypo.bleuStats.size(); ++i) {
00446 (*stats)[i] = bestHypo.bleuStats[i];
00447 }
00448 }
00449
00450
00451
00452 };