00001 #include "LexicalReorderingTable.h"
00002 #include "InputFileStream.h"
00003
00004
00005 #include "StaticData.h"
00006 #include "moses/TranslationModel/PhraseDictionary.h"
00007 #include "GenerationDictionary.h"
00008 #include "TargetPhrase.h"
00009 #include "TargetPhraseCollection.h"
00010
00011 #ifndef WIN32
00012 #include "TranslationModel/CompactPT/LexicalReorderingTableCompact.h"
00013 #endif
00014
00015 namespace Moses
00016 {
00017
00018
00019
00020
00021 std::string auxClearString(const std::string& str)
00022 {
00023 int i = 0, j = str.size()-1;
00024 while(i <= j) {
00025 if(' ' != str[i]) {
00026 break;
00027 } else {
00028 ++i;
00029 }
00030 }
00031 while(j >= i) {
00032 if(' ' != str[j]) {
00033 break;
00034 } else {
00035 --j;
00036 }
00037 }
00038 return str.substr(i,j-i+1);
00039 }
00040
00041 void auxAppend(IPhrase& head, const IPhrase& tail)
00042 {
00043 head.reserve(head.size()+tail.size());
00044 for(size_t i = 0; i < tail.size(); ++i) {
00045 head.push_back(tail[i]);
00046 }
00047 }
00048
00049
00050
00051
00052 LexicalReorderingTable* LexicalReorderingTable::LoadAvailable(const std::string& filePath, const FactorList& f_factors, const FactorList& e_factors, const FactorList& c_factors)
00053 {
00054
00055 LexicalReorderingTable *compactLexr =
00056 LexicalReorderingTableCompact::CheckAndLoad(filePath + ".minlexr", f_factors, e_factors, c_factors);
00057 if(compactLexr)
00058 return compactLexr;
00059 if(FileExists(filePath+".binlexr.idx")) {
00060
00061 return new LexicalReorderingTableTree(filePath, f_factors, e_factors, c_factors);
00062 } else {
00063
00064 return new LexicalReorderingTableMemory(filePath, f_factors, e_factors, c_factors);
00065 }
00066 }
00067
00068
00069
00070
00071 LexicalReorderingTableMemory::LexicalReorderingTableMemory(
00072 const std::string& filePath,
00073 const std::vector<FactorType>& f_factors,
00074 const std::vector<FactorType>& e_factors,
00075 const std::vector<FactorType>& c_factors)
00076 : LexicalReorderingTable(f_factors, e_factors, c_factors)
00077 {
00078 LoadFromFile(filePath);
00079 }
00080
00081 LexicalReorderingTableMemory::~LexicalReorderingTableMemory()
00082 {
00083 }
00084
00085 std::vector<float> LexicalReorderingTableMemory::GetScore(const Phrase& f,
00086 const Phrase& e,
00087 const Phrase& c)
00088 {
00089
00090
00091 TableType::const_iterator r;
00092 std::string key;
00093 if(0 == c.GetSize()) {
00094 key = MakeKey(f,e,c);
00095 r = m_Table.find(key);
00096 if(m_Table.end() != r) {
00097 return r->second;
00098 }
00099 } else {
00100
00101 for(size_t i = 0; i <= c.GetSize(); ++i) {
00102 Phrase sub_c(c.GetSubString(WordsRange(i,c.GetSize()-1)));
00103 key = MakeKey(f,e,sub_c);
00104 r = m_Table.find(key);
00105 if(m_Table.end() != r) {
00106 return r->second;
00107 }
00108 }
00109 }
00110 return Scores();
00111 }
00112
00113 void LexicalReorderingTableMemory::DbgDump(std::ostream* out) const
00114 {
00115 TableType::const_iterator i;
00116 for(i = m_Table.begin(); i != m_Table.end(); ++i) {
00117 *out << " key: '" << i->first << "' score: ";
00118 *out << "(num scores: " << (i->second).size() << ")";
00119 for(size_t j = 0; j < (i->second).size(); ++j) {
00120 *out << (i->second)[j] << " ";
00121 }
00122 *out << "\n";
00123 }
00124 };
00125
00126 std::string LexicalReorderingTableMemory::MakeKey(const Phrase& f,
00127 const Phrase& e,
00128 const Phrase& c) const
00129 {
00130
00131
00132
00133
00134
00135
00136
00137
00138
00139
00140
00141
00142 return MakeKey(auxClearString(f.GetStringRep(m_FactorsF)),
00143 auxClearString(e.GetStringRep(m_FactorsE)),
00144 auxClearString(c.GetStringRep(m_FactorsC)));
00145 }
00146
00147 std::string LexicalReorderingTableMemory::MakeKey(const std::string& f,
00148 const std::string& e,
00149 const std::string& c) const
00150 {
00151 std::string key;
00152 if(!f.empty()) {
00153 key += f;
00154 }
00155 if(!m_FactorsE.empty()) {
00156 if(!key.empty()) {
00157 key += "|||";
00158 }
00159 key += e;
00160 }
00161 if(!m_FactorsC.empty()) {
00162 if(!key.empty()) {
00163 key += "|||";
00164 }
00165 key += c;
00166 }
00167 return key;
00168 }
00169
00170 void LexicalReorderingTableMemory::LoadFromFile(const std::string& filePath)
00171 {
00172 std::string fileName = filePath;
00173 if(!FileExists(fileName) && FileExists(fileName+".gz")) {
00174 fileName += ".gz";
00175 }
00176 InputFileStream file(fileName);
00177 std::string line(""), key("");
00178 int numScores = -1;
00179 std::cerr << "Loading table into memory...";
00180 while(!getline(file, line).eof()) {
00181 std::vector<std::string> tokens = TokenizeMultiCharSeparator(line, "|||");
00182 int t = 0 ;
00183 std::string f(""),e(""),c("");
00184
00185 if(!m_FactorsF.empty()) {
00186
00187 f = auxClearString(tokens.at(t));
00188 ++t;
00189 }
00190 if(!m_FactorsE.empty()) {
00191
00192 e = auxClearString(tokens.at(t));
00193 ++t;
00194 }
00195 if(!m_FactorsC.empty()) {
00196
00197 c = auxClearString(tokens.at(t));
00198 ++t;
00199 }
00200
00201 std::vector<float> p = Scan<float>(Tokenize(tokens.at(t)));
00202
00203 if(-1 == numScores) {
00204 numScores = (int)p.size();
00205 }
00206 if((int)p.size() != numScores) {
00207 TRACE_ERR( "found inconsistent number of probabilities... found " << p.size() << " expected " << numScores << std::endl);
00208 exit(0);
00209 }
00210 std::transform(p.begin(),p.end(),p.begin(),TransformScore);
00211 std::transform(p.begin(),p.end(),p.begin(),FloorScore);
00212
00213 m_Table[MakeKey(f,e,c)] = p;
00214 }
00215 std::cerr << "done.\n";
00216 }
00217
00218
00219
00220
00221 LexicalReorderingTableTree::LexicalReorderingTableTree(
00222 const std::string& filePath,
00223 const std::vector<FactorType>& f_factors,
00224 const std::vector<FactorType>& e_factors,
00225 const std::vector<FactorType>& c_factors)
00226 : LexicalReorderingTable(f_factors, e_factors, c_factors), m_UseCache(false), m_FilePath(filePath)
00227 {
00228 m_Table.reset(new PrefixTreeMap());
00229 m_Table->Read(m_FilePath+".binlexr");
00230 }
00231
00232 LexicalReorderingTableTree::~LexicalReorderingTableTree()
00233 {
00234 }
00235
00236 Scores LexicalReorderingTableTree::GetScore(const Phrase& f, const Phrase& e, const Phrase& c)
00237 {
00238 if( (!m_FactorsF.empty() && 0 == f.GetSize())
00239 || (!m_FactorsE.empty() && 0 == e.GetSize())) {
00240
00241
00242
00243
00244 return Scores();
00245 }
00246 CacheType::iterator i;;
00247 if(m_UseCache) {
00248 std::pair<CacheType::iterator, bool> r = m_Cache.insert(std::make_pair(MakeCacheKey(f,e),Candidates()));
00249 if(!r.second) {
00250 return auxFindScoreForContext((r.first)->second, c);
00251 }
00252 i = r.first;
00253 } else if(!m_Cache.empty()) {
00254
00255 i = m_Cache.find(MakeCacheKey(f,e));
00256 if(i != m_Cache.end()) {
00257 return auxFindScoreForContext(i->second, c);
00258 }
00259 }
00260
00261 Scores score;
00262 Candidates cands;
00263 m_Table->GetCandidates(MakeTableKey(f,e), &cands);
00264 if(cands.empty()) {
00265 return Scores();
00266 }
00267
00268 if(m_FactorsC.empty()) {
00269 CHECK(1 == cands.size());
00270 return cands[0].GetScore(0);
00271 } else {
00272 score = auxFindScoreForContext(cands, c);
00273 }
00274
00275 if(m_UseCache) {
00276 i->second = cands;
00277 }
00278 return score;
00279 };
00280
00281 Scores LexicalReorderingTableTree::auxFindScoreForContext(const Candidates& cands, const Phrase& context)
00282 {
00283 if(m_FactorsC.empty()) {
00284 CHECK(cands.size() <= 1);
00285 return (1 == cands.size())?(cands[0].GetScore(0)):(Scores());
00286 } else {
00287 std::vector<std::string> cvec;
00288 for(size_t i = 0; i < context.GetSize(); ++i) {
00289
00290
00291
00292
00293 cvec.push_back(context.GetWord(i).GetString(m_FactorsC, false));
00294 }
00295 IPhrase c = m_Table->ConvertPhrase(cvec,TargetVocId);
00296 IPhrase sub_c;
00297 IPhrase::iterator start = c.begin();
00298 for(size_t j = 0; j <= context.GetSize(); ++j, ++start) {
00299 sub_c.assign(start, c.end());
00300 for(size_t cand = 0; cand < cands.size(); ++cand) {
00301 IPhrase p = cands[cand].GetPhrase(0);
00302 if(cands[cand].GetPhrase(0) == sub_c) {
00303 return cands[cand].GetScore(0);
00304 }
00305 }
00306 }
00307 return Scores();
00308 }
00309 }
00310
00311 void LexicalReorderingTableTree::InitializeForInput(const InputType& input)
00312 {
00313 ClearCache();
00314 if(ConfusionNet const* cn = dynamic_cast<ConfusionNet const*>(&input)) {
00315 Cache(*cn);
00316 } else if(Sentence const* s = dynamic_cast<Sentence const*>(&input)) {
00317
00318 DisableCache();
00319 }
00320 if (!m_Table.get()) {
00321
00322 m_Table.reset(new PrefixTreeMap());
00323 m_Table->Read(m_FilePath+".binlexr");
00324 }
00325 };
00326
00327 bool LexicalReorderingTableTree::Create(std::istream& inFile,
00328 const std::string& outFileName)
00329 {
00330 std::string line;
00331
00332 std::string
00333 ofn(outFileName+".binlexr.srctree"),
00334 oft(outFileName+".binlexr.tgtdata"),
00335 ofi(outFileName+".binlexr.idx"),
00336 ofsv(outFileName+".binlexr.voc0"),
00337 oftv(outFileName+".binlexr.voc1");
00338
00339
00340 FILE *os = fOpen(ofn.c_str(),"wb");
00341 FILE *ot = fOpen(oft.c_str(),"wb");
00342
00343
00344
00345 typedef PrefixTreeSA<LabelId,OFF_T> PSA;
00346 PSA *psa = new PSA;
00347 PSA::setDefault(InvalidOffT);
00348 WordVoc* voc[3];
00349
00350 LabelId currFirstWord = InvalidLabelId;
00351 IPhrase currKey;
00352
00353 Candidates cands;
00354 std::vector<OFF_T> vo;
00355 size_t lnc = 0;
00356 size_t numTokens = 0;
00357 size_t numKeyTokens = 0;
00358 while(getline(inFile, line)) {
00359 ++lnc;
00360 if(0 == lnc % 10000) {
00361 TRACE_ERR(".");
00362 }
00363 IPhrase key;
00364 Scores score;
00365
00366 std::vector<std::string> tokens = TokenizeMultiCharSeparator(line, "|||");
00367 std::string w;
00368 if(1 == lnc) {
00369
00370 numTokens = tokens.size();
00371 if(tokens.size() == 2) {
00372 numKeyTokens = 1;
00373 voc[0] = new WordVoc();
00374 voc[1] = 0;
00375 } else if(3 == tokens.size() || 4 == tokens.size()) {
00376 numKeyTokens = 2;
00377 voc[0] = new WordVoc();
00378 voc[1] = new WordVoc();
00379 voc[2] = voc[1];
00380 }
00381 } else {
00382
00383 CHECK(numTokens == tokens.size());
00384 }
00385 size_t phrase = 0;
00386 for(; phrase < numKeyTokens; ++phrase) {
00387
00388 if(phrase >=1) {
00389 key.push_back(PrefixTreeMap::MagicWord);
00390 }
00391 std::istringstream is(tokens[phrase]);
00392 while(is >> w) {
00393 key.push_back(voc[phrase]->add(w));
00394 }
00395 }
00396
00397 std::vector<IPhrase> tgt_phrases;
00398 tgt_phrases.resize(numTokens - numKeyTokens - 1);
00399 for(size_t j = 0; j < tgt_phrases.size(); ++j, ++phrase) {
00400 std::istringstream is(tokens[numKeyTokens + j]);
00401 while(is >> w) {
00402 tgt_phrases[j].push_back(voc[phrase]->add(w));
00403 }
00404 }
00405
00406 std::istringstream is(tokens[numTokens-1]);
00407 while(is >> w) {
00408 score.push_back(atof(w.c_str()));
00409 }
00410
00411 std::transform(score.begin(),score.end(),score.begin(),TransformScore);
00412 std::transform(score.begin(),score.end(),score.begin(),FloorScore);
00413 std::vector<Scores> scores;
00414 scores.push_back(score);
00415
00416 if(key.empty()) {
00417 TRACE_ERR("WARNING: empty source phrase in line '"<<line<<"'\n");
00418 continue;
00419 }
00420
00421 if(currFirstWord == InvalidLabelId) {
00422 currFirstWord = key[0];
00423 }
00424 if(currKey.empty()) {
00425 currKey = key;
00426
00427 CHECK(psa);
00428 PSA::Data& d = psa->insert(key);
00429 if(d == InvalidOffT) {
00430 d = fTell(ot);
00431 } else {
00432 TRACE_ERR("ERROR: source phrase already inserted (A)!\nline(" << lnc << "): '" << line << "\n");
00433 return false;
00434 }
00435 }
00436 if(currKey != key) {
00437
00438 currKey = key;
00439
00440 cands.writeBin(ot);
00441 cands.clear();
00442
00443 if(key[0] != currFirstWord) {
00444
00445 PTF pf;
00446 if(currFirstWord >= vo.size()) {
00447 vo.resize(currFirstWord+1,InvalidOffT);
00448 }
00449 vo[currFirstWord] = fTell(os);
00450 pf.create(*psa, os);
00451
00452 delete psa;
00453 psa = new PSA;
00454 currFirstWord = key[0];
00455 }
00456
00457 CHECK(psa);
00458 PSA::Data& d = psa->insert(key);
00459 if(d == InvalidOffT) {
00460 d = fTell(ot);
00461 } else {
00462 TRACE_ERR("ERROR: source phrase already inserted (A)!\nline(" << lnc << "): '" << line << "\n");
00463 return false;
00464 }
00465 }
00466 cands.push_back(GenericCandidate(tgt_phrases, scores));
00467 }
00468 if (lnc == 0) {
00469 TRACE_ERR("ERROR: empty lexicalised reordering file\n" << std::endl);
00470 return false;
00471 }
00472
00473 cands.writeBin(ot);
00474 cands.clear();
00475
00476 PTF pf;
00477 if(currFirstWord >= vo.size()) {
00478 vo.resize(currFirstWord+1,InvalidOffT);
00479 }
00480 vo[currFirstWord] = fTell(os);
00481 pf.create(*psa,os);
00482 delete psa;
00483 psa=0;
00484
00485 fClose(os);
00486 fClose(ot);
00487
00488
00489
00490
00491
00492
00493
00494
00495
00496
00497
00498
00499
00500
00501 FILE *oi = fOpen(ofi.c_str(),"wb");
00502 fWriteVector(oi,vo);
00503 fClose(oi);
00504
00505 if(voc[0]) {
00506 voc[0]->Write(ofsv);
00507 delete voc[0];
00508 }
00509 if(voc[1]) {
00510 voc[1]->Write(oftv);
00511 delete voc[1];
00512 }
00513 return true;
00514 }
00515
00516 std::string LexicalReorderingTableTree::MakeCacheKey(const Phrase& f,
00517 const Phrase& e) const
00518 {
00519 std::string key;
00520 if(!m_FactorsF.empty()) {
00521 key += auxClearString(f.GetStringRep(m_FactorsF));
00522 }
00523 if(!m_FactorsE.empty()) {
00524 if(!key.empty()) {
00525 key += "|||";
00526 }
00527 key += auxClearString(e.GetStringRep(m_FactorsE));
00528 }
00529 return key;
00530 };
00531
00532 IPhrase LexicalReorderingTableTree::MakeTableKey(const Phrase& f,
00533 const Phrase& e) const
00534 {
00535 IPhrase key;
00536 std::vector<std::string> keyPart;
00537 if(!m_FactorsF.empty()) {
00538 for(size_t i = 0; i < f.GetSize(); ++i) {
00539
00540
00541
00542
00543 keyPart.push_back(f.GetWord(i).GetString(m_FactorsF, false));
00544 }
00545 auxAppend(key, m_Table->ConvertPhrase(keyPart, SourceVocId));
00546 keyPart.clear();
00547 }
00548 if(!m_FactorsE.empty()) {
00549 if(!key.empty()) {
00550 key.push_back(PrefixTreeMap::MagicWord);
00551 }
00552 for(size_t i = 0; i < e.GetSize(); ++i) {
00553
00554
00555
00556
00557 keyPart.push_back(e.GetWord(i).GetString(m_FactorsE, false));
00558 }
00559 auxAppend(key, m_Table->ConvertPhrase(keyPart,TargetVocId));
00560
00561 }
00562 return key;
00563 };
00564
00565
00566 struct State {
00567 State(PPimp* t, const std::string& p) : pos(t), path(p) {
00568 }
00569 PPimp* pos;
00570 std::string path;
00571 };
00572
00573 void LexicalReorderingTableTree::auxCacheForSrcPhrase(const Phrase& f)
00574 {
00575 if(m_FactorsE.empty()) {
00576
00577 Candidates cands;
00578 m_Table->GetCandidates(MakeTableKey(f,Phrase(ARRAY_SIZE_INCR)),&cands);
00579 m_Cache[MakeCacheKey(f,Phrase(ARRAY_SIZE_INCR))] = cands;
00580 } else {
00581 ObjectPool<PPimp> pool;
00582 PPimp* pPos = m_Table->GetRoot();
00583
00584 for(size_t i = 0; i < f.GetSize() && 0 != pPos && pPos->isValid(); ++i) {
00585
00586
00587
00588 pPos = m_Table->Extend(pPos, f.GetWord(i).GetString(m_FactorsF, false), SourceVocId);
00589 }
00590 if(0 != pPos && pPos->isValid()) {
00591 pPos = m_Table->Extend(pPos, PrefixTreeMap::MagicWord);
00592 }
00593 if(0 == pPos || !pPos->isValid()) {
00594 return;
00595 }
00596
00597 std::string cache_key = auxClearString(f.GetStringRep(m_FactorsF)) + "|||";
00598
00599 std::vector<State> stack;
00600 stack.push_back(State(pool.get(PPimp(pPos->ptr()->getPtr(pPos->idx),0,0)),""));
00601 Candidates cands;
00602 while(!stack.empty()) {
00603 if(stack.back().pos->isValid()) {
00604 LabelId w = stack.back().pos->ptr()->getKey(stack.back().pos->idx);
00605 std::string next_path = stack.back().path + " " + m_Table->ConvertWord(w,TargetVocId);
00606
00607 m_Table->GetCandidates(*stack.back().pos,&cands);
00608 if(!cands.empty()) {
00609 m_Cache[cache_key + auxClearString(next_path)] = cands;
00610 }
00611 cands.clear();
00612 PPimp* next_pos = pool.get(PPimp(stack.back().pos->ptr()->getPtr(stack.back().pos->idx),0,0));
00613 ++stack.back().pos->idx;
00614 stack.push_back(State(next_pos,next_path));
00615 } else {
00616 stack.pop_back();
00617 }
00618 }
00619 }
00620 }
00621
00622 void LexicalReorderingTableTree::Cache(const ConfusionNet& )
00623 {
00624 return;
00625 }
00626
00627 void LexicalReorderingTableTree::Cache(const Sentence& input)
00628 {
00629
00630 size_t prev_cache_size = m_Cache.size();
00631 size_t max_phrase_length = input.GetSize();
00632 for(size_t len = 0; len <= max_phrase_length; ++len) {
00633 for(size_t start = 0; start+len <= input.GetSize(); ++start) {
00634 Phrase f = input.GetSubString(WordsRange(start, start+len));
00635 auxCacheForSrcPhrase(f);
00636 }
00637 }
00638 std::cerr << "Cached " << m_Cache.size() - prev_cache_size << " new primary reordering table keys\n";
00639 }
00640
00641
00642
00643
00644
00645
00646
00647
00648
00649
00650
00651
00652
00653
00654
00655
00656
00657
00658
00659
00660
00661
00662
00663
00664
00665
00666
00667
00668
00669
00670
00671
00672
00673
00674
00675
00676
00677
00678
00679
00680
00681
00682
00683
00684
00685
00686
00687
00688
00689
00690
00691
00692
00693
00694
00695
00696
00697
00698
00699
00700
00701
00702
00703
00704
00705
00706
00707
00708
00709 }
00710