00001
00002
00003 #include "LexicalReorderingTable.h"
00004 #include "moses/InputFileStream.h"
00005 #include "moses/StaticData.h"
00006 #include "moses/TranslationModel/PhraseDictionary.h"
00007 #include "moses/GenerationDictionary.h"
00008 #include "moses/TargetPhrase.h"
00009 #include "moses/TargetPhraseCollection.h"
00010 #include "moses/TranslationTask.h"
00011
00012 #if !defined WIN32 || defined __MINGW32__ || defined HAVE_CMPH
00013 #include "moses/TranslationModel/CompactPT/LexicalReorderingTableCompact.h"
00014 #endif
00015
00016 namespace Moses
00017 {
00018
00019
00020 std::string auxClearString(const std::string& str)
00021 {
00022 int i = 0, j = str.size()-1;
00023 while(i <= j) {
00024 if(' ' != str[i]) {
00025 break;
00026 } else {
00027 ++i;
00028 }
00029 }
00030 while(j >= i) {
00031 if(' ' != str[j]) {
00032 break;
00033 } else {
00034 --j;
00035 }
00036 }
00037 return str.substr(i,j-i+1);
00038 }
00039
00040 void auxAppend(IPhrase& head, const IPhrase& tail)
00041 {
00042 head.reserve(head.size()+tail.size());
00043 for(size_t i = 0; i < tail.size(); ++i) {
00044 head.push_back(tail[i]);
00045 }
00046 }
00047
00048 LexicalReorderingTable*
00049 LexicalReorderingTable::
00050 LoadAvailable(const std::string& filePath,
00051 const FactorList& f_factors,
00052 const FactorList& e_factors,
00053 const FactorList& c_factors)
00054 {
00055
00056 #ifdef HAVE_CMPH
00057 LexicalReorderingTable *compactLexr = NULL;
00058 compactLexr = LexicalReorderingTableCompact::CheckAndLoad(filePath + ".minlexr", f_factors, e_factors, c_factors);
00059 if(compactLexr)
00060 return compactLexr;
00061 #endif
00062 LexicalReorderingTable* ret;
00063 if (FileExists(filePath+".binlexr.idx") )
00064 ret = new LexicalReorderingTableTree(filePath, f_factors,
00065 e_factors, c_factors);
00066 else
00067 ret = new LexicalReorderingTableMemory(filePath, f_factors,
00068 e_factors, c_factors);
00069 return ret;
00070 }
00071
00072 LexicalReorderingTableMemory::
00073 LexicalReorderingTableMemory(const std::string& filePath,
00074 const std::vector<FactorType>& f_factors,
00075 const std::vector<FactorType>& e_factors,
00076 const std::vector<FactorType>& c_factors)
00077 : LexicalReorderingTable(f_factors, e_factors, c_factors)
00078 {
00079 LoadFromFile(filePath);
00080 }
00081
00082 LexicalReorderingTableMemory::
00083 ~LexicalReorderingTableMemory() { }
00084
00085 std::vector<float>
00086 LexicalReorderingTableMemory::GetScore(const Phrase& f,
00087 const Phrase& e,
00088 const Phrase& c)
00089 {
00090
00091
00092 TableType::const_iterator r;
00093 std::string key;
00094 if(0 == c.GetSize()) {
00095 key = MakeKey(f,e,c);
00096 r = m_Table.find(key);
00097 if(m_Table.end() != r) {
00098 return r->second;
00099 }
00100 } else {
00101
00102 for(size_t i = 0; i <= c.GetSize(); ++i) {
00103 Phrase sub_c(c.GetSubString(Range(i,c.GetSize()-1)));
00104 key = MakeKey(f,e,sub_c);
00105 r = m_Table.find(key);
00106 if(m_Table.end() != r) {
00107 return r->second;
00108 }
00109 }
00110 }
00111 return Scores();
00112 }
00113
00114 void
00115 LexicalReorderingTableMemory::
00116 DbgDump(std::ostream* out) const
00117 {
00118 TableType::const_iterator i;
00119 for(i = m_Table.begin(); i != m_Table.end(); ++i) {
00120 *out << " key: '" << i->first << "' score: ";
00121 *out << "(num scores: " << (i->second).size() << ")";
00122 for(size_t j = 0; j < (i->second).size(); ++j)
00123 *out << (i->second)[j] << " ";
00124
00125 *out << "\n";
00126 }
00127 };
00128
00129 std::string
00130 LexicalReorderingTableMemory::MakeKey(const Phrase& f,
00131 const Phrase& e,
00132 const Phrase& c) const
00133 {
00134 return MakeKey(auxClearString(f.GetStringRep(m_FactorsF)),
00135 auxClearString(e.GetStringRep(m_FactorsE)),
00136 auxClearString(c.GetStringRep(m_FactorsC)));
00137 }
00138
00139 std::string
00140 LexicalReorderingTableMemory::MakeKey(const std::string& f,
00141 const std::string& e,
00142 const std::string& c) const
00143 {
00144 std::string key;
00145 if(!f.empty()) key += f;
00146 if(!m_FactorsE.empty()) {
00147 if(!key.empty()) {
00148 key += "|||";
00149 }
00150 key += e;
00151 }
00152 if(!m_FactorsC.empty()) {
00153 if(!key.empty()) {
00154 key += "|||";
00155 }
00156 key += c;
00157 }
00158 return key;
00159 }
00160
00161 void
00162 LexicalReorderingTableMemory::
00163 LoadFromFile(const std::string& filePath)
00164 {
00165 std::string fileName = filePath;
00166 if(!FileExists(fileName) && FileExists(fileName+".gz"))
00167 fileName += ".gz";
00168
00169 InputFileStream file(fileName);
00170 std::string line(""), key("");
00171 int numScores = -1;
00172 std::cerr << "Loading table into memory...";
00173 while(!getline(file, line).eof()) {
00174 std::vector<std::string> tokens = TokenizeMultiCharSeparator(line, "|||");
00175 int t = 0 ;
00176 std::string f(""),e(""),c("");
00177
00178 if(!m_FactorsF.empty()) {
00179
00180 f = auxClearString(tokens.at(t));
00181 ++t;
00182 }
00183 if(!m_FactorsE.empty()) {
00184
00185 e = auxClearString(tokens.at(t));
00186 ++t;
00187 }
00188 if(!m_FactorsC.empty()) {
00189
00190 c = auxClearString(tokens.at(t));
00191 ++t;
00192 }
00193
00194 std::vector<float> p = Scan<float>(Tokenize(tokens.at(t)));
00195
00196 if(-1 == numScores) {
00197 numScores = (int)p.size();
00198 }
00199 if((int)p.size() != numScores) {
00200 TRACE_ERR( "found inconsistent number of probabilities... found "
00201 << p.size() << " expected " << numScores << std::endl);
00202 exit(0);
00203 }
00204 std::transform(p.begin(),p.end(),p.begin(),TransformScore);
00205 std::transform(p.begin(),p.end(),p.begin(),FloorScore);
00206
00207 m_Table[MakeKey(f,e,c)] = p;
00208 }
00209 std::cerr << "done.\n";
00210 }
00211
00212 LexicalReorderingTableTree::
00213 LexicalReorderingTableTree(const std::string& filePath,
00214 const std::vector<FactorType>& f_factors,
00215 const std::vector<FactorType>& e_factors,
00216 const std::vector<FactorType>& c_factors)
00217 : LexicalReorderingTable(f_factors, e_factors, c_factors)
00218 , m_UseCache(false)
00219 , m_FilePath(filePath)
00220 {
00221 m_Table.reset(new PrefixTreeMap());
00222 m_Table->Read(m_FilePath+".binlexr");
00223 }
00224
00225 LexicalReorderingTableTree::
00226 ~LexicalReorderingTableTree()
00227 { }
00228
00229 Scores
00230 LexicalReorderingTableTree::
00231 GetScore(const Phrase& f, const Phrase& e, const Phrase& c)
00232 {
00233 if((!m_FactorsF.empty() && 0 == f.GetSize())
00234 || (!m_FactorsE.empty() && 0 == e.GetSize())) {
00235
00236
00237
00238
00239 return Scores();
00240 }
00241
00242 CacheType::iterator i;
00243
00244 if(m_UseCache) {
00245 std::pair<CacheType::iterator, bool> r;
00246 r = m_Cache.insert(std::make_pair(MakeCacheKey(f,e),Candidates()));
00247 if(!r.second) return auxFindScoreForContext((r.first)->second, c);
00248 i = r.first;
00249 } else if((i = m_Cache.find(MakeCacheKey(f,e))) != m_Cache.end())
00250
00251 return auxFindScoreForContext(i->second, c);
00252
00253
00254 Candidates cands;
00255 m_Table->GetCandidates(MakeTableKey(f,e), &cands);
00256 if(cands.empty()) return Scores();
00257 if(m_UseCache) i->second = cands;
00258
00259 if(m_FactorsC.empty()) {
00260 UTIL_THROW_IF2(1 != cands.size(), "Error");
00261 return cands[0].GetScore(0);
00262 } else return auxFindScoreForContext(cands, c);
00263 };
00264
00265 Scores
00266 LexicalReorderingTableTree::
00267 auxFindScoreForContext(const Candidates& cands, const Phrase& context)
00268 {
00269 if(m_FactorsC.empty()) {
00270 UTIL_THROW_IF2(cands.size() > 1, "Error");
00271 return (cands.size() == 1) ? cands[0].GetScore(0) : Scores();
00272 } else {
00273 std::vector<std::string> cvec;
00274 for(size_t i = 0; i < context.GetSize(); ++i)
00275 cvec.push_back(context.GetWord(i).GetString(m_FactorsC, false));
00276
00277 IPhrase c = m_Table->ConvertPhrase(cvec,TargetVocId);
00278 IPhrase sub_c;
00279 IPhrase::iterator start = c.begin();
00280 for(size_t j = 0; j <= context.GetSize(); ++j, ++start) {
00281 sub_c.assign(start, c.end());
00282 for(size_t cand = 0; cand < cands.size(); ++cand) {
00283 IPhrase p = cands[cand].GetPhrase(0);
00284 if(cands[cand].GetPhrase(0) == sub_c)
00285 return cands[cand].GetScore(0);
00286 }
00287 }
00288 return Scores();
00289 }
00290 }
00291
00292 void
00293 LexicalReorderingTableTree::
00294 InitializeForInput(ttasksptr const& ttask)
00295 {
00296 const InputType& input = *ttask->GetSource();
00297 ClearCache();
00298 if(ConfusionNet const* cn = dynamic_cast<ConfusionNet const*>(&input)) {
00299 Cache(*cn);
00300 } else if (dynamic_cast<Sentence const*>(&input)) {
00301
00302 DisableCache();
00303 }
00304 if (!m_Table.get()) {
00305
00306 m_Table.reset(new PrefixTreeMap());
00307 m_Table->Read(m_FilePath+".binlexr");
00308 }
00309 };
00310
00311 bool
00312 LexicalReorderingTableTree::
00313 Create(std::istream& inFile, const std::string& outFileName)
00314 {
00315 typedef PrefixTreeSA<LabelId,OFF_T> PSA;
00316
00317 std::string
00318 line,
00319 ofn(outFileName+".binlexr.srctree"),
00320 oft(outFileName+".binlexr.tgtdata"),
00321 ofi(outFileName+".binlexr.idx"),
00322 ofsv(outFileName+".binlexr.voc0"),
00323 oftv(outFileName+".binlexr.voc1");
00324
00325 FILE *os = fOpen(ofn.c_str(),"wb");
00326 FILE *ot = fOpen(oft.c_str(),"wb");
00327
00328 PSA *psa = new PSA;
00329 PSA::setDefault(InvalidOffT);
00330 WordVoc* voc[3];
00331
00332 LabelId currFirstWord = InvalidLabelId;
00333 IPhrase currKey;
00334
00335 Candidates cands;
00336 std::vector<OFF_T> vo;
00337 size_t lnc = 0;
00338 size_t numTokens = 0;
00339 size_t numKeyTokens = 0;
00340 while(getline(inFile, line)) {
00341 ++lnc;
00342 if(0 == lnc % 10000) TRACE_ERR(".");
00343 IPhrase key;
00344 Scores score;
00345
00346 std::vector<std::string> tokens = TokenizeMultiCharSeparator(line, "|||");
00347 std::string w;
00348 if(1 == lnc) {
00349
00350 numTokens = tokens.size();
00351 if(tokens.size() == 2) {
00352
00353 numKeyTokens = 1;
00354 voc[0] = new WordVoc();
00355 voc[1] = 0;
00356 } else if(3 == tokens.size() || 4 == tokens.size()) {
00357
00358 numKeyTokens = 2;
00359 voc[0] = new WordVoc();
00360 voc[1] = new WordVoc();
00361 voc[2] = voc[1];
00362 }
00363 } else {
00364
00365 UTIL_THROW_IF2(numTokens != tokens.size(),
00366 "Lines do not have the same number of tokens");
00367 }
00368 size_t phrase = 0;
00369 for(; phrase < numKeyTokens; ++phrase) {
00370
00371 if(phrase >=1) key.push_back(PrefixTreeMap::MagicWord);
00372 std::istringstream is(tokens[phrase]);
00373 while(is >> w) key.push_back(voc[phrase]->add(w));
00374 }
00375
00376
00377 std::vector<IPhrase> tgt_phrases;
00378 tgt_phrases.resize(numTokens - numKeyTokens - 1);
00379 for(size_t j = 0; j < tgt_phrases.size(); ++j, ++phrase) {
00380 std::istringstream is(tokens[numKeyTokens + j]);
00381 while(is >> w) tgt_phrases[j].push_back(voc[phrase]->add(w));
00382 }
00383
00384
00385 std::istringstream is(tokens[numTokens-1]);
00386 while(is >> w) score.push_back(atof(w.c_str()));
00387
00388
00389 std::transform(score.begin(),score.end(),score.begin(),TransformScore);
00390 std::transform(score.begin(),score.end(),score.begin(),FloorScore);
00391 std::vector<Scores> scores;
00392 scores.push_back(score);
00393
00394 if(key.empty()) {
00395 TRACE_ERR("WARNING: empty source phrase in line '"<<line<<"'\n");
00396 continue;
00397 }
00398
00399
00400 if(currFirstWord == InvalidLabelId) currFirstWord = key[0];
00401 if(currKey.empty()) {
00402 currKey = key;
00403
00404 UTIL_THROW_IF2(psa == NULL, "Object not yet created");
00405 PSA::Data& d = psa->insert(key);
00406 if(d == InvalidOffT) d = fTell(ot);
00407 else {
00408 TRACE_ERR("ERROR: source phrase already inserted (A)!\nline("
00409 << lnc << "): '" << line << "\n");
00410 return false;
00411 }
00412 }
00413
00414 if(currKey != key) {
00415
00416 currKey = key;
00417
00418 cands.writeBin(ot);
00419 cands.clear();
00420
00421 if(key[0] != currFirstWord) {
00422
00423 PTF pf;
00424 if(currFirstWord >= vo.size())
00425 vo.resize(currFirstWord+1,InvalidOffT);
00426 vo[currFirstWord] = fTell(os);
00427 pf.create(*psa, os);
00428 delete psa;
00429 psa = new PSA;
00430 currFirstWord = key[0];
00431 }
00432
00433
00434 UTIL_THROW_IF2(psa == NULL, "Object not yet created");
00435 PSA::Data& d = psa->insert(key);
00436 if(d == InvalidOffT) d = fTell(ot);
00437 else {
00438 TRACE_ERR("ERROR: source phrase already inserted (A)!\nline("
00439 << lnc << "): '" << line << "\n");
00440 return false;
00441 }
00442 }
00443 cands.push_back(GenericCandidate(tgt_phrases, scores));
00444 }
00445 if (lnc == 0) {
00446 TRACE_ERR("ERROR: empty lexicalised reordering file\n" << std::endl);
00447 return false;
00448 }
00449 cands.writeBin(ot);
00450 cands.clear();
00451
00452 PTF pf;
00453 if(currFirstWord >= vo.size())
00454 vo.resize(currFirstWord+1,InvalidOffT);
00455 vo[currFirstWord] = fTell(os);
00456 pf.create(*psa,os);
00457 delete psa;
00458 psa=0;
00459
00460 fClose(os);
00461 fClose(ot);
00462 FILE *oi = fOpen(ofi.c_str(),"wb");
00463 fWriteVector(oi,vo);
00464 fClose(oi);
00465
00466 if(voc[0]) {
00467 voc[0]->Write(ofsv);
00468 delete voc[0];
00469 }
00470 if(voc[1]) {
00471 voc[1]->Write(oftv);
00472 delete voc[1];
00473 }
00474 return true;
00475 }
00476
00477 std::string
00478 LexicalReorderingTableTree::
00479 MakeCacheKey(const Phrase& f, const Phrase& e) const
00480 {
00481 std::string key;
00482 if(!m_FactorsF.empty())
00483 key += auxClearString(f.GetStringRep(m_FactorsF));
00484
00485 if(!m_FactorsE.empty()) {
00486 if(!key.empty()) {
00487 key += "|||";
00488 }
00489 key += auxClearString(e.GetStringRep(m_FactorsE));
00490 }
00491 return key;
00492 };
00493
00494 IPhrase
00495 LexicalReorderingTableTree::
00496 MakeTableKey(const Phrase& f, const Phrase& e) const
00497 {
00498 IPhrase key;
00499 std::vector<std::string> keyPart;
00500 if(!m_FactorsF.empty()) {
00501 for(size_t i = 0; i < f.GetSize(); ++i)
00502 keyPart.push_back(f.GetWord(i).GetString(m_FactorsF, false));
00503 auxAppend(key, m_Table->ConvertPhrase(keyPart, SourceVocId));
00504 keyPart.clear();
00505 }
00506 if(!m_FactorsE.empty()) {
00507 if(!key.empty()) key.push_back(PrefixTreeMap::MagicWord);
00508 for(size_t i = 0; i < e.GetSize(); ++i)
00509 keyPart.push_back(e.GetWord(i).GetString(m_FactorsE, false));
00510 auxAppend(key, m_Table->ConvertPhrase(keyPart,TargetVocId));
00511 }
00512 return key;
00513 };
00514
00515
00516 struct State {
00517 State(PPimp* t, const std::string& p)
00518 : pos(t), path(p) { }
00519
00520 PPimp* pos;
00521 std::string path;
00522 };
00523
00524 void
00525 LexicalReorderingTableTree::
00526 auxCacheForSrcPhrase(const Phrase& f)
00527 {
00528 if(m_FactorsE.empty()) {
00529
00530 Candidates cands;
00531 m_Table->GetCandidates(MakeTableKey(f,Phrase(ARRAY_SIZE_INCR)),&cands);
00532 m_Cache[MakeCacheKey(f,Phrase(ARRAY_SIZE_INCR))] = cands;
00533 } else {
00534 ObjectPool<PPimp> pool;
00535 PPimp* pPos = m_Table->GetRoot();
00536
00537
00538 for(size_t i = 0; i < f.GetSize() && 0 != pPos && pPos->isValid(); ++i)
00539 pPos = m_Table->Extend(pPos, f.GetWord(i).GetString(m_FactorsF, false), SourceVocId);
00540
00541 if(pPos && pPos->isValid())
00542 pPos = m_Table->Extend(pPos, PrefixTreeMap::MagicWord);
00543
00544 if(!pPos || !pPos->isValid())
00545 return;
00546
00547
00548 std::string cache_key = auxClearString(f.GetStringRep(m_FactorsF)) + "|||";
00549
00550 std::vector<State> stack;
00551 stack.push_back(State(pool.get(PPimp(pPos->ptr()->getPtr(pPos->idx),0,0)),""));
00552 Candidates cands;
00553 while(!stack.empty()) {
00554 if(stack.back().pos->isValid()) {
00555 LabelId w = stack.back().pos->ptr()->getKey(stack.back().pos->idx);
00556 std::string next_path = stack.back().path + " " + m_Table->ConvertWord(w,TargetVocId);
00557
00558 m_Table->GetCandidates(*stack.back().pos,&cands);
00559 if(!cands.empty()) m_Cache[cache_key + auxClearString(next_path)] = cands;
00560 cands.clear();
00561 PPimp* next_pos = pool.get(PPimp(stack.back().pos->ptr()->getPtr(stack.back().pos->idx),0,0));
00562 ++stack.back().pos->idx;
00563 stack.push_back(State(next_pos,next_path));
00564 } else stack.pop_back();
00565 }
00566 }
00567 }
00568
00569 void
00570 LexicalReorderingTableTree::
00571 Cache(const ConfusionNet& )
00572 {
00573 return;
00574 }
00575
00576 void
00577 LexicalReorderingTableTree::
00578 Cache(const Sentence& input)
00579 {
00580
00581 size_t prev_cache_size = m_Cache.size();
00582 size_t max_phrase_length = input.GetSize();
00583 for(size_t len = 0; len <= max_phrase_length; ++len) {
00584 for(size_t start = 0; start+len <= input.GetSize(); ++start) {
00585 Phrase f = input.GetSubString(Range(start, start+len));
00586 auxCacheForSrcPhrase(f);
00587 }
00588 }
00589 std::cerr << "Cached " << m_Cache.size() - prev_cache_size
00590 << " new primary reordering table keys\n";
00591 }
00592 }
00593