00001
00002
00003 #include "moses/FeatureVector.h"
00004 #include "moses/TranslationModel/PhraseDictionaryTree.h"
00005 #include <map>
00006 #include "util/check.hh"
00007 #include <sstream>
00008 #include <iostream>
00009 #include <fstream>
00010 #include <string>
00011 #include <vector>
00012
00013
00014 namespace Moses
00015 {
00016
00017 template<typename T>
00018 std::ostream& operator<<(std::ostream& out,const std::vector<T>& x)
00019 {
00020 out<<x.size()<<" ";
00021 typename std::vector<T>::const_iterator iend=x.end();
00022 for(typename std::vector<T>::const_iterator i=x.begin(); i!=iend; ++i)
00023 out<<*i<<' ';
00024 return out;
00025 }
00026
00027
00028 class TgtCand
00029 {
00030 IPhrase e;
00031 Scores sc;
00032 std::string m_alignment;
00033 IPhrase fnames;
00034 std::vector<FValue> fvalues;
00035
00036 static const float SPARSE_FLAG;
00037
00038 public:
00039 TgtCand() {}
00040
00041 TgtCand(const IPhrase& a, const Scores& b , const std::string& alignment)
00042 : e(a)
00043 , sc(b)
00044 , m_alignment(alignment)
00045 {}
00046
00047 TgtCand(const IPhrase& a,const Scores& b) : e(a),sc(b) {}
00048
00049 TgtCand(FILE* f) {
00050 readBin(f);
00051 }
00052
00053
00054 void writeBin(FILE* f) const {
00055 fWriteVector(f,e);
00056
00057
00058
00059 if (fnames.size()) {
00060 Scores sc_copy(sc);
00061 sc_copy.push_back(SPARSE_FLAG);
00062 fWriteVector(f,sc_copy);
00063 fWriteVector(f,fnames);
00064 fWriteVector(f,fvalues);
00065 } else {
00066 fWriteVector(f,sc);
00067 }
00068 }
00069
00070 void readBin(FILE* f) {
00071 fReadVector(f,e);
00072 fReadVector(f,sc);
00073 if (sc.back() == 100) {
00074 sc.pop_back();
00075 fReadVector(f,fnames);
00076 fReadVector(f,fvalues);
00077 }
00078 }
00079
00080 void writeBinWithAlignment(FILE* f) const {
00081 writeBin(f);
00082 fWriteString(f, m_alignment.c_str(), m_alignment.size());
00083 }
00084
00085 void readBinWithAlignment(FILE* f) {
00086 readBin(f);
00087 fReadString(f, m_alignment);
00088 }
00089
00090 const IPhrase& GetPhrase() const {
00091 return e;
00092 }
00093 const Scores& GetScores() const {
00094 return sc;
00095 }
00096 const std::string& GetAlignment() const {
00097 return m_alignment;
00098 }
00099
00100 const IPhrase& GetFeatureNames() const {
00101 return fnames;
00102 }
00103
00104 const std::vector<FValue> GetFeatureValues() const {
00105 return fvalues;
00106 }
00107
00108 void SetFeatures(const IPhrase& names, const std::vector<FValue>& values) {
00109 CHECK(names.size() == values.size());
00110 fnames = names;
00111 fvalues = values;
00112 }
00113 };
00114
00115 const float TgtCand::SPARSE_FLAG = 100;
00116
00117
00118 class TgtCands : public std::vector<TgtCand>
00119 {
00120 typedef std::vector<TgtCand> MyBase;
00121 public:
00122 TgtCands() : MyBase() {}
00123
00124 void writeBin(FILE* f) const {
00125 unsigned s=size();
00126 fWrite(f,s);
00127 for(size_t i=0; i<s; ++i) MyBase::operator[](i).writeBin(f);
00128 }
00129
00130 void writeBinWithAlignment(FILE* f) const {
00131 unsigned s=size();
00132 fWrite(f,s);
00133 for(size_t i=0; i<s; ++i) MyBase::operator[](i).writeBinWithAlignment(f);
00134 }
00135
00136 void readBin(FILE* f) {
00137 unsigned s;
00138 fRead(f,s);
00139 resize(s);
00140 for(size_t i=0; i<s; ++i) MyBase::operator[](i).readBin(f);
00141 }
00142
00143 void readBinWithAlignment(FILE* f) {
00144 unsigned s;
00145 fRead(f,s);
00146 resize(s);
00147 for(size_t i=0; i<s; ++i) MyBase::operator[](i).readBinWithAlignment(f);
00148 }
00149 };
00150
00151
00152 PhraseDictionaryTree::PrefixPtr::operator bool() const
00153 {
00154 return imp && imp->isValid();
00155 }
00156
00157 typedef LVoc<std::string> WordVoc;
00158
00159
00160 class PDTimp {
00161 public:
00162 typedef PrefixTreeF<LabelId,OFF_T> PTF;
00163 typedef FilePtr<PTF> CPT;
00164 typedef std::vector<CPT> Data;
00165
00166
00167 Data data;
00168 std::vector<OFF_T> srcOffsets;
00169
00170 FILE *os,*ot;
00171 WordVoc sv;
00172 WordVoc tv;
00173
00174 ObjectPool<PPimp> pPool;
00175
00176
00177 bool needwordalign, haswordAlign;
00178 bool printwordalign;
00179
00180 PDTimp() : os(0),ot(0), printwordalign(false) {
00181 PTF::setDefault(InvalidOffT);
00182 }
00183 ~PDTimp() {
00184 if(os) fClose(os);
00185 if(ot) fClose(ot);
00186 FreeMemory();
00187 }
00188
00189 inline void NeedAlignmentInfo(bool a) {
00190 needwordalign=a;
00191 }
00192 inline bool NeedAlignmentInfo() {
00193 return needwordalign;
00194 };
00195 inline void HasAlignmentInfo(bool a) {
00196 haswordAlign=a;
00197 }
00198 inline bool HasAlignmentInfo() {
00199 return haswordAlign;
00200 };
00201
00202 inline void PrintWordAlignment(bool a) {
00203 printwordalign=a;
00204 };
00205 inline bool PrintWordAlignment() {
00206 return printwordalign;
00207 };
00208
00209 void FreeMemory() {
00210 for(Data::iterator i=data.begin(); i!=data.end(); ++i) (*i).free();
00211 pPool.reset();
00212 }
00213
00214 int Read(const std::string& fn);
00215
00216 void GetTargetCandidates(const IPhrase& f,TgtCands& tgtCands) {
00217 if(f.empty()) return;
00218 if(f[0]>=data.size()) return;
00219 if(!data[f[0]]) return;
00220 CHECK(data[f[0]]->findKey(f[0])<data[f[0]]->size());
00221 OFF_T tCandOffset=data[f[0]]->find(f);
00222 if(tCandOffset==InvalidOffT) return;
00223 fSeek(ot,tCandOffset);
00224
00225 if (HasAlignmentInfo())
00226 tgtCands.readBinWithAlignment(ot);
00227 else
00228 tgtCands.readBin(ot);
00229 }
00230
00231 typedef PhraseDictionaryTree::PrefixPtr PPtr;
00232
00233 void GetTargetCandidates(PPtr p,TgtCands& tgtCands) {
00234 CHECK(p);
00235 if(p.imp->isRoot()) return;
00236 OFF_T tCandOffset=p.imp->ptr()->getData(p.imp->idx);
00237 if(tCandOffset==InvalidOffT) return;
00238 fSeek(ot,tCandOffset);
00239 if (HasAlignmentInfo())
00240 tgtCands.readBinWithAlignment(ot);
00241 else
00242 tgtCands.readBin(ot);
00243 }
00244
00245 void PrintTgtCand(const TgtCands& tcands,std::ostream& out) const;
00246
00247
00248 void ConvertTgtCand(const TgtCands& tcands,std::vector<StringTgtCand>& rv,
00249 std::vector<std::string>* wa) const {
00250 for(TgtCands::const_iterator i=tcands.begin(); i!=tcands.end(); ++i) {
00251 rv.push_back(StringTgtCand());
00252 const IPhrase& iphrase=i->GetPhrase();
00253
00254 rv.back().tokens.reserve(iphrase.size());
00255 for(size_t j=0; j<iphrase.size(); ++j) {
00256 rv.back().tokens.push_back(&tv.symbol(iphrase[j]));
00257 }
00258 rv.back().scores = i->GetScores();
00259 const IPhrase& fnames = i->GetFeatureNames();
00260 for (size_t j = 0; j < fnames.size(); ++j) {
00261 rv.back().fnames.push_back(&tv.symbol(fnames[j]));
00262 }
00263 rv.back().fvalues = i->GetFeatureValues();
00264 if (wa) wa->push_back(i->GetAlignment());
00265 }
00266 }
00267
00268 PPtr GetRoot() {
00269 return PPtr(pPool.get(PPimp(0,0,1)));
00270 }
00271
00272 PPtr Extend(PPtr p,const std::string& w) {
00273 CHECK(p);
00274 if(w.empty() || w==EPSILON) return p;
00275
00276 LabelId wi=sv.index(w);
00277
00278 if(wi==InvalidLabelId) return PPtr();
00279 else if(p.imp->isRoot()) {
00280 if(wi<data.size() && data[wi]) {
00281 const void* ptr = data[wi]->findKeyPtr(wi);
00282 CHECK(ptr);
00283 return PPtr(pPool.get(PPimp(data[wi],data[wi]->findKey(wi),0)));
00284 }
00285 } else if(PTF const* nextP=p.imp->ptr()->getPtr(p.imp->idx)) {
00286 return PPtr(pPool.get(PPimp(nextP,nextP->findKey(wi),0)));
00287 }
00288
00289 return PPtr();
00290 }
00291
00292 WordVoc* ReadVoc(const std::string& filename);
00293 };
00294
00295
00297
00298
00299
00301
00302 int PDTimp::Read(const std::string& fn)
00303 {
00304 std::string ifs, ift, ifi, ifsv, iftv;
00305
00306 HasAlignmentInfo(FileExists(fn+".binphr.srctree.wa"));
00307
00308 if (NeedAlignmentInfo() && !HasAlignmentInfo()) {
00309
00310 std::stringstream strme;
00311 strme << "You are asking for word alignment but the binary phrase table does not contain any alignment info. Please check if you had generated the correct phrase table with word alignment (.wa)\n";
00312 UserMessage::Add(strme.str());
00313 return false;
00314 }
00315
00316 if (HasAlignmentInfo()) {
00317 ifs=fn+".binphr.srctree.wa";
00318 ift=fn+".binphr.tgtdata.wa";
00319 } else {
00320 ifs=fn+".binphr.srctree";
00321 ift=fn+".binphr.tgtdata";
00322 }
00323
00324 ifi=fn+".binphr.idx";
00325 ifsv=fn+".binphr.srcvoc";
00326 iftv=fn+".binphr.tgtvoc";
00327
00328 FILE *ii=fOpen(ifi.c_str(),"rb");
00329 fReadVector(ii,srcOffsets);
00330 fClose(ii);
00331
00332 os=fOpen(ifs.c_str(),"rb");
00333 ot=fOpen(ift.c_str(),"rb");
00334
00335 data.resize(srcOffsets.size());
00336 for(size_t i=0; i<data.size(); ++i)
00337 data[i]=CPT(os,srcOffsets[i]);
00338
00339 sv.Read(ifsv);
00340 tv.Read(iftv);
00341
00342 TRACE_ERR("binary phrasefile loaded, default OFF_T: "<<PTF::getDefault()
00343 <<"\n");
00344 return 1;
00345 }
00346
00347 void PDTimp::PrintTgtCand(const TgtCands& tcand,std::ostream& out) const
00348 {
00349 for(size_t i=0; i<tcand.size(); ++i) {
00350
00351 Scores sc=tcand[i].GetScores();
00352 std::string trgAlign = tcand[i].GetAlignment();
00353
00354 const IPhrase& iphr=tcand[i].GetPhrase();
00355
00356 out << i << " -- " << sc << " -- ";
00357 for(size_t j=0; j<iphr.size(); ++j) out << tv.symbol(iphr[j])<<" ";
00358 out<< " -- " << trgAlign;
00359 out << std::endl;
00360 }
00361 }
00362
00364
00365
00366
00368
00369 PhraseDictionaryTree::PhraseDictionaryTree()
00370 : imp(new PDTimp)
00371 {
00372 if(sizeof(OFF_T)!=8) {
00373 TRACE_ERR("ERROR: size of type 'OFF_T' has to be 64 bit!\n"
00374 "In gcc, use compiler settings '-D_FILE_OFFSET_BITS=64 -D_LARGE_FILES'\n"
00375 " -> abort \n\n");
00376 abort();
00377 }
00378 }
00379
00380 PhraseDictionaryTree::~PhraseDictionaryTree()
00381 {
00382 delete imp;
00383 }
00384
00385 void PhraseDictionaryTree::NeedAlignmentInfo(bool a)
00386 {
00387 imp->NeedAlignmentInfo(a);
00388 };
00389 void PhraseDictionaryTree::PrintWordAlignment(bool a)
00390 {
00391 imp->PrintWordAlignment(a);
00392 };
00393 bool PhraseDictionaryTree::PrintWordAlignment()
00394 {
00395 return imp->PrintWordAlignment();
00396 };
00397
00398 void PhraseDictionaryTree::FreeMemory() const
00399 {
00400 imp->FreeMemory();
00401 }
00402
00403
00404 void PhraseDictionaryTree::
00405 GetTargetCandidates(const std::vector<std::string>& src,
00406 std::vector<StringTgtCand>& rv) const
00407 {
00408 IPhrase f(src.size());
00409 for(size_t i=0; i<src.size(); ++i) {
00410 f[i]=imp->sv.index(src[i]);
00411 if(f[i]==InvalidLabelId) return;
00412 }
00413
00414 TgtCands tgtCands;
00415 imp->GetTargetCandidates(f,tgtCands);
00416 imp->ConvertTgtCand(tgtCands,rv,NULL);
00417 }
00418
00419 void PhraseDictionaryTree::
00420 GetTargetCandidates(const std::vector<std::string>& src,
00421 std::vector<StringTgtCand>& rv,
00422 std::vector<std::string>& wa) const
00423 {
00424 IPhrase f(src.size());
00425 for(size_t i=0; i<src.size(); ++i) {
00426 f[i]=imp->sv.index(src[i]);
00427 if(f[i]==InvalidLabelId) return;
00428 }
00429
00430 TgtCands tgtCands;
00431 imp->GetTargetCandidates(f,tgtCands);
00432 imp->ConvertTgtCand(tgtCands,rv,&wa);
00433 }
00434
00435
00436 void PhraseDictionaryTree::
00437 PrintTargetCandidates(const std::vector<std::string>& src,
00438 std::ostream& out) const
00439 {
00440 IPhrase f(src.size());
00441 for(size_t i=0; i<src.size(); ++i) {
00442 f[i]=imp->sv.index(src[i]);
00443 if(f[i]==InvalidLabelId) {
00444 TRACE_ERR("the source phrase '"<<src<<"' contains an unknown word '"
00445 <<src[i]<<"'\n");
00446 return;
00447 }
00448 }
00449
00450 TgtCands tcand;
00451 imp->GetTargetCandidates(f,tcand);
00452 imp->PrintTgtCand(tcand,out);
00453 }
00454
00455 int PhraseDictionaryTree::Create(std::istream& inFile,const std::string& out)
00456 {
00457 std::string line;
00458 size_t count = 0;
00459
00460 std::string ofn(out+".binphr.srctree"),
00461 oft(out+".binphr.tgtdata"),
00462 ofi(out+".binphr.idx"),
00463 ofsv(out+".binphr.srcvoc"),
00464 oftv(out+".binphr.tgtvoc");
00465
00466 if (PrintWordAlignment()) {
00467 ofn+=".wa";
00468 oft+=".wa";
00469 }
00470
00471 FILE *os=fOpen(ofn.c_str(),"wb"),
00472 *ot=fOpen(oft.c_str(),"wb");
00473
00474 typedef PrefixTreeSA<LabelId,OFF_T> PSA;
00475 PSA *psa=new PSA;
00476 PSA::setDefault(InvalidOffT);
00477
00478 LabelId currFirstWord=InvalidLabelId;
00479 IPhrase currF;
00480 TgtCands tgtCands;
00481 std::vector<OFF_T> vo;
00482 size_t lnc=0;
00483 size_t numElement = NOT_FOUND;
00484 size_t missingAlignmentCount = 0;
00485
00486 while(getline(inFile, line)) {
00487 ++lnc;
00488
00489 std::vector<std::string> tokens = TokenizeMultiCharSeparator( line , "|||" );
00490
00491 if (numElement == NOT_FOUND) {
00492
00493 numElement = tokens.size();
00494 CHECK(numElement >= 3);
00495 }
00496
00497 if (tokens.size() != numElement) {
00498 std::stringstream strme;
00499 strme << "Syntax error at line " << lnc << " : " << line;
00500 UserMessage::Add(strme.str());
00501 abort();
00502 }
00503
00504 const std::string &sourcePhraseString =tokens[0]
00505 ,&targetPhraseString=tokens[1]
00506 ,&scoreString = tokens[2];
00507 const std::string empty;
00508 const std::string &alignmentString = PrintWordAlignment() ? tokens[3] : empty;
00509 const std::string sparseFeatureString = tokens.size() > 5 ? tokens[5] : empty;
00510 IPhrase f,e;
00511 Scores sc;
00512
00513 if (PrintWordAlignment() && alignmentString == " ") ++missingAlignmentCount;
00514
00515 std::vector<std::string> wordVec = Tokenize(sourcePhraseString);
00516 for (size_t i = 0 ; i < wordVec.size() ; ++i)
00517 f.push_back(imp->sv.add(wordVec[i]));
00518
00519 wordVec = Tokenize(targetPhraseString);
00520 for (size_t i = 0 ; i < wordVec.size() ; ++i)
00521 e.push_back(imp->tv.add(wordVec[i]));
00522
00523
00524
00525 std::vector<float> scoreVector = Tokenize<float>(scoreString);
00526 for (size_t i = 0 ; i < scoreVector.size() ; ++i) {
00527 float tmp = scoreVector[i];
00528 sc.push_back(((tmp>0.0)?tmp:(float)1.0e-38));
00529 }
00530
00531 if(f.empty()) {
00532 TRACE_ERR("WARNING: empty source phrase in line '"<<line<<"'\n");
00533 continue;
00534 }
00535
00536 if(currFirstWord==InvalidLabelId) currFirstWord=f[0];
00537 if(currF.empty()) {
00538 ++count;
00539 currF=f;
00540
00541 CHECK(psa);
00542 PSA::Data& d=psa->insert(f);
00543 if(d==InvalidOffT) d=fTell(ot);
00544 else {
00545 TRACE_ERR("ERROR: source phrase already inserted (A)!\nline(" << lnc << "): '"
00546 <<line<<"'\nf: "<<f<<"\n");
00547 abort();
00548 }
00549 }
00550
00551 IPhrase fnames;
00552 std::vector<FValue> fvalues;
00553 if (!sparseFeatureString.empty()) {
00554 std::vector<std::string> sparseTokens = Tokenize(sparseFeatureString);
00555 if (sparseTokens.size() % 2 != 0) {
00556 TRACE_ERR("ERROR: incorrectly formatted sparse feature string: " <<
00557 sparseFeatureString << std::endl);
00558 abort();
00559 }
00560 for (size_t i = 0; i < sparseTokens.size(); i+=2) {
00561 fnames.push_back(imp->tv.add(sparseTokens[i]));
00562 fvalues.push_back(Scan<FValue>(sparseTokens[i+1]));
00563 }
00564 }
00565
00566 if(currF!=f) {
00567
00568 currF=f;
00569 if (PrintWordAlignment())
00570 tgtCands.writeBinWithAlignment(ot);
00571 else
00572 tgtCands.writeBin(ot);
00573 tgtCands.clear();
00574
00575 if(++count%10000==0) {
00576 TRACE_ERR(".");
00577 if(count%500000==0) TRACE_ERR("[phrase:"<<count<<"]\n");
00578 }
00579
00580 if(f[0]!=currFirstWord) {
00581
00582 PTF pf;
00583 if(currFirstWord>=vo.size())
00584 vo.resize(currFirstWord+1,InvalidOffT);
00585 vo[currFirstWord]=fTell(os);
00586 pf.create(*psa,os);
00587
00588 delete psa;
00589 psa=new PSA;
00590 currFirstWord=f[0];
00591 }
00592
00593
00594 CHECK(psa);
00595 PSA::Data& d=psa->insert(f);
00596 if(d==InvalidOffT) d=fTell(ot);
00597 else {
00598 TRACE_ERR("ERROR: xsource phrase already inserted (B)!\nline(" << lnc << "): '"
00599 <<line<<"'\nf: "<<f<<"\n");
00600 abort();
00601 }
00602 }
00603 tgtCands.push_back(TgtCand(e,sc, alignmentString));
00604 CHECK(currFirstWord!=InvalidLabelId);
00605 tgtCands.back().SetFeatures(fnames, fvalues);
00606 }
00607 if (PrintWordAlignment())
00608 tgtCands.writeBinWithAlignment(ot);
00609 else
00610 tgtCands.writeBin(ot);
00611 tgtCands.clear();
00612
00613 PTF pf;
00614 if(currFirstWord>=vo.size()) vo.resize(currFirstWord+1,InvalidOffT);
00615 vo[currFirstWord]=fTell(os);
00616 pf.create(*psa,os);
00617 delete psa;
00618 psa=0;
00619
00620 TRACE_ERR("distinct source phrases: "<<count
00621 <<" distinct first words of source phrases: "<<vo.size()
00622 <<" number of phrase pairs (line count): "<<lnc
00623 <<"\n");
00624
00625 if ( PrintWordAlignment()) {
00626 TRACE_ERR("Count of lines with missing alignments: " <<
00627 missingAlignmentCount << "/" << lnc << "\n");
00628 }
00629
00630 fClose(os);
00631 fClose(ot);
00632
00633 std::vector<size_t> inv;
00634 for(size_t i=0; i<vo.size(); ++i)
00635 if(vo[i]==InvalidOffT) inv.push_back(i);
00636
00637 if(inv.size()) {
00638 TRACE_ERR("WARNING: there are src voc entries with no phrase "
00639 "translation: count "<<inv.size()<<"\n"
00640 "There exists phrase translations for "<<vo.size()-inv.size()
00641 <<" entries\n");
00642 }
00643
00644 FILE *oi=fOpen(ofi.c_str(),"wb");
00645 fWriteVector(oi,vo);
00646 fClose(oi);
00647
00648 imp->sv.Write(ofsv);
00649 imp->tv.Write(oftv);
00650
00651 return 1;
00652 }
00653
00654
00655 int PhraseDictionaryTree::Read(const std::string& fn)
00656 {
00657 TRACE_ERR("size of OFF_T "<<sizeof(OFF_T)<<"\n");
00658 return imp->Read(fn);
00659 }
00660
00661
00662 PhraseDictionaryTree::PrefixPtr PhraseDictionaryTree::GetRoot() const
00663 {
00664 return imp->GetRoot();
00665 }
00666
00667 PhraseDictionaryTree::PrefixPtr
00668 PhraseDictionaryTree::Extend(PrefixPtr p, const std::string& w) const
00669 {
00670 return imp->Extend(p,w);
00671 }
00672
00673 void PhraseDictionaryTree::PrintTargetCandidates(PrefixPtr p,std::ostream& out) const
00674 {
00675
00676 TgtCands tcand;
00677 imp->GetTargetCandidates(p,tcand);
00678 out<<"there are "<<tcand.size()<<" target candidates\n";
00679 imp->PrintTgtCand(tcand,out);
00680 }
00681
00682 void PhraseDictionaryTree::
00683 GetTargetCandidates(PrefixPtr p,
00684 std::vector<StringTgtCand>& rv) const
00685 {
00686 TgtCands tcands;
00687 imp->GetTargetCandidates(p,tcands);
00688 imp->ConvertTgtCand(tcands,rv,NULL);
00689 }
00690
00691 void PhraseDictionaryTree::
00692 GetTargetCandidates(PrefixPtr p,
00693 std::vector<StringTgtCand>& rv,
00694 std::vector<std::string>& wa) const
00695 {
00696 TgtCands tcands;
00697 imp->GetTargetCandidates(p,tcands);
00698 imp->ConvertTgtCand(tcands,rv,&wa);
00699 }
00700
00701 }
00702