00001 #include "PDTAimp.h"
00002
00003 namespace Moses
00004 {
00005
00006 PDTAimp::PDTAimp(PhraseDictionaryTreeAdaptor *p)
00007 : m_dict(0),
00008 m_obj(p),
00009 useCache(1),
00010 totalE(0),
00011 distinctE(0)
00012 {
00013 m_numInputScores = 0;
00014 m_inputFeature = InputFeature::InstancePtr();
00015
00016 if (m_inputFeature) {
00017 const PhraseDictionary *firstPt = PhraseDictionary::GetColl()[0];
00018 if (firstPt == m_obj) {
00019 m_numInputScores = m_inputFeature->GetNumScoreComponents();
00020 }
00021 }
00022 }
00023
00024 PDTAimp::~PDTAimp()
00025 {
00026 CleanUp();
00027 delete m_dict;
00028
00029 if (StaticData::Instance().GetVerboseLevel() >= 2) {
00030
00031 TRACE_ERR("tgt candidates stats: total="<<totalE<<"; distinct="
00032 <<distinctE<<" ("<<distinctE/(0.01*totalE)<<"); duplicates="
00033 <<totalE-distinctE<<" ("<<(totalE-distinctE)/(0.01*totalE)
00034 <<")\n");
00035
00036 TRACE_ERR("\npath statistics\n");
00037
00038 if(path1Best.size()) {
00039 TRACE_ERR("1-best: ");
00040 std::copy(path1Best.begin()+1,path1Best.end(),
00041 std::ostream_iterator<size_t>(std::cerr," \t"));
00042 TRACE_ERR("\n");
00043 }
00044 if(pathCN.size()) {
00045 TRACE_ERR("CN (full): ");
00046 std::transform(pathCN.begin()+1
00047 ,pathCN.end()
00048 ,std::ostream_iterator<double>(std::cerr," \t")
00049 ,Exp);
00050 TRACE_ERR("\n");
00051 }
00052 if(pathExplored.size()) {
00053 TRACE_ERR("CN (explored): ");
00054 std::copy(pathExplored.begin()+1,pathExplored.end(),
00055 std::ostream_iterator<size_t>(std::cerr," \t"));
00056 TRACE_ERR("\n");
00057 }
00058 }
00059
00060 }
00061
00062 void PDTAimp::CleanUp()
00063 {
00064 assert(m_dict);
00065 m_dict->FreeMemory();
00066
00067 m_tgtColls.clear();
00068 m_cache.clear();
00069 m_rangeCache.clear();
00070 uniqSrcPhr.clear();
00071 }
00072
00073 TargetPhraseCollectionWithSourcePhrase::shared_ptr
00074 PDTAimp::GetTargetPhraseCollection(Phrase const &src) const
00075 {
00076
00077 assert(m_dict);
00078
00079 TargetPhraseCollectionWithSourcePhrase::shared_ptr ret;
00080 if(src.GetSize()==0) return ret;
00081
00082 std::pair<MapSrc2Tgt::iterator,bool> piter;
00083 if(useCache) {
00084 piter=m_cache.insert(std::make_pair(src, ret));
00085 if(!piter.second) return piter.first->second;
00086 } else if (m_cache.size()) {
00087 MapSrc2Tgt::const_iterator i=m_cache.find(src);
00088 return (i!=m_cache.end() ? i->second : ret);
00089 }
00090
00091 std::vector<std::string> srcString(src.GetSize());
00092
00093 for(size_t i=0; i<srcString.size(); ++i) {
00094 Factors2String(src.GetWord(i),srcString[i]);
00095 }
00096
00097
00098 std::vector<StringTgtCand> cands;
00099 std::vector<std::string> wacands;
00100 m_dict->GetTargetCandidates(srcString,cands,wacands);
00101 if(cands.empty()) {
00102 return ret;
00103 }
00104
00105
00106 std::vector<float> weights = StaticData::Instance().GetWeights(m_obj);
00107
00108 std::vector<TargetPhrase> tCands;
00109 tCands.reserve(cands.size());
00110
00111 std::vector<std::pair<float,size_t> > costs;
00112 costs.reserve(cands.size());
00113
00114 std::vector<Phrase> sourcePhrases;
00115 sourcePhrases.reserve(cands.size());
00116
00117
00118
00119 std::string fd = m_obj->options()->output.factor_delimiter;
00120 for(size_t i=0; i<cands.size(); ++i) {
00121 TargetPhrase targetPhrase(m_obj);
00122
00123 StringTgtCand::Tokens const& factorStrings=cands[i].tokens;
00124 Scores const& probVector=cands[i].scores;
00125
00126 std::vector<float> scoreVector(probVector.size());
00127 std::transform(probVector.begin(),probVector.end(),scoreVector.begin(),
00128 TransformScore);
00129 std::transform(scoreVector.begin(),scoreVector.end(),scoreVector.begin(),
00130 FloorScore);
00131
00132
00133
00134 for (size_t j = 0; j < cands[i].fnames.size(); ++j) {
00135 targetPhrase.GetScoreBreakdown().Assign(m_obj, *cands[i].fnames[j], cands[i].fvalues[j]);
00136 }
00137
00138 CreateTargetPhrase(targetPhrase,factorStrings, fd, scoreVector, Scores(0),
00139 &wacands[i], &src);
00140
00141 costs.push_back(std::make_pair(-targetPhrase.GetFutureScore(),tCands.size()));
00142 tCands.push_back(targetPhrase);
00143
00144 sourcePhrases.push_back(src);
00145 }
00146
00147 ret = PruneTargetCandidates(tCands,costs, sourcePhrases);
00148 if(ret->IsEmpty()) {
00149 ret.reset();
00150 } else {
00151 if(useCache) piter.first->second = ret;
00152 m_tgtColls.push_back(ret);
00153 }
00154 return ret;
00155
00156 }
00157
00158 void PDTAimp::Create(const std::vector<FactorType> &input
00159 , const std::vector<FactorType> &output
00160 , const std::string &filePath
00161 , const std::vector<float> &weight
00162 )
00163 {
00164
00165
00166 m_dict=new PhraseDictionaryTree();
00167 m_input=input;
00168 m_output=output;
00169
00170 const StaticData &staticData = StaticData::Instance();
00171 m_dict->NeedAlignmentInfo(staticData.NeedAlignmentInfo());
00172
00173 std::string binFname=filePath+".binphr.idx";
00174 if(!FileExists(binFname.c_str())) {
00175 UTIL_THROW2( "bin ttable does not exist");
00176
00177
00178
00179 }
00180 VERBOSE(1,"reading bin ttable\n");
00181
00182 bool res=m_dict->Read(filePath);
00183 if (!res) {
00184 std::cerr << "bin ttable was read in a wrong way\n";
00185 exit(1);
00186 }
00187 }
00188
00189
00190 void PDTAimp::CacheSource(ConfusionNet const& src)
00191 {
00192 assert(m_dict);
00193 const size_t srcSize=src.GetSize();
00194
00195 std::vector<size_t> exploredPaths(srcSize+1,0);
00196 std::vector<double> exPathsD(srcSize+1,-1.0);
00197
00198
00199 std::vector<size_t> cnDepths(srcSize,0);
00200 for(size_t i=0; i<srcSize; ++i) cnDepths[i]=src[i].size();
00201
00202 for(size_t len=1; len<=srcSize; ++len)
00203 for(size_t i=0; i<=srcSize-len; ++i) {
00204 double pd=0.0;
00205 for(size_t k=i; k<i+len; ++k) pd+=log(1.0*cnDepths[k]);
00206 exPathsD[len]=(exPathsD[len]>=0.0 ? addLogScale(pd,exPathsD[len]) : pd);
00207 }
00208
00209
00210 if(pathCN.size()<=srcSize) pathCN.resize(srcSize+1,-1.0);
00211 for(size_t len=1; len<=srcSize; ++len)
00212 pathCN[len]=pathCN[len]>=0.0 ? addLogScale(pathCN[len],exPathsD[len]) : exPathsD[len];
00213
00214 if(path1Best.size()<=srcSize) path1Best.resize(srcSize+1,0);
00215 for(size_t len=1; len<=srcSize; ++len) path1Best[len]+=srcSize-len+1;
00216
00217
00218 if (StaticData::Instance().GetVerboseLevel() >= 2 && exPathsD.size()) {
00219 TRACE_ERR("path stats for current CN: \nCN (full): ");
00220 std::transform(exPathsD.begin()+1
00221 ,exPathsD.end()
00222 ,std::ostream_iterator<double>(std::cerr," ")
00223 ,Exp);
00224 TRACE_ERR("\n");
00225 }
00226
00227 typedef std::map<StringTgtCand::Tokens,TScores> E2Costs;
00228
00229 std::map<Range,E2Costs> cov2cand;
00230 std::vector<State> stack;
00231 for(Position i=0 ; i < srcSize ; ++i)
00232 stack.push_back(State(i, i, m_dict->GetRoot(), std::vector<float>(m_numInputScores,0.0)));
00233
00234 std::vector<float> weightTrans = StaticData::Instance().GetWeights(m_obj);
00235 std::vector<float> weightInput = StaticData::Instance().GetWeights(m_inputFeature);
00236 float weightWP = StaticData::Instance().GetWeightWordPenalty();
00237
00238 while(!stack.empty()) {
00239 State curr(stack.back());
00240 stack.pop_back();
00241
00242 UTIL_THROW_IF2(curr.end() >= srcSize, "Error");
00243 const ConfusionNet::Column &currCol=src[curr.end()];
00244
00245 for(size_t colidx=0; colidx<currCol.size(); ++colidx) {
00246 const Word& w=currCol[colidx].first;
00247 std::string s;
00248 Factors2String(w,s);
00249 bool isEpsilon=(s=="" || s==EPSILON);
00250
00251
00252 UTIL_THROW_IF2(currCol[colidx].second.denseScores.size() < m_numInputScores,
00253 "Incorrect number of input scores");
00254
00255
00256 if(isEpsilon && curr.begin()==curr.end() && curr.begin()>0) continue;
00257
00258
00259
00260 PPtr nextP = (isEpsilon ? curr.ptr : m_dict->Extend(curr.ptr,s));
00261
00262 if(nextP) {
00263 Range newRange(curr.begin(),curr.end()+src.GetColumnIncrement(curr.end(),colidx));
00264
00265
00266 float inputScoreSum = 0;
00267 std::vector<float> newInputScores(m_numInputScores,0.0);
00268 if (m_numInputScores) {
00269 std::transform(currCol[colidx].second.denseScores.begin(), currCol[colidx].second.denseScores.end(),
00270 curr.GetScores().begin(),
00271 newInputScores.begin(),
00272 std::plus<float>());
00273
00274
00275
00276
00277
00278 inputScoreSum = std::accumulate(newInputScores.begin(),newInputScores.begin()+m_numInputScores,0.0);
00279 }
00280
00281 Phrase newSrc(curr.src);
00282 if(!isEpsilon) newSrc.AddWord(w);
00283 if(newRange.second<srcSize && inputScoreSum>LOWEST_SCORE) {
00284
00285
00286 stack.push_back(State(newRange,nextP,newInputScores));
00287 stack.back().src=newSrc;
00288 }
00289
00290 std::vector<StringTgtCand> tcands;
00291
00292
00293 m_dict->GetTargetCandidates(nextP,tcands);
00294
00295 if(newRange.second>=exploredPaths.size()+newRange.first)
00296 exploredPaths.resize(newRange.second-newRange.first+1,0);
00297 ++exploredPaths[newRange.second-newRange.first];
00298
00299 totalE+=tcands.size();
00300
00301 if(tcands.size()) {
00302 E2Costs& e2costs=cov2cand[newRange];
00303 Phrase const* srcPtr=uniqSrcPhr(newSrc);
00304 for(size_t i=0; i<tcands.size(); ++i) {
00305
00306 std::vector<float> transcores(m_obj->GetNumScoreComponents());
00307 UTIL_THROW_IF2(transcores.size() != weightTrans.size(),
00308 "Incorrect number of translation scores");
00309
00310
00311 std::transform(tcands[i].scores.begin()
00312 ,tcands[i].scores.end()
00313 ,transcores.begin()
00314 ,TransformScore);
00315
00316
00317
00318 float score=std::inner_product(transcores.begin(), transcores.end(), weightTrans.begin(), 0.0f);
00319
00320
00321 score +=std::inner_product(newInputScores.begin(), newInputScores.end(), weightInput.begin(), 0.0f);
00322
00323
00324 score-=tcands[i].tokens.size() * weightWP;
00325
00326 std::pair<E2Costs::iterator,bool> p=e2costs.insert(std::make_pair(tcands[i].tokens,TScores()));
00327
00328 if(p.second) ++distinctE;
00329
00330 TScores & scores=p.first->second;
00331 if(p.second || scores.total<score) {
00332 scores.total=score;
00333 scores.transScore=transcores;
00334 scores.inputScores=newInputScores;
00335 scores.src=srcPtr;
00336 }
00337 }
00338 }
00339 }
00340 }
00341 }
00342
00343
00344 if (StaticData::Instance().GetVerboseLevel() >= 2 && exploredPaths.size()) {
00345 TRACE_ERR("CN (explored): ");
00346 std::copy(exploredPaths.begin()+1,exploredPaths.end(),
00347 std::ostream_iterator<size_t>(std::cerr," "));
00348 TRACE_ERR("\n");
00349 }
00350
00351 if(pathExplored.size()<exploredPaths.size())
00352 pathExplored.resize(exploredPaths.size(),0);
00353 for(size_t len=1; len<=srcSize; ++len)
00354 pathExplored[len]+=exploredPaths[len];
00355
00356
00357
00358 m_rangeCache.resize(src.GetSize(),vTPC(src.GetSize()));
00359
00360 for(std::map<Range,E2Costs>::const_iterator i=cov2cand.begin(); i!=cov2cand.end(); ++i) {
00361 assert(i->first.first<m_rangeCache.size());
00362 assert(i->first.second>0);
00363 assert(static_cast<size_t>(i->first.second-1)<m_rangeCache[i->first.first].size());
00364 assert(m_rangeCache[i->first.first][i->first.second-1]==0);
00365
00366 std::vector<TargetPhrase> tCands;
00367 tCands.reserve(i->second.size());
00368
00369 std::vector<std::pair<float,size_t> > costs;
00370 costs.reserve(i->second.size());
00371
00372 std::vector<Phrase> sourcePhrases;
00373 sourcePhrases.reserve(i->second.size());
00374
00375 for(E2Costs::const_iterator j=i->second.begin(); j!=i->second.end(); ++j) {
00376 TScores const & scores=j->second;
00377 TargetPhrase targetPhrase(m_obj);
00378 CreateTargetPhrase(targetPhrase
00379 , j ->first
00380 , m_obj->options()->output.factor_delimiter
00381 , scores.transScore
00382 , scores.inputScores
00383 , NULL
00384 , scores.src);
00385 costs.push_back(std::make_pair(-targetPhrase.GetFutureScore(),tCands.size()));
00386 tCands.push_back(targetPhrase);
00387
00388 sourcePhrases.push_back(*scores.src);
00389
00390
00391 }
00392
00393 TargetPhraseCollectionWithSourcePhrase::shared_ptr
00394 rv = PruneTargetCandidates(tCands, costs, sourcePhrases);
00395
00396 if(rv->IsEmpty())
00397 rv.reset();
00398 else {
00399 m_rangeCache[i->first.first][i->first.second-1]=rv;
00400 m_tgtColls.push_back(rv);
00401 }
00402 }
00403
00404 m_dict->FreeMemory();
00405 }
00406
00407 void PDTAimp::CreateTargetPhrase(TargetPhrase& targetPhrase,
00408 StringTgtCand::Tokens const& factorStrings,
00409 std::string const& factorDelimiter,
00410 Scores const& transVector,
00411 Scores const& inputVector,
00412 const std::string *alignmentString,
00413 Phrase const* srcPtr) const
00414 {
00415 FactorCollection &factorCollection = FactorCollection::Instance();
00416
00417 for(size_t k=0; k<factorStrings.size(); ++k) {
00418 util::TokenIter<util::MultiCharacter, false>
00419 word(*factorStrings[k], factorDelimiter);
00420 Word& w=targetPhrase.AddWord();
00421 for(size_t l=0; l<m_output.size(); ++l, ++word) {
00422 w[m_output[l]]= factorCollection.AddFactor(*word);
00423 }
00424 }
00425
00426 if (alignmentString) {
00427 targetPhrase.SetAlignmentInfo(*alignmentString);
00428 }
00429
00430 if (m_numInputScores) {
00431 targetPhrase.GetScoreBreakdown().Assign(m_inputFeature, inputVector);
00432 }
00433
00434 targetPhrase.GetScoreBreakdown().Assign(m_obj, transVector);
00435 targetPhrase.EvaluateInIsolation(*srcPtr, m_obj->GetFeaturesToApply());
00436 }
00437
00438 TargetPhraseCollectionWithSourcePhrase::shared_ptr
00439 PDTAimp::PruneTargetCandidates
00440 (const std::vector<TargetPhrase> & tCands,
00441 std::vector<std::pair<float,size_t> >& costs,
00442 const std::vector<Phrase> &sourcePhrases) const
00443 {
00444
00445 UTIL_THROW_IF2(tCands.size() != sourcePhrases.size(),
00446 "Number of target phrases must equal number of source phrases");
00447
00448 TargetPhraseCollectionWithSourcePhrase::shared_ptr rv;
00449 rv.reset(new TargetPhraseCollectionWithSourcePhrase);
00450
00451
00452
00453 std::vector<std::pair<float,size_t> >::iterator nth =
00454 costs.begin() + ((m_obj->m_tableLimit>0 &&
00455 m_obj->m_tableLimit < costs.size()) ?
00456 m_obj->m_tableLimit : costs.size());
00457
00458
00459 NTH_ELEMENT3(costs.begin(),nth ,costs.end());
00460
00461
00462 for(std::vector<std::pair<float,size_t> >::iterator
00463 it = costs.begin(); it != nth; ++it) {
00464 size_t ind = it->second;
00465 TargetPhrase *targetPhrase = new TargetPhrase(tCands[ind]);
00466 const Phrase &sourcePhrase = sourcePhrases[ind];
00467 rv->Add(targetPhrase, sourcePhrase);
00468
00469 }
00470
00471 return rv;
00472 }
00473
00474 }
00475
00476