Moses: /disk4/html/www/moses/doxygen/mosesdecoder/moses/PDTAimp.cpp Source File

00001 #include "PDTAimp.h"
00002 
00003 namespace Moses
00004 {
00005 
00006 PDTAimp::PDTAimp(PhraseDictionaryTreeAdaptor *p)
00007   : m_dict(0),
00008     m_obj(p),
00009     useCache(1),
00010     totalE(0),
00011     distinctE(0)
00012 {
00013   m_numInputScores = 0;
00014   m_inputFeature = InputFeature::InstancePtr();
00015 
00016   if (m_inputFeature) {
00017     const PhraseDictionary *firstPt = PhraseDictionary::GetColl()[0];
00018     if (firstPt == m_obj) {
00019       m_numInputScores = m_inputFeature->GetNumScoreComponents();
00020     }
00021   }
00022 }
00023 
00024 PDTAimp::~PDTAimp()
00025 {
00026   CleanUp();
00027   delete m_dict;
00028 
00029   if (StaticData::Instance().GetVerboseLevel() >= 2) {
00030 
00031     TRACE_ERR("tgt candidates stats:  total="<<totalE<<";  distinct="
00032               <<distinctE<<" ("<<distinctE/(0.01*totalE)<<");  duplicates="
00033               <<totalE-distinctE<<" ("<<(totalE-distinctE)/(0.01*totalE)
00034               <<")\n");
00035 
00036     TRACE_ERR("\npath statistics\n");
00037 
00038     if(path1Best.size()) {
00039       TRACE_ERR("1-best:        ");
00040       std::copy(path1Best.begin()+1,path1Best.end(),
00041                 std::ostream_iterator<size_t>(std::cerr," \t"));
00042       TRACE_ERR("\n");
00043     }
00044     if(pathCN.size()) {
00045       TRACE_ERR("CN (full):     ");
00046       std::transform(pathCN.begin()+1
00047                      ,pathCN.end()
00048                      ,std::ostream_iterator<double>(std::cerr," \t")
00049                      ,Exp);
00050       TRACE_ERR("\n");
00051     }
00052     if(pathExplored.size()) {
00053       TRACE_ERR("CN (explored): ");
00054       std::copy(pathExplored.begin()+1,pathExplored.end(),
00055                 std::ostream_iterator<size_t>(std::cerr," \t"));
00056       TRACE_ERR("\n");
00057     }
00058   }
00059 
00060 }
00061 
00062 void PDTAimp::CleanUp()
00063 {
00064   assert(m_dict);
00065   m_dict->FreeMemory();
00066   // for(size_t i=0; i<m_tgtColls.size(); ++i) m_tgtColls[i].reset();
00067   m_tgtColls.clear();
00068   m_cache.clear();
00069   m_rangeCache.clear();
00070   uniqSrcPhr.clear();
00071 }
00072 
00073 TargetPhraseCollectionWithSourcePhrase::shared_ptr
00074 PDTAimp::GetTargetPhraseCollection(Phrase const &src) const
00075 {
00076 
00077   assert(m_dict);
00078 
00079   TargetPhraseCollectionWithSourcePhrase::shared_ptr ret;
00080   if(src.GetSize()==0) return ret;
00081 
00082   std::pair<MapSrc2Tgt::iterator,bool> piter;
00083   if(useCache) {
00084     piter=m_cache.insert(std::make_pair(src, ret));
00085     if(!piter.second) return piter.first->second;
00086   } else if (m_cache.size()) {
00087     MapSrc2Tgt::const_iterator i=m_cache.find(src);
00088     return (i!=m_cache.end() ? i->second : ret);
00089   }
00090 
00091   std::vector<std::string> srcString(src.GetSize());
00092   // convert source Phrase into vector of strings
00093   for(size_t i=0; i<srcString.size(); ++i) {
00094     Factors2String(src.GetWord(i),srcString[i]);
00095   }
00096 
00097   // get target phrases in string representation
00098   std::vector<StringTgtCand> cands;
00099   std::vector<std::string> wacands;
00100   m_dict->GetTargetCandidates(srcString,cands,wacands);
00101   if(cands.empty()) {
00102     return ret;
00103   }
00104 
00105   //TODO: Multiple models broken here
00106   std::vector<float> weights = StaticData::Instance().GetWeights(m_obj);
00107 
00108   std::vector<TargetPhrase> tCands;
00109   tCands.reserve(cands.size());
00110 
00111   std::vector<std::pair<float,size_t> > costs;
00112   costs.reserve(cands.size());
00113 
00114   std::vector<Phrase> sourcePhrases;
00115   sourcePhrases.reserve(cands.size());
00116 
00117 
00118   // convert into TargetPhrases
00119   std::string fd = m_obj->options()->output.factor_delimiter;
00120   for(size_t i=0; i<cands.size(); ++i) {
00121     TargetPhrase targetPhrase(m_obj);
00122 
00123     StringTgtCand::Tokens const& factorStrings=cands[i].tokens;
00124     Scores const& probVector=cands[i].scores;
00125 
00126     std::vector<float> scoreVector(probVector.size());
00127     std::transform(probVector.begin(),probVector.end(),scoreVector.begin(),
00128                    TransformScore);
00129     std::transform(scoreVector.begin(),scoreVector.end(),scoreVector.begin(),
00130                    FloorScore);
00131 
00132     //sparse features.
00133     //These are already in log-space
00134     for (size_t j = 0; j < cands[i].fnames.size(); ++j) {
00135       targetPhrase.GetScoreBreakdown().Assign(m_obj, *cands[i].fnames[j], cands[i].fvalues[j]);
00136     }
00137 
00138     CreateTargetPhrase(targetPhrase,factorStrings, fd, scoreVector, Scores(0),
00139                        &wacands[i], &src);
00140 
00141     costs.push_back(std::make_pair(-targetPhrase.GetFutureScore(),tCands.size()));
00142     tCands.push_back(targetPhrase);
00143 
00144     sourcePhrases.push_back(src);
00145   }
00146 
00147   ret = PruneTargetCandidates(tCands,costs, sourcePhrases);
00148   if(ret->IsEmpty()) {
00149     ret.reset();
00150   } else {
00151     if(useCache) piter.first->second = ret;
00152     m_tgtColls.push_back(ret);
00153   }
00154   return ret;
00155 
00156 }
00157 
00158 void PDTAimp::Create(const std::vector<FactorType> &input
00159                      , const std::vector<FactorType> &output
00160                      , const std::string &filePath
00161                      , const std::vector<float> &weight
00162                     )
00163 {
00164 
00165   // set my members
00166   m_dict=new PhraseDictionaryTree();
00167   m_input=input;
00168   m_output=output;
00169 
00170   const StaticData &staticData = StaticData::Instance();
00171   m_dict->NeedAlignmentInfo(staticData.NeedAlignmentInfo());
00172 
00173   std::string binFname=filePath+".binphr.idx";
00174   if(!FileExists(binFname.c_str())) {
00175     UTIL_THROW2( "bin ttable does not exist");
00176     //TRACE_ERR( "bin ttable does not exist -> create it\n");
00177     //InputFileStream in(filePath);
00178     //m_dict->Create(in,filePath);
00179   }
00180   VERBOSE(1,"reading bin ttable\n");
00181 //              m_dict->Read(filePath);
00182   bool res=m_dict->Read(filePath);
00183   if (!res) {
00184     std::cerr << "bin ttable was read in a wrong way\n";
00185     exit(1);
00186   }
00187 }
00188 
00189 
00190 void PDTAimp::CacheSource(ConfusionNet const& src)
00191 {
00192   assert(m_dict);
00193   const size_t srcSize=src.GetSize();
00194 
00195   std::vector<size_t> exploredPaths(srcSize+1,0);
00196   std::vector<double> exPathsD(srcSize+1,-1.0);
00197 
00198   // collect some statistics
00199   std::vector<size_t> cnDepths(srcSize,0);
00200   for(size_t i=0; i<srcSize; ++i) cnDepths[i]=src[i].size();
00201 
00202   for(size_t len=1; len<=srcSize; ++len)
00203     for(size_t i=0; i<=srcSize-len; ++i) {
00204       double pd=0.0;
00205       for(size_t k=i; k<i+len; ++k)     pd+=log(1.0*cnDepths[k]);
00206       exPathsD[len]=(exPathsD[len]>=0.0 ? addLogScale(pd,exPathsD[len]) : pd);
00207     }
00208 
00209   // update global statistics
00210   if(pathCN.size()<=srcSize) pathCN.resize(srcSize+1,-1.0);
00211   for(size_t len=1; len<=srcSize; ++len)
00212     pathCN[len]=pathCN[len]>=0.0 ? addLogScale(pathCN[len],exPathsD[len]) : exPathsD[len];
00213 
00214   if(path1Best.size()<=srcSize) path1Best.resize(srcSize+1,0);
00215   for(size_t len=1; len<=srcSize; ++len) path1Best[len]+=srcSize-len+1;
00216 
00217 
00218   if (StaticData::Instance().GetVerboseLevel() >= 2 && exPathsD.size()) {
00219     TRACE_ERR("path stats for current CN: \nCN (full):     ");
00220     std::transform(exPathsD.begin()+1
00221                    ,exPathsD.end()
00222                    ,std::ostream_iterator<double>(std::cerr," ")
00223                    ,Exp);
00224     TRACE_ERR("\n");
00225   }
00226 
00227   typedef std::map<StringTgtCand::Tokens,TScores> E2Costs;
00228 
00229   std::map<Range,E2Costs> cov2cand;
00230   std::vector<State> stack;
00231   for(Position i=0 ; i < srcSize ; ++i)
00232     stack.push_back(State(i, i, m_dict->GetRoot(), std::vector<float>(m_numInputScores,0.0)));
00233 
00234   std::vector<float> weightTrans = StaticData::Instance().GetWeights(m_obj);
00235   std::vector<float> weightInput = StaticData::Instance().GetWeights(m_inputFeature);
00236   float weightWP = StaticData::Instance().GetWeightWordPenalty();
00237 
00238   while(!stack.empty()) {
00239     State curr(stack.back());
00240     stack.pop_back();
00241 
00242     UTIL_THROW_IF2(curr.end() >= srcSize, "Error");
00243     const ConfusionNet::Column &currCol=src[curr.end()];
00244     // in a given column, loop over all possibilities
00245     for(size_t colidx=0; colidx<currCol.size(); ++colidx) {
00246       const Word& w=currCol[colidx].first; // w=the i^th possibility in column colidx
00247       std::string s;
00248       Factors2String(w,s);
00249       bool isEpsilon=(s=="" || s==EPSILON);
00250 
00251       //assert that we have the right number of link params in this CN option
00252       UTIL_THROW_IF2(currCol[colidx].second.denseScores.size() < m_numInputScores,
00253                      "Incorrect number of input scores");
00254 
00255       // do not start with epsilon (except at first position)
00256       if(isEpsilon && curr.begin()==curr.end() && curr.begin()>0) continue;
00257 
00258       // At a given node in the prefix tree, look to see if w defines an edge to
00259       // another node (Extend).  Stay at the same node if w==EPSILON
00260       PPtr nextP = (isEpsilon ? curr.ptr : m_dict->Extend(curr.ptr,s));
00261 
00262       if(nextP) { // w is a word that should be considered
00263         Range newRange(curr.begin(),curr.end()+src.GetColumnIncrement(curr.end(),colidx));
00264 
00265         //add together the link scores from the current state and the new arc
00266         float inputScoreSum = 0;
00267         std::vector<float> newInputScores(m_numInputScores,0.0);
00268         if (m_numInputScores) {
00269           std::transform(currCol[colidx].second.denseScores.begin(), currCol[colidx].second.denseScores.end(),
00270                          curr.GetScores().begin(),
00271                          newInputScores.begin(),
00272                          std::plus<float>());
00273 
00274 
00275           //we need to sum up link weights (excluding realWordCount, which isn't in numLinkParams)
00276           //if the sum is too low, then we won't expand this.
00277           //TODO: dodgy! shouldn't we consider weights here? what about zero-weight params?
00278           inputScoreSum = std::accumulate(newInputScores.begin(),newInputScores.begin()+m_numInputScores,0.0);
00279         }
00280 
00281         Phrase newSrc(curr.src);
00282         if(!isEpsilon) newSrc.AddWord(w);
00283         if(newRange.second<srcSize && inputScoreSum>LOWEST_SCORE) {
00284           // if there is more room to grow, add a new state onto the queue
00285           // to be explored that represents [begin, curEnd+)
00286           stack.push_back(State(newRange,nextP,newInputScores));
00287           stack.back().src=newSrc;
00288         }
00289 
00290         std::vector<StringTgtCand> tcands;
00291         // now, look up the target candidates (aprx. TargetPhraseCollection) for
00292         // the current path through the CN
00293         m_dict->GetTargetCandidates(nextP,tcands);
00294 
00295         if(newRange.second>=exploredPaths.size()+newRange.first)
00296           exploredPaths.resize(newRange.second-newRange.first+1,0);
00297         ++exploredPaths[newRange.second-newRange.first];
00298 
00299         totalE+=tcands.size();
00300 
00301         if(tcands.size()) {
00302           E2Costs& e2costs=cov2cand[newRange];
00303           Phrase const* srcPtr=uniqSrcPhr(newSrc);
00304           for(size_t i=0; i<tcands.size(); ++i) {
00305             //put input scores in first - already logged, just drop in directly
00306             std::vector<float> transcores(m_obj->GetNumScoreComponents());
00307             UTIL_THROW_IF2(transcores.size() != weightTrans.size(),
00308                            "Incorrect number of translation scores");
00309 
00310             //put in phrase table scores, logging as we insert
00311             std::transform(tcands[i].scores.begin()
00312                            ,tcands[i].scores.end()
00313                            ,transcores.begin()
00314                            ,TransformScore);
00315 
00316 
00317             //tally up
00318             float score=std::inner_product(transcores.begin(), transcores.end(), weightTrans.begin(), 0.0f);
00319 
00320             // input feature
00321             score +=std::inner_product(newInputScores.begin(), newInputScores.end(), weightInput.begin(), 0.0f);
00322 
00323             //count word penalty
00324             score-=tcands[i].tokens.size() * weightWP;
00325 
00326             std::pair<E2Costs::iterator,bool> p=e2costs.insert(std::make_pair(tcands[i].tokens,TScores()));
00327 
00328             if(p.second) ++distinctE;
00329 
00330             TScores & scores=p.first->second;
00331             if(p.second || scores.total<score) {
00332               scores.total=score;
00333               scores.transScore=transcores;
00334               scores.inputScores=newInputScores;
00335               scores.src=srcPtr;
00336             }
00337           }
00338         }
00339       }
00340     }
00341   } // end while(!stack.empty())
00342 
00343 
00344   if (StaticData::Instance().GetVerboseLevel() >= 2 && exploredPaths.size()) {
00345     TRACE_ERR("CN (explored): ");
00346     std::copy(exploredPaths.begin()+1,exploredPaths.end(),
00347               std::ostream_iterator<size_t>(std::cerr," "));
00348     TRACE_ERR("\n");
00349   }
00350 
00351   if(pathExplored.size()<exploredPaths.size())
00352     pathExplored.resize(exploredPaths.size(),0);
00353   for(size_t len=1; len<=srcSize; ++len)
00354     pathExplored[len]+=exploredPaths[len];
00355 
00356 
00357   // m_rangeCache.resize(src.GetSize(),vTPC(src.GetSize(),0));
00358   m_rangeCache.resize(src.GetSize(),vTPC(src.GetSize()));
00359 
00360   for(std::map<Range,E2Costs>::const_iterator i=cov2cand.begin(); i!=cov2cand.end(); ++i) {
00361     assert(i->first.first<m_rangeCache.size());
00362     assert(i->first.second>0);
00363     assert(static_cast<size_t>(i->first.second-1)<m_rangeCache[i->first.first].size());
00364     assert(m_rangeCache[i->first.first][i->first.second-1]==0);
00365 
00366     std::vector<TargetPhrase> tCands;
00367     tCands.reserve(i->second.size());
00368 
00369     std::vector<std::pair<float,size_t> > costs;
00370     costs.reserve(i->second.size());
00371 
00372     std::vector<Phrase> sourcePhrases;
00373     sourcePhrases.reserve(i->second.size());
00374 
00375     for(E2Costs::const_iterator j=i->second.begin(); j!=i->second.end(); ++j) {
00376       TScores const & scores=j->second;
00377       TargetPhrase targetPhrase(m_obj);
00378       CreateTargetPhrase(targetPhrase
00379                          , j ->first
00380                          , m_obj->options()->output.factor_delimiter
00381                          , scores.transScore
00382                          , scores.inputScores
00383                          , NULL
00384                          , scores.src);
00385       costs.push_back(std::make_pair(-targetPhrase.GetFutureScore(),tCands.size()));
00386       tCands.push_back(targetPhrase);
00387 
00388       sourcePhrases.push_back(*scores.src);
00389 
00390       //std::cerr << i->first.first << "-" << i->first.second << ": " << targetPhrase << std::endl;
00391     }
00392 
00393     TargetPhraseCollectionWithSourcePhrase::shared_ptr
00394     rv = PruneTargetCandidates(tCands, costs, sourcePhrases);
00395 
00396     if(rv->IsEmpty())
00397       rv.reset();
00398     else {
00399       m_rangeCache[i->first.first][i->first.second-1]=rv;
00400       m_tgtColls.push_back(rv);
00401     }
00402   }
00403   // free memory
00404   m_dict->FreeMemory();
00405 }
00406 
00407 void PDTAimp::CreateTargetPhrase(TargetPhrase& targetPhrase,
00408                                  StringTgtCand::Tokens const& factorStrings,
00409                                  std::string const& factorDelimiter,
00410                                  Scores const& transVector,
00411                                  Scores const& inputVector,
00412                                  const std::string *alignmentString,
00413                                  Phrase const* srcPtr) const
00414 {
00415   FactorCollection &factorCollection = FactorCollection::Instance();
00416 
00417   for(size_t k=0; k<factorStrings.size(); ++k) {
00418     util::TokenIter<util::MultiCharacter, false>
00419     word(*factorStrings[k], factorDelimiter);
00420     Word& w=targetPhrase.AddWord();
00421     for(size_t l=0; l<m_output.size(); ++l, ++word) {
00422       w[m_output[l]]= factorCollection.AddFactor(*word);
00423     }
00424   }
00425 
00426   if (alignmentString) {
00427     targetPhrase.SetAlignmentInfo(*alignmentString);
00428   }
00429 
00430   if (m_numInputScores) {
00431     targetPhrase.GetScoreBreakdown().Assign(m_inputFeature, inputVector);
00432   }
00433 
00434   targetPhrase.GetScoreBreakdown().Assign(m_obj, transVector);
00435   targetPhrase.EvaluateInIsolation(*srcPtr, m_obj->GetFeaturesToApply());
00436 }
00437 
00438 TargetPhraseCollectionWithSourcePhrase::shared_ptr
00439 PDTAimp::PruneTargetCandidates
00440 (const std::vector<TargetPhrase> & tCands,
00441  std::vector<std::pair<float,size_t> >& costs,
00442  const std::vector<Phrase> &sourcePhrases) const
00443 {
00444   // convert into TargetPhraseCollection
00445   UTIL_THROW_IF2(tCands.size() != sourcePhrases.size(),
00446                  "Number of target phrases must equal number of source phrases");
00447 
00448   TargetPhraseCollectionWithSourcePhrase::shared_ptr rv;
00449   rv.reset(new TargetPhraseCollectionWithSourcePhrase);
00450 
00451 
00452   // set limit to tableLimit or actual size, whatever is smaller
00453   std::vector<std::pair<float,size_t> >::iterator nth =
00454     costs.begin() + ((m_obj->m_tableLimit>0 && // 0 indicates no limit
00455                       m_obj->m_tableLimit < costs.size()) ?
00456                      m_obj->m_tableLimit : costs.size());
00457 
00458   // find the nth phrase according to future cost
00459   NTH_ELEMENT3(costs.begin(),nth ,costs.end());
00460 
00461   // add n top phrases to the return list
00462   for(std::vector<std::pair<float,size_t> >::iterator
00463       it = costs.begin(); it != nth; ++it) {
00464     size_t ind = it->second;
00465     TargetPhrase *targetPhrase = new TargetPhrase(tCands[ind]);
00466     const Phrase &sourcePhrase = sourcePhrases[ind];
00467     rv->Add(targetPhrase, sourcePhrase);
00468 
00469   }
00470 
00471   return rv;
00472 }
00473 
00474 }
00475 
00476