Moses: mosesdecoder/moses/TranslationModel/BilingualDynSuffixArray.cpp Source File

00001 #include "BilingualDynSuffixArray.h"
00002 #include "moses/TranslationModel/DynSAInclude/utils.h"
00003 #include "moses/FactorCollection.h"
00004 #include "moses/StaticData.h"
00005 #include "moses/TargetPhrase.h"
00006 
00007 #include "moses/TranslationModel/UG/generic/sorting/NBestList.h"
00008 #include "moses/TranslationModel/UG/generic/sampling/Sampling.h"
00009 
00010 #include <boost/foreach.hpp>
00011 #include <iomanip>
00012 
00013 using namespace std;
00014 
00015 namespace Moses
00016 {
00017 
00018 BilingualDynSuffixArray::
00019 BilingualDynSuffixArray():
00020   m_maxPhraseLength(StaticData::Instance().GetMaxPhraseLength()),
00021   m_maxSampleSize(20), m_maxPTEntries(20)
00022 {
00023   m_srcSA = 0;
00024   m_trgSA = 0;
00025   m_srcCorpus = new vector<wordID_t>();
00026   m_trgCorpus = new vector<wordID_t>();
00027   m_srcVocab  = new Vocab(false);
00028   m_trgVocab  = new Vocab(false);
00029   m_scoreCmp = 0;
00030 }
00031 
00032 BilingualDynSuffixArray::
00033 ~BilingualDynSuffixArray()
00034 {
00035   if(m_srcSA)     delete m_srcSA;
00036   if(m_trgSA)     delete m_trgSA;
00037   if(m_srcVocab)  delete m_srcVocab;
00038   if(m_trgVocab)  delete m_trgVocab;
00039   if(m_srcCorpus) delete m_srcCorpus;
00040   if(m_trgCorpus) delete m_trgCorpus;
00041   if(m_scoreCmp)  delete m_scoreCmp;
00042 }
00043 
00044 bool
00045 BilingualDynSuffixArray::
00046 Load(
00047   const vector<FactorType>& inputFactors,
00048   const vector<FactorType>& outputFactors,
00049   string source, string target, string alignments,
00050   const vector<float> &weight)
00051 {
00052   m_inputFactors = inputFactors;
00053   m_outputFactors = outputFactors;
00054 
00055   // m_scoreCmp = new ScoresComp(weight);
00056   InputFileStream sourceStrme(source);
00057   InputFileStream targetStrme(target);
00058   cerr << "Loading source corpus...\n";
00059   // Input and Output are 'Factor directions' (whatever that is) defined in Typedef.h
00060   LoadCorpus(Input, sourceStrme, m_inputFactors, *m_srcCorpus, m_srcSntBreaks, m_srcVocab);
00061   cerr << "Loading target corpus...\n";
00062   LoadCorpus(Output, targetStrme, m_outputFactors,*m_trgCorpus, m_trgSntBreaks, m_trgVocab);
00063 
00064   UTIL_THROW_IF2(m_srcSntBreaks.size() != m_trgSntBreaks.size(),
00065                  "Source and target arrays aren't the same size");
00066 
00067   // build suffix arrays and auxilliary arrays
00068   cerr << "Building Source Suffix Array...\n";
00069   m_srcSA = new DynSuffixArray(m_srcCorpus);
00070   if(!m_srcSA) return false;
00071   cerr << "Building Target Suffix Array...\n";
00072   m_trgSA = new DynSuffixArray(m_trgCorpus);
00073   if(!m_trgSA) return false;
00074 
00075   InputFileStream alignStrme(alignments);
00076   cerr << "Loading Alignment File...\n";
00077   LoadRawAlignments(alignStrme);
00078   cerr << m_srcSntBreaks.size() << " "
00079        << m_trgSntBreaks.size() << " "
00080        << m_rawAlignments.size() << endl;
00081   //LoadAlignments(alignStrme);
00082   cerr << "Building frequent word cache...\n";
00083   CacheFreqWords();
00084 
00085   wordID_t const* s = &(*m_srcCorpus)[0];
00086   wordID_t const* t = &(*m_trgCorpus)[0];
00087   for (size_t sid = 0; sid < m_srcSntBreaks.size(); ++sid) {
00088     wordID_t const* se = s + GetSourceSentenceSize(sid);
00089     wordID_t const* te = t + GetTargetSentenceSize(sid);
00090     vector<short> const& a = m_rawAlignments[sid];
00091     m_wrd_cooc.Count(vector<wordID_t>(s,se),
00092                      vector<wordID_t>(t,te), a,
00093                      m_srcVocab->GetkOOVWordID(),
00094                      m_trgVocab->GetkOOVWordID());
00095     s = se;
00096     t = te;
00097   }
00098   if (m_srcSntBreaks.size()  != m_trgSntBreaks.size() ||
00099       m_rawAlignments.size() != m_trgSntBreaks.size()) {
00100     cerr << "FATAL ERROR: Line counts don't match!\n"
00101          << "Source side text corpus: " << m_srcSntBreaks.size() << "\n"
00102          << "Target side text corpus: " << m_trgSntBreaks.size() << "\n"
00103          << "Word alignments:         " << m_rawAlignments.size() << endl;
00104     exit(1);
00105   }
00106   return true;
00107 }
00108 
00109 int
00110 BilingualDynSuffixArray::
00111 LoadRawAlignments(InputFileStream& align)
00112 {
00113   // stores the alignments in the raw file format
00114   string line;
00115   // vector<int> vtmp;
00116   // int lineNum = 0;
00117   while(getline(align, line)) {
00118     // if (++lineNum % 10000 == 0) cerr << lineNum << endl;
00119     LoadRawAlignments(line);
00120   }
00121   return m_rawAlignments.size();
00122 }
00123 
00124 
00125 int
00126 BilingualDynSuffixArray::
00127 LoadRawAlignments(string& align)
00128 {
00129   // stores the alignments in the raw file format
00130   vector<int> vtmp;
00131   Utils::splitToInt(align, vtmp, "- ");
00132   UTIL_THROW_IF2(vtmp.size() % 2 != 0,
00133                  "Alignment format is incorrect: " << align);
00134   vector<short> vAlgn;  // store as short ints for memory
00135   for (vector<int>::const_iterator itr = vtmp.begin();
00136        itr != vtmp.end(); ++itr) {
00137     vAlgn.push_back(short(*itr));
00138   }
00139   m_rawAlignments.push_back(vAlgn);
00140   return m_rawAlignments.size();
00141 }
00142 
00143 SentenceAlignment
00144 BilingualDynSuffixArray::
00145 GetSentenceAlignment(const int sntIndex, bool trg2Src) const
00146 {
00147   // retrieves the alignments in the format used by SentenceAlignment.Extract()
00148   int t = GetTargetSentenceSize(sntIndex);
00149   int s = GetSourceSentenceSize(sntIndex);
00150   int sntGiven   = trg2Src ? t : s;
00151   int sntExtract = trg2Src ? s : t;
00152   SentenceAlignment curSnt(sntIndex, sntGiven, sntExtract); // initialize empty sentence
00153   vector<short> const& a = m_rawAlignments.at(sntIndex);
00154   for(size_t i=0; i < a.size(); i+=2) {
00155     int sourcePos = a[i];
00156     int targetPos = a[i+1];
00157     if(trg2Src) {
00158       curSnt.alignedList[targetPos].push_back(sourcePos);       // list of target nodes for each source word
00159       curSnt.numberAligned[sourcePos]++; // cnt of how many source words connect to this target word
00160     } else {
00161       curSnt.alignedList[sourcePos].push_back(targetPos);       // list of target nodes for each source word
00162       curSnt.numberAligned[targetPos]++; // cnt of how many source words connect to this target word
00163     }
00164   }
00165   curSnt.srcSnt = m_srcCorpus + sntIndex;       // point source and target sentence
00166   curSnt.trgSnt = m_trgCorpus + sntIndex;
00167 
00168   return curSnt;
00169 }
00170 
00171 bool
00172 BilingualDynSuffixArray::
00173 ExtractPhrases(const int& sntIndex,
00174                const int& wordIndex,
00175                const int& sourceSize,
00176                vector<PhrasePair*>& phrasePairs,
00177                bool trg2Src) const
00178 {
00179   /* ExtractPhrases() can extract the matching phrases for both directions by using the trg2Src
00180    * parameter */
00181   SentenceAlignment curSnt = GetSentenceAlignment(sntIndex, trg2Src);
00182   // get span of phrase in source sentence
00183   int beginSentence = m_srcSntBreaks[sntIndex];
00184   int rightIdx = wordIndex - beginSentence;
00185   int leftIdx  = rightIdx  - sourceSize + 1;
00186   return curSnt.Extract(m_maxPhraseLength, phrasePairs, leftIdx, rightIdx); // extract all phrase Alignments in sentence
00187 }
00188 
00189 void
00190 BilingualDynSuffixArray::
00191 CleanUp(const InputType& source)
00192 {
00193   //m_wordPairCache.clear();
00194 }
00195 
00196 int
00197 BilingualDynSuffixArray::
00198 LoadCorpus(FactorDirection direction,
00199            InputFileStream  & corpus,
00200            const FactorList & factors,
00201            vector<wordID_t> & cArray,
00202            vector<wordID_t> & sntArray,
00203            Vocab* vocab)
00204 {
00205   string line, word;
00206   int sntIdx(0);
00207   // corpus.seekg(0); Seems needless -> commented out to allow
00208   // loading of gzipped corpora (gzfilebuf doesn't support seeking).
00209   while(getline(corpus, line)) {
00210     sntArray.push_back(sntIdx);
00211     Phrase phrase(ARRAY_SIZE_INCR);
00212     phrase.CreateFromString( direction, factors, line, NULL);
00213     // store words in vocabulary and corpus
00214     for( size_t i = 0; i < phrase.GetSize(); ++i) {
00215       cArray.push_back( vocab->GetWordID(phrase.GetWord(i)) );
00216     }
00217     sntIdx += phrase.GetSize();
00218   }
00219   //cArray.push_back(vocab->GetkOOVWordID);     // signify end of corpus
00220   vocab->MakeClosed(); // avoid adding words
00221   return cArray.size();
00222 }
00223 
00224 bool
00225 BilingualDynSuffixArray::
00226 GetLocalVocabIDs(const Phrase& src, SAPhrase &output) const
00227 {
00228   // looks up the SA vocab ids for the current src phrase
00229   size_t phraseSize = src.GetSize();
00230   for (size_t pos = 0; pos < phraseSize; ++pos) {
00231     const Word &word = src.GetWord(pos);
00232     wordID_t arrayId = m_srcVocab->GetWordID(word);
00233     if (arrayId == m_srcVocab->GetkOOVWordID()) {
00234       // oov
00235       return false;
00236     } else {
00237       output.SetId(pos, arrayId);
00238     }
00239   }
00240   return true;
00241 }
00242 
00243 pair<float, float>
00244 BilingualDynSuffixArray::
00245 GetLexicalWeight(const PhrasePair& pp) const
00246 {
00247   // sp,tp: sum of link probabilities
00248   // sc,tc: count of links
00249   int src_size = pp.GetSourceSize();
00250   int trg_size = pp.GetTargetSize();
00251   vector<float> sp(src_size, 0), tp(trg_size, 0);
00252   vector<int>   sc(src_size,0),  tc(trg_size,0);
00253   wordID_t const* sw = &(m_srcCorpus->at(m_srcSntBreaks.at(pp.m_sntIndex)));
00254   wordID_t const* tw = &(m_trgCorpus->at(m_trgSntBreaks.at(pp.m_sntIndex)));
00255   vector<short> const & a = m_rawAlignments.at(pp.m_sntIndex);
00256   for (size_t i = 0; i < a.size(); i += 2) {
00257     int s = a[i], t = a.at(i+1), sx, tx;
00258     // sx, tx: local positions within phrase pair
00259 
00260     if (s < pp.m_startSource || t < pp.m_startTarget) continue;
00261     if ((sx = s - pp.m_startSource) >= src_size) continue;
00262     if ((tx = t - pp.m_startTarget) >= trg_size) continue;
00263 
00264     sp[sx] += m_wrd_cooc.pfwd(sw[s],tw[t]);
00265     tp[tx] += m_wrd_cooc.pbwd(sw[s],tw[t]);
00266     ++sc[sx];
00267     ++tc[tx];
00268 #if 0
00269     cout << m_srcVocab->GetWord(sw[s])   << " -> "
00270          << m_trgVocab->GetWord(tw[t])   << " "
00271          << m_wrd_cooc.pfwd(sw[s],tw[t]) << " "
00272          << m_wrd_cooc.pbwd(sw[s],tw[t]) << " "
00273          << sp[sx] << " (" << sc[sx] << ") "
00274          << tp[tx] << " (" << tc[tx] << ") "
00275          << endl;
00276 #endif
00277   }
00278   pair<float,float> ret(1,1);
00279   wordID_t null_trg = m_trgVocab->GetkOOVWordID();
00280   wordID_t null_src = m_srcVocab->GetkOOVWordID();
00281   size_t soff = pp.m_startSource;
00282   for (size_t i = 0; i < sp.size(); ++i) {
00283     if (sc[i]) ret.first *= sp[i]/sc[i];
00284     else       ret.first *= m_wrd_cooc.pfwd(sw[soff+i], null_trg);
00285   }
00286   size_t toff = pp.m_startTarget;
00287   for (size_t i = 0; i < tp.size(); ++i) {
00288     if (tc[i]) ret.second *= tp[i]/tc[i];
00289     else       ret.second *= m_wrd_cooc.pbwd(null_src,tw[toff+i]);
00290   }
00291   return ret;
00292 }
00293 
00294 void
00295 BilingualDynSuffixArray::
00296 CacheFreqWords() const
00297 {
00298   multimap<int, wordID_t> wordCnts;
00299   // for each source word in vocab
00300   Vocab::Word2Id::const_iterator it;
00301   for(it = m_srcVocab->VocabStart(); it != m_srcVocab->VocabEnd(); ++it) {
00302     // get its frequency
00303     wordID_t srcWord = it->second;
00304     vector<wordID_t> sword(1, srcWord), wrdIndices;
00305     m_srcSA->GetCorpusIndex(&sword, &wrdIndices);
00306     if(wrdIndices.size() >= 1000) { // min count
00307       wordCnts.insert(make_pair(wrdIndices.size(), srcWord));
00308     }
00309   }
00310   int numSoFar(0);
00311   multimap<int, wordID_t>::reverse_iterator ritr;
00312   for(ritr = wordCnts.rbegin(); ritr != wordCnts.rend(); ++ritr) {
00313     m_freqWordsCached.insert(ritr->second);
00314     CacheWordProbs(ritr->second);
00315     if(++numSoFar == 50) break; // get top counts
00316   }
00317   cerr << "\tCached " << m_freqWordsCached.size() << " source words\n";
00318 }
00319 
00320 void
00321 BilingualDynSuffixArray::
00322 CacheWordProbs(wordID_t srcWord) const
00323 {
00324   map<wordID_t, int> counts;
00325   vector<wordID_t> sword(1, srcWord), wrdIndices;
00326   bool ret = m_srcSA->GetCorpusIndex(&sword, &wrdIndices);
00327   UTIL_THROW_IF2(!ret, "Error");
00328 
00329   vector<int> sntIndexes = GetSntIndexes(wrdIndices, 1, m_srcSntBreaks);
00330   float denom(0);
00331   // for each occurrence of this word
00332   for(size_t snt = 0; snt < sntIndexes.size(); ++snt) {
00333     int sntIdx = sntIndexes.at(snt); // get corpus index for sentence
00334     UTIL_THROW_IF2(sntIdx == -1, "Error");
00335 
00336     int srcWrdSntIdx = wrdIndices.at(snt) - m_srcSntBreaks.at(sntIdx); // get word index in sentence
00337     const vector<int> srcAlg = GetSentenceAlignment(sntIdx).alignedList.at(srcWrdSntIdx); // list of target words for this source word
00338     if(srcAlg.size() == 0) {
00339       ++counts[m_srcVocab->GetkOOVWordID()]; // if not alligned then align to NULL word
00340       ++denom;
00341     } else { //get target words aligned to srcword in this sentence
00342       for(size_t i=0; i < srcAlg.size(); ++i) {
00343         wordID_t trgWord = m_trgCorpus->at(srcAlg[i] + m_trgSntBreaks[sntIdx]);
00344         ++counts[trgWord];
00345         ++denom;
00346       }
00347     }
00348   }
00349   // now we've gotten counts of all target words aligned to this source word
00350   // get probs and cache all pairs
00351   for(map<wordID_t, int>::const_iterator itrCnt = counts.begin();
00352       itrCnt != counts.end(); ++itrCnt) {
00353     pair<wordID_t, wordID_t> wordPair = make_pair(srcWord, itrCnt->first);
00354     float srcTrgPrb = float(itrCnt->second) / float(denom);     // gives p(src->trg)
00355     float trgSrcPrb = float(itrCnt->second) / float(counts.size()); // gives p(trg->src)
00356     m_wordPairCache[wordPair] = pair<float, float>(srcTrgPrb, trgSrcPrb);
00357   }
00358 }
00359 
00360 SAPhrase
00361 BilingualDynSuffixArray::
00362 TrgPhraseFromSntIdx(const PhrasePair& phrasepair) const
00363 {
00364   // takes sentence indexes and looks up vocab IDs
00365   SAPhrase phraseIds(phrasepair.GetTargetSize());
00366   int sntIndex = phrasepair.m_sntIndex;
00367   int id(-1), pos(0);
00368   for(int i=phrasepair.m_startTarget; i <= phrasepair.m_endTarget; ++i) { // look up trg words
00369     id = m_trgCorpus->at(m_trgSntBreaks[sntIndex] + i);
00370     phraseIds.SetId(pos++, id);
00371   }
00372   return phraseIds;
00373 }
00374 
00375 TargetPhrase*
00376 BilingualDynSuffixArray::
00377 GetMosesFactorIDs(const SAPhrase& phrase, const Phrase& sourcePhrase, const PhraseDictionary *pt) const
00378 {
00379   TargetPhrase* targetPhrase = new TargetPhrase(pt);
00380   for(size_t i=0; i < phrase.words.size(); ++i) { // look up trg words
00381     Word& word = m_trgVocab->GetWord( phrase.words[i]);
00382     UTIL_THROW_IF2(word == m_trgVocab->GetkOOVWord(),
00383                    "Unknown word at position " << i);
00384     targetPhrase->AddWord(word);
00385   }
00386   // scoring
00387   return targetPhrase;
00388 }
00389 
00391 //  phrase pair statistics in /pstats/. Return the sample rate
00392 //  (number of samples considered / total number of hits) and total number of
00393 //  phrase pairs
00394 pair<float,float>
00395 BilingualDynSuffixArray::
00396 GatherCands(Phrase const& src, map<SAPhrase, vector<float> >& pstats) const
00397 {
00398   typedef map<SAPhrase, vector<float> >::iterator   pstat_iter;
00399   typedef map<SAPhrase, vector<float> >::value_type pstat_entry;
00400   pair<float,float> ret(0,0);
00401   float& sampleRate   = ret.first;
00402   float& totalPhrases = ret.second;
00403   size_t srcSize = src.GetSize();
00404   SAPhrase localIDs(srcSize);
00405   vector<unsigned> wrdIndices;
00406   if(!GetLocalVocabIDs(src, localIDs) ||
00407       !m_srcSA->GetCorpusIndex(&(localIDs.words), &wrdIndices))
00408     return ret; // source phrase contains OOVs
00409 
00410   // select a sample of the occurrences for phrase extraction
00411   size_t m1 = wrdIndices.size();
00412   SampleSelection(wrdIndices); // careful! SampleSelection alters wrdIndices!
00413   sampleRate = float(wrdIndices.size())/m1;
00414 
00415   // determine the sentences in which these phrases occur
00416   vector<int> sntIndices = GetSntIndexes(wrdIndices, srcSize, m_srcSntBreaks);
00417   for(size_t s = 0; s < sntIndices.size(); ++s) {
00418     int sntStart = sntIndices.at(s);
00419     if(sntStart == -1) continue; // marked as bad by GetSntIndexes()
00420     vector<PhrasePair*> phrasePairs;
00421     ExtractPhrases(sntStart, wrdIndices[s], srcSize, phrasePairs);
00422     totalPhrases += phrasePairs.size();
00423     vector<PhrasePair*>::iterator p;
00424     for (p = phrasePairs.begin(); p != phrasePairs.end(); ++p) {
00425       assert(*p);
00426       pair<float, float> lex = GetLexicalWeight(**p);
00427       pstat_entry entry(TrgPhraseFromSntIdx(**p), Scores(5));
00428       pair<pstat_iter, bool> foo = pstats.insert(entry);
00429       Scores& feats = foo.first->second;
00430       if (foo.second) {
00431         feats[0]  = 1; // count
00432         feats[1]  = lex.first;
00433         feats[3]  = lex.second;
00434       } else {
00435         feats[0] += 1;
00436         feats[1]  = max(feats[1],lex.first);
00437         feats[3]  = max(feats[3],lex.second);
00438       }
00439       delete *p;
00440     }
00441   } // done with all sentences
00442   BOOST_FOREACH(pstat_entry & e, pstats) {
00443     Scores& feats = e.second;
00444     // 0: bwd phrase prob
00445     // 1: lex 1
00446     // 2: fwd phrase prob
00447     // 3: lex 2
00448     // 4: phrase penalty
00449     float x  = m_trgSA->GetCount(e.first.words)-feats[0] * sampleRate;
00450     feats[4] = 1;
00451     feats[3] = log(feats[3]);
00452     feats[2] = log(feats[0]) - log(totalPhrases);
00453     feats[1] = log(feats[1]);
00454     feats[0] = log(feats[0]) - log(feats[0] + x);
00455   }
00456   return ret;
00457 }
00458 
00459 vector<int>
00460 BilingualDynSuffixArray::
00461 GetSntIndexes(vector<unsigned>& wrdIndices,
00462               const int sourceSize,
00463               const vector<unsigned>& sntBreaks) const
00464 {
00465   vector<unsigned>::const_iterator vit;
00466   vector<int> sntIndices;
00467   for(size_t i=0; i < wrdIndices.size(); ++i) {
00468     vit = upper_bound(sntBreaks.begin(), sntBreaks.end(), wrdIndices[i]);
00469     int index = int(vit - sntBreaks.begin()) - 1;
00470     // check for phrases that cross sentence boundaries
00471     if(wrdIndices[i] - sourceSize + 1 < sntBreaks.at(index))
00472       sntIndices.push_back(-1); // set bad flag
00473     else
00474       sntIndices.push_back(index);      // store the index of the sentence in the corpus
00475   }
00476   return sntIndices;
00477 }
00478 
00479 int
00480 BilingualDynSuffixArray::
00481 SampleSelection(vector<unsigned>& sample, int sampleSize) const
00482 {
00483   // only use top 'sampleSize' number of samples
00484   vector<unsigned> s;
00485   randomSample<unsigned>(s,sampleSize,sample.size());
00486   for (size_t i = 0; i < s.size(); ++i)
00487     s[i] = sample[s[i]];
00488   sample.swap(s);
00489   return sample.size();
00490 }
00491 
00492 void
00493 BilingualDynSuffixArray::
00494 addSntPair(string& source, string& target, string& alignment)
00495 {
00496   vuint_t srcFactor, trgFactor;
00497   cerr << "source, target, alignment = " << source << ", "
00498        << target << ", " << alignment << endl;
00499   const unsigned oldSrcCrpSize = m_srcCorpus->size(), oldTrgCrpSize = m_trgCorpus->size();
00500   cerr << "old source corpus size = " << oldSrcCrpSize << "\told target size = " << oldTrgCrpSize << endl;
00501   Phrase sphrase(ARRAY_SIZE_INCR);
00502   sphrase.CreateFromString(Input, m_inputFactors, source, NULL);
00503   m_srcVocab->MakeOpen();
00504   vector<wordID_t> sIDs(sphrase.GetSize());
00505   // store words in vocabulary and corpus
00506   for(int i = sphrase.GetSize()-1; i >= 0; --i) {
00507     sIDs[i] = m_srcVocab->GetWordID(sphrase.GetWord(i));  // get vocab id backwards
00508   }
00509   for(size_t i = 0; i < sphrase.GetSize(); ++i) {
00510     srcFactor.push_back(sIDs[i]);
00511     cerr << "srcFactor[" << (srcFactor.size() - 1) << "] = " << srcFactor.back() << endl;
00512     m_srcCorpus->push_back(srcFactor.back()); // add word to corpus
00513   }
00514   m_srcSntBreaks.push_back(oldSrcCrpSize); // former end of corpus is index of new sentence
00515   m_srcVocab->MakeClosed();
00516   Phrase tphrase(ARRAY_SIZE_INCR);
00517   tphrase.CreateFromString(Output, m_outputFactors, target, NULL);
00518   m_trgVocab->MakeOpen();
00519   vector<wordID_t> tIDs(tphrase.GetSize());
00520   for(int i = tphrase.GetSize()-1; i >= 0; --i) {
00521     tIDs[i] = m_trgVocab->GetWordID(tphrase.GetWord(i));  // get vocab id
00522   }
00523   for(size_t i = 0; i < tphrase.GetSize(); ++i) {
00524     trgFactor.push_back(tIDs[i]);
00525     cerr << "trgFactor[" << (trgFactor.size() - 1) << "] = " << trgFactor.back() << endl;
00526     m_trgCorpus->push_back(trgFactor.back());
00527   }
00528   cerr << "gets to 1\n";
00529   m_trgSntBreaks.push_back(oldTrgCrpSize);
00530   cerr << "gets to 2\n";
00531   m_srcSA->Insert(&srcFactor, oldSrcCrpSize);
00532   cerr << "gets to 3\n";
00533   m_trgSA->Insert(&trgFactor, oldTrgCrpSize);
00534   LoadRawAlignments(alignment);
00535   m_trgVocab->MakeClosed();
00536 
00537   m_wrd_cooc.Count(sIDs,tIDs, m_rawAlignments.back(),
00538                    m_srcVocab->GetkOOVWordID(),
00539                    m_trgVocab->GetkOOVWordID());
00540 
00541   //for(size_t i=0; i < sphrase.GetSize(); ++i)
00542   //ClearWordInCache(sIDs[i]);
00543 
00544 }
00545 
00546 void
00547 BilingualDynSuffixArray::
00548 ClearWordInCache(wordID_t srcWord)
00549 {
00550   if(m_freqWordsCached.find(srcWord) != m_freqWordsCached.end())
00551     return;
00552   map<pair<wordID_t, wordID_t>, pair<float, float> >::iterator it,
00553       first, last;
00554   for(it = m_wordPairCache.begin(); it != m_wordPairCache.end(); ++it) {
00555     if(it->first.first == srcWord) {  // all source words grouped
00556       first = it; // copy first entry of srcWord
00557       last = it++;
00558       while(it != m_wordPairCache.end() && (it->first.first == srcWord)) {
00559         last = it++;
00560       }
00561     }
00562     m_wordPairCache.erase(first, last);
00563   }
00564 }
00565 
00566 SentenceAlignment::
00567 SentenceAlignment(int sntIndex, int sourceSize, int targetSize)
00568   : m_sntIndex(sntIndex)
00569   , numberAligned(targetSize, 0)
00570   , alignedList(sourceSize)
00571 {
00572   // What is the code below supposed to accomplish??? UG.
00573   // for(int i=0; i < sourceSize; ++i) {
00574   //   vector<int> trgWrd;
00575   //   alignedList[i] = trgWrd;
00576   // }
00577 }
00578 
00579 bool
00580 SentenceAlignment::
00581 Extract(int maxPhraseLength, vector<PhrasePair*> &ret, int startSource, int endSource) const
00582 {
00583   // foreign = target, F=T
00584   // english = source, E=S
00585   int countTarget = numberAligned.size();
00586 
00587   int minTarget = 9999;
00588   int maxTarget = -1;
00589   vector< int > usedTarget = numberAligned;
00590   for(int sourcePos = startSource; sourcePos <= endSource; sourcePos++) {
00591     for(int ind=0; ind < (int)alignedList[sourcePos].size(); ind++) {
00592       int targetPos = alignedList[sourcePos][ind];
00593       // cout << "point (" << targetPos << ", " << sourcePos << ")\n";
00594       if (targetPos<minTarget) {
00595         minTarget = targetPos;
00596       }
00597       if (targetPos>maxTarget) {
00598         maxTarget = targetPos;
00599       }
00600       usedTarget[ targetPos ]--;
00601     } // for(int ind=0;ind<sentence
00602   } // for(int sourcePos=startSource
00603 
00604   // cout << "f projected ( " << minTarget << "-" << maxTarget << ", " << startSource << "," << endSource << ")\n";
00605 
00606   if (maxTarget >= 0 && // aligned to any foreign words at all
00607       maxTarget-minTarget < maxPhraseLength) {
00608     // foreign phrase within limits
00609 
00610     // check if foreign words are aligned to out of bound english words
00611     bool out_of_bounds = false;
00612     for(int targetPos=minTarget; targetPos <= maxTarget && !out_of_bounds; targetPos++) {
00613       if (usedTarget[targetPos]>0) {
00614         // cout << "ouf of bounds: " << targetPos << "\n";
00615         out_of_bounds = true;
00616       }
00617     }
00618 
00619     // cout << "doing if for ( " << minTarget << "-" << maxTarget << ", " << startSource << "," << endSource << ")\n";
00620     if (!out_of_bounds) {
00621       // start point of foreign phrase may retreat over unaligned
00622       for(int startTarget = minTarget;
00623           (startTarget >= 0 &&
00624            startTarget > maxTarget-maxPhraseLength && // within length limit
00625            (startTarget==minTarget || numberAligned[startTarget]==0)); // unaligned
00626           startTarget--) {
00627         // end point of foreign phrase may advance over unaligned
00628         for (int endTarget=maxTarget;
00629              (endTarget<countTarget &&
00630               endTarget<startTarget+maxPhraseLength && // within length limit
00631               (endTarget==maxTarget || numberAligned[endTarget]==0)); // unaligned
00632              endTarget++) {
00633           PhrasePair *phrasePair = new PhrasePair(startTarget,endTarget,startSource,endSource, m_sntIndex);
00634           ret.push_back(phrasePair);
00635         } // for (int endTarget=maxTarget;
00636       } // for(int startTarget=minTarget;
00637     } // if (!out_of_bounds)
00638   } // if (maxTarget >= 0 &&
00639   return (ret.size() > 0);
00640 
00641 }
00642 
00643 int
00644 BilingualDynSuffixArray::
00645 GetSourceSentenceSize(size_t sentenceId) const
00646 {
00647   return (sentenceId==m_srcSntBreaks.size()-1) ?
00648          m_srcCorpus->size() - m_srcSntBreaks.at(sentenceId) :
00649          m_srcSntBreaks.at(sentenceId+1) - m_srcSntBreaks.at(sentenceId);
00650 }
00651 
00652 int
00653 BilingualDynSuffixArray::
00654 GetTargetSentenceSize(size_t sentenceId) const
00655 {
00656   return (sentenceId==m_trgSntBreaks.size()-1) ?
00657          m_trgCorpus->size() - m_trgSntBreaks.at(sentenceId) :
00658          m_trgSntBreaks.at(sentenceId+1) - m_trgSntBreaks.at(sentenceId);
00659 }
00660 
00661 BetterPhrase::
00662 BetterPhrase(ScoresComp const& sc)
00663   : cmp(sc) {}
00664 
00665 // bool
00666 // BetterPhrase::
00667 // operator()(pair<Scores, TargetPhrase const*> const& a,
00668 //           pair<Scores, TargetPhrase const*> const& b) const
00669 // {
00670 //   return cmp(b.first,a.first);
00671 // }
00672 
00673 bool
00674 BetterPhrase::
00675 operator()(pair<Scores, SAPhrase const*> const& a,
00676            pair<Scores, SAPhrase const*> const& b) const
00677 {
00678   return cmp(b.first,a.first);
00679 }
00680 
00681 }// end namepsace