00001 #include "BilingualDynSuffixArray.h"
00002 #include "moses/TranslationModel/DynSAInclude/utils.h"
00003 #include "moses/FactorCollection.h"
00004 #include "moses/StaticData.h"
00005 #include "moses/TargetPhrase.h"
00006
00007 #include "moses/TranslationModel/UG/generic/sorting/NBestList.h"
00008 #include "moses/TranslationModel/UG/generic/sampling/Sampling.h"
00009
00010 #include <boost/foreach.hpp>
00011 #include <iomanip>
00012
00013 using namespace std;
00014
00015 namespace Moses
00016 {
00017
00018 BilingualDynSuffixArray::
00019 BilingualDynSuffixArray():
00020 m_maxPhraseLength(StaticData::Instance().GetMaxPhraseLength()),
00021 m_maxSampleSize(20), m_maxPTEntries(20)
00022 {
00023 m_srcSA = 0;
00024 m_trgSA = 0;
00025 m_srcCorpus = new vector<wordID_t>();
00026 m_trgCorpus = new vector<wordID_t>();
00027 m_srcVocab = new Vocab(false);
00028 m_trgVocab = new Vocab(false);
00029 m_scoreCmp = 0;
00030 }
00031
00032 BilingualDynSuffixArray::
00033 ~BilingualDynSuffixArray()
00034 {
00035 if(m_srcSA) delete m_srcSA;
00036 if(m_trgSA) delete m_trgSA;
00037 if(m_srcVocab) delete m_srcVocab;
00038 if(m_trgVocab) delete m_trgVocab;
00039 if(m_srcCorpus) delete m_srcCorpus;
00040 if(m_trgCorpus) delete m_trgCorpus;
00041 if(m_scoreCmp) delete m_scoreCmp;
00042 }
00043
00044 bool
00045 BilingualDynSuffixArray::
00046 Load(
00047 const vector<FactorType>& inputFactors,
00048 const vector<FactorType>& outputFactors,
00049 string source, string target, string alignments,
00050 const vector<float> &weight)
00051 {
00052 m_inputFactors = inputFactors;
00053 m_outputFactors = outputFactors;
00054
00055
00056 InputFileStream sourceStrme(source);
00057 InputFileStream targetStrme(target);
00058 cerr << "Loading source corpus...\n";
00059
00060 LoadCorpus(Input, sourceStrme, m_inputFactors, *m_srcCorpus, m_srcSntBreaks, m_srcVocab);
00061 cerr << "Loading target corpus...\n";
00062 LoadCorpus(Output, targetStrme, m_outputFactors,*m_trgCorpus, m_trgSntBreaks, m_trgVocab);
00063
00064 UTIL_THROW_IF2(m_srcSntBreaks.size() != m_trgSntBreaks.size(),
00065 "Source and target arrays aren't the same size");
00066
00067
00068 cerr << "Building Source Suffix Array...\n";
00069 m_srcSA = new DynSuffixArray(m_srcCorpus);
00070 if(!m_srcSA) return false;
00071 cerr << "Building Target Suffix Array...\n";
00072 m_trgSA = new DynSuffixArray(m_trgCorpus);
00073 if(!m_trgSA) return false;
00074
00075 InputFileStream alignStrme(alignments);
00076 cerr << "Loading Alignment File...\n";
00077 LoadRawAlignments(alignStrme);
00078 cerr << m_srcSntBreaks.size() << " "
00079 << m_trgSntBreaks.size() << " "
00080 << m_rawAlignments.size() << endl;
00081
00082 cerr << "Building frequent word cache...\n";
00083 CacheFreqWords();
00084
00085 wordID_t const* s = &(*m_srcCorpus)[0];
00086 wordID_t const* t = &(*m_trgCorpus)[0];
00087 for (size_t sid = 0; sid < m_srcSntBreaks.size(); ++sid) {
00088 wordID_t const* se = s + GetSourceSentenceSize(sid);
00089 wordID_t const* te = t + GetTargetSentenceSize(sid);
00090 vector<short> const& a = m_rawAlignments[sid];
00091 m_wrd_cooc.Count(vector<wordID_t>(s,se),
00092 vector<wordID_t>(t,te), a,
00093 m_srcVocab->GetkOOVWordID(),
00094 m_trgVocab->GetkOOVWordID());
00095 s = se;
00096 t = te;
00097 }
00098 if (m_srcSntBreaks.size() != m_trgSntBreaks.size() ||
00099 m_rawAlignments.size() != m_trgSntBreaks.size()) {
00100 cerr << "FATAL ERROR: Line counts don't match!\n"
00101 << "Source side text corpus: " << m_srcSntBreaks.size() << "\n"
00102 << "Target side text corpus: " << m_trgSntBreaks.size() << "\n"
00103 << "Word alignments: " << m_rawAlignments.size() << endl;
00104 exit(1);
00105 }
00106 return true;
00107 }
00108
00109 int
00110 BilingualDynSuffixArray::
00111 LoadRawAlignments(InputFileStream& align)
00112 {
00113
00114 string line;
00115
00116
00117 while(getline(align, line)) {
00118
00119 LoadRawAlignments(line);
00120 }
00121 return m_rawAlignments.size();
00122 }
00123
00124
00125 int
00126 BilingualDynSuffixArray::
00127 LoadRawAlignments(string& align)
00128 {
00129
00130 vector<int> vtmp;
00131 Utils::splitToInt(align, vtmp, "- ");
00132 UTIL_THROW_IF2(vtmp.size() % 2 != 0,
00133 "Alignment format is incorrect: " << align);
00134 vector<short> vAlgn;
00135 for (vector<int>::const_iterator itr = vtmp.begin();
00136 itr != vtmp.end(); ++itr) {
00137 vAlgn.push_back(short(*itr));
00138 }
00139 m_rawAlignments.push_back(vAlgn);
00140 return m_rawAlignments.size();
00141 }
00142
00143 SentenceAlignment
00144 BilingualDynSuffixArray::
00145 GetSentenceAlignment(const int sntIndex, bool trg2Src) const
00146 {
00147
00148 int t = GetTargetSentenceSize(sntIndex);
00149 int s = GetSourceSentenceSize(sntIndex);
00150 int sntGiven = trg2Src ? t : s;
00151 int sntExtract = trg2Src ? s : t;
00152 SentenceAlignment curSnt(sntIndex, sntGiven, sntExtract);
00153 vector<short> const& a = m_rawAlignments.at(sntIndex);
00154 for(size_t i=0; i < a.size(); i+=2) {
00155 int sourcePos = a[i];
00156 int targetPos = a[i+1];
00157 if(trg2Src) {
00158 curSnt.alignedList[targetPos].push_back(sourcePos);
00159 curSnt.numberAligned[sourcePos]++;
00160 } else {
00161 curSnt.alignedList[sourcePos].push_back(targetPos);
00162 curSnt.numberAligned[targetPos]++;
00163 }
00164 }
00165 curSnt.srcSnt = m_srcCorpus + sntIndex;
00166 curSnt.trgSnt = m_trgCorpus + sntIndex;
00167
00168 return curSnt;
00169 }
00170
00171 bool
00172 BilingualDynSuffixArray::
00173 ExtractPhrases(const int& sntIndex,
00174 const int& wordIndex,
00175 const int& sourceSize,
00176 vector<PhrasePair*>& phrasePairs,
00177 bool trg2Src) const
00178 {
00179
00180
00181 SentenceAlignment curSnt = GetSentenceAlignment(sntIndex, trg2Src);
00182
00183 int beginSentence = m_srcSntBreaks[sntIndex];
00184 int rightIdx = wordIndex - beginSentence;
00185 int leftIdx = rightIdx - sourceSize + 1;
00186 return curSnt.Extract(m_maxPhraseLength, phrasePairs, leftIdx, rightIdx);
00187 }
00188
00189 void
00190 BilingualDynSuffixArray::
00191 CleanUp(const InputType& source)
00192 {
00193
00194 }
00195
00196 int
00197 BilingualDynSuffixArray::
00198 LoadCorpus(FactorDirection direction,
00199 InputFileStream & corpus,
00200 const FactorList & factors,
00201 vector<wordID_t> & cArray,
00202 vector<wordID_t> & sntArray,
00203 Vocab* vocab)
00204 {
00205 string line, word;
00206 int sntIdx(0);
00207
00208
00209 while(getline(corpus, line)) {
00210 sntArray.push_back(sntIdx);
00211 Phrase phrase(ARRAY_SIZE_INCR);
00212 phrase.CreateFromString( direction, factors, line, NULL);
00213
00214 for( size_t i = 0; i < phrase.GetSize(); ++i) {
00215 cArray.push_back( vocab->GetWordID(phrase.GetWord(i)) );
00216 }
00217 sntIdx += phrase.GetSize();
00218 }
00219
00220 vocab->MakeClosed();
00221 return cArray.size();
00222 }
00223
00224 bool
00225 BilingualDynSuffixArray::
00226 GetLocalVocabIDs(const Phrase& src, SAPhrase &output) const
00227 {
00228
00229 size_t phraseSize = src.GetSize();
00230 for (size_t pos = 0; pos < phraseSize; ++pos) {
00231 const Word &word = src.GetWord(pos);
00232 wordID_t arrayId = m_srcVocab->GetWordID(word);
00233 if (arrayId == m_srcVocab->GetkOOVWordID()) {
00234
00235 return false;
00236 } else {
00237 output.SetId(pos, arrayId);
00238 }
00239 }
00240 return true;
00241 }
00242
00243 pair<float, float>
00244 BilingualDynSuffixArray::
00245 GetLexicalWeight(const PhrasePair& pp) const
00246 {
00247
00248
00249 int src_size = pp.GetSourceSize();
00250 int trg_size = pp.GetTargetSize();
00251 vector<float> sp(src_size, 0), tp(trg_size, 0);
00252 vector<int> sc(src_size,0), tc(trg_size,0);
00253 wordID_t const* sw = &(m_srcCorpus->at(m_srcSntBreaks.at(pp.m_sntIndex)));
00254 wordID_t const* tw = &(m_trgCorpus->at(m_trgSntBreaks.at(pp.m_sntIndex)));
00255 vector<short> const & a = m_rawAlignments.at(pp.m_sntIndex);
00256 for (size_t i = 0; i < a.size(); i += 2) {
00257 int s = a[i], t = a.at(i+1), sx, tx;
00258
00259
00260 if (s < pp.m_startSource || t < pp.m_startTarget) continue;
00261 if ((sx = s - pp.m_startSource) >= src_size) continue;
00262 if ((tx = t - pp.m_startTarget) >= trg_size) continue;
00263
00264 sp[sx] += m_wrd_cooc.pfwd(sw[s],tw[t]);
00265 tp[tx] += m_wrd_cooc.pbwd(sw[s],tw[t]);
00266 ++sc[sx];
00267 ++tc[tx];
00268 #if 0
00269 cout << m_srcVocab->GetWord(sw[s]) << " -> "
00270 << m_trgVocab->GetWord(tw[t]) << " "
00271 << m_wrd_cooc.pfwd(sw[s],tw[t]) << " "
00272 << m_wrd_cooc.pbwd(sw[s],tw[t]) << " "
00273 << sp[sx] << " (" << sc[sx] << ") "
00274 << tp[tx] << " (" << tc[tx] << ") "
00275 << endl;
00276 #endif
00277 }
00278 pair<float,float> ret(1,1);
00279 wordID_t null_trg = m_trgVocab->GetkOOVWordID();
00280 wordID_t null_src = m_srcVocab->GetkOOVWordID();
00281 size_t soff = pp.m_startSource;
00282 for (size_t i = 0; i < sp.size(); ++i) {
00283 if (sc[i]) ret.first *= sp[i]/sc[i];
00284 else ret.first *= m_wrd_cooc.pfwd(sw[soff+i], null_trg);
00285 }
00286 size_t toff = pp.m_startTarget;
00287 for (size_t i = 0; i < tp.size(); ++i) {
00288 if (tc[i]) ret.second *= tp[i]/tc[i];
00289 else ret.second *= m_wrd_cooc.pbwd(null_src,tw[toff+i]);
00290 }
00291 return ret;
00292 }
00293
00294 void
00295 BilingualDynSuffixArray::
00296 CacheFreqWords() const
00297 {
00298 multimap<int, wordID_t> wordCnts;
00299
00300 Vocab::Word2Id::const_iterator it;
00301 for(it = m_srcVocab->VocabStart(); it != m_srcVocab->VocabEnd(); ++it) {
00302
00303 wordID_t srcWord = it->second;
00304 vector<wordID_t> sword(1, srcWord), wrdIndices;
00305 m_srcSA->GetCorpusIndex(&sword, &wrdIndices);
00306 if(wrdIndices.size() >= 1000) {
00307 wordCnts.insert(make_pair(wrdIndices.size(), srcWord));
00308 }
00309 }
00310 int numSoFar(0);
00311 multimap<int, wordID_t>::reverse_iterator ritr;
00312 for(ritr = wordCnts.rbegin(); ritr != wordCnts.rend(); ++ritr) {
00313 m_freqWordsCached.insert(ritr->second);
00314 CacheWordProbs(ritr->second);
00315 if(++numSoFar == 50) break;
00316 }
00317 cerr << "\tCached " << m_freqWordsCached.size() << " source words\n";
00318 }
00319
00320 void
00321 BilingualDynSuffixArray::
00322 CacheWordProbs(wordID_t srcWord) const
00323 {
00324 map<wordID_t, int> counts;
00325 vector<wordID_t> sword(1, srcWord), wrdIndices;
00326 bool ret = m_srcSA->GetCorpusIndex(&sword, &wrdIndices);
00327 UTIL_THROW_IF2(!ret, "Error");
00328
00329 vector<int> sntIndexes = GetSntIndexes(wrdIndices, 1, m_srcSntBreaks);
00330 float denom(0);
00331
00332 for(size_t snt = 0; snt < sntIndexes.size(); ++snt) {
00333 int sntIdx = sntIndexes.at(snt);
00334 UTIL_THROW_IF2(sntIdx == -1, "Error");
00335
00336 int srcWrdSntIdx = wrdIndices.at(snt) - m_srcSntBreaks.at(sntIdx);
00337 const vector<int> srcAlg = GetSentenceAlignment(sntIdx).alignedList.at(srcWrdSntIdx);
00338 if(srcAlg.size() == 0) {
00339 ++counts[m_srcVocab->GetkOOVWordID()];
00340 ++denom;
00341 } else {
00342 for(size_t i=0; i < srcAlg.size(); ++i) {
00343 wordID_t trgWord = m_trgCorpus->at(srcAlg[i] + m_trgSntBreaks[sntIdx]);
00344 ++counts[trgWord];
00345 ++denom;
00346 }
00347 }
00348 }
00349
00350
00351 for(map<wordID_t, int>::const_iterator itrCnt = counts.begin();
00352 itrCnt != counts.end(); ++itrCnt) {
00353 pair<wordID_t, wordID_t> wordPair = make_pair(srcWord, itrCnt->first);
00354 float srcTrgPrb = float(itrCnt->second) / float(denom);
00355 float trgSrcPrb = float(itrCnt->second) / float(counts.size());
00356 m_wordPairCache[wordPair] = pair<float, float>(srcTrgPrb, trgSrcPrb);
00357 }
00358 }
00359
00360 SAPhrase
00361 BilingualDynSuffixArray::
00362 TrgPhraseFromSntIdx(const PhrasePair& phrasepair) const
00363 {
00364
00365 SAPhrase phraseIds(phrasepair.GetTargetSize());
00366 int sntIndex = phrasepair.m_sntIndex;
00367 int id(-1), pos(0);
00368 for(int i=phrasepair.m_startTarget; i <= phrasepair.m_endTarget; ++i) {
00369 id = m_trgCorpus->at(m_trgSntBreaks[sntIndex] + i);
00370 phraseIds.SetId(pos++, id);
00371 }
00372 return phraseIds;
00373 }
00374
00375 TargetPhrase*
00376 BilingualDynSuffixArray::
00377 GetMosesFactorIDs(const SAPhrase& phrase, const Phrase& sourcePhrase, const PhraseDictionary *pt) const
00378 {
00379 TargetPhrase* targetPhrase = new TargetPhrase(pt);
00380 for(size_t i=0; i < phrase.words.size(); ++i) {
00381 Word& word = m_trgVocab->GetWord( phrase.words[i]);
00382 UTIL_THROW_IF2(word == m_trgVocab->GetkOOVWord(),
00383 "Unknown word at position " << i);
00384 targetPhrase->AddWord(word);
00385 }
00386
00387 return targetPhrase;
00388 }
00389
00391
00392
00393
00394 pair<float,float>
00395 BilingualDynSuffixArray::
00396 GatherCands(Phrase const& src, map<SAPhrase, vector<float> >& pstats) const
00397 {
00398 typedef map<SAPhrase, vector<float> >::iterator pstat_iter;
00399 typedef map<SAPhrase, vector<float> >::value_type pstat_entry;
00400 pair<float,float> ret(0,0);
00401 float& sampleRate = ret.first;
00402 float& totalPhrases = ret.second;
00403 size_t srcSize = src.GetSize();
00404 SAPhrase localIDs(srcSize);
00405 vector<unsigned> wrdIndices;
00406 if(!GetLocalVocabIDs(src, localIDs) ||
00407 !m_srcSA->GetCorpusIndex(&(localIDs.words), &wrdIndices))
00408 return ret;
00409
00410
00411 size_t m1 = wrdIndices.size();
00412 SampleSelection(wrdIndices);
00413 sampleRate = float(wrdIndices.size())/m1;
00414
00415
00416 vector<int> sntIndices = GetSntIndexes(wrdIndices, srcSize, m_srcSntBreaks);
00417 for(size_t s = 0; s < sntIndices.size(); ++s) {
00418 int sntStart = sntIndices.at(s);
00419 if(sntStart == -1) continue;
00420 vector<PhrasePair*> phrasePairs;
00421 ExtractPhrases(sntStart, wrdIndices[s], srcSize, phrasePairs);
00422 totalPhrases += phrasePairs.size();
00423 vector<PhrasePair*>::iterator p;
00424 for (p = phrasePairs.begin(); p != phrasePairs.end(); ++p) {
00425 assert(*p);
00426 pair<float, float> lex = GetLexicalWeight(**p);
00427 pstat_entry entry(TrgPhraseFromSntIdx(**p), Scores(5));
00428 pair<pstat_iter, bool> foo = pstats.insert(entry);
00429 Scores& feats = foo.first->second;
00430 if (foo.second) {
00431 feats[0] = 1;
00432 feats[1] = lex.first;
00433 feats[3] = lex.second;
00434 } else {
00435 feats[0] += 1;
00436 feats[1] = max(feats[1],lex.first);
00437 feats[3] = max(feats[3],lex.second);
00438 }
00439 delete *p;
00440 }
00441 }
00442 BOOST_FOREACH(pstat_entry & e, pstats) {
00443 Scores& feats = e.second;
00444
00445
00446
00447
00448
00449 float x = m_trgSA->GetCount(e.first.words)-feats[0] * sampleRate;
00450 feats[4] = 1;
00451 feats[3] = log(feats[3]);
00452 feats[2] = log(feats[0]) - log(totalPhrases);
00453 feats[1] = log(feats[1]);
00454 feats[0] = log(feats[0]) - log(feats[0] + x);
00455 }
00456 return ret;
00457 }
00458
00459 vector<int>
00460 BilingualDynSuffixArray::
00461 GetSntIndexes(vector<unsigned>& wrdIndices,
00462 const int sourceSize,
00463 const vector<unsigned>& sntBreaks) const
00464 {
00465 vector<unsigned>::const_iterator vit;
00466 vector<int> sntIndices;
00467 for(size_t i=0; i < wrdIndices.size(); ++i) {
00468 vit = upper_bound(sntBreaks.begin(), sntBreaks.end(), wrdIndices[i]);
00469 int index = int(vit - sntBreaks.begin()) - 1;
00470
00471 if(wrdIndices[i] - sourceSize + 1 < sntBreaks.at(index))
00472 sntIndices.push_back(-1);
00473 else
00474 sntIndices.push_back(index);
00475 }
00476 return sntIndices;
00477 }
00478
00479 int
00480 BilingualDynSuffixArray::
00481 SampleSelection(vector<unsigned>& sample, int sampleSize) const
00482 {
00483
00484 vector<unsigned> s;
00485 randomSample<unsigned>(s,sampleSize,sample.size());
00486 for (size_t i = 0; i < s.size(); ++i)
00487 s[i] = sample[s[i]];
00488 sample.swap(s);
00489 return sample.size();
00490 }
00491
00492 void
00493 BilingualDynSuffixArray::
00494 addSntPair(string& source, string& target, string& alignment)
00495 {
00496 vuint_t srcFactor, trgFactor;
00497 cerr << "source, target, alignment = " << source << ", "
00498 << target << ", " << alignment << endl;
00499 const unsigned oldSrcCrpSize = m_srcCorpus->size(), oldTrgCrpSize = m_trgCorpus->size();
00500 cerr << "old source corpus size = " << oldSrcCrpSize << "\told target size = " << oldTrgCrpSize << endl;
00501 Phrase sphrase(ARRAY_SIZE_INCR);
00502 sphrase.CreateFromString(Input, m_inputFactors, source, NULL);
00503 m_srcVocab->MakeOpen();
00504 vector<wordID_t> sIDs(sphrase.GetSize());
00505
00506 for(int i = sphrase.GetSize()-1; i >= 0; --i) {
00507 sIDs[i] = m_srcVocab->GetWordID(sphrase.GetWord(i));
00508 }
00509 for(size_t i = 0; i < sphrase.GetSize(); ++i) {
00510 srcFactor.push_back(sIDs[i]);
00511 cerr << "srcFactor[" << (srcFactor.size() - 1) << "] = " << srcFactor.back() << endl;
00512 m_srcCorpus->push_back(srcFactor.back());
00513 }
00514 m_srcSntBreaks.push_back(oldSrcCrpSize);
00515 m_srcVocab->MakeClosed();
00516 Phrase tphrase(ARRAY_SIZE_INCR);
00517 tphrase.CreateFromString(Output, m_outputFactors, target, NULL);
00518 m_trgVocab->MakeOpen();
00519 vector<wordID_t> tIDs(tphrase.GetSize());
00520 for(int i = tphrase.GetSize()-1; i >= 0; --i) {
00521 tIDs[i] = m_trgVocab->GetWordID(tphrase.GetWord(i));
00522 }
00523 for(size_t i = 0; i < tphrase.GetSize(); ++i) {
00524 trgFactor.push_back(tIDs[i]);
00525 cerr << "trgFactor[" << (trgFactor.size() - 1) << "] = " << trgFactor.back() << endl;
00526 m_trgCorpus->push_back(trgFactor.back());
00527 }
00528 cerr << "gets to 1\n";
00529 m_trgSntBreaks.push_back(oldTrgCrpSize);
00530 cerr << "gets to 2\n";
00531 m_srcSA->Insert(&srcFactor, oldSrcCrpSize);
00532 cerr << "gets to 3\n";
00533 m_trgSA->Insert(&trgFactor, oldTrgCrpSize);
00534 LoadRawAlignments(alignment);
00535 m_trgVocab->MakeClosed();
00536
00537 m_wrd_cooc.Count(sIDs,tIDs, m_rawAlignments.back(),
00538 m_srcVocab->GetkOOVWordID(),
00539 m_trgVocab->GetkOOVWordID());
00540
00541
00542
00543
00544 }
00545
00546 void
00547 BilingualDynSuffixArray::
00548 ClearWordInCache(wordID_t srcWord)
00549 {
00550 if(m_freqWordsCached.find(srcWord) != m_freqWordsCached.end())
00551 return;
00552 map<pair<wordID_t, wordID_t>, pair<float, float> >::iterator it,
00553 first, last;
00554 for(it = m_wordPairCache.begin(); it != m_wordPairCache.end(); ++it) {
00555 if(it->first.first == srcWord) {
00556 first = it;
00557 last = it++;
00558 while(it != m_wordPairCache.end() && (it->first.first == srcWord)) {
00559 last = it++;
00560 }
00561 }
00562 m_wordPairCache.erase(first, last);
00563 }
00564 }
00565
00566 SentenceAlignment::
00567 SentenceAlignment(int sntIndex, int sourceSize, int targetSize)
00568 : m_sntIndex(sntIndex)
00569 , numberAligned(targetSize, 0)
00570 , alignedList(sourceSize)
00571 {
00572
00573
00574
00575
00576
00577 }
00578
00579 bool
00580 SentenceAlignment::
00581 Extract(int maxPhraseLength, vector<PhrasePair*> &ret, int startSource, int endSource) const
00582 {
00583
00584
00585 int countTarget = numberAligned.size();
00586
00587 int minTarget = 9999;
00588 int maxTarget = -1;
00589 vector< int > usedTarget = numberAligned;
00590 for(int sourcePos = startSource; sourcePos <= endSource; sourcePos++) {
00591 for(int ind=0; ind < (int)alignedList[sourcePos].size(); ind++) {
00592 int targetPos = alignedList[sourcePos][ind];
00593
00594 if (targetPos<minTarget) {
00595 minTarget = targetPos;
00596 }
00597 if (targetPos>maxTarget) {
00598 maxTarget = targetPos;
00599 }
00600 usedTarget[ targetPos ]--;
00601 }
00602 }
00603
00604
00605
00606 if (maxTarget >= 0 &&
00607 maxTarget-minTarget < maxPhraseLength) {
00608
00609
00610
00611 bool out_of_bounds = false;
00612 for(int targetPos=minTarget; targetPos <= maxTarget && !out_of_bounds; targetPos++) {
00613 if (usedTarget[targetPos]>0) {
00614
00615 out_of_bounds = true;
00616 }
00617 }
00618
00619
00620 if (!out_of_bounds) {
00621
00622 for(int startTarget = minTarget;
00623 (startTarget >= 0 &&
00624 startTarget > maxTarget-maxPhraseLength &&
00625 (startTarget==minTarget || numberAligned[startTarget]==0));
00626 startTarget--) {
00627
00628 for (int endTarget=maxTarget;
00629 (endTarget<countTarget &&
00630 endTarget<startTarget+maxPhraseLength &&
00631 (endTarget==maxTarget || numberAligned[endTarget]==0));
00632 endTarget++) {
00633 PhrasePair *phrasePair = new PhrasePair(startTarget,endTarget,startSource,endSource, m_sntIndex);
00634 ret.push_back(phrasePair);
00635 }
00636 }
00637 }
00638 }
00639 return (ret.size() > 0);
00640
00641 }
00642
00643 int
00644 BilingualDynSuffixArray::
00645 GetSourceSentenceSize(size_t sentenceId) const
00646 {
00647 return (sentenceId==m_srcSntBreaks.size()-1) ?
00648 m_srcCorpus->size() - m_srcSntBreaks.at(sentenceId) :
00649 m_srcSntBreaks.at(sentenceId+1) - m_srcSntBreaks.at(sentenceId);
00650 }
00651
00652 int
00653 BilingualDynSuffixArray::
00654 GetTargetSentenceSize(size_t sentenceId) const
00655 {
00656 return (sentenceId==m_trgSntBreaks.size()-1) ?
00657 m_trgCorpus->size() - m_trgSntBreaks.at(sentenceId) :
00658 m_trgSntBreaks.at(sentenceId+1) - m_trgSntBreaks.at(sentenceId);
00659 }
00660
00661 BetterPhrase::
00662 BetterPhrase(ScoresComp const& sc)
00663 : cmp(sc) {}
00664
00665
00666
00667
00668
00669
00670
00671
00672
00673 bool
00674 BetterPhrase::
00675 operator()(pair<Scores, SAPhrase const*> const& a,
00676 pair<Scores, SAPhrase const*> const& b) const
00677 {
00678 return cmp(b.first,a.first);
00679 }
00680
00681 }