00001 #include "BilingualDynSuffixArray.h"
00002 #include "DynSAInclude/utils.h"
00003 #include "FactorCollection.h"
00004 #include "StaticData.h"
00005 #include "TargetPhrase.h"
00006 #include <iomanip>
00007
00008 using namespace std;
00009
00010 namespace Moses {
00011
00012 BilingualDynSuffixArray::BilingualDynSuffixArray():
00013 m_maxPhraseLength(StaticData::Instance().GetMaxPhraseLength()),
00014 m_maxSampleSize(20)
00015 {
00016 m_srcSA = 0;
00017 m_trgSA = 0;
00018 m_srcCorpus = new std::vector<wordID_t>();
00019 m_trgCorpus = new std::vector<wordID_t>();
00020 m_srcVocab = new Vocab(false);
00021 m_trgVocab = new Vocab(false);
00022 m_scoreCmp = 0;
00023 }
00024
00025 BilingualDynSuffixArray::~BilingualDynSuffixArray()
00026 {
00027 if(m_srcSA) delete m_srcSA;
00028 if(m_trgSA) delete m_trgSA;
00029 if(m_srcVocab) delete m_srcVocab;
00030 if(m_trgVocab) delete m_trgVocab;
00031 if(m_srcCorpus) delete m_srcCorpus;
00032 if(m_trgCorpus) delete m_trgCorpus;
00033 if(m_scoreCmp) delete m_scoreCmp;
00034 }
00035
00036 bool BilingualDynSuffixArray::Load(
00037 const std::vector<FactorType>& inputFactors,
00038 const std::vector<FactorType>& outputFactors,
00039 std::string source, std::string target, std::string alignments,
00040 const std::vector<float> &weight)
00041 {
00042 m_inputFactors = inputFactors;
00043 m_outputFactors = outputFactors;
00044
00045 m_scoreCmp = new ScoresComp(weight);
00046 InputFileStream sourceStrme(source);
00047 InputFileStream targetStrme(target);
00048 cerr << "Loading source corpus...\n";
00049 LoadCorpus(sourceStrme, m_inputFactors, *m_srcCorpus, m_srcSntBreaks, m_srcVocab);
00050 cerr << "Loading target corpus...\n";
00051 LoadCorpus(targetStrme, m_outputFactors,*m_trgCorpus, m_trgSntBreaks, m_trgVocab);
00052 CHECK(m_srcSntBreaks.size() == m_trgSntBreaks.size());
00053
00054
00055 cerr << "Building Source Suffix Array...\n";
00056 m_srcSA = new DynSuffixArray(m_srcCorpus);
00057 if(!m_srcSA) return false;
00058 cerr << "Building Target Suffix Array...\n";
00059
00060
00061 cerr << "\t(Skipped. Not used)\n";
00062
00063 InputFileStream alignStrme(alignments);
00064 cerr << "Loading Alignment File...\n";
00065 LoadRawAlignments(alignStrme);
00066
00067 cerr << "Building frequent word cache...\n";
00068 CacheFreqWords();
00069 return true;
00070 }
00071
00072 int BilingualDynSuffixArray::LoadRawAlignments(InputFileStream& align)
00073 {
00074
00075 std::string line;
00076 std::vector<int> vtmp;
00077 while(getline(align, line)) {
00078 Utils::splitToInt(line, vtmp, "- ");
00079 CHECK(vtmp.size() % 2 == 0);
00080 std::vector<short> vAlgn;
00081 for (std::vector<int>::const_iterator itr = vtmp.begin();
00082 itr != vtmp.end(); ++itr) {
00083 vAlgn.push_back(short(*itr));
00084 }
00085 m_rawAlignments.push_back(vAlgn);
00086 }
00087 return m_rawAlignments.size();
00088 }
00089 int BilingualDynSuffixArray::LoadRawAlignments(string& align) {
00090
00091 vector<int> vtmp;
00092 Utils::splitToInt(align, vtmp, "- ");
00093 CHECK(vtmp.size() % 2 == 0);
00094 vector<short> vAlgn;
00095 for (std::vector<int>::const_iterator itr = vtmp.begin();
00096 itr != vtmp.end(); ++itr) {
00097 vAlgn.push_back(short(*itr));
00098 }
00099 m_rawAlignments.push_back(vAlgn);
00100 return m_rawAlignments.size();
00101 }
00102
00103 int BilingualDynSuffixArray::LoadAlignments(InputFileStream& align)
00104 {
00105 std::string line;
00106 std::vector<int> vtmp;
00107 int sntIndex(0);
00108
00109 while(getline(align, line)) {
00110 Utils::splitToInt(line, vtmp, "- ");
00111 CHECK(vtmp.size() % 2 == 0);
00112
00113 int sourceSize = GetSourceSentenceSize(sntIndex);
00114 int targetSize = GetTargetSentenceSize(sntIndex);
00115
00116 SentenceAlignment curSnt(sntIndex, sourceSize, targetSize);
00117 for(int i=0; i < (int)vtmp.size(); i+=2) {
00118 int sourcePos = vtmp[i];
00119 int targetPos = vtmp[i+1];
00120 CHECK(sourcePos < sourceSize);
00121 CHECK(targetPos < targetSize);
00122
00123 curSnt.alignedList[sourcePos].push_back(targetPos);
00124 curSnt.numberAligned[targetPos]++;
00125 }
00126 curSnt.srcSnt = m_srcCorpus + sntIndex;
00127 curSnt.trgSnt = m_trgCorpus + sntIndex;
00128 m_alignments.push_back(curSnt);
00129
00130 sntIndex++;
00131 }
00132 return m_alignments.size();
00133 }
00134
00135 SentenceAlignment BilingualDynSuffixArray::GetSentenceAlignment(const int sntIndex, bool trg2Src) const
00136 {
00137
00138 int sntGiven = trg2Src ? GetTargetSentenceSize(sntIndex) : GetSourceSentenceSize(sntIndex);
00139 int sntExtract = trg2Src ? GetSourceSentenceSize(sntIndex) : GetTargetSentenceSize(sntIndex);
00140 std::vector<short> alignment = m_rawAlignments.at(sntIndex);
00141 SentenceAlignment curSnt(sntIndex, sntGiven, sntExtract);
00142 for(size_t i=0; i < alignment.size(); i+=2) {
00143 int sourcePos = alignment[i];
00144 int targetPos = alignment[i+1];
00145 if(trg2Src) {
00146 curSnt.alignedList[targetPos].push_back(sourcePos);
00147 curSnt.numberAligned[sourcePos]++;
00148 }
00149 else {
00150 curSnt.alignedList[sourcePos].push_back(targetPos);
00151 curSnt.numberAligned[targetPos]++;
00152 }
00153 }
00154 curSnt.srcSnt = m_srcCorpus + sntIndex;
00155 curSnt.trgSnt = m_trgCorpus + sntIndex;
00156
00157 return curSnt;
00158 }
00159
00160 bool BilingualDynSuffixArray::ExtractPhrases(const int& sntIndex, const int& wordIndex,
00161 const int& sourceSize, std::vector<PhrasePair*>& phrasePairs, bool trg2Src) const
00162 {
00163
00164
00165 SentenceAlignment curSnt = GetSentenceAlignment(sntIndex, trg2Src);
00166
00167 int beginSentence = m_srcSntBreaks[sntIndex];
00168 int rightIdx = wordIndex - beginSentence
00169 ,leftIdx = rightIdx - sourceSize + 1;
00170 return curSnt.Extract(m_maxPhraseLength, phrasePairs, leftIdx, rightIdx);
00171 }
00172
00173 void BilingualDynSuffixArray::CleanUp()
00174 {
00175
00176 }
00177
00178 int BilingualDynSuffixArray::LoadCorpus(InputFileStream& corpus, const FactorList& factors,
00179 std::vector<wordID_t>& cArray, std::vector<wordID_t>& sntArray,
00180 Vocab* vocab)
00181 {
00182 std::string line, word;
00183 int sntIdx(0);
00184
00185 const std::string& factorDelimiter = StaticData::Instance().GetFactorDelimiter();
00186 while(getline(corpus, line)) {
00187 sntArray.push_back(sntIdx);
00188 Phrase phrase(ARRAY_SIZE_INCR);
00189
00190 phrase.CreateFromString( factors, line, factorDelimiter);
00191
00192 for( size_t i = 0; i < phrase.GetSize(); ++i) {
00193 cArray.push_back( vocab->GetWordID(phrase.GetWord(i)) );
00194 }
00195 sntIdx += phrase.GetSize();
00196 }
00197
00198 vocab->MakeClosed();
00199 return cArray.size();
00200 }
00201
00202 bool BilingualDynSuffixArray::GetLocalVocabIDs(const Phrase& src, SAPhrase &output) const
00203 {
00204
00205 size_t phraseSize = src.GetSize();
00206 for (size_t pos = 0; pos < phraseSize; ++pos) {
00207 const Word &word = src.GetWord(pos);
00208 wordID_t arrayId = m_srcVocab->GetWordID(word);
00209 if (arrayId == m_srcVocab->GetkOOVWordID())
00210 {
00211 return false;
00212 }
00213 else
00214 {
00215 output.SetId(pos, arrayId);
00216
00217 }
00218 }
00219 return true;
00220 }
00221
00222 pair<float, float> BilingualDynSuffixArray::GetLexicalWeight(const PhrasePair& phrasepair) const
00223 {
00224
00225 float srcLexWeight(1.0), trgLexWeight(1.0);
00226 std::map<pair<wordID_t, wordID_t>, float> targetProbs;
00227
00228 const SentenceAlignment& alignment = GetSentenceAlignment(phrasepair.m_sntIndex);
00229 std::map<pair<wordID_t, wordID_t>, pair<float, float> >::const_iterator itrCache;
00230
00231 for(int srcIdx = phrasepair.m_startSource; srcIdx <= phrasepair.m_endSource; ++srcIdx) {
00232 float srcSumPairProbs(0);
00233 wordID_t srcWord = m_srcCorpus->at(srcIdx + m_srcSntBreaks[phrasepair.m_sntIndex]);
00234 const std::vector<int>& srcWordAlignments = alignment.alignedList.at(srcIdx);
00235
00236 if(srcWordAlignments.size() == 0) {
00237 pair<wordID_t, wordID_t> wordpair = make_pair(srcWord, m_srcVocab->GetkOOVWordID());
00238 itrCache = m_wordPairCache.find(wordpair);
00239 if(itrCache == m_wordPairCache.end()) {
00240 CacheWordProbs(srcWord);
00241 itrCache = m_wordPairCache.find(wordpair);
00242 }
00243 CHECK(itrCache != m_wordPairCache.end());
00244 srcSumPairProbs += itrCache->second.first;
00245 targetProbs[wordpair] = itrCache->second.second;
00246 }
00247 else {
00248 for(size_t i = 0; i < srcWordAlignments.size(); ++i) {
00249 int trgIdx = srcWordAlignments[i];
00250 wordID_t trgWord = m_trgCorpus->at(trgIdx + m_trgSntBreaks[phrasepair.m_sntIndex]);
00251
00252 pair<wordID_t, wordID_t> wordpair = make_pair(srcWord, trgWord);
00253 itrCache = m_wordPairCache.find(wordpair);
00254 if(itrCache == m_wordPairCache.end()) {
00255 CacheWordProbs(srcWord);
00256 itrCache = m_wordPairCache.find(wordpair);
00257 }
00258 CHECK(itrCache != m_wordPairCache.end());
00259 srcSumPairProbs += itrCache->second.first;
00260 targetProbs[wordpair] = itrCache->second.second;
00261 }
00262 }
00263 float srcNormalizer = srcWordAlignments.size() < 2 ? 1.0 : 1.0 / float(srcWordAlignments.size());
00264 srcLexWeight *= (srcNormalizer * srcSumPairProbs);
00265 }
00266 for(int trgIdx = phrasepair.m_startTarget; trgIdx <= phrasepair.m_endTarget; ++trgIdx) {
00267 float trgSumPairProbs(0);
00268 wordID_t trgWord = m_trgCorpus->at(trgIdx + m_trgSntBreaks[phrasepair.m_sntIndex]);
00269 for (std::map<pair<wordID_t, wordID_t>, float>::const_iterator trgItr
00270 = targetProbs.begin(); trgItr != targetProbs.end(); ++trgItr) {
00271 if(trgItr->first.second == trgWord)
00272 trgSumPairProbs += trgItr->second;
00273 }
00274 if(trgSumPairProbs == 0) continue;
00275 int noAligned = alignment.numberAligned.at(trgIdx);
00276 float trgNormalizer = noAligned < 2 ? 1.0 : 1.0 / float(noAligned);
00277 trgLexWeight *= (trgNormalizer * trgSumPairProbs);
00278 }
00279
00280 return pair<float, float>(srcLexWeight, trgLexWeight);
00281 }
00282 void BilingualDynSuffixArray::CacheFreqWords() const {
00283 std::multimap<int, wordID_t> wordCnts;
00284
00285 Vocab::Word2Id::const_iterator it;
00286 for(it = m_srcVocab->VocabStart(); it != m_srcVocab->VocabEnd(); ++it) {
00287
00288 wordID_t srcWord = it->second;
00289 std::vector<wordID_t> sword(1, srcWord), wrdIndices;
00290 m_srcSA->GetCorpusIndex(&sword, &wrdIndices);
00291 if(wrdIndices.size() >= 1000) {
00292 wordCnts.insert(make_pair(wrdIndices.size(), srcWord));
00293 }
00294 }
00295 int numSoFar(0);
00296 std::multimap<int, wordID_t>::reverse_iterator ritr;
00297 for(ritr = wordCnts.rbegin(); ritr != wordCnts.rend(); ++ritr) {
00298 m_freqWordsCached.insert(ritr->second);
00299 CacheWordProbs(ritr->second);
00300 if(++numSoFar == 50) break;
00301 }
00302 cerr << "\tCached " << m_freqWordsCached.size() << " source words\n";
00303 }
00304 void BilingualDynSuffixArray::CacheWordProbs(wordID_t srcWord) const
00305 {
00306 std::map<wordID_t, int> counts;
00307 std::vector<wordID_t> sword(1, srcWord), wrdIndices;
00308 bool ret = m_srcSA->GetCorpusIndex(&sword, &wrdIndices);
00309 CHECK(ret);
00310 std::vector<int> sntIndexes = GetSntIndexes(wrdIndices, 1, m_srcSntBreaks);
00311 float denom(0);
00312
00313 for(size_t snt = 0; snt < sntIndexes.size(); ++snt) {
00314 int sntIdx = sntIndexes.at(snt);
00315 CHECK(sntIdx != -1);
00316 int srcWrdSntIdx = wrdIndices.at(snt) - m_srcSntBreaks.at(sntIdx);
00317 const std::vector<int> srcAlg = GetSentenceAlignment(sntIdx).alignedList.at(srcWrdSntIdx);
00318 if(srcAlg.size() == 0) {
00319 ++counts[m_srcVocab->GetkOOVWordID()];
00320 ++denom;
00321 }
00322 else {
00323 for(size_t i=0; i < srcAlg.size(); ++i) {
00324 wordID_t trgWord = m_trgCorpus->at(srcAlg[i] + m_trgSntBreaks[sntIdx]);
00325 ++counts[trgWord];
00326 ++denom;
00327 }
00328 }
00329 }
00330
00331
00332 for(std::map<wordID_t, int>::const_iterator itrCnt = counts.begin();
00333 itrCnt != counts.end(); ++itrCnt) {
00334 pair<wordID_t, wordID_t> wordPair = make_pair(srcWord, itrCnt->first);
00335 float srcTrgPrb = float(itrCnt->second) / float(denom);
00336 float trgSrcPrb = float(itrCnt->second) / float(counts.size());
00337 m_wordPairCache[wordPair] = pair<float, float>(srcTrgPrb, trgSrcPrb);
00338 }
00339 }
00340
00341 SAPhrase BilingualDynSuffixArray::TrgPhraseFromSntIdx(const PhrasePair& phrasepair) const
00342 {
00343
00344 SAPhrase phraseIds(phrasepair.GetTargetSize());
00345 int sntIndex = phrasepair.m_sntIndex;
00346 int id(-1), pos(0);
00347 for(int i=phrasepair.m_startTarget; i <= phrasepair.m_endTarget; ++i) {
00348 id = m_trgCorpus->at(m_trgSntBreaks[sntIndex] + i);
00349 phraseIds.SetId(pos++, id);
00350 }
00351 return phraseIds;
00352 }
00353
00354 TargetPhrase* BilingualDynSuffixArray::GetMosesFactorIDs(const SAPhrase& phrase) const
00355 {
00356 TargetPhrase* targetPhrase = new TargetPhrase(Output);
00357 for(size_t i=0; i < phrase.words.size(); ++i) {
00358 Word& word = m_trgVocab->GetWord( phrase.words[i]);
00359 CHECK(word != m_trgVocab->GetkOOVWord());
00360 targetPhrase->AddWord(word);
00361 }
00362
00363 return targetPhrase;
00364 }
00365
00366 void BilingualDynSuffixArray::GetTargetPhrasesByLexicalWeight(const Phrase& src, std::vector< std::pair<Scores, TargetPhrase*> > & target) const
00367 {
00368
00369 size_t sourceSize = src.GetSize();
00370 SAPhrase localIDs(sourceSize);
00371 if(!GetLocalVocabIDs(src, localIDs)) return;
00372 float totalTrgPhrases(0);
00373 std::map<SAPhrase, int> phraseCounts;
00374
00375 std::map<SAPhrase, pair<float, float> > lexicalWeights;
00376 std::map<SAPhrase, pair<float, float> >::iterator itrLexW;
00377 std::vector<unsigned> wrdIndices;
00378
00379 if(!m_srcSA->GetCorpusIndex(&(localIDs.words), &wrdIndices)) return;
00380 SampleSelection(wrdIndices);
00381 std::vector<int> sntIndexes = GetSntIndexes(wrdIndices, sourceSize, m_srcSntBreaks);
00382
00383 for(size_t snt = 0; snt < sntIndexes.size(); ++snt) {
00384 std::vector<PhrasePair*> phrasePairs;
00385 int sntIndex = sntIndexes.at(snt);
00386 if(sntIndex == -1) continue;
00387 ExtractPhrases(sntIndex, wrdIndices[snt], sourceSize, phrasePairs);
00388
00389 totalTrgPhrases += phrasePairs.size();
00390 std::vector<PhrasePair*>::iterator iterPhrasePair;
00391 for (iterPhrasePair = phrasePairs.begin(); iterPhrasePair != phrasePairs.end(); ++iterPhrasePair) {
00392 SAPhrase phrase = TrgPhraseFromSntIdx(**iterPhrasePair);
00393 phraseCounts[phrase]++;
00394
00395
00396 pair<float, float> lexWeight = GetLexicalWeight(**iterPhrasePair);
00397 itrLexW = lexicalWeights.find(phrase);
00398 if((itrLexW != lexicalWeights.end()) && (itrLexW->second.first < lexWeight.first))
00399 itrLexW->second = lexWeight;
00400 else lexicalWeights[phrase] = lexWeight;
00401 }
00402
00403 RemoveAllInColl(phrasePairs);
00404 }
00405
00406 std::map<SAPhrase, int>::const_iterator iterPhrases;
00407 std::multimap<Scores, const SAPhrase*, ScoresComp> phraseScores (*m_scoreCmp);
00408
00409 for(iterPhrases = phraseCounts.begin(); iterPhrases != phraseCounts.end(); ++iterPhrases) {
00410 float trg2SrcMLE = float(iterPhrases->second) / totalTrgPhrases;
00411 itrLexW = lexicalWeights.find(iterPhrases->first);
00412 CHECK(itrLexW != lexicalWeights.end());
00413 Scores scoreVector(3);
00414 scoreVector[0] = trg2SrcMLE;
00415 scoreVector[1] = itrLexW->second.first;
00416 scoreVector[2] = 2.718;
00417 phraseScores.insert(make_pair(scoreVector, &iterPhrases->first));
00418 }
00419
00420 std::multimap<Scores, const SAPhrase*, ScoresComp>::reverse_iterator ritr;
00421 for(ritr = phraseScores.rbegin(); ritr != phraseScores.rend(); ++ritr) {
00422 Scores scoreVector = ritr->first;
00423 TargetPhrase *targetPhrase = GetMosesFactorIDs(*ritr->second);
00424 target.push_back(make_pair( scoreVector, targetPhrase));
00425 if(target.size() == m_maxSampleSize) break;
00426 }
00427 }
00428
00429 std::vector<int> BilingualDynSuffixArray::GetSntIndexes(std::vector<unsigned>& wrdIndices,
00430 const int sourceSize, const std::vector<unsigned>& sntBreaks) const
00431 {
00432 std::vector<unsigned>::const_iterator vit;
00433 std::vector<int> sntIndexes;
00434 for(size_t i=0; i < wrdIndices.size(); ++i) {
00435 vit = std::upper_bound(sntBreaks.begin(), sntBreaks.end(), wrdIndices[i]);
00436 int index = int(vit - sntBreaks.begin()) - 1;
00437
00438 if(wrdIndices[i] - sourceSize + 1 < sntBreaks.at(index))
00439 sntIndexes.push_back(-1);
00440 else
00441 sntIndexes.push_back(index);
00442 }
00443 return sntIndexes;
00444 }
00445
00446 int BilingualDynSuffixArray::SampleSelection(std::vector<unsigned>& sample,
00447 int sampleSize) const
00448 {
00449
00450 if(sample.size() > sampleSize)
00451 sample.erase(sample.begin()+sampleSize, sample.end());
00452 return sample.size();
00453 }
00454
00455 void BilingualDynSuffixArray::addSntPair(string& source, string& target, string& alignment) {
00456 vuint_t srcFactor, trgFactor;
00457 cerr << "source, target, alignment = " << source << ", " << target << ", " << alignment << endl;
00458 const std::string& factorDelimiter = StaticData::Instance().GetFactorDelimiter();
00459 const unsigned oldSrcCrpSize = m_srcCorpus->size(), oldTrgCrpSize = m_trgCorpus->size();
00460 cerr << "old source corpus size = " << oldSrcCrpSize << "\told target size = " << oldTrgCrpSize << endl;
00461 Phrase sphrase(ARRAY_SIZE_INCR);
00462 sphrase.CreateFromString(m_inputFactors, source, factorDelimiter);
00463 m_srcVocab->MakeOpen();
00464 wordID_t sIDs[sphrase.GetSize()];
00465
00466 for(int i = sphrase.GetSize()-1; i >= 0; --i) {
00467 sIDs[i] = m_srcVocab->GetWordID(sphrase.GetWord(i));
00468 }
00469 for(size_t i = 0; i < sphrase.GetSize(); ++i) {
00470 srcFactor.push_back(sIDs[i]);
00471 cerr << "srcFactor[" << (srcFactor.size() - 1) << "] = " << srcFactor.back() << endl;
00472 m_srcCorpus->push_back(srcFactor.back());
00473 }
00474 m_srcSntBreaks.push_back(oldSrcCrpSize);
00475 m_srcVocab->MakeClosed();
00476 Phrase tphrase(ARRAY_SIZE_INCR);
00477 tphrase.CreateFromString(m_outputFactors, target, factorDelimiter);
00478 m_trgVocab->MakeOpen();
00479 wordID_t tIDs[tphrase.GetSize()];
00480 for(int i = tphrase.GetSize()-1; i >= 0; --i) {
00481 tIDs[i] = m_trgVocab->GetWordID(tphrase.GetWord(i));
00482 }
00483 for(size_t i = 0; i < tphrase.GetSize(); ++i) {
00484 trgFactor.push_back(tIDs[i]);
00485 cerr << "trgFactor[" << (trgFactor.size() - 1) << "] = " << trgFactor.back() << endl;
00486 m_trgCorpus->push_back(trgFactor.back());
00487 }
00488 cerr << "gets to 1\n";
00489 m_trgSntBreaks.push_back(oldTrgCrpSize);
00490 cerr << "gets to 2\n";
00491 m_srcSA->Insert(&srcFactor, oldSrcCrpSize);
00492 cerr << "gets to 3\n";
00493
00494 LoadRawAlignments(alignment);
00495 m_trgVocab->MakeClosed();
00496
00497
00498
00499 }
00500 void BilingualDynSuffixArray::ClearWordInCache(wordID_t srcWord) {
00501 if(m_freqWordsCached.find(srcWord) != m_freqWordsCached.end())
00502 return;
00503 std::map<std::pair<wordID_t, wordID_t>, std::pair<float, float> >::iterator it,
00504 first, last;
00505 for(it = m_wordPairCache.begin(); it != m_wordPairCache.end(); ++it) {
00506 if(it->first.first == srcWord) {
00507 first = it;
00508 last = it++;
00509 while(it != m_wordPairCache.end() && (it->first.first == srcWord)) {
00510 last = it++;
00511 }
00512 }
00513 m_wordPairCache.erase(first, last);
00514 }
00515 }
00516 SentenceAlignment::SentenceAlignment(int sntIndex, int sourceSize, int targetSize)
00517 :m_sntIndex(sntIndex)
00518 ,numberAligned(targetSize, 0)
00519 ,alignedList(sourceSize)
00520 {
00521 for(int i=0; i < sourceSize; ++i) {
00522 std::vector<int> trgWrd;
00523 alignedList[i] = trgWrd;
00524 }
00525 }
00526
00527 bool SentenceAlignment::Extract(int maxPhraseLength, std::vector<PhrasePair*> &ret, int startSource, int endSource) const
00528 {
00529
00530
00531 int countTarget = numberAligned.size();
00532
00533 int minTarget = 9999;
00534 int maxTarget = -1;
00535 std::vector< int > usedTarget = numberAligned;
00536 for(int sourcePos = startSource; sourcePos <= endSource; sourcePos++)
00537 {
00538 for(int ind=0; ind < (int)alignedList[sourcePos].size();ind++)
00539 {
00540 int targetPos = alignedList[sourcePos][ind];
00541
00542 if (targetPos<minTarget) { minTarget = targetPos; }
00543 if (targetPos>maxTarget) { maxTarget = targetPos; }
00544 usedTarget[ targetPos ]--;
00545 }
00546 }
00547
00548
00549
00550 if (maxTarget >= 0 &&
00551 maxTarget-minTarget < maxPhraseLength)
00552 {
00553
00554
00555 bool out_of_bounds = false;
00556 for(int targetPos=minTarget; targetPos <= maxTarget && !out_of_bounds; targetPos++)
00557 {
00558 if (usedTarget[targetPos]>0)
00559 {
00560
00561 out_of_bounds = true;
00562 }
00563 }
00564
00565
00566 if (!out_of_bounds)
00567 {
00568
00569 for(int startTarget = minTarget;
00570 (startTarget >= 0 &&
00571 startTarget > maxTarget-maxPhraseLength &&
00572 (startTarget==minTarget || numberAligned[startTarget]==0));
00573 startTarget--)
00574 {
00575
00576 for (int endTarget=maxTarget;
00577 (endTarget<countTarget &&
00578 endTarget<startTarget+maxPhraseLength &&
00579 (endTarget==maxTarget || numberAligned[endTarget]==0));
00580 endTarget++)
00581 {
00582 PhrasePair *phrasePair = new PhrasePair(startTarget,endTarget,startSource,endSource, m_sntIndex);
00583 ret.push_back(phrasePair);
00584 }
00585 }
00586 }
00587 }
00588 return (ret.size() > 0);
00589
00590 }
00591
00592 }