00001
00002
00003
00004
00005
00006
00007
00008
00009
00010
00011
00012
00013
00014
00015
00016
00017
00018
00019
00020
00021
00022
00023 #include <algorithm>
00024 #include <sstream>
00025 #include <string>
00026 #include "memory.h"
00027 #include "FactorCollection.h"
00028 #include "Phrase.h"
00029 #include "StaticData.h"
00030
00031 #include "util/string_piece.hh"
00032 #include "util/string_stream.hh"
00033 #include "util/tokenize_piece.hh"
00034
00035 using namespace std;
00036
00037 namespace Moses
00038 {
00039
00040 Phrase::Phrase() {}
00041
00042 Phrase::Phrase(size_t reserveSize)
00043 {
00044 m_words.reserve(reserveSize);
00045 }
00046
00047 Phrase::Phrase(const vector< const Word* > &mergeWords)
00048 {
00049 m_words.reserve(mergeWords.size());
00050 for (size_t currPos = 0 ; currPos < mergeWords.size() ; currPos++) {
00051 AddWord(*mergeWords[currPos]);
00052 }
00053 }
00054
00055 Phrase::~Phrase()
00056 {
00057 }
00058
00059 void Phrase::MergeFactors(const Phrase ©)
00060 {
00061 UTIL_THROW_IF2(GetSize() != copy.GetSize(), "Both phrases need to be the same size to merge");
00062 size_t size = GetSize();
00063 const size_t maxNumFactors = MAX_NUM_FACTORS;
00064 for (size_t currPos = 0 ; currPos < size ; currPos++) {
00065 for (unsigned int currFactor = 0 ; currFactor < maxNumFactors ; currFactor++) {
00066 FactorType factorType = static_cast<FactorType>(currFactor);
00067 const Factor *factor = copy.GetFactor(currPos, factorType);
00068 if (factor != NULL)
00069 SetFactor(currPos, factorType, factor);
00070 }
00071 }
00072 }
00073
00074 void Phrase::MergeFactors(const Phrase ©, FactorType factorType)
00075 {
00076 UTIL_THROW_IF2(GetSize() != copy.GetSize(), "Both phrases need to be the same size to merge");
00077 for (size_t currPos = 0 ; currPos < GetSize() ; currPos++)
00078 SetFactor(currPos, factorType, copy.GetFactor(currPos, factorType));
00079 }
00080
00081 void Phrase::MergeFactors(const Phrase ©, const std::vector<FactorType>& factorVec)
00082 {
00083 UTIL_THROW_IF2(GetSize() != copy.GetSize(), "Both phrases need to be the same size to merge");
00084 for (size_t currPos = 0 ; currPos < GetSize() ; currPos++)
00085 for (std::vector<FactorType>::const_iterator i = factorVec.begin();
00086 i != factorVec.end(); ++i) {
00087 SetFactor(currPos, *i, copy.GetFactor(currPos, *i));
00088 }
00089 }
00090
00091
00092 Phrase Phrase::GetSubString(const Range &range) const
00093 {
00094 Phrase retPhrase(range.GetNumWordsCovered());
00095
00096 for (size_t currPos = range.GetStartPos() ; currPos <= range.GetEndPos() ; currPos++) {
00097 Word &word = retPhrase.AddWord();
00098 word = GetWord(currPos);
00099 }
00100
00101 return retPhrase;
00102 }
00103
00104 Phrase Phrase::GetSubString(const Range &range, FactorType factorType) const
00105 {
00106 Phrase retPhrase(range.GetNumWordsCovered());
00107
00108 for (size_t currPos = range.GetStartPos() ; currPos <= range.GetEndPos() ; currPos++) {
00109 const Factor* f = GetFactor(currPos, factorType);
00110 Word &word = retPhrase.AddWord();
00111 word.SetFactor(factorType, f);
00112 }
00113
00114 return retPhrase;
00115 }
00116
00117 std::string
00118 Phrase::
00119 GetStringRep(vector<FactorType> const& factorsToPrint,
00120 AllOptions const* opts) const
00121 {
00122 if (!opts) opts = StaticData::Instance().options().get();
00123 bool markUnk = opts->unk.mark;
00124 util::StringStream strme;
00125 for (size_t pos = 0 ; pos < GetSize() ; pos++) {
00126 if (markUnk && GetWord(pos).IsOOV()) {
00127 strme << opts->unk.prefix;
00128 }
00129 strme << GetWord(pos).GetString(factorsToPrint, (pos != GetSize()-1));
00130 if (markUnk && GetWord(pos).IsOOV()) {
00131 strme << opts->unk.suffix;
00132 }
00133 }
00134 return strme.str();
00135 }
00136
00137 Word &Phrase::AddWord()
00138 {
00139 m_words.push_back(Word());
00140 return m_words.back();
00141 }
00142
00143 void Phrase::Append(const Phrase &endPhrase)
00144 {
00145
00146 for (size_t i = 0; i < endPhrase.GetSize(); i++) {
00147 AddWord(endPhrase.GetWord(i));
00148 }
00149 }
00150
00151 void Phrase::PrependWord(const Word &newWord)
00152 {
00153 AddWord();
00154
00155
00156 for (size_t pos = GetSize() - 1; pos >= 1; --pos) {
00157 const Word &word = m_words[pos - 1];
00158 m_words[pos] = word;
00159 }
00160
00161 m_words[0] = newWord;
00162 }
00163
00164 void Phrase::CreateFromString(FactorDirection direction,
00165 const std::vector<FactorType> &factorOrder,
00166 const StringPiece &phraseString,
00167 Word **lhs)
00168 {
00169
00170 vector<StringPiece> annotatedWordVector;
00171 for (util::TokenIter<util::AnyCharacter, true> it(phraseString, "\t "); it; ++it) {
00172 annotatedWordVector.push_back(*it);
00173 }
00174
00175 if (annotatedWordVector.size() == 0) {
00176 if (lhs) {
00177 (*lhs) = NULL;
00178 }
00179 return;
00180 }
00181
00182
00183
00184
00185
00186 size_t numWords;
00187 const StringPiece &annotatedWord = annotatedWordVector.back();
00188 if (annotatedWord.size() >= 2
00189 && *annotatedWord.data() == '['
00190 && annotatedWord.data()[annotatedWord.size() - 1] == ']') {
00191
00192 numWords = annotatedWordVector.size()-1;
00193
00194
00195 assert(lhs);
00196 (*lhs) = new Word(true);
00197 (*lhs)->CreateFromString(direction, factorOrder, annotatedWord.substr(1, annotatedWord.size() - 2), true);
00198 assert((*lhs)->IsNonTerminal());
00199 } else {
00200 numWords = annotatedWordVector.size();
00201 if (lhs) {
00202 (*lhs) = NULL;
00203 }
00204 }
00205
00206
00207 m_words.reserve(numWords);
00208
00209 for (size_t phrasePos = 0 ; phrasePos < numWords; phrasePos++) {
00210 StringPiece &annotatedWord = annotatedWordVector[phrasePos];
00211 bool isNonTerminal;
00212 if (annotatedWord.size() >= 2 && *annotatedWord.data() == '[' && annotatedWord.data()[annotatedWord.size() - 1] == ']') {
00213
00214 isNonTerminal = true;
00215
00216 size_t nextPos = annotatedWord.find('[', 1);
00217 UTIL_THROW_IF2(nextPos == string::npos,
00218 "Incorrect formatting of non-terminal. Should have 2 non-terms, eg. [X][X]. "
00219 << "Current string: " << annotatedWord);
00220
00221 if (direction == Input)
00222 annotatedWord = annotatedWord.substr(1, nextPos - 2);
00223 else
00224 annotatedWord = annotatedWord.substr(nextPos + 1, annotatedWord.size() - nextPos - 2);
00225 } else {
00226 isNonTerminal = false;
00227 }
00228
00229 Word &word = AddWord();
00230 word.CreateFromString(direction, factorOrder, annotatedWord, isNonTerminal);
00231
00232 }
00233 }
00234
00235 int Phrase::Compare(const Phrase &other) const
00236 {
00237 #ifdef min
00238 #undef min
00239 #endif
00240 size_t thisSize = GetSize()
00241 ,compareSize = other.GetSize();
00242 if (thisSize != compareSize) {
00243 return (thisSize < compareSize) ? -1 : 1;
00244 }
00245
00246 for (size_t pos = 0 ; pos < thisSize ; pos++) {
00247 const Word &thisWord = GetWord(pos)
00248 ,&otherWord = other.GetWord(pos);
00249 int ret = Word::Compare(thisWord, otherWord);
00250
00251 if (ret != 0)
00252 return ret;
00253 }
00254
00255 return 0;
00256 }
00257
00258 size_t Phrase::hash() const
00259 {
00260 size_t seed = 0;
00261 for (size_t i = 0; i < GetSize(); ++i) {
00262 boost::hash_combine(seed, GetWord(i));
00263 }
00264 return seed;
00265 }
00266
00267 bool Phrase::operator== (const Phrase &other) const
00268 {
00269 size_t thisSize = GetSize()
00270 ,compareSize = other.GetSize();
00271 if (thisSize != compareSize) {
00272 return false;
00273 }
00274
00275 for (size_t pos = 0 ; pos < thisSize ; pos++) {
00276 const Word &thisWord = GetWord(pos)
00277 ,&otherWord = other.GetWord(pos);
00278 bool ret = thisWord == otherWord;
00279 if (!ret) {
00280 return false;
00281 }
00282 }
00283
00284 return true;
00285 }
00286
00287
00288 bool Phrase::Contains(const vector< vector<string> > &subPhraseVector
00289 , const vector<FactorType> &inputFactor) const
00290 {
00291 const size_t subSize = subPhraseVector.size()
00292 ,thisSize= GetSize();
00293 if (subSize > thisSize)
00294 return false;
00295
00296
00297 for (size_t currStartPos = 0 ; currStartPos < (thisSize - subSize + 1) ; currStartPos++) {
00298 bool match = true;
00299
00300 for (size_t currFactorIndex = 0 ; currFactorIndex < inputFactor.size() ; currFactorIndex++) {
00301 FactorType factorType = inputFactor[currFactorIndex];
00302 for (size_t currSubPos = 0 ; currSubPos < subSize ; currSubPos++) {
00303 size_t currThisPos = currSubPos + currStartPos;
00304 const string &subStr = subPhraseVector[currSubPos][currFactorIndex];
00305 StringPiece thisStr = GetFactor(currThisPos, factorType)->GetString();
00306 if (subStr != thisStr) {
00307 match = false;
00308 break;
00309 }
00310 }
00311 if (!match)
00312 break;
00313 }
00314
00315 if (match)
00316 return true;
00317 }
00318 return false;
00319 }
00320
00321 bool Phrase::IsCompatible(const Phrase &inputPhrase) const
00322 {
00323 if (inputPhrase.GetSize() != GetSize()) {
00324 return false;
00325 }
00326
00327 const size_t size = GetSize();
00328
00329 const size_t maxNumFactors = MAX_NUM_FACTORS;
00330 for (size_t currPos = 0 ; currPos < size ; currPos++) {
00331 for (unsigned int currFactor = 0 ; currFactor < maxNumFactors ; currFactor++) {
00332 FactorType factorType = static_cast<FactorType>(currFactor);
00333 const Factor *thisFactor = GetFactor(currPos, factorType)
00334 ,*inputFactor = inputPhrase.GetFactor(currPos, factorType);
00335 if (thisFactor != NULL && inputFactor != NULL && thisFactor != inputFactor)
00336 return false;
00337 }
00338 }
00339 return true;
00340
00341 }
00342
00343 bool Phrase::IsCompatible(const Phrase &inputPhrase, FactorType factorType) const
00344 {
00345 if (inputPhrase.GetSize() != GetSize()) {
00346 return false;
00347 }
00348 for (size_t currPos = 0 ; currPos < GetSize() ; currPos++) {
00349 if (GetFactor(currPos, factorType) != inputPhrase.GetFactor(currPos, factorType))
00350 return false;
00351 }
00352 return true;
00353 }
00354
00355 bool Phrase::IsCompatible(const Phrase &inputPhrase, const std::vector<FactorType>& factorVec) const
00356 {
00357 if (inputPhrase.GetSize() != GetSize()) {
00358 return false;
00359 }
00360 for (size_t currPos = 0 ; currPos < GetSize() ; currPos++) {
00361 for (std::vector<FactorType>::const_iterator i = factorVec.begin();
00362 i != factorVec.end(); ++i) {
00363 if (GetFactor(currPos, *i) != inputPhrase.GetFactor(currPos, *i))
00364 return false;
00365 }
00366 }
00367 return true;
00368 }
00369
00370 size_t Phrase::GetNumTerminals() const
00371 {
00372 size_t ret = 0;
00373
00374 for (size_t pos = 0; pos < GetSize(); ++pos) {
00375 if (!GetWord(pos).IsNonTerminal())
00376 ret++;
00377 }
00378 return ret;
00379 }
00380
00381 void Phrase::InitializeMemPool()
00382 {
00383 }
00384
00385 void Phrase::FinalizeMemPool()
00386 {
00387 }
00388
00389 void Phrase::OnlyTheseFactors(const FactorMask &factors)
00390 {
00391 for (unsigned int currFactor = 0 ; currFactor < MAX_NUM_FACTORS ; currFactor++) {
00392 if (!factors[currFactor]) {
00393 for (size_t pos = 0; pos < GetSize(); ++pos) {
00394 SetFactor(pos, currFactor, NULL);
00395 }
00396 }
00397 }
00398 }
00399
00400 void Phrase::InitStartEndWord()
00401 {
00402 FactorCollection &factorCollection = FactorCollection::Instance();
00403
00404 Word startWord(Input);
00405 const Factor *factor = factorCollection.AddFactor(Input, 0, BOS_);
00406 startWord.SetFactor(0, factor);
00407 PrependWord(startWord);
00408
00409 Word endWord(Input);
00410 factor = factorCollection.AddFactor(Input, 0, EOS_);
00411 endWord.SetFactor(0, factor);
00412 AddWord(endWord);
00413 }
00414
00415 size_t Phrase::Find(const Phrase &sought, int maxUnknown) const
00416 {
00417 if (GetSize() < sought.GetSize()) {
00418
00419 return NOT_FOUND;
00420 }
00421
00422 size_t maxStartPos = GetSize() - sought.GetSize();
00423 for (size_t startThisPos = 0; startThisPos <= maxStartPos; ++startThisPos) {
00424 size_t thisPos = startThisPos;
00425 int currUnknowns = 0;
00426 size_t soughtPos;
00427 for (soughtPos = 0; soughtPos < sought.GetSize(); ++soughtPos) {
00428 const Word &soughtWord = sought.GetWord(soughtPos);
00429 const Word &thisWord = GetWord(thisPos);
00430
00431 if (soughtWord == thisWord) {
00432 ++thisPos;
00433 } else if (soughtWord.IsOOV() && (maxUnknown < 0 || currUnknowns < maxUnknown)) {
00434
00435 ++currUnknowns;
00436 ++thisPos;
00437 } else {
00438 break;
00439 }
00440 }
00441
00442 if (soughtPos == sought.GetSize()) {
00443 return startThisPos;
00444 }
00445 }
00446
00447 return NOT_FOUND;
00448 }
00449
00450 TO_STRING_BODY(Phrase);
00451
00452
00453 ostream& operator<<(ostream& out, const Phrase& phrase)
00454 {
00455
00456 for (size_t pos = 0 ; pos < phrase.GetSize() ; pos++) {
00457 const Word &word = phrase.GetWord(pos);
00458 out << word;
00459 }
00460 return out;
00461 }
00462
00463 }
00464
00465