00001
00002
00003
00004
00005
00006
00007
00008
00009
00010
00011
00012
00013
00014
00015
00016
00017
00018
00019
00020
00021
00022
00023 #include "util/check.hh"
00024 #include <algorithm>
00025 #include <sstream>
00026 #include <string>
00027 #include "memory.h"
00028 #include "FactorCollection.h"
00029 #include "Phrase.h"
00030 #include "StaticData.h"
00031
00032 #include "util/string_piece.hh"
00033 #include "util/tokenize_piece.hh"
00034
00035 using namespace std;
00036
00037 namespace Moses
00038 {
00039
00040 Phrase::Phrase(size_t reserveSize)
00041 {
00042 m_words.reserve(reserveSize);
00043 }
00044
00045 Phrase::Phrase(const vector< const Word* > &mergeWords)
00046 {
00047 m_words.reserve(mergeWords.size());
00048 for (size_t currPos = 0 ; currPos < mergeWords.size() ; currPos++) {
00049 AddWord(*mergeWords[currPos]);
00050 }
00051 }
00052
00053 Phrase::~Phrase()
00054 {
00055 }
00056
00057 void Phrase::MergeFactors(const Phrase ©)
00058 {
00059 CHECK(GetSize() == copy.GetSize());
00060 size_t size = GetSize();
00061 const size_t maxNumFactors = MAX_NUM_FACTORS;
00062 for (size_t currPos = 0 ; currPos < size ; currPos++) {
00063 for (unsigned int currFactor = 0 ; currFactor < maxNumFactors ; currFactor++) {
00064 FactorType factorType = static_cast<FactorType>(currFactor);
00065 const Factor *factor = copy.GetFactor(currPos, factorType);
00066 if (factor != NULL)
00067 SetFactor(currPos, factorType, factor);
00068 }
00069 }
00070 }
00071
00072 void Phrase::MergeFactors(const Phrase ©, FactorType factorType)
00073 {
00074 CHECK(GetSize() == copy.GetSize());
00075 for (size_t currPos = 0 ; currPos < GetSize() ; currPos++)
00076 SetFactor(currPos, factorType, copy.GetFactor(currPos, factorType));
00077 }
00078
00079 void Phrase::MergeFactors(const Phrase ©, const std::vector<FactorType>& factorVec)
00080 {
00081 CHECK(GetSize() == copy.GetSize());
00082 for (size_t currPos = 0 ; currPos < GetSize() ; currPos++)
00083 for (std::vector<FactorType>::const_iterator i = factorVec.begin();
00084 i != factorVec.end(); ++i) {
00085 SetFactor(currPos, *i, copy.GetFactor(currPos, *i));
00086 }
00087 }
00088
00089
00090 Phrase Phrase::GetSubString(const WordsRange &wordsRange) const
00091 {
00092 Phrase retPhrase(wordsRange.GetNumWordsCovered());
00093
00094 for (size_t currPos = wordsRange.GetStartPos() ; currPos <= wordsRange.GetEndPos() ; currPos++) {
00095 Word &word = retPhrase.AddWord();
00096 word = GetWord(currPos);
00097 }
00098
00099 return retPhrase;
00100 }
00101
00102 std::string Phrase::GetStringRep(const vector<FactorType> factorsToPrint) const
00103 {
00104 stringstream strme;
00105 for (size_t pos = 0 ; pos < GetSize() ; pos++) {
00106 strme << GetWord(pos).GetString(factorsToPrint, (pos != GetSize()-1));
00107 }
00108
00109 return strme.str();
00110 }
00111
00112 Word &Phrase::AddWord()
00113 {
00114 m_words.push_back(Word());
00115 return m_words.back();
00116 }
00117
00118 void Phrase::Append(const Phrase &endPhrase)
00119 {
00120
00121 for (size_t i = 0; i < endPhrase.GetSize(); i++) {
00122 AddWord(endPhrase.GetWord(i));
00123 }
00124 }
00125
00126 void Phrase::PrependWord(const Word &newWord)
00127 {
00128 AddWord();
00129
00130
00131 for (size_t pos = GetSize() - 1; pos >= 1; --pos) {
00132 const Word &word = m_words[pos - 1];
00133 m_words[pos] = word;
00134 }
00135
00136 m_words[0] = newWord;
00137 }
00138
00139 void Phrase::CreateFromString(const std::vector<FactorType> &factorOrder, const StringPiece &phraseString, const StringPiece &factorDelimiter)
00140 {
00141 FactorCollection &factorCollection = FactorCollection::Instance();
00142
00143 for (util::TokenIter<util::AnyCharacter, true> word_it(phraseString, util::AnyCharacter(" \t")); word_it; ++word_it) {
00144 Word &word = AddWord();
00145 size_t index = 0;
00146 for (util::TokenIter<util::MultiCharacter, false> factor_it(*word_it, util::MultiCharacter(factorDelimiter));
00147 factor_it && (index < factorOrder.size());
00148 ++factor_it, ++index) {
00149 word[factorOrder[index]] = factorCollection.AddFactor(*factor_it);
00150 }
00151 if (index != factorOrder.size()) {
00152 TRACE_ERR( "[ERROR] Malformed input: '" << *word_it << "'" << std::endl
00153 << "In '" << phraseString << "'" << endl
00154 << " Expected input to have words composed of " << factorOrder.size() << " factor(s) (form FAC1|FAC2|...)" << std::endl
00155 << " but instead received input with " << index << " factor(s).\n");
00156 abort();
00157 }
00158 }
00159 }
00160
00161 void Phrase::CreateFromStringNewFormat(FactorDirection direction
00162 , const std::vector<FactorType> &factorOrder
00163 , const std::string &phraseString
00164 , const std::string &
00165 , Word &lhs)
00166 {
00167
00168 vector<string> annotatedWordVector;
00169 Tokenize(annotatedWordVector, phraseString);
00170
00171
00172
00173
00174 m_words.reserve(annotatedWordVector.size()-1);
00175
00176 for (size_t phrasePos = 0 ; phrasePos < annotatedWordVector.size() - 1 ; phrasePos++) {
00177 string &annotatedWord = annotatedWordVector[phrasePos];
00178 bool isNonTerminal;
00179 if (annotatedWord.substr(0, 1) == "[" && annotatedWord.substr(annotatedWord.size()-1, 1) == "]") {
00180
00181 isNonTerminal = true;
00182
00183 size_t nextPos = annotatedWord.find("[", 1);
00184 CHECK(nextPos != string::npos);
00185
00186 if (direction == Input)
00187 annotatedWord = annotatedWord.substr(1, nextPos - 2);
00188 else
00189 annotatedWord = annotatedWord.substr(nextPos + 1, annotatedWord.size() - nextPos - 2);
00190 } else {
00191 isNonTerminal = false;
00192 }
00193
00194 Word &word = AddWord();
00195 word.CreateFromString(direction, factorOrder, annotatedWord, isNonTerminal);
00196
00197 }
00198
00199
00200 string &annotatedWord = annotatedWordVector.back();
00201 CHECK(annotatedWord.substr(0, 1) == "[" && annotatedWord.substr(annotatedWord.size()-1, 1) == "]");
00202 annotatedWord = annotatedWord.substr(1, annotatedWord.size() - 2);
00203
00204 lhs.CreateFromString(direction, factorOrder, annotatedWord, true);
00205 CHECK(lhs.IsNonTerminal());
00206 }
00207
00208 int Phrase::Compare(const Phrase &other) const
00209 {
00210 #ifdef min
00211 #undef min
00212 #endif
00213 size_t thisSize = GetSize()
00214 ,compareSize = other.GetSize();
00215 if (thisSize != compareSize) {
00216 return (thisSize < compareSize) ? -1 : 1;
00217 }
00218
00219 for (size_t pos = 0 ; pos < thisSize ; pos++) {
00220 const Word &thisWord = GetWord(pos)
00221 ,&otherWord = other.GetWord(pos);
00222 int ret = Word::Compare(thisWord, otherWord);
00223
00224 if (ret != 0)
00225 return ret;
00226 }
00227
00228 return 0;
00229 }
00230
00231
00232 bool Phrase::Contains(const vector< vector<string> > &subPhraseVector
00233 , const vector<FactorType> &inputFactor) const
00234 {
00235 const size_t subSize = subPhraseVector.size()
00236 ,thisSize= GetSize();
00237 if (subSize > thisSize)
00238 return false;
00239
00240
00241 for (size_t currStartPos = 0 ; currStartPos < (thisSize - subSize + 1) ; currStartPos++) {
00242 bool match = true;
00243
00244 for (size_t currFactorIndex = 0 ; currFactorIndex < inputFactor.size() ; currFactorIndex++) {
00245 FactorType factorType = inputFactor[currFactorIndex];
00246 for (size_t currSubPos = 0 ; currSubPos < subSize ; currSubPos++) {
00247 size_t currThisPos = currSubPos + currStartPos;
00248 const string &subStr = subPhraseVector[currSubPos][currFactorIndex]
00249 ,&thisStr = GetFactor(currThisPos, factorType)->GetString();
00250 if (subStr != thisStr) {
00251 match = false;
00252 break;
00253 }
00254 }
00255 if (!match)
00256 break;
00257 }
00258
00259 if (match)
00260 return true;
00261 }
00262 return false;
00263 }
00264
00265 bool Phrase::IsCompatible(const Phrase &inputPhrase) const
00266 {
00267 if (inputPhrase.GetSize() != GetSize()) {
00268 return false;
00269 }
00270
00271 const size_t size = GetSize();
00272
00273 const size_t maxNumFactors = MAX_NUM_FACTORS;
00274 for (size_t currPos = 0 ; currPos < size ; currPos++) {
00275 for (unsigned int currFactor = 0 ; currFactor < maxNumFactors ; currFactor++) {
00276 FactorType factorType = static_cast<FactorType>(currFactor);
00277 const Factor *thisFactor = GetFactor(currPos, factorType)
00278 ,*inputFactor = inputPhrase.GetFactor(currPos, factorType);
00279 if (thisFactor != NULL && inputFactor != NULL && thisFactor != inputFactor)
00280 return false;
00281 }
00282 }
00283 return true;
00284
00285 }
00286
00287 bool Phrase::IsCompatible(const Phrase &inputPhrase, FactorType factorType) const
00288 {
00289 if (inputPhrase.GetSize() != GetSize()) {
00290 return false;
00291 }
00292 for (size_t currPos = 0 ; currPos < GetSize() ; currPos++) {
00293 if (GetFactor(currPos, factorType) != inputPhrase.GetFactor(currPos, factorType))
00294 return false;
00295 }
00296 return true;
00297 }
00298
00299 bool Phrase::IsCompatible(const Phrase &inputPhrase, const std::vector<FactorType>& factorVec) const
00300 {
00301 if (inputPhrase.GetSize() != GetSize()) {
00302 return false;
00303 }
00304 for (size_t currPos = 0 ; currPos < GetSize() ; currPos++) {
00305 for (std::vector<FactorType>::const_iterator i = factorVec.begin();
00306 i != factorVec.end(); ++i) {
00307 if (GetFactor(currPos, *i) != inputPhrase.GetFactor(currPos, *i))
00308 return false;
00309 }
00310 }
00311 return true;
00312 }
00313
00314 size_t Phrase::GetNumTerminals() const
00315 {
00316 size_t ret = 0;
00317
00318 for (size_t pos = 0; pos < GetSize(); ++pos) {
00319 if (!GetWord(pos).IsNonTerminal())
00320 ret++;
00321 }
00322 return ret;
00323 }
00324
00325 void Phrase::InitializeMemPool()
00326 {
00327 }
00328
00329 void Phrase::FinalizeMemPool()
00330 {
00331 }
00332
00333 TO_STRING_BODY(Phrase);
00334
00335
00336 ostream& operator<<(ostream& out, const Phrase& phrase)
00337 {
00338
00339 for (size_t pos = 0 ; pos < phrase.GetSize() ; pos++) {
00340 const Word &word = phrase.GetWord(pos);
00341 out << word;
00342 }
00343 return out;
00344 }
00345
00346 }
00347
00348