00001
00002
00003
00004
00005
00006
00007
00008
00009
00010
00011
00012
00013
00014
00015
00016
00017
00018
00019
00020
00021
00022
00023 #include <fstream>
00024 #include <string>
00025 #include <iterator>
00026 #include <algorithm>
00027 #include <memory>
00028 #include <sys/stat.h>
00029 #include <stdlib.h>
00030 #include "util/file_piece.hh"
00031 #include "util/tokenize_piece.hh"
00032
00033 #include "moses/TranslationModel/PhraseDictionaryMemory.h"
00034 #include "moses/FactorCollection.h"
00035 #include "moses/Word.h"
00036 #include "moses/Util.h"
00037 #include "moses/InputFileStream.h"
00038 #include "moses/StaticData.h"
00039 #include "moses/WordsRange.h"
00040 #include "moses/UserMessage.h"
00041 #include "moses/SparsePhraseDictionaryFeature.h"
00042
00043 using namespace std;
00044
00045 namespace Moses
00046 {
00047
00048 namespace {
00049 void ParserDeath(const std::string &file, size_t line_num) {
00050 stringstream strme;
00051 strme << "Syntax error at " << file << ":" << line_num;
00052 UserMessage::Add(strme.str());
00053 abort();
00054 }
00055 template <class It> StringPiece GrabOrDie(It &it, const std::string &file, size_t line_num) {
00056 if (!it) ParserDeath(file, line_num);
00057 return *it++;
00058 }
00059 }
00060
00061 bool PhraseDictionaryMemory::Load(const std::vector<FactorType> &input
00062 , const std::vector<FactorType> &output
00063 , const string &filePath
00064 , const vector<float> &weight
00065 , size_t tableLimit
00066 , const LMList &languageModels
00067 , float weightWP)
00068 {
00069 const_cast<LMList&>(languageModels).InitializeBeforeSentenceProcessing();
00070
00071 const StaticData &staticData = StaticData::Instance();
00072
00073 m_tableLimit = tableLimit;
00074
00075 util::FilePiece inFile(filePath.c_str(), staticData.GetVerboseLevel() >= 1 ? &std::cerr : NULL);
00076
00077 size_t line_num = 0;
00078 size_t numElement = NOT_FOUND;
00079 const std::string& factorDelimiter = staticData.GetFactorDelimiter();
00080
00081 Phrase sourcePhrase(0);
00082 std::vector<float> scv;
00083 scv.reserve(m_numScoreComponent);
00084
00085 TargetPhraseCollection *preSourceNode = NULL;
00086 std::string preSourceString;
00087
00088 while(true) {
00089 ++line_num;
00090 StringPiece line;
00091 try {
00092 line = inFile.ReadLine();
00093 } catch (util::EndOfFileException &e) {
00094 break;
00095 }
00096
00097 util::TokenIter<util::MultiCharacter> pipes(line, util::MultiCharacter("|||"));
00098 StringPiece sourcePhraseString(GrabOrDie(pipes, filePath, line_num));
00099 StringPiece targetPhraseString(GrabOrDie(pipes, filePath, line_num));
00100 StringPiece scoreString(GrabOrDie(pipes, filePath, line_num));
00101
00102 bool isLHSEmpty = !util::TokenIter<util::AnyCharacter, true>(sourcePhraseString, util::AnyCharacter(" \t"));
00103 if (isLHSEmpty && !staticData.IsWordDeletionEnabled()) {
00104 TRACE_ERR( filePath << ":" << line_num << ": pt entry contains empty source, skipping\n");
00105 continue;
00106 }
00107
00108
00109 std::auto_ptr<TargetPhrase> targetPhrase(new TargetPhrase());
00110 targetPhrase->CreateFromString(output, targetPhraseString, factorDelimiter);
00111
00112 scv.clear();
00113 for (util::TokenIter<util::AnyCharacter, true> token(scoreString, util::AnyCharacter(" \t")); token; ++token) {
00114 char *err_ind;
00115
00116 scv.push_back(FloorScore(TransformScore(static_cast<float>(strtod(token->data(), &err_ind)))));
00117 if (err_ind == token->data()) {
00118 stringstream strme;
00119 strme << "Bad number " << token << " on line " << line_num;
00120 UserMessage::Add(strme.str());
00121 abort();
00122 }
00123 }
00124 if (scv.size() != m_numScoreComponent) {
00125
00126
00127 if (m_numScoreComponentMultiModel > 0 && scv.size() == m_numScoreComponentMultiModel && m_numScoreComponentMultiModel < m_numScoreComponent) {
00128 scv.resize(m_numScoreComponent);
00129 }
00130 else {
00131 stringstream strme;
00132 strme << "Size of scoreVector != number (" <<scv.size() << "!=" <<m_numScoreComponent<<") of score components on line " << line_num;
00133 UserMessage::Add(strme.str());
00134 abort();
00135 }
00136 }
00137
00138
00139
00140 size_t consumed = 3;
00141 if (pipes) {
00142 targetPhrase->SetAlignmentInfo(*pipes++);
00143 ++consumed;
00144 }
00145
00146 ScoreComponentCollection sparse;
00147 if (pipes) pipes++;
00148 if (pipes) {
00149
00150 SparsePhraseDictionaryFeature* spdf =
00151 GetFeature()->GetSparsePhraseDictionaryFeature();
00152 if (spdf) {
00153 sparse.Assign(spdf,(pipes++)->as_string());
00154 }
00155 }
00156
00157
00158
00159 targetPhrase->SetScore(m_feature, scv, sparse, weight, weightWP, languageModels);
00160
00161
00162 for (; pipes; ++pipes, ++consumed) {}
00163 if (numElement != consumed) {
00164 if (numElement == NOT_FOUND) {
00165 numElement = consumed;
00166 } else {
00167 stringstream strme;
00168 strme << "Syntax error at " << filePath << ":" << line_num;
00169 UserMessage::Add(strme.str());
00170 abort();
00171 }
00172 }
00173
00174
00175
00176 sourcePhrase.Clear();
00177 sourcePhrase.CreateFromString(input, sourcePhraseString, factorDelimiter);
00178
00179 targetPhrase->SetSourcePhrase(sourcePhrase);
00180 if (preSourceString == sourcePhraseString && preSourceNode) {
00181 preSourceNode->Add(targetPhrase.release());
00182 } else {
00183 preSourceNode = CreateTargetPhraseCollection(sourcePhrase);
00184 preSourceNode->Add(targetPhrase.release());
00185 preSourceString.assign(sourcePhraseString.data(), sourcePhraseString.size());
00186 }
00187 }
00188
00189
00190 m_collection.Sort(m_tableLimit);
00191
00192
00193
00194
00195
00196 return true;
00197 }
00198
00199 TargetPhraseCollection *PhraseDictionaryMemory::CreateTargetPhraseCollection(const Phrase &source)
00200 {
00201 const size_t size = source.GetSize();
00202
00203 PhraseDictionaryNode *currNode = &m_collection;
00204 for (size_t pos = 0 ; pos < size ; ++pos) {
00205 const Word& word = source.GetWord(pos);
00206 currNode = currNode->GetOrCreateChild(word);
00207 if (currNode == NULL)
00208 return NULL;
00209 }
00210
00211 return currNode->CreateTargetPhraseCollection();
00212 }
00213
00214 const TargetPhraseCollection *PhraseDictionaryMemory::GetTargetPhraseCollection(const Phrase &source) const
00215 {
00216
00217 const size_t size = source.GetSize();
00218
00219 const PhraseDictionaryNode *currNode = &m_collection;
00220 for (size_t pos = 0 ; pos < size ; ++pos) {
00221 const Word& word = source.GetWord(pos);
00222 currNode = currNode->GetChild(word);
00223 if (currNode == NULL)
00224 return NULL;
00225 }
00226
00227 return currNode->GetTargetPhraseCollection();
00228 }
00229
00230 PhraseDictionaryMemory::~PhraseDictionaryMemory()
00231 {
00232 }
00233
00234 TO_STRING_BODY(PhraseDictionaryMemory);
00235
00236
00237 ostream& operator<<(ostream& out, const PhraseDictionaryMemory& phraseDict)
00238 {
00239 const PhraseDictionaryNode &coll = phraseDict.m_collection;
00240 PhraseDictionaryNode::const_iterator iter;
00241 for (iter = coll.begin() ; iter != coll.end() ; ++iter) {
00242 const Word &word = (*iter).first;
00243 out << word;
00244 }
00245 return out;
00246 }
00247
00248
00249 }
00250