00001
00002
00003
00004
00005
00006
00007
00008
00009
00010
00011
00012
00013
00014
00015
00016
00017
00018
00019
00020
00021
00022
00023 #include <fstream>
00024 #include <string>
00025 #include <iterator>
00026 #include <algorithm>
00027 #include <memory>
00028 #include <sys/stat.h>
00029 #include <stdlib.h>
00030 #include "util/file_piece.hh"
00031 #include "util/tokenize_piece.hh"
00032
00033 #include "PhraseDictionaryMemory.h"
00034 #include "FactorCollection.h"
00035 #include "Word.h"
00036 #include "Util.h"
00037 #include "InputFileStream.h"
00038 #include "StaticData.h"
00039 #include "WordsRange.h"
00040 #include "UserMessage.h"
00041
00042 using namespace std;
00043
00044 namespace Moses
00045 {
00046
00047 namespace {
00048 void ParserDeath(const std::string &file, size_t line_num) {
00049 stringstream strme;
00050 strme << "Syntax error at " << file << ":" << line_num;
00051 UserMessage::Add(strme.str());
00052 abort();
00053 }
00054 template <class It> StringPiece GrabOrDie(It &it, const std::string &file, size_t line_num) {
00055 if (!it) ParserDeath(file, line_num);
00056 return *it++;
00057 }
00058 }
00059
00060 bool PhraseDictionaryMemory::Load(const std::vector<FactorType> &input
00061 , const std::vector<FactorType> &output
00062 , const string &filePath
00063 , const vector<float> &weight
00064 , size_t tableLimit
00065 , const LMList &languageModels
00066 , float weightWP)
00067 {
00068 const StaticData &staticData = StaticData::Instance();
00069
00070 m_tableLimit = tableLimit;
00071
00072 util::FilePiece inFile(filePath.c_str(), staticData.GetVerboseLevel() >= 1 ? &std::cerr : NULL);
00073
00074 size_t line_num = 0;
00075 size_t numElement = NOT_FOUND;
00076 const std::string& factorDelimiter = staticData.GetFactorDelimiter();
00077
00078 Phrase sourcePhrase(0);
00079 std::vector<float> scv;
00080 scv.reserve(m_numScoreComponent);
00081
00082 TargetPhraseCollection *preSourceNode = NULL;
00083 std::string preSourceString;
00084
00085 while(true) {
00086 ++line_num;
00087 StringPiece line;
00088 try {
00089 line = inFile.ReadLine();
00090 } catch (util::EndOfFileException &e) {
00091 break;
00092 }
00093
00094 util::TokenIter<util::MultiCharacter> pipes(line, util::MultiCharacter("|||"));
00095 StringPiece sourcePhraseString(GrabOrDie(pipes, filePath, line_num));
00096 StringPiece targetPhraseString(GrabOrDie(pipes, filePath, line_num));
00097 StringPiece scoreString(GrabOrDie(pipes, filePath, line_num));
00098
00099 bool isLHSEmpty = !util::TokenIter<util::AnyCharacter, true>(sourcePhraseString, util::AnyCharacter(" \t"));
00100 if (isLHSEmpty && !staticData.IsWordDeletionEnabled()) {
00101 TRACE_ERR( filePath << ":" << line_num << ": pt entry contains empty source, skipping\n");
00102 continue;
00103 }
00104
00105
00106 std::auto_ptr<TargetPhrase> targetPhrase(new TargetPhrase(Output));
00107 targetPhrase->SetSourcePhrase(&sourcePhrase);
00108 targetPhrase->CreateFromString(output, targetPhraseString, factorDelimiter);
00109
00110 scv.clear();
00111 for (util::TokenIter<util::AnyCharacter, true> token(scoreString, util::AnyCharacter(" \t")); token; ++token) {
00112 char *err_ind;
00113
00114 scv.push_back(FloorScore(TransformScore(static_cast<float>(strtod(token->data(), &err_ind)))));
00115 if (err_ind == token->data()) {
00116 stringstream strme;
00117 strme << "Bad number " << token << " on line " << line_num;
00118 UserMessage::Add(strme.str());
00119 abort();
00120 }
00121 }
00122 if (scv.size() != m_numScoreComponent) {
00123 stringstream strme;
00124 strme << "Size of scoreVector != number (" <<scv.size() << "!=" <<m_numScoreComponent<<") of score components on line " << line_num;
00125 UserMessage::Add(strme.str());
00126 abort();
00127 }
00128
00129 targetPhrase->SetScore(m_feature, scv, weight, weightWP, languageModels);
00130
00131 size_t consumed = 3;
00132 if (pipes) {
00133 targetPhrase->SetAlignmentInfo(*pipes++);
00134 ++consumed;
00135 }
00136
00137 for (; pipes; ++pipes, ++consumed) {}
00138 if (numElement != consumed) {
00139 if (numElement == NOT_FOUND) {
00140 numElement = consumed;
00141 } else {
00142 stringstream strme;
00143 strme << "Syntax error at " << filePath << ":" << line_num;
00144 UserMessage::Add(strme.str());
00145 abort();
00146 }
00147 }
00148
00149
00150 if (preSourceString == sourcePhraseString && preSourceNode) {
00151 preSourceNode->Add(targetPhrase.release());
00152 } else {
00153 sourcePhrase.Clear();
00154 sourcePhrase.CreateFromString(input, sourcePhraseString, factorDelimiter);
00155 preSourceNode = CreateTargetPhraseCollection(sourcePhrase);
00156 preSourceNode->Add(targetPhrase.release());
00157 preSourceString.assign(sourcePhraseString.data(), sourcePhraseString.size());
00158 }
00159 }
00160
00161
00162 m_collection.Sort(m_tableLimit);
00163
00164 return true;
00165 }
00166
00167 TargetPhraseCollection *PhraseDictionaryMemory::CreateTargetPhraseCollection(const Phrase &source)
00168 {
00169 const size_t size = source.GetSize();
00170
00171 PhraseDictionaryNode *currNode = &m_collection;
00172 for (size_t pos = 0 ; pos < size ; ++pos) {
00173 const Word& word = source.GetWord(pos);
00174 currNode = currNode->GetOrCreateChild(word);
00175 if (currNode == NULL)
00176 return NULL;
00177 }
00178
00179 return currNode->CreateTargetPhraseCollection();
00180 }
00181
00182 const TargetPhraseCollection *PhraseDictionaryMemory::GetTargetPhraseCollection(const Phrase &source) const
00183 {
00184
00185 const size_t size = source.GetSize();
00186
00187 const PhraseDictionaryNode *currNode = &m_collection;
00188 for (size_t pos = 0 ; pos < size ; ++pos) {
00189 const Word& word = source.GetWord(pos);
00190 currNode = currNode->GetChild(word);
00191 if (currNode == NULL)
00192 return NULL;
00193 }
00194
00195 return currNode->GetTargetPhraseCollection();
00196 }
00197
00198 PhraseDictionaryMemory::~PhraseDictionaryMemory()
00199 {
00200 }
00201
00202 TO_STRING_BODY(PhraseDictionaryMemory);
00203
00204
00205 ostream& operator<<(ostream& out, const PhraseDictionaryMemory& phraseDict)
00206 {
00207 const PhraseDictionaryNode &coll = phraseDict.m_collection;
00208 PhraseDictionaryNode::const_iterator iter;
00209 for (iter = coll.begin() ; iter != coll.end() ; ++iter) {
00210 const Word &word = (*iter).first;
00211 out << word;
00212 }
00213 return out;
00214 }
00215
00216
00217 }
00218