00001
00002
00003
00004
00005
00006
00007
00008
00009
00010
00011
00012
00013
00014
00015
00016
00017
00018
00019
00020 #include "LoaderStandard.h"
00021
00022 #include <fstream>
00023 #include <string>
00024 #include <iterator>
00025 #include <algorithm>
00026 #include <iostream>
00027 #include <sys/stat.h>
00028 #include <cstdlib>
00029 #include <boost/algorithm/string/predicate.hpp>
00030 #include "Trie.h"
00031 #include "moses/FactorCollection.h"
00032 #include "moses/Word.h"
00033 #include "moses/Util.h"
00034 #include "moses/InputFileStream.h"
00035 #include "moses/StaticData.h"
00036 #include "moses/Range.h"
00037 #include "moses/ChartTranslationOptionList.h"
00038 #include "moses/FactorCollection.h"
00039 #include "util/file_piece.hh"
00040 #include "util/string_piece.hh"
00041 #include "util/tokenize_piece.hh"
00042 #include "util/double-conversion/double-conversion.h"
00043 #include "util/exception.hh"
00044
00045 using namespace std;
00046 using namespace boost::algorithm;
00047
00048 namespace Moses
00049 {
00050
00051 bool
00052 RuleTableLoaderStandard::
00053 Load(AllOptions const& opts
00054 , const std::vector<FactorType> &input
00055 , const std::vector<FactorType> &output
00056 , const std::string &inFile
00057 , size_t tableLimit
00058 , RuleTableTrie &ruleTable)
00059 {
00060 return Load(opts, MosesFormat,input, output ,inFile ,tableLimit ,ruleTable);
00061 }
00062
00063 void ReformatHieroRule(int sourceTarget, string &phrase, map<size_t, pair<size_t, size_t> > &ntAlign)
00064 {
00065 vector<string> toks;
00066 Tokenize(toks, phrase, " ");
00067
00068 for (size_t i = 0; i < toks.size(); ++i) {
00069 string &tok = toks[i];
00070 if (starts_with(tok, "[") && ends_with(tok, "]")) {
00071
00072 vector<string> split = Tokenize(tok, ",");
00073 UTIL_THROW_IF2(split.size() != 2,
00074 "Incorrectly formmatted non-terminal: " << tok);
00075
00076 tok = "[X]" + split[0] + "]";
00077 size_t coIndex = Scan<size_t>(split[1]);
00078
00079 pair<size_t, size_t> &alignPoint = ntAlign[coIndex];
00080 if (sourceTarget == 0) {
00081 alignPoint.first = i;
00082 } else {
00083 alignPoint.second = i;
00084 }
00085 }
00086 }
00087
00088 phrase = Join(" ", toks) + " [X]";
00089
00090 }
00091
00092 void ReformateHieroScore(string &scoreString)
00093 {
00094 vector<string> toks;
00095 Tokenize(toks, scoreString, " ");
00096
00097 for (size_t i = 0; i < toks.size(); ++i) {
00098 string &tok = toks[i];
00099 vector<string> nameValue = Tokenize(tok, "=");
00100 UTIL_THROW_IF2(nameValue.size() != 2,
00101 "Incorrectly formatted score: " << tok);
00102
00103 float score = Scan<float>(nameValue[1]);
00104 score = exp(-score);
00105 tok = SPrint(score);
00106 }
00107
00108 scoreString = Join(" ", toks);
00109 }
00110
00111 void ReformatHieroRule(const string &lineOrig, string &out)
00112 {
00113 vector<string> tokens;
00114 vector<float> scoreVector;
00115
00116 TokenizeMultiCharSeparator(tokens, lineOrig, "|||" );
00117
00118 string &sourcePhraseString = tokens[1]
00119 , &targetPhraseString = tokens[2]
00120 , &scoreString = tokens[3];
00121
00122 map<size_t, pair<size_t, size_t> > ntAlign;
00123 ReformatHieroRule(0, sourcePhraseString, ntAlign);
00124 ReformatHieroRule(1, targetPhraseString, ntAlign);
00125 ReformateHieroScore(scoreString);
00126
00127 util::StringStream align;
00128 map<size_t, pair<size_t, size_t> >::const_iterator iterAlign;
00129 for (iterAlign = ntAlign.begin(); iterAlign != ntAlign.end(); ++iterAlign) {
00130 const pair<size_t, size_t> &alignPoint = iterAlign->second;
00131 align << alignPoint.first << "-" << alignPoint.second << " ";
00132 }
00133
00134 util::StringStream ret;
00135 ret << sourcePhraseString << " ||| "
00136 << targetPhraseString << " ||| "
00137 << scoreString << " ||| "
00138 << align.str();
00139
00140 out = ret.str();
00141 }
00142
00143 bool RuleTableLoaderStandard::Load(AllOptions const& opts, FormatType format
00144 , const std::vector<FactorType> &input
00145 , const std::vector<FactorType> &output
00146 , const std::string &inFile
00147 , size_t
00148 , RuleTableTrie &ruleTable)
00149 {
00150 PrintUserTime(string("Start loading text phrase table. ") + (format==MosesFormat?"Moses":"Hiero") + " format");
00151
00152
00153
00154 string lineOrig;
00155 size_t count = 0;
00156
00157 std::ostream *progress = NULL;
00158 IFVERBOSE(1) progress = &std::cerr;
00159 util::FilePiece in(inFile.c_str(), progress);
00160
00161
00162 vector<float> scoreVector;
00163 StringPiece line;
00164 std::string hiero_before, hiero_after;
00165
00166 double_conversion::StringToDoubleConverter converter(double_conversion::StringToDoubleConverter::NO_FLAGS, NAN, NAN, "inf", "nan");
00167
00168 while(true) {
00169 try {
00170 line = in.ReadLine();
00171 } catch (const util::EndOfFileException &e) {
00172 break;
00173 }
00174
00175 if (format == HieroFormat) {
00176 hiero_before.assign(line.data(), line.size());
00177 ReformatHieroRule(hiero_before, hiero_after);
00178 line = hiero_after;
00179 }
00180
00181 util::TokenIter<util::MultiCharacter> pipes(line, "|||");
00182 StringPiece sourcePhraseString(*pipes);
00183 StringPiece targetPhraseString(*++pipes);
00184 StringPiece scoreString(*++pipes);
00185
00186 StringPiece alignString;
00187 if (++pipes) {
00188 StringPiece temp(*pipes);
00189 alignString = temp;
00190 }
00191
00192 bool isLHSEmpty = (sourcePhraseString.find_first_not_of(" \t", 0) == string::npos);
00193 if (isLHSEmpty && !opts.unk.word_deletion_enabled) {
00194 TRACE_ERR( ruleTable.GetFilePath() << ":" << count << ": pt entry contains empty target, skipping\n");
00195 continue;
00196 }
00197
00198 scoreVector.clear();
00199 for (util::TokenIter<util::AnyCharacter, true> s(scoreString, " \t"); s; ++s) {
00200 int processed;
00201 float score = converter.StringToFloat(s->data(), s->length(), &processed);
00202 UTIL_THROW_IF2(isnan(score), "Bad score " << *s << " on line " << count);
00203 scoreVector.push_back(FloorScore(TransformScore(score)));
00204 }
00205 const size_t numScoreComponents = ruleTable.GetNumScoreComponents();
00206 if (scoreVector.size() != numScoreComponents) {
00207 UTIL_THROW2("Size of scoreVector != number (" << scoreVector.size() << "!="
00208 << numScoreComponents << ") of score components on line " << count);
00209 }
00210
00211
00212
00213
00214 Word *sourceLHS = NULL;
00215 Word *targetLHS;
00216
00217
00218 TargetPhrase *targetPhrase = new TargetPhrase(&ruleTable);
00219 targetPhrase->CreateFromString(Output, output, targetPhraseString, &targetLHS);
00220
00221 Phrase sourcePhrase;
00222 sourcePhrase.CreateFromString(Input, input, sourcePhraseString, &sourceLHS);
00223
00224
00225 targetPhrase->SetAlignmentInfo(alignString);
00226 targetPhrase->SetTargetLHS(targetLHS);
00227
00228 ++pipes;
00229
00230 if (++pipes) {
00231 StringPiece sparseString(*pipes);
00232 targetPhrase->SetSparseScore(&ruleTable, sparseString);
00233 }
00234
00235 if (++pipes) {
00236 StringPiece propertiesString(*pipes);
00237 targetPhrase->SetProperties(propertiesString);
00238 }
00239
00240 targetPhrase->GetScoreBreakdown().Assign(&ruleTable, scoreVector);
00241 targetPhrase->EvaluateInIsolation(sourcePhrase, ruleTable.GetFeaturesToApply());
00242
00243 TargetPhraseCollection::shared_ptr phraseColl
00244 = GetOrCreateTargetPhraseCollection(ruleTable, sourcePhrase,
00245 *targetPhrase, sourceLHS);
00246 phraseColl->Add(targetPhrase);
00247
00248
00249 delete sourceLHS;
00250
00251 count++;
00252 }
00253
00254
00255 SortAndPrune(ruleTable);
00256
00257 return true;
00258 }
00259
00260 }