00001 #include "RuleTrieLoader.h"
00002
00003 #include <sys/stat.h>
00004 #include <cstdlib>
00005
00006 #include <fstream>
00007 #include <string>
00008 #include <iterator>
00009 #include <algorithm>
00010 #include <iostream>
00011 #include <cmath>
00012
00013 #include "moses/FactorCollection.h"
00014 #include "moses/Word.h"
00015 #include "moses/Util.h"
00016 #include "moses/Timer.h"
00017 #include "moses/InputFileStream.h"
00018 #include "moses/StaticData.h"
00019 #include "moses/Range.h"
00020 #include "moses/ChartTranslationOptionList.h"
00021 #include "moses/FactorCollection.h"
00022 #include "moses/Syntax/RuleTableFF.h"
00023 #include "util/file_piece.hh"
00024 #include "util/string_piece.hh"
00025 #include "util/tokenize_piece.hh"
00026 #include "util/double-conversion/double-conversion.h"
00027 #include "util/exception.hh"
00028
00029 #include "RuleTrie.h"
00030 #include "moses/parameters/AllOptions.h"
00031
00032 namespace Moses
00033 {
00034 namespace Syntax
00035 {
00036 namespace S2T
00037 {
00038
00039 bool RuleTrieLoader::Load(Moses::AllOptions const& opts,
00040 const std::vector<FactorType> &input,
00041 const std::vector<FactorType> &output,
00042 const std::string &inFile,
00043 const RuleTableFF &ff,
00044 RuleTrie &trie)
00045 {
00046 PrintUserTime(std::string("Start loading text phrase table. Moses format"));
00047
00048
00049
00050 std::size_t count = 0;
00051
00052 std::ostream *progress = NULL;
00053 IFVERBOSE(1) progress = &std::cerr;
00054 util::FilePiece in(inFile.c_str(), progress);
00055
00056
00057 std::vector<float> scoreVector;
00058 StringPiece line;
00059
00060 double_conversion::StringToDoubleConverter converter(double_conversion::StringToDoubleConverter::NO_FLAGS, NAN, NAN, "inf", "nan");
00061
00062 while(true) {
00063 try {
00064 line = in.ReadLine();
00065 } catch (const util::EndOfFileException &e) {
00066 break;
00067 }
00068
00069 util::TokenIter<util::MultiCharacter> pipes(line, "|||");
00070 StringPiece sourcePhraseString(*pipes);
00071 StringPiece targetPhraseString(*++pipes);
00072 StringPiece scoreString(*++pipes);
00073
00074 StringPiece alignString;
00075 if (++pipes) {
00076 StringPiece temp(*pipes);
00077 alignString = temp;
00078 }
00079
00080 bool isLHSEmpty = (sourcePhraseString.find_first_not_of(" \t", 0) == std::string::npos);
00081 if (isLHSEmpty && !opts.unk.word_deletion_enabled) {
00082 TRACE_ERR( ff.GetFilePath() << ":" << count << ": pt entry contains empty target, skipping\n");
00083 continue;
00084 }
00085
00086 scoreVector.clear();
00087 for (util::TokenIter<util::AnyCharacter, true> s(scoreString, " \t"); s; ++s) {
00088 int processed;
00089 float score = converter.StringToFloat(s->data(), s->length(), &processed);
00090 UTIL_THROW_IF2(std::isnan(score), "Bad score " << *s << " on line " << count);
00091 scoreVector.push_back(FloorScore(TransformScore(score)));
00092 }
00093 const size_t numScoreComponents = ff.GetNumScoreComponents();
00094 if (scoreVector.size() != numScoreComponents) {
00095 UTIL_THROW2("Size of scoreVector != number (" << scoreVector.size() << "!="
00096 << numScoreComponents << ") of score components on line " << count);
00097 }
00098
00099
00100
00101
00102 Word *sourceLHS = NULL;
00103 Word *targetLHS;
00104
00105
00106 TargetPhrase *targetPhrase = new TargetPhrase(&ff);
00107 targetPhrase->CreateFromString(Output, output, targetPhraseString, &targetLHS);
00108
00109 Phrase sourcePhrase;
00110 sourcePhrase.CreateFromString(Input, input, sourcePhraseString, &sourceLHS);
00111
00112
00113 targetPhrase->SetAlignmentInfo(alignString);
00114 targetPhrase->SetTargetLHS(targetLHS);
00115
00116 ++pipes;
00117
00118 if (++pipes) {
00119 StringPiece sparseString(*pipes);
00120 targetPhrase->SetSparseScore(&ff, sparseString);
00121 }
00122
00123 if (++pipes) {
00124 StringPiece propertiesString(*pipes);
00125 targetPhrase->SetProperties(propertiesString);
00126 }
00127
00128 targetPhrase->GetScoreBreakdown().Assign(&ff, scoreVector);
00129 targetPhrase->EvaluateInIsolation(sourcePhrase, ff.GetFeaturesToApply());
00130
00131 TargetPhraseCollection::shared_ptr phraseColl
00132 = GetOrCreateTargetPhraseCollection(trie, sourcePhrase,
00133 *targetPhrase, sourceLHS);
00134 phraseColl->Add(targetPhrase);
00135
00136
00137 delete sourceLHS;
00138
00139 count++;
00140 }
00141
00142
00143 if (ff.GetTableLimit()) {
00144 SortAndPrune(trie, ff.GetTableLimit());
00145 }
00146
00147 return true;
00148 }
00149
00150 }
00151 }
00152 }