00001 #include "RuleTrieLoader.h"
00002
00003 #include <sys/stat.h>
00004
00005 #include <cmath>
00006 #include <cstdlib>
00007 #include <fstream>
00008 #include <string>
00009 #include <iterator>
00010 #include <algorithm>
00011 #include <iostream>
00012
00013 #include "moses/FactorCollection.h"
00014 #include "moses/Word.h"
00015 #include "moses/Util.h"
00016 #include "moses/InputFileStream.h"
00017 #include "moses/StaticData.h"
00018 #include "moses/Range.h"
00019 #include "moses/ChartTranslationOptionList.h"
00020 #include "moses/FactorCollection.h"
00021 #include "moses/Syntax/RuleTableFF.h"
00022 #include "util/file_piece.hh"
00023 #include "util/string_piece.hh"
00024 #include "util/tokenize_piece.hh"
00025 #include "util/double-conversion/double-conversion.h"
00026 #include "util/exception.hh"
00027
00028 #include "RuleTrie.h"
00029 #include "moses/parameters/AllOptions.h"
00030
00031 namespace Moses
00032 {
00033 namespace Syntax
00034 {
00035 namespace T2S
00036 {
00037
00038 bool RuleTrieLoader::Load(Moses::AllOptions const& opts,
00039 const std::vector<FactorType> &input,
00040 const std::vector<FactorType> &output,
00041 const std::string &inFile,
00042 const RuleTableFF &ff,
00043 RuleTrie &trie)
00044 {
00045 PrintUserTime(std::string("Start loading text phrase table. Moses format"));
00046
00047 std::size_t count = 0;
00048
00049 std::ostream *progress = NULL;
00050 IFVERBOSE(1) progress = &std::cerr;
00051 util::FilePiece in(inFile.c_str(), progress);
00052
00053
00054 std::vector<float> scoreVector;
00055 StringPiece line;
00056
00057 int noflags = double_conversion::StringToDoubleConverter::NO_FLAGS;
00058 double_conversion::StringToDoubleConverter
00059 converter(noflags, NAN, NAN, "inf", "nan");
00060
00061 while(true) {
00062 try {
00063 line = in.ReadLine();
00064 } catch (const util::EndOfFileException &e) {
00065 break;
00066 }
00067
00068 util::TokenIter<util::MultiCharacter> pipes(line, "|||");
00069 StringPiece sourcePhraseString(*pipes);
00070 StringPiece targetPhraseString(*++pipes);
00071 StringPiece scoreString(*++pipes);
00072
00073 StringPiece alignString;
00074 if (++pipes) {
00075 StringPiece temp(*pipes);
00076 alignString = temp;
00077 }
00078
00079 ++pipes;
00080
00081 bool isLHSEmpty = (sourcePhraseString.find_first_not_of(" \t", 0) == std::string::npos);
00082 if (isLHSEmpty && !opts.unk.word_deletion_enabled) {
00083 TRACE_ERR( ff.GetFilePath() << ":" << count << ": pt entry contains empty target, skipping\n");
00084 continue;
00085 }
00086
00087 scoreVector.clear();
00088 for (util::TokenIter<util::AnyCharacter, true> s(scoreString, " \t"); s; ++s) {
00089 int processed;
00090 float score = converter.StringToFloat(s->data(), s->length(), &processed);
00091 UTIL_THROW_IF2(std::isnan(score), "Bad score " << *s << " on line " << count);
00092 scoreVector.push_back(FloorScore(TransformScore(score)));
00093 }
00094 const std::size_t numScoreComponents = ff.GetNumScoreComponents();
00095 if (scoreVector.size() != numScoreComponents) {
00096 UTIL_THROW2("Size of scoreVector != number (" << scoreVector.size() << "!="
00097 << numScoreComponents << ") of score components on line " << count);
00098 }
00099
00100
00101
00102
00103 Word *sourceLHS = NULL;
00104 Word *targetLHS;
00105
00106
00107 TargetPhrase *targetPhrase = new TargetPhrase(&ff);
00108
00109 targetPhrase->CreateFromString(Output, output, targetPhraseString, &targetLHS);
00110
00111 Phrase sourcePhrase;
00112
00113 sourcePhrase.CreateFromString(Input, input, sourcePhraseString, &sourceLHS);
00114
00115
00116 targetPhrase->SetAlignmentInfo(alignString);
00117 targetPhrase->SetTargetLHS(targetLHS);
00118
00119
00120
00121 if (++pipes) {
00122 StringPiece sparseString(*pipes);
00123 targetPhrase->SetSparseScore(&ff, sparseString);
00124 }
00125
00126 if (++pipes) {
00127 StringPiece propertiesString(*pipes);
00128 targetPhrase->SetProperties(propertiesString);
00129 }
00130
00131 targetPhrase->GetScoreBreakdown().Assign(&ff, scoreVector);
00132 targetPhrase->EvaluateInIsolation(sourcePhrase, ff.GetFeaturesToApply());
00133
00134 TargetPhraseCollection::shared_ptr phraseColl
00135 = GetOrCreateTargetPhraseCollection(trie, *sourceLHS, sourcePhrase);
00136 phraseColl->Add(targetPhrase);
00137
00138
00139 delete sourceLHS;
00140
00141 count++;
00142 }
00143
00144
00145 if (ff.GetTableLimit()) {
00146 SortAndPrune(trie, ff.GetTableLimit());
00147 }
00148
00149 return true;
00150 }
00151
00152 }
00153 }
00154 }