00001
00002
00003
00004
00005
00006
00007
00008
00009
00010
00011
00012
00013
00014
00015
00016
00017
00018
00019
00020
00021
00022
00023 #include <string>
00024 #include "util/check.hh"
00025 #include "PhraseDictionaryMemory.h"
00026 #include "DecodeStepTranslation.h"
00027 #include "DecodeStepGeneration.h"
00028 #include "GenerationDictionary.h"
00029 #include "DummyScoreProducers.h"
00030 #include "StaticData.h"
00031 #include "Util.h"
00032 #include "FactorCollection.h"
00033 #include "Timer.h"
00034 #include "LM/Factory.h"
00035 #include "LexicalReordering.h"
00036 #include "GlobalLexicalModel.h"
00037 #include "SentenceStats.h"
00038 #include "PhraseDictionary.h"
00039 #include "UserMessage.h"
00040 #include "TranslationOption.h"
00041 #include "DecodeGraph.h"
00042 #include "InputFileStream.h"
00043
00044 #ifdef HAVE_SYNLM
00045 #include "SyntacticLanguageModel.h"
00046 #endif
00047
00048 #ifdef WITH_THREADS
00049 #include <boost/thread.hpp>
00050 #endif
00051
00052 using namespace std;
00053
00054 namespace Moses
00055 {
00056 static size_t CalcMax(size_t x, const vector<size_t>& y)
00057 {
00058 size_t max = x;
00059 for (vector<size_t>::const_iterator i=y.begin(); i != y.end(); ++i)
00060 if (*i > max) max = *i;
00061 return max;
00062 }
00063
00064 static size_t CalcMax(size_t x, const vector<size_t>& y, const vector<size_t>& z)
00065 {
00066 size_t max = x;
00067 for (vector<size_t>::const_iterator i=y.begin(); i != y.end(); ++i)
00068 if (*i > max) max = *i;
00069 for (vector<size_t>::const_iterator i=z.begin(); i != z.end(); ++i)
00070 if (*i > max) max = *i;
00071 return max;
00072 }
00073
00074 StaticData StaticData::s_instance;
00075
00076 StaticData::StaticData()
00077 :m_numLinkParams(1)
00078 ,m_fLMsLoaded(false)
00079 ,m_sourceStartPosMattersForRecombination(false)
00080 ,m_inputType(SentenceInput)
00081 ,m_numInputScores(0)
00082 ,m_detailedTranslationReportingFilePath()
00083 ,m_onlyDistinctNBest(false)
00084 ,m_factorDelimiter("|")
00085 ,m_lmEnableOOVFeature(false)
00086 ,m_isAlwaysCreateDirectTranslationOption(false)
00087 {
00088 m_maxFactorIdx[0] = 0;
00089 m_maxFactorIdx[1] = 0;
00090
00091 m_xmlBrackets.first="<";
00092 m_xmlBrackets.second=">";
00093
00094
00095 Phrase::InitializeMemPool();
00096 }
00097
00098 bool StaticData::LoadData(Parameter *parameter)
00099 {
00100 ResetUserTime();
00101 m_parameter = parameter;
00102
00103
00104 m_verboseLevel = 1;
00105 if (m_parameter->GetParam("verbose").size() == 1) {
00106 m_verboseLevel = Scan<size_t>( m_parameter->GetParam("verbose")[0]);
00107 }
00108
00109 m_parsingAlgorithm = (m_parameter->GetParam("parsing-algorithm").size() > 0) ?
00110 (ParsingAlgorithm) Scan<size_t>(m_parameter->GetParam("parsing-algorithm")[0]) : ParseCYKPlus;
00111
00112
00113 m_searchAlgorithm = (m_parameter->GetParam("search-algorithm").size() > 0) ?
00114 (SearchAlgorithm) Scan<size_t>(m_parameter->GetParam("search-algorithm")[0]) : Normal;
00115
00116 if (m_searchAlgorithm == ChartDecoding)
00117 LoadChartDecodingParameters();
00118 else
00119 LoadPhraseBasedParameters();
00120
00121
00122 if(m_parameter->GetParam("inputtype").size())
00123 m_inputType= (InputTypeEnum) Scan<int>(m_parameter->GetParam("inputtype")[0]);
00124 std::string s_it = "text input";
00125 if (m_inputType == 1) {
00126 s_it = "confusion net";
00127 }
00128 if (m_inputType == 2) {
00129 s_it = "word lattice";
00130 }
00131 VERBOSE(2,"input type is: "<<s_it<<"\n");
00132
00133 if(m_parameter->GetParam("recover-input-path").size()) {
00134 m_recoverPath = Scan<bool>(m_parameter->GetParam("recover-input-path")[0]);
00135 if (m_recoverPath && m_inputType == SentenceInput) {
00136 TRACE_ERR("--recover-input-path should only be used with confusion net or word lattice input!\n");
00137 m_recoverPath = false;
00138 }
00139 }
00140
00141 if(m_parameter->GetParam("sort-word-alignment").size()) {
00142 m_wordAlignmentSort = (WordAlignmentSort) Scan<size_t>(m_parameter->GetParam("sort-word-alignment")[0]);
00143 }
00144
00145
00146 if (m_parameter->GetParam("factor-delimiter").size() > 0) {
00147 m_factorDelimiter = m_parameter->GetParam("factor-delimiter")[0];
00148 }
00149
00150 SetBooleanParameter( &m_continuePartialTranslation, "continue-partial-translation", false );
00151
00152
00153 SetBooleanParameter( &m_UseAlignmentInfo, "use-alignment-info", false );
00154 SetBooleanParameter( &m_PrintAlignmentInfo, "print-alignment-info", false );
00155 SetBooleanParameter( &m_PrintAlignmentInfoNbest, "print-alignment-info-in-n-best", false );
00156
00157 SetBooleanParameter( &m_outputHypoScore, "output-hypo-score", false );
00158
00159 if (!m_UseAlignmentInfo && m_PrintAlignmentInfo) {
00160 TRACE_ERR("--print-alignment-info should only be used together with \"--use-alignment-info true\". Continue forcing to false.\n");
00161 m_PrintAlignmentInfo=false;
00162 }
00163 if (!m_UseAlignmentInfo && m_PrintAlignmentInfoNbest) {
00164 TRACE_ERR("--print-alignment-info-in-n-best should only be used together with \"--use-alignment-info true\". Continue forcing to false.\n");
00165 m_PrintAlignmentInfoNbest=false;
00166 }
00167
00168 if (m_parameter->GetParam("alignment-output-file").size() > 0) {
00169 m_alignmentOutputFile = Scan<std::string>(m_parameter->GetParam("alignment-output-file")[0]);
00170 }
00171
00172
00173 if (m_parameter->GetParam("n-best-list").size() >= 2) {
00174 m_nBestFilePath = m_parameter->GetParam("n-best-list")[0];
00175 m_nBestSize = Scan<size_t>( m_parameter->GetParam("n-best-list")[1] );
00176 m_onlyDistinctNBest=(m_parameter->GetParam("n-best-list").size()>2 && m_parameter->GetParam("n-best-list")[2]=="distinct");
00177 } else if (m_parameter->GetParam("n-best-list").size() == 1) {
00178 UserMessage::Add(string("wrong format for switch -n-best-list file size"));
00179 return false;
00180 } else {
00181 m_nBestSize = 0;
00182 }
00183 if (m_parameter->GetParam("n-best-factor").size() > 0) {
00184 m_nBestFactor = Scan<size_t>( m_parameter->GetParam("n-best-factor")[0]);
00185 } else {
00186 m_nBestFactor = 20;
00187 }
00188
00189
00190 if (m_parameter->GetParam("lattice-samples").size() ==2 ) {
00191 m_latticeSamplesFilePath = m_parameter->GetParam("lattice-samples")[0];
00192 m_latticeSamplesSize = Scan<size_t>(m_parameter->GetParam("lattice-samples")[1]);
00193 } else if (m_parameter->GetParam("lattice-samples").size() != 0 ) {
00194 UserMessage::Add(string("wrong format for switch -lattice-samples file size"));
00195 return false;
00196 } else {
00197 m_latticeSamplesSize = 0;
00198 }
00199
00200
00201 if (m_parameter->GetParam("output-word-graph").size() == 2)
00202 m_outputWordGraph = true;
00203 else
00204 m_outputWordGraph = false;
00205
00206
00207 if (m_parameter->GetParam("output-search-graph").size() > 0) {
00208 if (m_parameter->GetParam("output-search-graph").size() != 1) {
00209 UserMessage::Add(string("ERROR: wrong format for switch -output-search-graph file"));
00210 return false;
00211 }
00212 m_outputSearchGraph = true;
00213 }
00214
00215 else if (m_parameter->GetParam("output-search-graph-extended").size() > 0) {
00216 if (m_parameter->GetParam("output-search-graph-extended").size() != 1) {
00217 UserMessage::Add(string("ERROR: wrong format for switch -output-search-graph-extended file"));
00218 return false;
00219 }
00220 m_outputSearchGraph = true;
00221 m_outputSearchGraphExtended = true;
00222 } else
00223 m_outputSearchGraph = false;
00224 #ifdef HAVE_PROTOBUF
00225 if (m_parameter->GetParam("output-search-graph-pb").size() > 0) {
00226 if (m_parameter->GetParam("output-search-graph-pb").size() != 1) {
00227 UserMessage::Add(string("ERROR: wrong format for switch -output-search-graph-pb path"));
00228 return false;
00229 }
00230 m_outputSearchGraphPB = true;
00231 } else
00232 m_outputSearchGraphPB = false;
00233 #endif
00234 SetBooleanParameter( &m_unprunedSearchGraph, "unpruned-search-graph", true );
00235
00236
00237 SetBooleanParameter( &m_labeledNBestList, "labeled-n-best-list", true );
00238
00239
00240 SetBooleanParameter( &m_nBestIncludesAlignment, "include-alignment-in-n-best", false );
00241
00242
00243 SetBooleanParameter( &m_reportSegmentation, "report-segmentation", false );
00244
00245
00246 SetBooleanParameter( &m_reportAllFactors, "report-all-factors", false );
00247
00248
00249 SetBooleanParameter( &m_reportAllFactorsNBest, "report-all-factors-in-n-best", false );
00250
00251
00252 if (m_inputType == SentenceInput) {
00253 SetBooleanParameter( &m_useTransOptCache, "use-persistent-cache", true );
00254 m_transOptCacheMaxSize = (m_parameter->GetParam("persistent-cache-size").size() > 0)
00255 ? Scan<size_t>(m_parameter->GetParam("persistent-cache-size")[0]) : DEFAULT_MAX_TRANS_OPT_CACHE_SIZE;
00256 } else {
00257 m_useTransOptCache = false;
00258 }
00259
00260
00261
00262 const vector<string> &inputFactorVector = m_parameter->GetParam("input-factors");
00263 for(size_t i=0; i<inputFactorVector.size(); i++) {
00264 m_inputFactorOrder.push_back(Scan<FactorType>(inputFactorVector[i]));
00265 }
00266 if(m_inputFactorOrder.empty()) {
00267 UserMessage::Add(string("no input factor specified in config file"));
00268 return false;
00269 }
00270
00271
00272 const vector<string> &outputFactorVector = m_parameter->GetParam("output-factors");
00273 for(size_t i=0; i<outputFactorVector.size(); i++) {
00274 m_outputFactorOrder.push_back(Scan<FactorType>(outputFactorVector[i]));
00275 }
00276 if(m_outputFactorOrder.empty()) {
00277
00278 m_outputFactorOrder.push_back(0);
00279 }
00280
00281
00282 SetBooleanParameter( &m_wordDeletionEnabled, "phrase-drop-allowed", false );
00283
00284
00285 SetBooleanParameter(&m_disableDiscarding, "disable-discarding", false);
00286
00287
00288 SetBooleanParameter( &m_printAllDerivations , "print-all-derivations", false );
00289
00290
00291 if (m_parameter->isParamSpecified("translation-details")) {
00292 const vector<string> &args = m_parameter->GetParam("translation-details");
00293 if (args.size() == 1) {
00294 m_detailedTranslationReportingFilePath = args[0];
00295 } else {
00296 UserMessage::Add(string("the translation-details option requires exactly one filename argument"));
00297 return false;
00298 }
00299 }
00300
00301
00302 for (size_t i = 0; i < m_parameter->GetParam("weight-w").size(); ++i) {
00303 float weightWordPenalty = Scan<float>( m_parameter->GetParam("weight-w")[i] );
00304 m_wordPenaltyProducers.push_back(new WordPenaltyProducer(m_scoreIndexManager));
00305 m_allWeights.push_back(weightWordPenalty);
00306 }
00307
00308
00309 float weightUnknownWord = (m_parameter->GetParam("weight-u").size() > 0) ? Scan<float>(m_parameter->GetParam("weight-u")[0]) : 1;
00310 m_unknownWordPenaltyProducer = new UnknownWordPenaltyProducer(m_scoreIndexManager);
00311 m_allWeights.push_back(weightUnknownWord);
00312
00313
00314 m_maxDistortion = (m_parameter->GetParam("distortion-limit").size() > 0) ?
00315 Scan<int>(m_parameter->GetParam("distortion-limit")[0])
00316 : -1;
00317 SetBooleanParameter( &m_reorderingConstraint, "monotone-at-punctuation", false );
00318
00319
00320 m_maxHypoStackSize = (m_parameter->GetParam("stack").size() > 0)
00321 ? Scan<size_t>(m_parameter->GetParam("stack")[0]) : DEFAULT_MAX_HYPOSTACK_SIZE;
00322 m_minHypoStackDiversity = 0;
00323 if (m_parameter->GetParam("stack-diversity").size() > 0) {
00324 if (m_maxDistortion > 15) {
00325 UserMessage::Add("stack diversity > 0 is not allowed for distortion limits larger than 15");
00326 return false;
00327 }
00328 if (m_inputType == WordLatticeInput) {
00329 UserMessage::Add("stack diversity > 0 is not allowed for lattice input");
00330 return false;
00331 }
00332 m_minHypoStackDiversity = Scan<size_t>(m_parameter->GetParam("stack-diversity")[0]);
00333 }
00334
00335 m_beamWidth = (m_parameter->GetParam("beam-threshold").size() > 0) ?
00336 TransformScore(Scan<float>(m_parameter->GetParam("beam-threshold")[0]))
00337 : TransformScore(DEFAULT_BEAM_WIDTH);
00338 m_earlyDiscardingThreshold = (m_parameter->GetParam("early-discarding-threshold").size() > 0) ?
00339 TransformScore(Scan<float>(m_parameter->GetParam("early-discarding-threshold")[0]))
00340 : TransformScore(DEFAULT_EARLY_DISCARDING_THRESHOLD);
00341 m_translationOptionThreshold = (m_parameter->GetParam("translation-option-threshold").size() > 0) ?
00342 TransformScore(Scan<float>(m_parameter->GetParam("translation-option-threshold")[0]))
00343 : TransformScore(DEFAULT_TRANSLATION_OPTION_THRESHOLD);
00344
00345 m_maxNoTransOptPerCoverage = (m_parameter->GetParam("max-trans-opt-per-coverage").size() > 0)
00346 ? Scan<size_t>(m_parameter->GetParam("max-trans-opt-per-coverage")[0]) : DEFAULT_MAX_TRANS_OPT_SIZE;
00347
00348 m_maxNoPartTransOpt = (m_parameter->GetParam("max-partial-trans-opt").size() > 0)
00349 ? Scan<size_t>(m_parameter->GetParam("max-partial-trans-opt")[0]) : DEFAULT_MAX_PART_TRANS_OPT_SIZE;
00350
00351 m_maxPhraseLength = (m_parameter->GetParam("max-phrase-length").size() > 0)
00352 ? Scan<size_t>(m_parameter->GetParam("max-phrase-length")[0]) : DEFAULT_MAX_PHRASE_LENGTH;
00353
00354 m_cubePruningPopLimit = (m_parameter->GetParam("cube-pruning-pop-limit").size() > 0)
00355 ? Scan<size_t>(m_parameter->GetParam("cube-pruning-pop-limit")[0]) : DEFAULT_CUBE_PRUNING_POP_LIMIT;
00356
00357 m_cubePruningDiversity = (m_parameter->GetParam("cube-pruning-diversity").size() > 0)
00358 ? Scan<size_t>(m_parameter->GetParam("cube-pruning-diversity")[0]) : DEFAULT_CUBE_PRUNING_DIVERSITY;
00359
00360 SetBooleanParameter(&m_cubePruningLazyScoring, "cube-pruning-lazy-scoring", false);
00361
00362
00363 SetBooleanParameter( &m_dropUnknown, "drop-unknown", false );
00364
00365 SetBooleanParameter( &m_lmEnableOOVFeature, "lmodel-oov-feature", false);
00366
00367
00368 SetBooleanParameter( &m_mbr, "minimum-bayes-risk", false );
00369 m_mbrSize = (m_parameter->GetParam("mbr-size").size() > 0) ?
00370 Scan<size_t>(m_parameter->GetParam("mbr-size")[0]) : 200;
00371 m_mbrScale = (m_parameter->GetParam("mbr-scale").size() > 0) ?
00372 Scan<float>(m_parameter->GetParam("mbr-scale")[0]) : 1.0f;
00373
00374
00375 SetBooleanParameter( &m_useLatticeMBR, "lminimum-bayes-risk", false );
00376 if (m_useLatticeMBR && m_mbr) {
00377 cerr << "Errror: Cannot use both n-best mbr and lattice mbr together" << endl;
00378 exit(1);
00379 }
00380
00381 if (m_useLatticeMBR) m_mbr = true;
00382
00383 m_lmbrPruning = (m_parameter->GetParam("lmbr-pruning-factor").size() > 0) ?
00384 Scan<size_t>(m_parameter->GetParam("lmbr-pruning-factor")[0]) : 30;
00385 m_lmbrThetas = Scan<float>(m_parameter->GetParam("lmbr-thetas"));
00386 SetBooleanParameter( &m_useLatticeHypSetForLatticeMBR, "lattice-hypo-set", false );
00387 m_lmbrPrecision = (m_parameter->GetParam("lmbr-p").size() > 0) ?
00388 Scan<float>(m_parameter->GetParam("lmbr-p")[0]) : 0.8f;
00389 m_lmbrPRatio = (m_parameter->GetParam("lmbr-r").size() > 0) ?
00390 Scan<float>(m_parameter->GetParam("lmbr-r")[0]) : 0.6f;
00391 m_lmbrMapWeight = (m_parameter->GetParam("lmbr-map-weight").size() >0) ?
00392 Scan<float>(m_parameter->GetParam("lmbr-map-weight")[0]) : 0.0f;
00393
00394
00395 SetBooleanParameter( &m_useConsensusDecoding, "consensus-decoding", false );
00396 if (m_useConsensusDecoding && m_mbr) {
00397 cerr<< "Error: Cannot use consensus decoding together with mbr" << endl;
00398 exit(1);
00399 }
00400 if (m_useConsensusDecoding) m_mbr=true;
00401
00402
00403 m_timeout_threshold = (m_parameter->GetParam("time-out").size() > 0) ?
00404 Scan<size_t>(m_parameter->GetParam("time-out")[0]) : -1;
00405 m_timeout = (GetTimeoutThreshold() == (size_t)-1) ? false : true;
00406
00407
00408 m_lmcache_cleanup_threshold = (m_parameter->GetParam("clean-lm-cache").size() > 0) ?
00409 Scan<size_t>(m_parameter->GetParam("clean-lm-cache")[0]) : 1;
00410
00411 m_threadCount = 1;
00412 const std::vector<std::string> &threadInfo = m_parameter->GetParam("threads");
00413 if (!threadInfo.empty()) {
00414 if (threadInfo[0] == "all") {
00415 #ifdef WITH_THREADS
00416 m_threadCount = boost::thread::hardware_concurrency();
00417 if (!m_threadCount) {
00418 UserMessage::Add("-threads all specified but Boost doesn't know how many cores there are");
00419 return false;
00420 }
00421 #else
00422 UserMessage::Add("-threads all specified but moses not built with thread support");
00423 return false;
00424 #endif
00425 } else {
00426 m_threadCount = Scan<int>(threadInfo[0]);
00427 if (m_threadCount < 1) {
00428 UserMessage::Add("Specify at least one thread.");
00429 return false;
00430 }
00431 #ifndef WITH_THREADS
00432 if (m_threadCount > 1) {
00433 UserMessage::Add(std::string("Error: Thread count of ") + threadInfo[0] + " but moses not built with thread support");
00434 return false;
00435 }
00436 #endif
00437 }
00438 }
00439
00440 m_startTranslationId = (m_parameter->GetParam("start-translation-id").size() > 0) ?
00441 Scan<long>(m_parameter->GetParam("start-translation-id")[0]) : 0;
00442
00443
00444 if(m_parameter->GetParam("constraint").size()) {
00445 if (m_parameter->GetParam("search-algorithm").size() > 0
00446 && Scan<size_t>(m_parameter->GetParam("search-algorithm")[0]) != 0) {
00447 cerr << "Can use -constraint only with stack-based search (-search-algorithm 0)" << endl;
00448 exit(1);
00449 }
00450 m_constraintFileName = m_parameter->GetParam("constraint")[0];
00451
00452 InputFileStream constraintFile(m_constraintFileName);
00453
00454 std::string line;
00455
00456 long sentenceID = GetStartTranslationId() - 1;
00457 while (getline(constraintFile, line)) {
00458 vector<string> vecStr = Tokenize(line, "\t");
00459
00460 if (vecStr.size() == 1) {
00461 sentenceID++;
00462 Phrase phrase(0);
00463 phrase.CreateFromString(GetOutputFactorOrder(), vecStr[0], GetFactorDelimiter());
00464 m_constraints.insert(make_pair(sentenceID,phrase));
00465 } else if (vecStr.size() == 2) {
00466 sentenceID = Scan<long>(vecStr[0]);
00467 Phrase phrase(0);
00468 phrase.CreateFromString(GetOutputFactorOrder(), vecStr[1], GetFactorDelimiter());
00469 m_constraints.insert(make_pair(sentenceID,phrase));
00470 } else {
00471 CHECK(false);
00472 }
00473 }
00474 }
00475
00476
00477 if (m_parameter->GetParam("xml-input").size() == 0) m_xmlInputType = XmlPassThrough;
00478 else if (m_parameter->GetParam("xml-input")[0]=="exclusive") m_xmlInputType = XmlExclusive;
00479 else if (m_parameter->GetParam("xml-input")[0]=="inclusive") m_xmlInputType = XmlInclusive;
00480 else if (m_parameter->GetParam("xml-input")[0]=="ignore") m_xmlInputType = XmlIgnore;
00481 else if (m_parameter->GetParam("xml-input")[0]=="pass-through") m_xmlInputType = XmlPassThrough;
00482 else {
00483 UserMessage::Add("invalid xml-input value, must be pass-through, exclusive, inclusive, or ignore");
00484 return false;
00485 }
00486
00487
00488 if (m_parameter->GetParam("xml-brackets").size() > 0) {
00489 std::vector<std::string> brackets = Tokenize(m_parameter->GetParam("xml-brackets")[0]);
00490 if(brackets.size()!=2) {
00491 cerr << "invalid xml-brackets value, must specify exactly 2 blank-delimited strings for XML tags opening and closing brackets" << endl;
00492 exit(1);
00493 }
00494 m_xmlBrackets.first= brackets[0];
00495 m_xmlBrackets.second=brackets[1];
00496 cerr << "XML tags opening and closing brackets for XML input are: " << m_xmlBrackets.first << " and " << m_xmlBrackets.second << endl;
00497 }
00498
00499 #ifdef HAVE_SYNLM
00500 if (m_parameter->GetParam("slmodel-file").size() > 0) {
00501 if (!LoadSyntacticLanguageModel()) return false;
00502 }
00503 #endif
00504
00505 if (!LoadLexicalReorderingModel()) return false;
00506 if (!LoadLanguageModels()) return false;
00507 if (!LoadGenerationTables()) return false;
00508 if (!LoadPhraseTables()) return false;
00509 if (!LoadGlobalLexicalModel()) return false;
00510 if (!LoadDecodeGraphs()) return false;
00511
00512
00513
00514 vector<string> tsConfig = m_parameter->GetParam("translation-systems");
00515 if (!tsConfig.size()) {
00516
00517 tsConfig.push_back(TranslationSystem::DEFAULT + " R * D * L * G *");
00518 }
00519
00520 if (m_wordPenaltyProducers.size() != tsConfig.size()) {
00521 UserMessage::Add(string("Mismatch between number of word penalties and number of translation systems"));
00522 return false;
00523 }
00524
00525 if (m_searchAlgorithm == ChartDecoding) {
00526
00527 m_distortionScoreProducers.assign(tsConfig.size(), NULL);
00528 } else {
00529 if (m_distortionScoreProducers.size() != tsConfig.size()) {
00530 UserMessage::Add(string("Mismatch between number of distortion scores and number of translation systems. Or [search-algorithm] has been set to a phrase-based algorithm when it should be chart decoding"));
00531 return false;
00532 }
00533 }
00534
00535 for (size_t i = 0; i < tsConfig.size(); ++i) {
00536 vector<string> config = Tokenize(tsConfig[i]);
00537 if (config.size() % 2 != 1) {
00538 UserMessage::Add(string("Incorrect number of fields in Translation System config. Should be an odd number"));
00539 }
00540 m_translationSystems.insert(pair<string, TranslationSystem>(config[0],
00541 TranslationSystem(config[0],m_wordPenaltyProducers[i],m_unknownWordPenaltyProducer,m_distortionScoreProducers[i])));
00542 for (size_t j = 1; j < config.size(); j += 2) {
00543 const string& id = config[j];
00544 const string& tables = config[j+1];
00545 set<size_t> tableIds;
00546 if (tables != "*") {
00547
00548 vector<string> tableIdStrings = Tokenize(tables,",");
00549 vector<size_t> tableIdList;
00550 Scan<size_t>(tableIdList, tableIdStrings);
00551 copy(tableIdList.begin(), tableIdList.end(), inserter(tableIds,tableIds.end()));
00552 }
00553 if (id == "D") {
00554 for (size_t k = 0; k < m_decodeGraphs.size(); ++k) {
00555 if (!tableIds.size() || tableIds.find(k) != tableIds.end()) {
00556 VERBOSE(2,"Adding decoder graph " << k << " to translation system " << config[0] << endl);
00557 m_translationSystems.find(config[0])->second.AddDecodeGraph(m_decodeGraphs[k],m_decodeGraphBackoff[k]);
00558 }
00559 }
00560 } else if (id == "R") {
00561 for (size_t k = 0; k < m_reorderModels.size(); ++k) {
00562 if (!tableIds.size() || tableIds.find(k) != tableIds.end()) {
00563 m_translationSystems.find(config[0])->second.AddReorderModel(m_reorderModels[k]);
00564 VERBOSE(2,"Adding reorder table " << k << " to translation system " << config[0] << endl);
00565 }
00566 }
00567 } else if (id == "G") {
00568 for (size_t k = 0; k < m_globalLexicalModels.size(); ++k) {
00569 if (!tableIds.size() || tableIds.find(k) != tableIds.end()) {
00570 m_translationSystems.find(config[0])->second.AddGlobalLexicalModel(m_globalLexicalModels[k]);
00571 VERBOSE(2,"Adding global lexical model " << k << " to translation system " << config[0] << endl);
00572 }
00573 }
00574 } else if (id == "L") {
00575 size_t lmid = 0;
00576 for (LMList::const_iterator k = m_languageModel.begin(); k != m_languageModel.end(); ++k, ++lmid) {
00577 if (!tableIds.size() || tableIds.find(lmid) != tableIds.end()) {
00578 m_translationSystems.find(config[0])->second.AddLanguageModel(*k);
00579 VERBOSE(2,"Adding language model " << lmid << " to translation system " << config[0] << endl);
00580 }
00581 }
00582 } else {
00583 UserMessage::Add(string("Incorrect translation system identifier: ") + id);
00584 return false;
00585 }
00586 }
00587
00588 m_translationSystems.find(config[0])->second.ConfigDictionaries();
00589
00590
00591
00592
00593 #ifdef HAVE_SYNLM
00594 if (m_syntacticLanguageModel != NULL) {
00595 m_translationSystems.find(config[0])->second.AddFeatureFunction(m_syntacticLanguageModel);
00596 }
00597 #endif
00598 }
00599
00600
00601 m_scoreIndexManager.InitFeatureNames();
00602
00603 return true;
00604 }
00605
00606 void StaticData::SetBooleanParameter( bool *parameter, string parameterName, bool defaultValue )
00607 {
00608
00609 *parameter = defaultValue;
00610 if (! m_parameter->isParamSpecified( parameterName ) ) {
00611 return;
00612 }
00613
00614
00615 if (m_parameter->GetParam( parameterName ).size() == 0) {
00616 *parameter = true;
00617 }
00618
00619
00620 else if (m_parameter->GetParam( parameterName ).size() == 1) {
00621 *parameter = Scan<bool>( m_parameter->GetParam( parameterName )[0]);
00622 }
00623 }
00624
00625 StaticData::~StaticData()
00626 {
00627 RemoveAllInColl(m_phraseDictionary);
00628 RemoveAllInColl(m_generationDictionary);
00629 RemoveAllInColl(m_reorderModels);
00630 RemoveAllInColl(m_globalLexicalModels);
00631
00632 #ifdef HAVE_SYNLM
00633 delete m_syntacticLanguageModel;
00634 #endif
00635
00636
00637 RemoveAllInColl(m_decodeGraphs);
00638 RemoveAllInColl(m_wordPenaltyProducers);
00639 RemoveAllInColl(m_distortionScoreProducers);
00640 m_languageModel.CleanUp();
00641
00642
00643 ClearTransOptionCache();
00644
00645
00646 delete m_unknownWordPenaltyProducer;
00647
00648
00649
00650
00651 Phrase::FinalizeMemPool();
00652
00653 }
00654
00655 #ifdef HAVE_SYNLM
00656 bool StaticData::LoadSyntacticLanguageModel() {
00657 cerr << "Loading syntactic language models..." << std::endl;
00658
00659 const vector<float> weights = Scan<float>(m_parameter->GetParam("weight-slm"));
00660 const vector<string> files = m_parameter->GetParam("slmodel-file");
00661
00662 const FactorType factorType = (m_parameter->GetParam("slmodel-factor").size() > 0) ?
00663 TransformScore(Scan<int>(m_parameter->GetParam("slmodel-factor")[0]))
00664 : 0;
00665
00666 const size_t beamWidth = (m_parameter->GetParam("slmodel-beam").size() > 0) ?
00667 TransformScore(Scan<int>(m_parameter->GetParam("slmodel-beam")[0]))
00668 : 500;
00669
00670 if (files.size() < 1) {
00671 cerr << "No syntactic language model files specified!" << std::endl;
00672 return false;
00673 }
00674
00675
00676 if (weights.size() >= 1) {
00677
00678
00679
00680
00681
00682 m_syntacticLanguageModel = new SyntacticLanguageModel(files,weights,factorType,beamWidth);
00683
00684
00686
00687
00688
00689
00690
00691
00692
00693
00694
00696
00697
00698
00699 if (m_syntacticLanguageModel==NULL) {
00700 return false;
00701 }
00702
00703 }
00704
00705 return true;
00706
00707 }
00708 #endif
00709
00710 bool StaticData::LoadLexicalReorderingModel()
00711 {
00712 VERBOSE(1, "Loading lexical distortion models...");
00713 const vector<string> fileStr = m_parameter->GetParam("distortion-file");
00714 bool hasWeightlr = (m_parameter->GetParam("weight-lr").size() != 0);
00715 vector<string> weightsStr;
00716 if (hasWeightlr) {
00717 weightsStr = m_parameter->GetParam("weight-lr");
00718 } else {
00719 weightsStr = m_parameter->GetParam("weight-d");
00720 }
00721
00722 std::vector<float> weights;
00723 size_t w = 1;
00724 if (hasWeightlr) {
00725 w = 0;
00726 }
00727 size_t f = 0;
00728
00729 VERBOSE(1, "have " << fileStr.size() << " models" << std::endl);
00730 for(size_t j = 0; j < weightsStr.size(); ++j) {
00731 weights.push_back(Scan<float>(weightsStr[j]));
00732 }
00733
00734 for(size_t i = 0; i < fileStr.size(); ++i) {
00735 vector<string> spec = Tokenize<string>(fileStr[f], " ");
00736 ++f;
00737 if(spec.size() != 4) {
00738 UserMessage::Add("Invalid Lexical Reordering Model Specification: " + fileStr[f]);
00739 return false;
00740 }
00741
00742
00743
00744
00745
00746
00747
00748
00749 vector<FactorType> input, output;
00750 vector<string> inputfactors = Tokenize(spec[0],"-");
00751 if(inputfactors.size() == 2) {
00752 input = Tokenize<FactorType>(inputfactors[0],",");
00753 output = Tokenize<FactorType>(inputfactors[1],",");
00754 } else if(inputfactors.size() == 1) {
00755
00756 output = Tokenize<FactorType>(inputfactors[0],",");
00757 } else {
00758
00759 return false;
00760 }
00761
00762 string modelType = spec[1];
00763
00764
00765 std::vector<float> mweights;
00766 size_t numWeights = atoi(spec[2].c_str());
00767 for(size_t k = 0; k < numWeights; ++k, ++w) {
00768 if(w >= weights.size()) {
00769 UserMessage::Add("Lexicalized distortion model: Not enough weights, add to [weight-d]");
00770 return false;
00771 } else
00772 mweights.push_back(weights[w]);
00773 }
00774
00775 string filePath = spec[3];
00776
00777 m_reorderModels.push_back(new LexicalReordering(input, output, modelType, filePath, mweights));
00778 }
00779 return true;
00780 }
00781
00782 bool StaticData::LoadGlobalLexicalModel()
00783 {
00784 const vector<float> &weight = Scan<float>(m_parameter->GetParam("weight-lex"));
00785 const vector<string> &file = m_parameter->GetParam("global-lexical-file");
00786
00787 if (weight.size() != file.size()) {
00788 std::cerr << "number of weights and models for the global lexical model does not match ("
00789 << weight.size() << " != " << file.size() << ")" << std::endl;
00790 return false;
00791 }
00792
00793 for (size_t i = 0; i < weight.size(); i++ ) {
00794 vector<string> spec = Tokenize<string>(file[i], " ");
00795 if ( spec.size() != 2 ) {
00796 std::cerr << "wrong global lexical model specification: " << file[i] << endl;
00797 return false;
00798 }
00799 vector< string > factors = Tokenize(spec[0],"-");
00800 if ( factors.size() != 2 ) {
00801 std::cerr << "wrong factor definition for global lexical model: " << spec[0] << endl;
00802 return false;
00803 }
00804 vector<FactorType> inputFactors = Tokenize<FactorType>(factors[0],",");
00805 vector<FactorType> outputFactors = Tokenize<FactorType>(factors[1],",");
00806 m_globalLexicalModels.push_back( new GlobalLexicalModel( spec[1], weight[i], inputFactors, outputFactors ) );
00807 }
00808 return true;
00809 }
00810
00811 bool StaticData::LoadLanguageModels()
00812 {
00813 if (m_parameter->GetParam("lmodel-file").size() > 0) {
00814
00815 vector<float> weightAll = Scan<float>(m_parameter->GetParam("weight-l"));
00816
00817 for (size_t i = 0 ; i < weightAll.size() ; i++) {
00818 m_allWeights.push_back(weightAll[i]);
00819 }
00820
00821
00822 vector<int> LMdub = Scan<int>(m_parameter->GetParam("lmodel-dub"));
00823 if (m_parameter->GetParam("lmodel-dub").size() == 0) {
00824 for(size_t i=0; i<m_parameter->GetParam("lmodel-file").size(); i++)
00825 LMdub.push_back(0);
00826 }
00827
00828
00829 const vector<string> &lmVector = m_parameter->GetParam("lmodel-file");
00830
00831 map<string,LanguageModel*> languageModelsLoaded;
00832
00833 for(size_t i=0; i<lmVector.size(); i++) {
00834 LanguageModel* lm = NULL;
00835 if (languageModelsLoaded.find(lmVector[i]) != languageModelsLoaded.end()) {
00836 lm = languageModelsLoaded[lmVector[i]]->Duplicate(m_scoreIndexManager);
00837 } else {
00838 vector<string> token = Tokenize(lmVector[i]);
00839 if (token.size() != 4 && token.size() != 5 ) {
00840 UserMessage::Add("Expected format 'LM-TYPE FACTOR-TYPE NGRAM-ORDER filePath [mapFilePath (only for IRSTLM)]'");
00841 return false;
00842 }
00843
00844 LMImplementation lmImplementation = static_cast<LMImplementation>(Scan<int>(token[0]));
00845
00846
00847 vector<FactorType> factorTypes = Tokenize<FactorType>(token[1], ",");
00848
00849
00850 size_t nGramOrder = Scan<int>(token[2]);
00851
00852 string &languageModelFile = token[3];
00853 if (token.size() == 5) {
00854 if (lmImplementation==IRST)
00855 languageModelFile += " " + token[4];
00856 else {
00857 UserMessage::Add("Expected format 'LM-TYPE FACTOR-TYPE NGRAM-ORDER filePath [mapFilePath (only for IRSTLM)]'");
00858 return false;
00859 }
00860 }
00861 IFVERBOSE(1)
00862 PrintUserTime(string("Start loading LanguageModel ") + languageModelFile);
00863
00864 lm = LanguageModelFactory::CreateLanguageModel(
00865 lmImplementation
00866 , factorTypes
00867 , nGramOrder
00868 , languageModelFile
00869 , m_scoreIndexManager
00870 , LMdub[i]);
00871 if (lm == NULL) {
00872 UserMessage::Add("no LM created. We probably don't have it compiled");
00873 return false;
00874 }
00875 languageModelsLoaded[lmVector[i]] = lm;
00876 }
00877
00878 m_languageModel.Add(lm);
00879 }
00880 }
00881
00882
00883 m_fLMsLoaded = true;
00884 IFVERBOSE(1)
00885 PrintUserTime("Finished loading LanguageModels");
00886 return true;
00887 }
00888
00889 bool StaticData::LoadGenerationTables()
00890 {
00891 if (m_parameter->GetParam("generation-file").size() > 0) {
00892 const vector<string> &generationVector = m_parameter->GetParam("generation-file");
00893 const vector<float> &weight = Scan<float>(m_parameter->GetParam("weight-generation"));
00894
00895 IFVERBOSE(1) {
00896 TRACE_ERR( "weight-generation: ");
00897 for (size_t i = 0 ; i < weight.size() ; i++) {
00898 TRACE_ERR( weight[i] << "\t");
00899 }
00900 TRACE_ERR(endl);
00901 }
00902 size_t currWeightNum = 0;
00903
00904 for(size_t currDict = 0 ; currDict < generationVector.size(); currDict++) {
00905 vector<string> token = Tokenize(generationVector[currDict]);
00906 vector<FactorType> input = Tokenize<FactorType>(token[0], ",")
00907 ,output = Tokenize<FactorType>(token[1], ",");
00908 m_maxFactorIdx[1] = CalcMax(m_maxFactorIdx[1], input, output);
00909 string filePath;
00910 size_t numFeatures;
00911
00912 numFeatures = Scan<size_t>(token[2]);
00913 filePath = token[3];
00914
00915 if (!FileExists(filePath) && FileExists(filePath + ".gz")) {
00916 filePath += ".gz";
00917 }
00918
00919 VERBOSE(1, filePath << endl);
00920
00921 m_generationDictionary.push_back(new GenerationDictionary(numFeatures, m_scoreIndexManager, input,output));
00922 CHECK(m_generationDictionary.back() && "could not create GenerationDictionary");
00923 if (!m_generationDictionary.back()->Load(filePath, Output)) {
00924 delete m_generationDictionary.back();
00925 return false;
00926 }
00927 for(size_t i = 0; i < numFeatures; i++) {
00928 CHECK(currWeightNum < weight.size());
00929 m_allWeights.push_back(weight[currWeightNum++]);
00930 }
00931 }
00932 if (currWeightNum != weight.size()) {
00933 TRACE_ERR( " [WARNING] config file has " << weight.size() << " generation weights listed, but the configuration for generation files indicates there should be " << currWeightNum << "!\n");
00934 }
00935 }
00936
00937 return true;
00938 }
00939
00940
00941 bool StaticData::LoadPhraseTables()
00942 {
00943 VERBOSE(2,"Creating phrase table features" << endl);
00944
00945
00946 CHECK(m_fLMsLoaded);
00947
00948 if (m_parameter->GetParam("ttable-file").size() > 0) {
00949
00950 vector<float> weightAll = Scan<float>(m_parameter->GetParam("weight-t"));
00951
00952 const vector<string> &translationVector = m_parameter->GetParam("ttable-file");
00953 vector<size_t> maxTargetPhrase = Scan<size_t>(m_parameter->GetParam("ttable-limit"));
00954
00955 if(maxTargetPhrase.size() == 1 && translationVector.size() > 1) {
00956 VERBOSE(1, "Using uniform ttable-limit of " << maxTargetPhrase[0] << " for all translation tables." << endl);
00957 for(size_t i = 1; i < translationVector.size(); i++)
00958 maxTargetPhrase.push_back(maxTargetPhrase[0]);
00959 } else if(maxTargetPhrase.size() != 1 && maxTargetPhrase.size() < translationVector.size()) {
00960 stringstream strme;
00961 strme << "You specified " << translationVector.size() << " translation tables, but only " << maxTargetPhrase.size() << " ttable-limits.";
00962 UserMessage::Add(strme.str());
00963 return false;
00964 }
00965
00966 size_t index = 0;
00967 size_t weightAllOffset = 0;
00968 bool oldFileFormat = false;
00969 for(size_t currDict = 0 ; currDict < translationVector.size(); currDict++) {
00970 vector<string> token = Tokenize(translationVector[currDict]);
00971
00972 if(currDict == 0 && token.size() == 4) {
00973 VERBOSE(1, "Warning: Phrase table specification in old 4-field format. Assuming binary phrase tables (type 1)!" << endl);
00974 oldFileFormat = true;
00975 }
00976
00977 if((!oldFileFormat && token.size() < 5) || (oldFileFormat && token.size() != 4)) {
00978 UserMessage::Add("invalid phrase table specification");
00979 return false;
00980 }
00981
00982 PhraseTableImplementation implementation = (PhraseTableImplementation) Scan<int>(token[0]);
00983 if(oldFileFormat) {
00984 token.push_back(token[3]);
00985 token[3] = token[2];
00986 token[2] = token[1];
00987 token[1] = token[0];
00988 token[0] = "1";
00989 implementation = Binary;
00990 } else
00991 implementation = (PhraseTableImplementation) Scan<int>(token[0]);
00992
00993 CHECK(token.size() >= 5);
00994
00995
00996 vector<FactorType> input = Tokenize<FactorType>(token[1], ",")
00997 ,output = Tokenize<FactorType>(token[2], ",");
00998 m_maxFactorIdx[0] = CalcMax(m_maxFactorIdx[0], input);
00999 m_maxFactorIdx[1] = CalcMax(m_maxFactorIdx[1], output);
01000 m_maxNumFactors = std::max(m_maxFactorIdx[0], m_maxFactorIdx[1]) + 1;
01001 size_t numScoreComponent = Scan<size_t>(token[3]);
01002 string filePath= token[4];
01003
01004 CHECK(weightAll.size() >= weightAllOffset + numScoreComponent);
01005
01006
01007
01008 vector<float> weight;
01009
01010 if(currDict==0 && (m_inputType == ConfusionNetworkInput || m_inputType == WordLatticeInput)) {
01011
01012
01013
01014 m_numInputScores=m_parameter->GetParam("weight-i").size();
01015
01016 if (implementation == Binary)
01017 {
01018 for(unsigned k=0; k<m_numInputScores; ++k)
01019 weight.push_back(Scan<float>(m_parameter->GetParam("weight-i")[k]));
01020 }
01021
01022 if(m_parameter->GetParam("link-param-count").size())
01023 m_numLinkParams = Scan<size_t>(m_parameter->GetParam("link-param-count")[0]);
01024
01025
01026 if (implementation == Binary) {
01027 if (m_numLinkParams == m_numInputScores) {
01028 VERBOSE(1,"specified equal numbers of link parameters and insertion weights, not using non-epsilon 'real' word link count.\n");
01029 } else if ((m_numLinkParams + 1) == m_numInputScores) {
01030 VERBOSE(1,"WARN: "<< m_numInputScores << " insertion weights found and only "<< m_numLinkParams << " link parameters specified, applying non-epsilon 'real' word link count for last feature weight.\n");
01031 } else {
01032 stringstream strme;
01033 strme << "You specified " << m_numInputScores
01034 << " input weights (weight-i), but you specified " << m_numLinkParams << " link parameters (link-param-count)!";
01035 UserMessage::Add(strme.str());
01036 return false;
01037 }
01038 }
01039
01040 }
01041 if (!m_inputType) {
01042 m_numInputScores=0;
01043 }
01044
01045 size_t tableInputScores = (currDict == 0 && implementation == Binary) ? m_numInputScores : 0;
01046
01047 for (size_t currScore = 0 ; currScore < numScoreComponent; currScore++)
01048 weight.push_back(weightAll[weightAllOffset + currScore]);
01049
01050 if(weight.size() - tableInputScores != numScoreComponent) {
01051 stringstream strme;
01052 strme << "Your phrase table has " << numScoreComponent
01053 << " scores, but you specified " << (weight.size() - tableInputScores) << " weights!";
01054 UserMessage::Add(strme.str());
01055 return false;
01056 }
01057
01058 weightAllOffset += numScoreComponent;
01059 numScoreComponent += tableInputScores;
01060
01061 string targetPath, alignmentsFile;
01062 if (implementation == SuffixArray) {
01063 targetPath = token[5];
01064 alignmentsFile= token[6];
01065 }
01066
01067 CHECK(numScoreComponent==weight.size());
01068
01069 std::copy(weight.begin(),weight.end(),std::back_inserter(m_allWeights));
01070
01071
01072
01073 IFVERBOSE(1)
01074 PrintUserTime(string("Start loading PhraseTable ") + filePath);
01075 VERBOSE(1,"filePath: " << filePath <<endl);
01076
01077 PhraseDictionaryFeature* pdf = new PhraseDictionaryFeature(
01078 implementation
01079 , numScoreComponent
01080 , (currDict==0 ? m_numInputScores : 0)
01081 , input
01082 , output
01083 , filePath
01084 , weight
01085 , maxTargetPhrase[index]
01086 , targetPath, alignmentsFile);
01087
01088 m_phraseDictionary.push_back(pdf);
01089
01090
01091
01092
01093
01094 index++;
01095 }
01096 }
01097
01098 IFVERBOSE(1)
01099 PrintUserTime("Finished loading phrase tables");
01100 return true;
01101 }
01102
01103 void StaticData::LoadNonTerminals()
01104 {
01105 string defaultNonTerminals;
01106
01107 if (m_parameter->GetParam("non-terminals").size() == 0) {
01108 defaultNonTerminals = "X";
01109 } else {
01110 vector<std::string> tokens = Tokenize(m_parameter->GetParam("non-terminals")[0]);
01111 defaultNonTerminals = tokens[0];
01112 }
01113
01114 FactorCollection &factorCollection = FactorCollection::Instance();
01115
01116 m_inputDefaultNonTerminal.SetIsNonTerminal(true);
01117 const Factor *sourceFactor = factorCollection.AddFactor(Input, 0, defaultNonTerminals);
01118 m_inputDefaultNonTerminal.SetFactor(0, sourceFactor);
01119
01120 m_outputDefaultNonTerminal.SetIsNonTerminal(true);
01121 const Factor *targetFactor = factorCollection.AddFactor(Output, 0, defaultNonTerminals);
01122 m_outputDefaultNonTerminal.SetFactor(0, targetFactor);
01123
01124
01125 if (m_parameter->GetParam("unknown-lhs").size() == 0) {
01126 UnknownLHSEntry entry(defaultNonTerminals, 0.0f);
01127 m_unknownLHS.push_back(entry);
01128 } else {
01129 const string &filePath = m_parameter->GetParam("unknown-lhs")[0];
01130
01131 InputFileStream inStream(filePath);
01132 string line;
01133 while(getline(inStream, line)) {
01134 vector<string> tokens = Tokenize(line);
01135 CHECK(tokens.size() == 2);
01136 UnknownLHSEntry entry(tokens[0], Scan<float>(tokens[1]));
01137 m_unknownLHS.push_back(entry);
01138 }
01139
01140 }
01141
01142 }
01143
01144 void StaticData::LoadChartDecodingParameters()
01145 {
01146 LoadNonTerminals();
01147
01148
01149 if (m_parameter->GetParam("source-label-overlap").size() > 0) {
01150 m_sourceLabelOverlap = (SourceLabelOverlap) Scan<int>(m_parameter->GetParam("source-label-overlap")[0]);
01151 } else {
01152 m_sourceLabelOverlap = SourceLabelOverlapAdd;
01153 }
01154
01155 m_ruleLimit = (m_parameter->GetParam("rule-limit").size() > 0)
01156 ? Scan<size_t>(m_parameter->GetParam("rule-limit")[0]) : DEFAULT_MAX_TRANS_OPT_SIZE;
01157 }
01158
01159 void StaticData::LoadPhraseBasedParameters()
01160 {
01161 const vector<string> distortionWeights = m_parameter->GetParam("weight-d");
01162 size_t distortionWeightCount = distortionWeights.size();
01163
01164
01165 if (!m_parameter->GetParam("weight-lr").size() && m_parameter->GetParam("distortion-file").size()) {
01166 distortionWeightCount = 1;
01167 }
01168 for (size_t i = 0; i < distortionWeightCount; ++i) {
01169 float weightDistortion = Scan<float>(distortionWeights[i]);
01170 m_distortionScoreProducers.push_back(new DistortionScoreProducer(m_scoreIndexManager));
01171 m_allWeights.push_back(weightDistortion);
01172 }
01173 }
01174
01175 bool StaticData::LoadDecodeGraphs()
01176 {
01177 const vector<string> &mappingVector = m_parameter->GetParam("mapping");
01178 const vector<size_t> &maxChartSpans = Scan<size_t>(m_parameter->GetParam("max-chart-span"));
01179
01180 DecodeStep *prev = 0;
01181 size_t prevDecodeGraphInd = 0;
01182 for(size_t i=0; i<mappingVector.size(); i++) {
01183 vector<string> token = Tokenize(mappingVector[i]);
01184 size_t decodeGraphInd;
01185 DecodeType decodeType;
01186 size_t index;
01187 if (token.size() == 2) {
01188 decodeGraphInd = 0;
01189 decodeType = token[0] == "T" ? Translate : Generate;
01190 index = Scan<size_t>(token[1]);
01191 } else if (token.size() == 3) {
01192
01193 decodeGraphInd = Scan<size_t>(token[0]);
01194
01195 CHECK(decodeGraphInd == prevDecodeGraphInd || decodeGraphInd == prevDecodeGraphInd + 1);
01196 if (decodeGraphInd > prevDecodeGraphInd) {
01197 prev = NULL;
01198 }
01199 decodeType = token[1] == "T" ? Translate : Generate;
01200 index = Scan<size_t>(token[2]);
01201 } else {
01202 UserMessage::Add("Malformed mapping!");
01203 CHECK(false);
01204 }
01205
01206 DecodeStep* decodeStep = NULL;
01207 switch (decodeType) {
01208 case Translate:
01209 if(index>=m_phraseDictionary.size()) {
01210 stringstream strme;
01211 strme << "No phrase dictionary with index "
01212 << index << " available!";
01213 UserMessage::Add(strme.str());
01214 CHECK(false);
01215 }
01216 decodeStep = new DecodeStepTranslation(m_phraseDictionary[index], prev);
01217 break;
01218 case Generate:
01219 if(index>=m_generationDictionary.size()) {
01220 stringstream strme;
01221 strme << "No generation dictionary with index "
01222 << index << " available!";
01223 UserMessage::Add(strme.str());
01224 CHECK(false);
01225 }
01226 decodeStep = new DecodeStepGeneration(m_generationDictionary[index], prev);
01227 break;
01228 case InsertNullFertilityWord:
01229 CHECK(!"Please implement NullFertilityInsertion.");
01230 break;
01231 }
01232
01233 CHECK(decodeStep);
01234 if (m_decodeGraphs.size() < decodeGraphInd + 1) {
01235 DecodeGraph *decodeGraph;
01236 if (m_searchAlgorithm == ChartDecoding) {
01237 size_t maxChartSpan = (decodeGraphInd < maxChartSpans.size()) ? maxChartSpans[decodeGraphInd] : DEFAULT_MAX_CHART_SPAN;
01238 decodeGraph = new DecodeGraph(m_decodeGraphs.size(), maxChartSpan);
01239 } else {
01240 decodeGraph = new DecodeGraph(m_decodeGraphs.size());
01241 }
01242
01243 m_decodeGraphs.push_back(decodeGraph);
01244 }
01245
01246 m_decodeGraphs[decodeGraphInd]->Add(decodeStep);
01247 prev = decodeStep;
01248 prevDecodeGraphInd = decodeGraphInd;
01249 }
01250
01251
01252
01253 for(size_t i=0; i<m_decodeGraphs.size(); i++) {
01254 m_decodeGraphBackoff.push_back( 0 );
01255 }
01256
01257 const vector<string> &backoffVector = m_parameter->GetParam("decoding-graph-backoff");
01258 for(size_t i=0; i<m_decodeGraphs.size() && i<backoffVector.size(); i++) {
01259 m_decodeGraphBackoff[i] = Scan<size_t>(backoffVector[i]);
01260 }
01261
01262 return true;
01263 }
01264
01265
01266 void StaticData::SetWeightsForScoreProducer(const ScoreProducer* sp, const std::vector<float>& weights)
01267 {
01268 const size_t id = sp->GetScoreBookkeepingID();
01269 const size_t begin = m_scoreIndexManager.GetBeginIndex(id);
01270 const size_t end = m_scoreIndexManager.GetEndIndex(id);
01271 CHECK(end - begin == weights.size());
01272 if (m_allWeights.size() < end)
01273 m_allWeights.resize(end);
01274 std::vector<float>::const_iterator weightIter = weights.begin();
01275 for (size_t i = begin; i < end; i++)
01276 m_allWeights[i] = *weightIter++;
01277 }
01278
01279 const TranslationOptionList* StaticData::FindTransOptListInCache(const DecodeGraph &decodeGraph, const Phrase &sourcePhrase) const
01280 {
01281 std::pair<size_t, Phrase> key(decodeGraph.GetPosition(), sourcePhrase);
01282 #ifdef WITH_THREADS
01283 boost::mutex::scoped_lock lock(m_transOptCacheMutex);
01284 #endif
01285 std::map<std::pair<size_t, Phrase>, std::pair<TranslationOptionList*,clock_t> >::iterator iter
01286 = m_transOptCache.find(key);
01287 if (iter == m_transOptCache.end())
01288 return NULL;
01289 iter->second.second = clock();
01290 return iter->second.first;
01291 }
01292
01293 void StaticData::ReduceTransOptCache() const
01294 {
01295 if (m_transOptCache.size() <= m_transOptCacheMaxSize) return;
01296 clock_t t = clock();
01297
01298
01299 priority_queue< clock_t > lastUsedTimes;
01300 std::map<std::pair<size_t, Phrase>, std::pair<TranslationOptionList*,clock_t> >::iterator iter;
01301 iter = m_transOptCache.begin();
01302 while( iter != m_transOptCache.end() ) {
01303 lastUsedTimes.push( iter->second.second );
01304 iter++;
01305 }
01306 for( size_t i=0; i < lastUsedTimes.size()-m_transOptCacheMaxSize/2; i++ )
01307 lastUsedTimes.pop();
01308 clock_t cutoffLastUsedTime = lastUsedTimes.top();
01309
01310
01311 iter = m_transOptCache.begin();
01312 while( iter != m_transOptCache.end() ) {
01313 if (iter->second.second < cutoffLastUsedTime) {
01314 std::map<std::pair<size_t, Phrase>, std::pair<TranslationOptionList*,clock_t> >::iterator iterRemove = iter++;
01315 delete iterRemove->second.first;
01316 m_transOptCache.erase(iterRemove);
01317 } else iter++;
01318 }
01319 VERBOSE(2,"Reduced persistent translation option cache in " << ((clock()-t)/(float)CLOCKS_PER_SEC) << " seconds." << std::endl);
01320 }
01321
01322 void StaticData::AddTransOptListToCache(const DecodeGraph &decodeGraph, const Phrase &sourcePhrase, const TranslationOptionList &transOptList) const
01323 {
01324 if (m_transOptCacheMaxSize == 0) return;
01325 std::pair<size_t, Phrase> key(decodeGraph.GetPosition(), sourcePhrase);
01326 TranslationOptionList* storedTransOptList = new TranslationOptionList(transOptList);
01327 #ifdef WITH_THREADS
01328 boost::mutex::scoped_lock lock(m_transOptCacheMutex);
01329 #endif
01330 m_transOptCache[key] = make_pair( storedTransOptList, clock() );
01331 ReduceTransOptCache();
01332 }
01333 void StaticData::ClearTransOptionCache() const {
01334 map<std::pair<size_t, Phrase>, std::pair< TranslationOptionList*, clock_t > >::iterator iterCache;
01335 for (iterCache = m_transOptCache.begin() ; iterCache != m_transOptCache.end() ; ++iterCache) {
01336 TranslationOptionList *transOptList = iterCache->second.first;
01337 delete transOptList;
01338 }
01339 }
01340
01341 }
01342
01343