00001
00002
00003
00004
00005
00006
00007
00008
00009
00010
00011
00012
00013
00014
00015
00016
00017
00018
00019
00020
00021
00022 #include <ctime>
00023 #include <iostream>
00024 #include <iterator>
00025 #include <fstream>
00026 #include <sstream>
00027 #include <algorithm>
00028 #include <boost/algorithm/string/predicate.hpp>
00029 #include "Parameter.h"
00030 #include "Util.h"
00031 #include "InputFileStream.h"
00032 #include "StaticData.h"
00033 #include "util/string_stream.hh"
00034 #include "util/exception.hh"
00035 #include "util/random.hh"
00036 #include <boost/program_options.hpp>
00037
00038 #ifdef HAVE_XMLRPC_C
00039 #include <xmlrpc_server.h>
00040 #endif
00041
00042 using namespace std;
00043 using namespace boost::algorithm;
00044 namespace po = boost::program_options;
00045
00046 namespace Moses
00047 {
00048
00050 Parameter::Parameter()
00051 {
00053
00054 po::options_description main_opts("Main Options");
00055 AddParam(main_opts,"config", "f", "location of the configuration file");
00056 AddParam(main_opts,"input-file", "i", "location of the input file to be translated");
00057
00058 AddParam(main_opts,"verbose", "v", "verbosity level of the logging");
00059 AddParam(main_opts,"version", "show version of Moses and libraries used");
00060 AddParam(main_opts,"show-weights", "print feature weights and exit");
00061 AddParam(main_opts,"time-out", "seconds after which is interrupted (-1=no time-out, default is -1)");
00062 AddParam(main_opts,"segment-time-out", "seconds for single segment after which is interrupted (-1=no time-out, default is -1)");
00063
00065
00066 po::options_description factor_opts("General Factorization Options");
00067 AddParam(factor_opts,"factor-delimiter", "fd", "specify a different factor delimiter than the default");
00068
00069 AddParam(factor_opts,"mapping", "description of decoding steps");
00070 AddParam(factor_opts,"placeholder-factor", "Which source factor to use to store the original text for placeholders. The factor must not be used by a translation or gen model");
00071
00073
00074 po::options_description search_opts("Search Options");
00075 string desc = "Which search algorithm to use.\n";
00076 desc += "0=normal stack (default)\n";
00077 desc += "1=cube pruning\n";
00078 desc += "3=chart (with cube pruning)\n";
00079 desc += "4=stack with batched lm requests\n";
00080 desc += "5=chart (with incremental search)\n";
00081 desc += "6=string-to-tree\n";
00082 desc += "7=tree-to-string\n";
00083 desc += "8=tree-to-string (SCFG-based)\n";
00084 desc += "9=forest-to-string";
00085 AddParam(search_opts,"search-algorithm", desc);
00086 AddParam(search_opts,"beam-threshold", "b", "threshold for threshold pruning");
00087 AddParam(search_opts,"early-discarding-threshold", "edt", "threshold for constructing hypotheses based on estimate cost");
00088 AddParam(search_opts,"stack", "s", "maximum stack size for histogram pruning. 0 = unlimited stack size");
00089 AddParam(search_opts,"stack-diversity", "sd", "minimum number of hypothesis of each coverage in stack (default 0)");
00090
00091
00092 AddParam(search_opts,"weight-file", "wf", "feature weights file. Do *not* put weights for 'core' features in here - they go in moses.ini");
00093 AddParam(search_opts,"weight", "weights for ALL models, 1 per line 'WeightName value'. Weight names can be repeated");
00094
00095 AddParam(search_opts,"feature-overwrite", "Override arguments in a particular feature function with a particular key. Format: -feature-overwrite \"FeatureName key=value\"");
00096
00097 po::options_description tune_opts("Options used in tuning.");
00098 AddParam(tune_opts,"weight-overwrite", "special parameter for mert. All on 1 line. Overrides weights specified in 'weights' argument");
00099 AddParam(tune_opts,"feature-add", "Add a feature function on the command line. Used by mira to add BLEU feature");
00100 AddParam(tune_opts,"weight-add", "Add weight for FF if it doesn't exist, i.e weights here are added 1st, and can be override by the ini file or on the command line. Used to specify initial weights for FF that was also specified on the copmmand line");
00101
00102
00103 AddParam(search_opts,"max-partial-trans-opt", "maximum number of partial translation options per input span (during mapping steps)");
00104 AddParam(search_opts,"max-trans-opt-per-coverage", "maximum number of translation options per input span (after applying mapping steps)");
00105 AddParam(search_opts,"max-phrase-length", "maximum phrase length (default 20)");
00106 AddParam(search_opts,"translation-option-threshold", "tot", "threshold for translation options relative to best for input phrase");
00107
00108
00109 AddParam(search_opts,"disable-discarding", "dd", "disable hypothesis discarding");
00110 AddParam(search_opts,"phrase-drop-allowed", "da", "if present, allow dropping of source words");
00111 AddParam(search_opts,"threads","th", "number of threads to use in decoding (defaults to single-threaded)");
00112
00113
00114 po::options_description disto_opts("Distortion options");
00115 AddParam(disto_opts,"distortion-limit", "dl", "distortion (reordering) limit in maximum number of words (0 = monotone, -1 = unlimited)");
00116 AddParam(disto_opts,"monotone-at-punctuation", "mp", "do not reorder over punctuation");
00117 AddParam(disto_opts,"early-distortion-cost", "edc", "include estimate of distortion cost yet to be incurred in the score [Moore & Quirk 2007]. Default is no");
00118 AddParam(disto_opts,"distortion", "configurations for each factorized/lexicalized reordering model.");
00119
00120
00121 po::options_description cube_opts("Cube pruning options.");
00122 AddParam(cube_opts,"cube-pruning-pop-limit", "cbp", "How many hypotheses should be popped for each stack. (default = 1000)");
00123 AddParam(cube_opts,"cube-pruning-diversity", "cbd", "How many hypotheses should be created for each coverage. (default = 0)");
00124 AddParam(cube_opts,"cube-pruning-lazy-scoring", "cbls", "Don't fully score a hypothesis until it is popped");
00125 AddParam(cube_opts,"cube-pruning-deterministic-search", "cbds", "Break ties deterministically during search");
00126
00128
00129 po::options_description mbr_opts("Minimum Bayes Risk (MBR), Lattice MBR, and Consensus decoding");
00130
00131 AddParam(mbr_opts,"minimum-bayes-risk", "mbr", "use miminum Bayes risk to determine best translation");
00132 AddParam(mbr_opts,"mbr-size", "number of translation candidates considered in MBR decoding (default 200)");
00133 AddParam(mbr_opts,"mbr-scale", "scaling factor to convert log linear score probability in MBR decoding (default 1.0)");
00134
00135 AddParam(mbr_opts,"lminimum-bayes-risk", "lmbr", "use lattice miminum Bayes risk to determine best translation");
00136 AddParam(mbr_opts,"consensus-decoding", "con", "use consensus decoding (De Nero et. al. 2009)");
00137
00138 po::options_description lmbr_opts("Options specific to Lattic MBR");
00139 AddParam(lmbr_opts,"lmbr-p", "unigram precision value for lattice mbr");
00140 AddParam(lmbr_opts,"lmbr-r", "ngram precision decay value for lattice mbr");
00141 AddParam(lmbr_opts,"lmbr-thetas", "theta(s) for lattice mbr calculation");
00142 AddParam(mbr_opts,"lmbr-map-weight", "weight given to map solution when doing lattice MBR (default 0)");
00143 AddParam(mbr_opts,"lmbr-pruning-factor", "average number of nodes/word wanted in pruned lattice");
00144 AddParam(mbr_opts,"lattice-hypo-set", "to use lattice as hypo set during lattice MBR");
00145
00147
00148 po::options_description oov_opts("OOV Handling Options");
00149 AddParam(oov_opts,"drop-unknown", "du", "drop unknown words instead of copying them");
00150 AddParam(oov_opts,"mark-unknown", "mu", "mark unknown words in output");
00151 AddParam(oov_opts,"unknown-word-prefix", "prefix to unknwon word when marked (default: 'UNK')");
00152 AddParam(oov_opts,"unknown-word-suffix", "suffix to unknwon word when marked (default: '')");
00153 AddParam(oov_opts,"lmodel-oov-feature", "add language model oov feature, one per model");
00154 AddParam(oov_opts,"output-unknowns", "Output the unknown (OOV) words to the given file, one line per sentence");
00155 AddParam(oov_opts,"always-create-direct-transopt", "Always create a translation that translates the source word ad-verbatim");
00156
00158
00159 po::options_description input_opts("Input Format Options");
00160 AddParam(input_opts,"input-factors", "list of factors in the input");
00161 AddParam(input_opts,"inputtype", "text (0), confusion network (1), word lattice (2), tree (3) (default = 0)");
00162 AddParam(input_opts,"xml-input", "xi", "allows markup of input with desired translations and probabilities. values can be 'pass-through' (default), 'inclusive', 'exclusive', 'constraint', 'ignore'");
00163 AddParam(input_opts,"xml-brackets", "xb", "specify strings to be used as xml tags opening and closing, e.g. \"{{ }}\" (default \"< >\"). Avoid square brackets because of configuration file format. Valid only with text input mode" );
00164 AddParam(input_opts,"start-translation-id", "Id of 1st input. Default = 0");
00165 AddParam(input_opts,"alternate-weight-setting", "aws", "alternate set of weights to used per xml specification");
00166
00168
00169 po::options_description output_opts("Output Options");
00170 AddParam(output_opts,"report-all-factors", "report all factors in output, not just first");
00171 AddParam(output_opts,"output-factors", "list if factors in the output");
00172 AddParam(output_opts,"print-id", "prefix translations with id. Default if false");
00173 AddParam(output_opts,"print-passthrough", "output the sgml tag <passthrough> without any computation on that. Default is false");
00174 AddParam(output_opts,"print-passthrough-in-n-best", "output the sgml tag <passthrough> without any computation on that in each entry of the n-best-list. Default is false");
00175 AddParam(output_opts,"output-factors", "list of factors in the output");
00176 AddParam(output_opts,"print-all-derivations", "to print all derivations in search graph");
00177 AddParam(output_opts,"translation-details", "T", "for each best hypothesis, report translation details to the given file");
00178
00179 AddParam(output_opts,"output-hypo-score", "Output the hypo score to stdout with the output string. For search error analysis. Default is false");
00180 AddParam(output_opts,"output-word-graph", "owg", "Output stack info as word graph. Takes filename, 0=only hypos in stack, 1=stack + nbest hypos");
00181 AddParam(output_opts,"tree-translation-details", "Ttree", "for each hypothesis, report translation details with tree fragment info to given file");
00182 AddParam(output_opts,"print-alignment-info", "Output word-to-word alignment to standard out, separated from translation by |||. Word-to-word alignments are takne from the phrase table if any. Default is false");
00183 AddParam(output_opts,"alignment-output-file", "print output word alignments into given file");
00184 AddParam(output_opts,"sort-word-alignment", "Sort word alignments for more consistent display. 0=no sort (default), 1=target order");
00185 AddParam(output_opts,"report-segmentation", "t", "report phrase segmentation in the output");
00186 AddParam(output_opts,"report-segmentation-enriched", "tt", "report phrase segmentation in the output with additional information");
00187
00188
00189
00190 AddParam(output_opts,"translation-all-details", "Tall", "for all hypotheses, report translation details to the given file");
00191
00192 po::options_description osg_opts("Options for outputting search graphs");
00193 AddParam(osg_opts,"output-search-graph", "osg", "Output connected hypotheses of search into specified filename");
00194 AddParam(osg_opts,"output-search-graph-extended", "osgx", "Output connected hypotheses of search into specified filename, in extended format");
00195 AddParam(osg_opts,"unpruned-search-graph", "usg", "When outputting chart search graph, do not exclude dead ends. Note: stack pruning may have eliminated some hypotheses");
00196 AddParam(osg_opts,"output-search-graph-slf", "slf", "Output connected hypotheses of search into specified directory, one file per sentence, in HTK standard lattice format (SLF) - the flag should be followed by a directory name, which must exist");
00197 AddParam(output_opts,"include-lhs-in-search-graph", "lhssg", "When outputting chart search graph, include the label of the LHS of the rule (useful when using syntax)");
00198 #ifdef HAVE_PROTOBUF
00199 AddParam(osg_opts,"output-search-graph-pb", "pb", "Write phrase lattice to protocol buffer objects in the specified path.");
00200 #endif
00201 AddParam(osg_opts,"output-search-graph-hypergraph", "DEPRECATED! Output connected hypotheses of search into specified directory, one file per sentence, in a hypergraph format (see Kenneth Heafield's lazy hypergraph decoder). This flag is followed by 3 values: 'true (gz|txt|bz) directory-name'");
00202
00204
00205 po::options_description nbest_opts("N-best Options");
00206 AddParam(nbest_opts,"n-best-list", "file and size of n-best-list to be generated; specify - as the file in order to write to STDOUT");
00207
00208
00209 AddParam(nbest_opts,"labeled-n-best-list", "print out labels for each weight type in n-best list. default is true");
00210 AddParam(nbest_opts,"n-best-trees", "Write n-best target-side trees to n-best-list");
00211 AddParam(nbest_opts,"n-best-factor", "factor to compute the maximum number of contenders (=factor*nbest-size). value 0 means infinity, i.e. no threshold. default is 0");
00212 AddParam(nbest_opts,"report-all-factors-in-n-best", "Report all factors in n-best-lists. Default is false");
00213 AddParam(nbest_opts,"lattice-samples", "generate samples from lattice, in same format as nbest list. Uses the file and size arguments, as in n-best-list");
00214 AddParam(nbest_opts,"include-segmentation-in-n-best", "include phrasal segmentation in the n-best list. default is false");
00215 AddParam(nbest_opts,"print-alignment-info-in-n-best",
00216 "Include word-to-word alignment in the n-best list. Word-to-word alignments are taken from the phrase table if any. Default is false");
00217
00219
00220 po::options_description server_opts("Moses Server Options");
00221 AddParam(server_opts,"server", "Run moses as a translation server.");
00222 AddParam(server_opts,"daemon", "Run moses as a translation server in the background.");
00223 AddParam(server_opts,"server-port", "Port for moses server");
00224 AddParam(server_opts,"server-log", "Log destination for moses server");
00225 AddParam(server_opts,"serial", "Run server in serial mode, processing only one request at a time.");
00226
00227 AddParam(server_opts,"server-maxconn",
00228 "Max. No of simultaneous HTTP transactions allowed by the server.");
00229 AddParam(server_opts,"server-maxconn-backlog",
00230 "Max. No. of requests the OS will queue if the server is busy.");
00231 AddParam(server_opts,"server-keepalive-maxconn",
00232 "Max. No. of requests the server will accept on a single TCP connection.");
00233 AddParam(server_opts,"server-keepalive-timeout",
00234 "Max. number of seconds the server will keep a persistent connection alive.");
00235 AddParam(server_opts,"server-timeout",
00236 "Max. number of seconds the server will wait for a client to submit a request once a connection has been established.");
00237
00238
00239 AddParam(server_opts,"session-timeout",
00240 "Timeout for sessions, e.g. '2h30m' or 1d (=24h)");
00241 AddParam(server_opts,"session-cache-size", string("Max. number of sessions cached.")
00242 +"Least recently used session is dumped first.");
00243
00244 po::options_description irstlm_opts("IRSTLM Options");
00245 AddParam(irstlm_opts,"clean-lm-cache",
00246 "clean language model caches after N translations (default N=1)");
00247
00248 po::options_description chart_opts("Chart Decoding Options");
00249 AddParam(chart_opts,"max-chart-span", "maximum num. of source word chart rules can consume (default 10)");
00250 AddParam(chart_opts,"non-terminals", "list of non-term symbols, space separated");
00251 AddParam(chart_opts,"rule-limit", "a little like table limit. But for chart decoding rules. Default is DEFAULT_MAX_TRANS_OPT_SIZE");
00252 AddParam(chart_opts,"source-label-overlap", "What happens if a span already has a label. 0=add more. 1=replace. 2=discard. Default is 0");
00253 AddParam(chart_opts,"unknown-lhs", "file containing target lhs of unknown words. 1 per line: LHS prob");
00254
00255 po::options_description misc_opts("Miscellaneous Options");
00256 AddParam(misc_opts,"mira", "do mira training");
00257 AddParam(misc_opts,"description", "Source language, target language, description");
00258 AddParam(misc_opts,"no-cache", "Disable all phrase-table caching. Default = false (ie. enable caching)");
00259 AddParam(misc_opts,"default-non-term-for-empty-range-only", "Don't add [X] to all ranges, just ranges where there isn't a source non-term. Default = false (ie. add [X] everywhere)");
00260 AddParam(misc_opts,"s2t-parsing-algorithm", "Which S2T parsing algorithm to use. 0=recursive CYK+, 1=scope-3 (default = 0)");
00261
00262
00263 AddParam(misc_opts,"decoding-graph-backoff", "dpb", "only use subsequent decoding paths for unknown spans of given length");
00264 AddParam(misc_opts,"references", "Reference file(s) - used for bleu score feature");
00265 AddParam(misc_opts,"recover-input-path", "r", "(conf net/word lattice only) - recover input path corresponding to the best translation");
00266 AddParam(misc_opts,"link-param-count", "Number of parameters on word links when using confusion networks or lattices (default = 1)");
00267 AddParam(misc_opts,"feature-name-overwrite", "Override feature name (NOT arguments). Eg. SRILM-->KENLM, PhraseDictionaryMemory-->PhraseDictionaryScope3");
00268
00269 AddParam(misc_opts,"feature", "All the feature functions should be here");
00270 AddParam(misc_opts,"context-string",
00271 "A (tokenized) string containing context words for context-sensitive translation.");
00272 AddParam(misc_opts,"context-weights", "A key-value map for context-sensitive translation.");
00273 AddParam(misc_opts,"context-window",
00274 "Context window (in words) for context-sensitive translation: {+|-|+-}<number>.");
00275
00276
00277 po::options_description cpt_opts("Options when using compact phrase and reordering tables.");
00278 AddParam(cpt_opts,"minphr-memory", "Load phrase table in minphr format into memory");
00279 AddParam(cpt_opts,"minlexr-memory", "Load lexical reordering table in minlexr format into memory");
00280
00281 po::options_description spe_opts("Simulated Post-editing Options");
00282 AddParam(spe_opts,"spe-src", "Simulated post-editing. Source filename");
00283 AddParam(spe_opts,"spe-trg", "Simulated post-editing. Target filename");
00284 AddParam(spe_opts,"spe-aln", "Simulated post-editing. Alignment filename");
00285
00287
00288 po::options_description deprec_opts("Deprecated Options");
00289 AddParam(deprec_opts,"link-param-count", "DEPRECATED. DO NOT USE. Number of parameters on word links when using confusion networks or lattices (default = 1)");
00290 AddParam(deprec_opts,"weight-slm", "slm", "DEPRECATED. DO NOT USE. weight(s) for syntactic language model");
00291 AddParam(deprec_opts,"weight-bl", "bl", "DEPRECATED. DO NOT USE. weight for bleu score feature");
00292 AddParam(deprec_opts,"weight-d", "d", "DEPRECATED. DO NOT USE. weight(s) for distortion (reordering components)");
00293 AddParam(deprec_opts,"weight-dlm", "dlm", "DEPRECATED. DO NOT USE. weight for discriminative LM feature function (on top of sparse weights)");
00294 AddParam(deprec_opts,"weight-lr", "lr", "DEPRECATED. DO NOT USE. weight(s) for lexicalized reordering, if not included in weight-d");
00295 AddParam(deprec_opts,"weight-generation", "g", "DEPRECATED. DO NOT USE. weight(s) for generation components");
00296 AddParam(deprec_opts,"weight-i", "I", "DEPRECATED. DO NOT USE. weight(s) for word insertion - used for parameters from confusion network and lattice input links");
00297 AddParam(deprec_opts,"weight-l", "lm", "DEPRECATED. DO NOT USE. weight(s) for language models");
00298 AddParam(deprec_opts,"weight-lex", "lex", "DEPRECATED. DO NOT USE. weight for global lexical model");
00299 AddParam(deprec_opts,"weight-glm", "glm", "DEPRECATED. DO NOT USE. weight for global lexical feature, sparse producer");
00300 AddParam(deprec_opts,"weight-wt", "wt", "DEPRECATED. DO NOT USE. weight for word translation feature");
00301 AddParam(deprec_opts,"weight-pp", "pp", "DEPRECATED. DO NOT USE. weight for phrase pair feature");
00302 AddParam(deprec_opts,"weight-pb", "pb", "DEPRECATED. DO NOT USE. weight for phrase boundary feature");
00303 AddParam(deprec_opts,"weight-t", "tm", "DEPRECATED. DO NOT USE. weights for translation model components");
00304 AddParam(deprec_opts,"weight-p", "w", "DEPRECATED. DO NOT USE. weight for phrase penalty");
00305 AddParam(deprec_opts,"weight-w", "w", "DEPRECATED. DO NOT USE. weight for word penalty");
00306 AddParam(deprec_opts,"weight-u", "u", "DEPRECATED. DO NOT USE. weight for unknown word penalty");
00307 AddParam(deprec_opts,"weight-e", "e", "DEPRECATED. DO NOT USE. weight for word deletion");
00308 AddParam(deprec_opts,"text-type", "DEPRECATED. DO NOT USE. should be one of dev/devtest/test, used for domain adaptation features");
00309 AddParam(deprec_opts,"input-scores", "DEPRECATED. DO NOT USE. 2 numbers on 2 lines - [1] of scores on each edge of a confusion network or lattice input (default=1). [2] Number of 'real' word scores (0 or 1. default=0)");
00310 AddParam(deprec_opts,"dlm-model", "DEPRECATED. DO NOT USE. Order, factor and vocabulary file for discriminative LM. Use * for filename to indicate unlimited vocabulary.");
00311 AddParam(deprec_opts,"generation-file", "DEPRECATED. DO NOT USE. location and properties of the generation table");
00312 AddParam(deprec_opts,"global-lexical-file", "gl", "DEPRECATED. DO NOT USE. discriminatively trained global lexical translation model file");
00313 AddParam(deprec_opts,"glm-feature", "DEPRECATED. DO NOT USE. discriminatively trained global lexical translation feature, sparse producer");
00314 AddParam(deprec_opts,"lmodel-file", "DEPRECATED. DO NOT USE. location and properties of the language models");
00315 AddParam(deprec_opts,"lmodel-dub", "DEPRECATED. DO NOT USE. dictionary upper bounds of language models");
00316 #ifdef HAVE_SYNLM
00317 AddParam(deprec_opts,"slmodel-file", "DEPRECATED. DO NOT USE. location of the syntactic language model file(s)");
00318 AddParam(deprec_opts,"slmodel-factor", "DEPRECATED. DO NOT USE. factor to use with syntactic language model");
00319 AddParam(deprec_opts,"slmodel-beam", "DEPRECATED. DO NOT USE. beam width to use with syntactic language model's parser");
00320 #endif
00321 AddParam(deprec_opts,"ttable-file", "DEPRECATED. DO NOT USE. location and properties of the translation tables");
00322 AddParam(deprec_opts,"phrase-pair-feature", "DEPRECATED. DO NOT USE. Source and target factors for phrase pair feature");
00323 AddParam(deprec_opts,"phrase-boundary-source-feature", "DEPRECATED. DO NOT USE. Source factors for phrase boundary feature");
00324 AddParam(deprec_opts,"phrase-boundary-target-feature", "DEPRECATED. DO NOT USE. Target factors for phrase boundary feature");
00325 AddParam(deprec_opts,"phrase-length-feature", "DEPRECATED. DO NOT USE. Count features for source length, target length, both of each phrase");
00326 AddParam(deprec_opts,"target-word-insertion-feature", "DEPRECATED. DO NOT USE. Count feature for each unaligned target word");
00327 AddParam(deprec_opts,"source-word-deletion-feature", "DEPRECATED. DO NOT USE. Count feature for each unaligned source word");
00328 AddParam(deprec_opts,"word-translation-feature", "DEPRECATED. DO NOT USE. Count feature for word translation according to word alignment");
00329
00330 po::options_description zombie_opts("Zombie Options");
00331 AddParam(zombie_opts,"distortion-file", "source factors (0 if table independent of source), target factors, location of the factorized/lexicalized reordering tables");
00332
00333
00334 mbr_opts.add(lmbr_opts);
00335 search_opts.add(cube_opts);
00336 search_opts.add(mbr_opts);
00337 search_opts.add(disto_opts);
00338 search_opts.add(chart_opts);
00339
00340 input_opts.add(spe_opts);
00341
00342 output_opts.add(nbest_opts);
00343 output_opts.add(osg_opts);
00344
00345 m_options.add(main_opts);
00346 m_options.add(server_opts);
00347 m_options.add(input_opts);
00348 m_options.add(search_opts);
00349 m_options.add(output_opts);
00350 m_options.add(oov_opts);
00351 m_options.add(factor_opts);
00352 m_options.add(cpt_opts);
00353 m_options.add(irstlm_opts);
00354 m_options.add(tune_opts);
00355 m_options.add(misc_opts);
00356 m_options.add(deprec_opts);
00357 m_options.add(zombie_opts);
00358
00359 }
00360
00361 Parameter::~Parameter()
00362 {
00363 }
00364
00365 const PARAM_VEC *Parameter::GetParam(const std::string ¶mName) const
00366 {
00367 PARAM_MAP::const_iterator iter = m_setting.find( paramName );
00368 if (iter == m_setting.end()) {
00369 return NULL;
00370 } else {
00371 return &iter->second;
00372 }
00373
00374 }
00375
00377 void
00378 Parameter::
00379 AddParam(po::options_description& optgroup,
00380 string const& paramName,
00381 string const& description)
00382 {
00383 m_valid[paramName] = true;
00384 m_description[paramName] = description;
00385 optgroup.add_options()(paramName.c_str(), description.c_str());
00386 }
00387
00389 void
00390 Parameter::
00391 AddParam(po::options_description& optgroup,
00392 string const& paramName,
00393 string const& abbrevName,
00394 string const& description)
00395 {
00396 m_valid[paramName] = true;
00397 m_valid[abbrevName] = true;
00398 m_abbreviation[paramName] = abbrevName;
00399 m_fullname[abbrevName] = paramName;
00400 m_description[paramName] = description;
00401 string optname = paramName;
00402 if (abbrevName.size() == 1) {
00403 optname += string(",")+abbrevName;
00404
00405 }
00406 optgroup.add_options()(optname.c_str(),description.c_str());
00407 }
00408
00410 void
00411 Parameter::
00412 Explain()
00413 {
00414 cerr << "Usage:" << endl;
00415 cerr << m_options << endl;
00416
00417
00418
00419
00420
00421
00422
00423
00424
00425
00426
00427 }
00428
00432 bool
00433 Parameter::
00434 isOption(const char* token)
00435 {
00436 if (! token) return false;
00437 std::string tokenString(token);
00438 size_t length = tokenString.size();
00439 if (length <= 1) return false;
00440 if (!starts_with(tokenString, "-")) return false;
00441 if (tokenString.substr(1,1).find_first_not_of("0123456789") == 0) return true;
00442 return false;
00443 }
00444
00446 bool
00447 Parameter::
00448 LoadParam(const string &filePath)
00449 {
00450 const char *argv[] = {"executable", "-f", filePath.c_str() };
00451 return LoadParam(3, (char const**) argv);
00452 }
00453
00456 void show_version()
00457 {
00458 std::cout << "\nMoses code version (git tag or commit hash):\n "
00459 << MOSES_VERSION_ID << std::endl
00460 << "Libraries used:" << std::endl
00461 << " Boost version "
00462 << BOOST_VERSION / 100000 << "."
00463 << BOOST_VERSION / 100 % 1000 << "."
00464 << BOOST_VERSION % 100
00465 << std::endl;
00466 #ifdef HAVE_XMLRPC_C
00467 unsigned int major, minor, point;
00468 xmlrpc_server_version(&major, &minor, &point);
00469 std::cout << " Xmlrpc-c version "
00470 << major << "." << minor << "." << point << std::endl;
00471 #endif
00472 #ifdef HAVE_CMPH
00473
00474 std::cout << " CMPH (version unknown)" << std::endl;
00475 #endif
00476
00477 #ifdef MMT_VERSION_ID
00478 std::cout << string(20,'-')
00479 << "\nMMT extras version: " << MMT_VERSION_ID << std::endl;
00480 #endif
00481 }
00482
00484 bool
00485 Parameter::
00486 LoadParam(int argc, char const* xargv[])
00487 {
00488
00489
00490 char const* argv[argc+1];
00491 for (int i = 0; i < argc; ++i) {
00492 argv[i] = xargv[i];
00493 if (strlen(argv[i]) > 2 && argv[i][0] == '-' && argv[i][1] == '-')
00494 ++argv[i];
00495 if (!strcmp(argv[i],"-version")) {
00496 show_version();
00497 exit(0);
00498 }
00499 }
00500
00501
00502
00503
00504 string configPath;
00505 if ( (configPath = FindParam("-f", argc, argv)) == ""
00506 && (configPath = FindParam("-config", argc, argv)) == "") {
00507 PrintCredit();
00508 Explain();
00509 PrintFF();
00510
00511 cerr << endl;
00512 cerr << "No configuration file was specified. Use -config or -f";
00513 cerr << endl;
00514 return false;
00515 } else {
00516 if (!ReadConfigFile(configPath)) {
00517 std::cerr << "Could not read " << configPath;
00518 return false;
00519 }
00520 }
00521
00522
00523 for(PARAM_STRING::const_iterator iterParam = m_description.begin();
00524 iterParam != m_description.end(); iterParam++) {
00525 const string paramName = iterParam->first;
00526 OverwriteParam("-" + paramName, paramName, argc, argv);
00527 }
00528
00529
00530 for(PARAM_STRING::const_iterator iterParam = m_abbreviation.begin();
00531 iterParam != m_abbreviation.end(); iterParam++) {
00532 const string paramName = iterParam->first;
00533 const string paramShortName = iterParam->second;
00534 OverwriteParam("-" + paramShortName, paramName, argc, argv);
00535 }
00536
00537 AddFeaturesCmd();
00538
00539
00540 int verbose = 1;
00541 if (m_setting.find("verbose") != m_setting.end() &&
00542 m_setting["verbose"].size() > 0)
00543 verbose = Scan<int>(m_setting["verbose"][0]);
00544 if (verbose >= 1) {
00545 TRACE_ERR( "Defined parameters (per moses.ini or switch):" << endl);
00546 for(PARAM_MAP::const_iterator iterParam = m_setting.begin() ;
00547 iterParam != m_setting.end(); iterParam++) {
00548 TRACE_ERR( "\t" << iterParam->first << ": ");
00549 for ( size_t i = 0; i < iterParam->second.size(); i++ )
00550 TRACE_ERR( iterParam->second[i] << " ");
00551 TRACE_ERR( endl);
00552 }
00553 }
00554
00555
00556 if ((GetParam("feature") || GetParam("weight"))
00557 && (GetParam("weight-slm") || GetParam("weight-bl") || GetParam("weight-d") ||
00558 GetParam("weight-dlm") || GetParam("weight-lrl") || GetParam("weight-generation") ||
00559 GetParam("weight-i") || GetParam("weight-l") || GetParam("weight-lex") ||
00560 GetParam("weight-glm") || GetParam("weight-wt") || GetParam("weight-pp") ||
00561 GetParam("weight-pb") || GetParam("weight-t") || GetParam("weight-w") ||
00562 GetParam("weight-p") ||
00563 GetParam("weight-u") || GetParam("weight-e") ||
00564 GetParam("dlm-mode") || GetParam("generation-file") || GetParam("global-lexical-file") ||
00565 GetParam("glm-feature") || GetParam("lmodel-file") || GetParam("lmodel-dub") ||
00566 GetParam("slmodel-file") || GetParam("slmodel-factor") ||
00567 GetParam("slmodel-beam") || GetParam("ttable-file") || GetParam("phrase-pair-feature") ||
00568 GetParam("phrase-boundary-source-feature") || GetParam("phrase-boundary-target-feature") || GetParam("phrase-length-feature") ||
00569 GetParam("target-word-insertion-feature") || GetParam("source-word-deletion-feature") || GetParam("word-translation-feature")
00570 )
00571 ) {
00572 UTIL_THROW(util::Exception, "Don't mix old and new ini file format");
00573 }
00574
00575
00576 if (GetParam("feature") == NULL) {
00577 ConvertWeightArgs();
00578 }
00579 CreateWeightsMap();
00580 WeightOverwrite();
00581
00582
00583 bool noErrorFlag = true;
00584 for (int i = 0 ; i < argc ; i++) {
00585 if (isOption(argv[i])) {
00586 string paramSwitch = (string) argv[i];
00587 string paramName = paramSwitch.substr(1);
00588 if (m_valid.find(paramName) == m_valid.end()) {
00589 std::cerr << "illegal switch: " << paramSwitch;
00590 noErrorFlag = false;
00591 }
00592 }
00593 }
00594
00595
00596
00597
00598 return Validate() && noErrorFlag;
00599 }
00600
00601 void
00602 Parameter::
00603 AddFeaturesCmd()
00604 {
00605 const PARAM_VEC *params = GetParam("feature-add");
00606 if (params) {
00607 PARAM_VEC::const_iterator iter;
00608 for (iter = params->begin(); iter != params->end(); ++iter) {
00609 const string &line = *iter;
00610 AddFeature(line);
00611 }
00612
00613 m_setting.erase("feature-add");
00614 }
00615 }
00616
00617 std::vector<float>
00618 Parameter::
00619 GetWeights(const std::string &name)
00620 {
00621 std::vector<float> ret = m_weights[name];
00622
00623
00624
00625
00626
00627
00628 return ret;
00629 }
00630
00631 void
00632 Parameter::
00633 SetWeight(const std::string &name, size_t ind, float weight)
00634 {
00635 PARAM_VEC &newWeights = m_setting["weight"];
00636 string line = name + SPrint(ind) + "= " + SPrint(weight);
00637 newWeights.push_back(line);
00638 }
00639
00640 void Parameter::SetWeight(const std::string &name, size_t ind, const vector<float> &weights)
00641 {
00642 PARAM_VEC &newWeights = m_setting["weight"];
00643 string line = name + SPrint(ind) + "=";
00644
00645 for (size_t i = 0; i < weights.size(); ++i) {
00646 line += " " + SPrint(weights[i]);
00647 }
00648 newWeights.push_back(line);
00649 }
00650
00651 void
00652 Parameter::
00653 AddWeight(const std::string &name, size_t ind,
00654 const std::vector<float> &weights)
00655 {
00656 PARAM_VEC &newWeights = m_setting["weight"];
00657
00658 string sought = name + SPrint(ind) + "=";
00659 for (size_t i = 0; i < newWeights.size(); ++i) {
00660 string &line = newWeights[i];
00661 if (line.find(sought) == 0) {
00662
00663 for (size_t i = 0; i < weights.size(); ++i) {
00664 line += " " + SPrint(weights[i]);
00665 }
00666 return;
00667 }
00668 }
00669
00670
00671 SetWeight(name, ind, weights);
00672 }
00673
00674 void
00675 Parameter::
00676 ConvertWeightArgsSingleWeight(const string &oldWeightName, const string &newWeightName)
00677 {
00678 size_t ind = 0;
00679 PARAM_MAP::iterator iterMap;
00680
00681 iterMap = m_setting.find(oldWeightName);
00682 if (iterMap != m_setting.end()) {
00683 const PARAM_VEC &weights = iterMap->second;
00684 for (size_t i = 0; i < weights.size(); ++i) {
00685 SetWeight(newWeightName, ind, Scan<float>(weights[i]));
00686 }
00687
00688 m_setting.erase(iterMap);
00689 }
00690 }
00691
00692 void
00693 Parameter::
00694 ConvertWeightArgsPhraseModel(const string &oldWeightName)
00695 {
00696 const PARAM_VEC *params;
00697
00698
00699 params = GetParam("weight-i");
00700 if (params) {
00701 vector<float> inputWeights = Scan<float>(*params);
00702 PARAM_VEC &numInputScores = m_setting["input-scores"];
00703 if (inputWeights.size() == 1) {
00704 UTIL_THROW_IF2(numInputScores.size() != 0, "No [input-scores] section allowed");
00705 numInputScores.push_back("1");
00706 numInputScores.push_back("0");
00707 } else if (inputWeights.size() == 2) {
00708 UTIL_THROW_IF2(numInputScores.size() != 0, "No [input-scores] section allowed");
00709 numInputScores.push_back("1");
00710 numInputScores.push_back("1");
00711 }
00712
00713 SetWeight("PhraseDictionaryBinary", 0, inputWeights);
00714 }
00715
00716
00717 VERBOSE(2,"Creating phrase table features" << endl);
00718
00719 size_t numInputScores = 0;
00720 size_t numRealWordsInInput = 0;
00721 map<string, size_t> ptIndices;
00722
00723 params = GetParam("input-scores");
00724 if (params) {
00725 numInputScores = Scan<size_t>(params->at(0));
00726
00727 if (params->size() > 1) {
00728 numRealWordsInInput = Scan<size_t>(params->at(1));
00729 }
00730 }
00731
00732
00733 params = GetParam("ttable-file");
00734 if (params) {
00735
00736 const vector<string> translationVector = *params;
00737
00738 vector<size_t> maxTargetPhrase;
00739 params = GetParam("ttable-limit");
00740 if (params) {
00741 maxTargetPhrase = Scan<size_t>(*params);
00742 }
00743
00744 if(maxTargetPhrase.size() == 1 && translationVector.size() > 1) {
00745 VERBOSE(1, "Using uniform ttable-limit of " << maxTargetPhrase[0] << " for all translation tables." << endl);
00746 for(size_t i = 1; i < translationVector.size(); i++)
00747 maxTargetPhrase.push_back(maxTargetPhrase[0]);
00748 } else if(maxTargetPhrase.size() != 1 && maxTargetPhrase.size() < translationVector.size()) {
00749 std::cerr << "You specified " << translationVector.size() << " translation tables, but only " << maxTargetPhrase.size() << " ttable-limits.";
00750 return;
00751 }
00752
00753
00754 const PARAM_VEC &oldWeights = m_setting[oldWeightName];
00755
00756 size_t currOldInd = 0;
00757 for(size_t currDict = 0 ; currDict < translationVector.size(); currDict++) {
00758 util::StringStream ptLine;
00759
00760 vector<string> token = Tokenize(translationVector[currDict]);
00761
00762 if(currDict == 0 && token.size() == 4) {
00763 std::cerr << "Phrase table specification in old 4-field format. No longer supported";
00764 return;
00765 }
00766 UTIL_THROW_IF2(token.size() < 5, "Phrase table must have at least 5 scores");
00767
00768 int implementation = Scan<int>(token[0]);
00769
00770 string ptType;
00771 switch (implementation) {
00772 case 0:
00773 ptType = "PhraseDictionaryMemory";
00774 break;
00775 case 1:
00776 ptType = "PhraseDictionaryBinary";
00777 break;
00778 case 2:
00779 ptType = "PhraseDictionaryOnDisk";
00780 break;
00781 case 6:
00782 ptType = "PhraseDictionaryMemory";
00783 break;
00784 case 12:
00785 ptType = "PhraseDictionaryCompact";
00786 break;
00787 case 8:
00788 ptType = "PhraseDictionarySuffixArray";
00789 break;
00790 case 14:
00791 ptType = "PhraseDictionaryDynSuffixArray";
00792 break;
00793 case 15:
00794 ptType = "PhraseDictionaryDynamicCacheBased";
00795 break;
00796 default:
00797 break;
00798 }
00799
00800 size_t ptInd;
00801 if (ptIndices.find(ptType) == ptIndices.end()) {
00802 ptIndices[ptType] = 0;
00803 ptInd = 0;
00804 } else {
00805 ptInd = ++ptIndices[ptType];
00806 }
00807
00808
00809 size_t numFFInd = (token.size() == 4) ? 2 : 3;
00810 size_t numFF = Scan<size_t>(token[numFFInd]);
00811
00812 vector<float> weights(numFF);
00813 for (size_t currFF = 0; currFF < numFF; ++currFF) {
00814 UTIL_THROW_IF2(currOldInd >= oldWeights.size(),
00815 "Errors converting old phrase-table weights to new weights");
00816 float weight = Scan<float>(oldWeights[currOldInd]);
00817 weights[currFF] = weight;
00818
00819 ++currOldInd;
00820 }
00821
00822
00823
00824 AddWeight(ptType, ptInd, weights);
00825
00826
00827 ptLine << ptType << " ";
00828 ptLine << "input-factor=" << token[1] << " ";
00829 ptLine << "output-factor=" << token[2] << " ";
00830 ptLine << "path=" << token[4] << " ";
00831
00832
00833
00834 vector<FactorType> input = Tokenize<FactorType>(token[1], ",")
00835 ,output = Tokenize<FactorType>(token[2], ",");
00836 size_t numScoreComponent = Scan<size_t>(token[3]);
00837 string filePath= token[4];
00838
00839 if(currDict==0) {
00840
00841
00842
00843 numScoreComponent += numInputScores + numRealWordsInInput;
00844 }
00845
00846 ptLine << "num-features=" << numScoreComponent << " ";
00847 ptLine << "table-limit=" << maxTargetPhrase[currDict] << " ";
00848
00849 if (implementation == 8 || implementation == 14) {
00850 ptLine << "target-path=" << token[5] << " ";
00851 ptLine << "alignment-path=" << token[6] << " ";
00852 }
00853
00854 AddFeature(ptLine.str());
00855 }
00856 }
00857
00858 m_setting.erase("weight-i");
00859 m_setting.erase(oldWeightName);
00860 m_setting.erase("ttable-file");
00861 m_setting.erase("ttable-limit");
00862
00863 }
00864
00865 void
00866 Parameter::
00867 AddFeature(const std::string &line)
00868 {
00869 PARAM_VEC &features = m_setting["feature"];
00870 features.push_back(line);
00871 }
00872
00873 void
00874 Parameter::
00875 ConvertWeightArgsDistortion()
00876 {
00877 const string oldWeightName = "weight-d";
00878 const string oldLexReordingName = "distortion-file";
00879
00880
00881 const PARAM_VEC *oldWeights = GetParam(oldWeightName);
00882
00883 if (oldWeights) {
00884 const PARAM_VEC *searchAlgo = GetParam("search-algorithm");
00885 if (searchAlgo == NULL ||
00886 (searchAlgo->size() > 0
00887 && (Trim(searchAlgo->at(0)) == "0" || Trim(searchAlgo->at(0)) == "1")
00888 )
00889 ) {
00890
00891 AddFeature("Distortion");
00892 SetWeight("Distortion", 0, Scan<float>(oldWeights->at(0)));
00893 }
00894
00895
00896
00897 size_t currOldInd = 1;
00898 const PARAM_VEC *lextable = GetParam(oldLexReordingName);
00899
00900 for (size_t indTable = 0; lextable && indTable < lextable->size(); ++indTable) {
00901 const string &line = lextable->at(indTable);
00902 vector<string> toks = Tokenize(line);
00903
00904 size_t numFF = Scan<size_t>(toks[2]);
00905
00906 vector<float> weights(numFF);
00907 for (size_t currFF = 0; currFF < numFF; ++currFF) {
00908 UTIL_THROW_IF2(oldWeights && currOldInd >= oldWeights->size(),
00909 "Errors converting old distortion weights to new weights");
00910 float weight = Scan<float>(oldWeights->at(currOldInd));
00911 weights[currFF] = weight;
00912
00913 ++currOldInd;
00914 }
00915 SetWeight("LexicalReordering", indTable, weights);
00916
00917 util::StringStream strme;
00918 strme << "LexicalReordering "
00919 << "type=" << toks[1] << " ";
00920
00921 vector<FactorType> factors = Tokenize<FactorType>(toks[0], "-");
00922 UTIL_THROW_IF2(factors.size() != 2,
00923 "Error in old factor specification for lexicalized reordering model: "
00924 << toks[0]);
00925 strme << "input-factor=" << factors[0]
00926 << " output-factor=" << factors[1] << " ";
00927
00928 strme << "num-features=" << toks[2] << " ";
00929 strme << "path=" << toks[3];
00930
00931 AddFeature(strme.str());
00932 }
00933 }
00934
00935 m_setting.erase(oldWeightName);
00936 m_setting.erase(oldLexReordingName);
00937
00938 }
00939
00940 void
00941 Parameter::
00942 ConvertWeightArgsLM()
00943 {
00944 const string oldWeightName = "weight-l";
00945 const string oldFeatureName = "lmodel-file";
00946 const PARAM_VEC *params;
00947
00948 bool isChartDecoding = true;
00949
00950 params = GetParam("search-algorithm");
00951 if (params == NULL ||
00952 (params->size() > 0
00953 && (Trim(params->at(0)) == "0" || Trim(params->at(0)) == "1")
00954 )
00955 ) {
00956 isChartDecoding = false;
00957 }
00958
00959 vector<int> oovWeights;
00960 params = GetParam("lmodel-oov-feature");
00961 if (params) {
00962 oovWeights = Scan<int>(*params);
00963 }
00964
00965 PARAM_MAP::iterator iterMap;
00966
00967 iterMap = m_setting.find(oldWeightName);
00968 if (iterMap != m_setting.end()) {
00969
00970 size_t currOldInd = 0;
00971 const PARAM_VEC &weights = iterMap->second;
00972 const PARAM_VEC &models = m_setting[oldFeatureName];
00973 for (size_t lmIndex = 0; lmIndex < models.size(); ++lmIndex) {
00974 const string &line = models[lmIndex];
00975 vector<string> modelToks = Tokenize(line);
00976
00977 int lmType = Scan<int>(modelToks[0]);
00978
00979 string newFeatureName;
00980 switch (lmType) {
00981 case 0:
00982 newFeatureName = "SRILM";
00983 break;
00984 case 1:
00985 newFeatureName = "IRSTLM";
00986 break;
00987 case 8:
00988 case 9:
00989 newFeatureName = "KENLM";
00990 break;
00991 default:
00992 UTIL_THROW2("Unkown language model type id:" << lmType);
00993 }
00994
00995 size_t numFF = 1;
00996 if (oovWeights.size() > lmIndex)
00997 numFF += oovWeights[lmIndex];
00998
00999 vector<float> weightsLM(numFF);
01000 for (size_t currFF = 0; currFF < numFF; ++currFF) {
01001 UTIL_THROW_IF2(currOldInd >= weights.size(),
01002 "Errors converting old LM weights to new weights");
01003 weightsLM[currFF] = Scan<float>(weights[currOldInd]);
01004 if (isChartDecoding) {
01005 weightsLM[currFF] = UntransformLMScore(weightsLM[currFF]);
01006 }
01007
01008 ++currOldInd;
01009 }
01010
01011 SetWeight(newFeatureName, lmIndex, weightsLM);
01012
01013 string featureLine = newFeatureName + " "
01014 + "factor=" + modelToks[1] + " "
01015 + "order=" + modelToks[2] + " "
01016 + "num-features=" + SPrint(numFF) + " ";
01017 if (lmType == 9) {
01018 featureLine += "load=lazy ";
01019 }
01020
01021 if(oovWeights.size() > lmIndex)
01022 featureLine += "oov-feature=1 ";
01023
01024 featureLine += "path=" + modelToks[3];
01025
01026 AddFeature(featureLine);
01027 }
01028
01029 m_setting.erase(iterMap);
01030 }
01031
01032 m_setting.erase(oldFeatureName);
01033 }
01034
01035 void
01036 Parameter::
01037 ConvertWeightArgsGeneration(const std::string &oldWeightName, const std::string &newWeightName)
01038 {
01039 string oldFeatureName = "generation-file";
01040
01041
01042 PARAM_VEC &oldWeights = m_setting[oldWeightName];
01043
01044 if (oldWeights.size() > 0) {
01045 size_t currOldInd = 0;
01046 PARAM_VEC &models = m_setting[oldFeatureName];
01047
01048 for (size_t indTable = 0; indTable < models.size(); ++indTable) {
01049 string &line = models[indTable];
01050 vector<string> modelToks = Tokenize(line);
01051
01052 size_t numFF = Scan<size_t>(modelToks[2]);
01053
01054 vector<float> weights(numFF);
01055 for (size_t currFF = 0; currFF < numFF; ++currFF) {
01056 UTIL_THROW_IF2(currOldInd >= oldWeights.size(),
01057 "Errors converting old generation weights to new weights");
01058 float weight = Scan<float>(oldWeights[currOldInd]);
01059 weights[currFF] = weight;
01060
01061 ++currOldInd;
01062 }
01063 SetWeight(newWeightName, indTable, weights);
01064
01065 util::StringStream strme;
01066 strme << "Generation "
01067 << "input-factor=" << modelToks[0] << " "
01068 << "output-factor=" << modelToks[1] << " "
01069 << "num-features=" << modelToks[2] << " "
01070 << "path=" << modelToks[3];
01071 AddFeature(strme.str());
01072 }
01073 }
01074
01075 m_setting.erase(oldWeightName);
01076 m_setting.erase(oldFeatureName);
01077 }
01078
01079 void
01080 Parameter::
01081 ConvertWeightArgsWordPenalty()
01082 {
01083 const std::string oldWeightName = "weight-w";
01084 const std::string newWeightName = "WordPenalty";
01085
01086 bool isChartDecoding = true;
01087 const PARAM_VEC *searchAlgo = GetParam("search-algorithm");
01088 if (searchAlgo == NULL ||
01089 (searchAlgo->size() > 0
01090 && (Trim(searchAlgo->at(0)) == "0" || Trim(searchAlgo->at(0)) == "1")
01091 )
01092 ) {
01093 isChartDecoding = false;
01094 }
01095
01096 PARAM_MAP::iterator iterMap;
01097
01098 iterMap = m_setting.find(oldWeightName);
01099 if (iterMap != m_setting.end()) {
01100 const PARAM_VEC &weights = iterMap->second;
01101 for (size_t i = 0; i < weights.size(); ++i) {
01102 float weight = Scan<float>(weights[i]);
01103 if (isChartDecoding) {
01104 weight *= 0.434294482;
01105 }
01106 SetWeight(newWeightName, i, weight);
01107 }
01108
01109 m_setting.erase(iterMap);
01110 }
01111
01112 }
01113
01114 void
01115 Parameter::
01116 ConvertPhrasePenalty()
01117 {
01118 string oldWeightName = "weight-p";
01119 const PARAM_VEC *params = GetParam(oldWeightName);
01120 if (params) {
01121 UTIL_THROW_IF2(params->size() != 1,
01122 "There should be only 1 phrase-penalty weight");
01123 float weight = Scan<float>(params->at(0));
01124 AddFeature("PhrasePenalty");
01125 SetWeight("PhrasePenalty", 0, weight);
01126
01127 m_setting.erase(oldWeightName);
01128 }
01129 }
01130
01131 void
01132 Parameter::
01133 ConvertWeightArgs()
01134 {
01135
01136 UTIL_THROW_IF2( m_setting.count("weight-dlm") != 0,
01137 "Can't handle discr LM. must do it manually 'cos of bigram/n-gram split");
01138
01139
01140 if (m_setting.count("weight") &&
01141 (m_setting.count("weight-i") || m_setting.count("weight-t") || m_setting.count("weight-w") ||
01142 m_setting.count("weight-l") || m_setting.count("weight-u") || m_setting.count("weight-lex") ||
01143 m_setting.count("weight-generation") || m_setting.count("weight-lr") || m_setting.count("weight-d")
01144 )) {
01145 cerr << "Do not mix old and new format for specify weights";
01146 }
01147
01148 ConvertWeightArgsWordPenalty();
01149 ConvertWeightArgsLM();
01150 ConvertWeightArgsSingleWeight("weight-slm", "SyntacticLM");
01151 ConvertWeightArgsSingleWeight("weight-u", "UnknownWordPenalty");
01152 ConvertWeightArgsGeneration("weight-generation", "Generation");
01153 ConvertWeightArgsDistortion();
01154
01155
01156 ConvertWeightArgsSingleWeight("weight-lr", "LexicalReordering");
01157 ConvertWeightArgsSingleWeight("weight-bl", "BleuScoreFeature");
01158 ConvertWeightArgsSingleWeight("weight-glm", "GlobalLexicalModel");
01159 ConvertWeightArgsSingleWeight("weight-wt", "WordTranslationFeature");
01160 ConvertWeightArgsSingleWeight("weight-pp", "PhrasePairFeature");
01161 ConvertWeightArgsSingleWeight("weight-pb", "PhraseBoundaryFeature");
01162
01163 ConvertWeightArgsSingleWeight("weight-e", "WordDeletion");
01164 ConvertWeightArgsSingleWeight("weight-lex", "GlobalLexicalReordering");
01165
01166 ConvertPhrasePenalty();
01167
01168 AddFeature("WordPenalty");
01169 AddFeature("UnknownWordPenalty");
01170
01171 ConvertWeightArgsPhraseModel("weight-t");
01172
01173 }
01174
01175 void
01176 Parameter::
01177 CreateWeightsMap()
01178 {
01179 CreateWeightsMap(m_setting["weight-add"]);
01180 CreateWeightsMap(m_setting["weight"]);
01181 }
01182
01183 void
01184 Parameter::
01185 CreateWeightsMap(const PARAM_VEC &vec)
01186 {
01187 for (size_t i = 0; i < vec.size(); ++i) {
01188 const string &line = vec[i];
01189 vector<string> toks = Tokenize(line);
01190 UTIL_THROW_IF2(toks.size() < 2,
01191 "Error in format of weights: " << line);
01192
01193 string name = toks[0];
01194 name = name.substr(0, name.size() - 1);
01195
01196 vector<float> weights(toks.size() - 1);
01197 for (size_t i = 1; i < toks.size(); ++i) {
01198 float weight = Scan<float>(toks[i]);
01199 weights[i - 1] = weight;
01200 }
01201 m_weights[name] = weights;
01202 }
01203 }
01204
01205 void
01206 Parameter::
01207 WeightOverwrite()
01208 {
01209 PARAM_VEC &vec = m_setting["weight-overwrite"];
01210
01211 if (vec.size() == 0)
01212 return;
01213
01214
01215 UTIL_THROW_IF2(vec.size() != 1,
01216 "weight-overwrite should only be on 1 line");
01217
01218 string name("");
01219 vector<float> weights;
01220 vector<string> toks = Tokenize(vec[0]);
01221 size_t cnt = 0;
01222 const std::vector<float>* oldWeights = NULL;
01223 for (size_t i = 0; i < toks.size(); ++i) {
01224 const string &tok = toks[i];
01225
01226 if (ends_with(tok, "=")) {
01227
01228
01229 if (name != "") {
01230
01231 m_weights[name] = weights;
01232 weights.clear();
01233 }
01234
01235 name = tok.substr(0, tok.size() - 1);
01236 std::map<std::string, std::vector<float> >::const_iterator found = m_weights.find(name);
01237 if (found!=m_weights.end()) {
01238 oldWeights = &(found->second);
01239 } else {
01240 oldWeights = NULL;
01241 }
01242 cnt = 0;
01243 } else {
01244
01245 if (toks[i] == "x") {
01246 UTIL_THROW_IF2(!oldWeights || cnt>=oldWeights->size(),
01247 "Keeping previous weight failed in weight-overwrite");
01248 weights.push_back(oldWeights->at(cnt));
01249 } else {
01250 float weight = Scan<float>(toks[i]);
01251 weights.push_back(weight);
01252 }
01253 ++cnt;
01254 }
01255 }
01256
01257 if (name != "") {
01258 m_weights[name] = weights;
01259 }
01260
01261 }
01262
01264 bool
01265 Parameter::
01266 Validate()
01267 {
01268 bool noErrorFlag = true;
01269
01270 PARAM_MAP::const_iterator iterParams;
01271 for (iterParams = m_setting.begin(); iterParams != m_setting.end(); ++iterParams) {
01272 const std::string &key = iterParams->first;
01273
01274 if (m_valid.find(key) == m_valid.end()) {
01275 std::cerr << "Unknown parameter " << key;
01276 noErrorFlag = false;
01277 }
01278 }
01279
01280 if (m_setting["lmodel-dub"].size() > 0) {
01281 if (m_setting["lmodel-file"].size() != m_setting["lmodel-dub"].size()) {
01282 std::cerr << "Config and parameters specify "
01283 << static_cast<int>(m_setting["lmodel-file"].size())
01284 << " language model files (lmodel-file), but "
01285 << static_cast<int>(m_setting["lmodel-dub"].size())
01286 << " LM upperbounds (lmodel-dub)"
01287 << endl;
01288 noErrorFlag = false;
01289 }
01290 }
01291
01292
01293
01294
01295 if (noErrorFlag && m_setting["input-file"].size() == 1) {
01296 noErrorFlag = FileExists(m_setting["input-file"][0]);
01297 if (!noErrorFlag) {
01298 std::cerr << endl << "Input file " << m_setting["input-file"][0] << " does not exist";
01299 }
01300 }
01301
01302 if (noErrorFlag) {
01303 std::vector<std::string> ext;
01304
01305 ext.push_back("");
01306 ext.push_back(".gz");
01307 noErrorFlag = FilesExist("generation-file", 3, ext);
01308 }
01309
01310 if (noErrorFlag) {
01311 std::vector<std::string> ext;
01312
01313 ext.push_back("");
01314 ext.push_back(".gz");
01315
01316 ext.push_back(".binlexr.idx");
01317
01318 ext.push_back(".minlexr");
01319 noErrorFlag = FilesExist("distortion-file", 3, ext);
01320 }
01321 return noErrorFlag;
01322 }
01323
01325 bool
01326 Parameter::
01327 FilesExist(const string ¶mName, int fieldNo,
01328 std::vector<std::string> const& extensions)
01329 {
01330 typedef std::vector<std::string> StringVec;
01331 StringVec::const_iterator iter;
01332
01333 PARAM_MAP::const_iterator iterParam = m_setting.find(paramName);
01334 if (iterParam == m_setting.end()) {
01335
01336 return true;
01337 }
01338 const StringVec &pathVec = (*iterParam).second;
01339 for (iter = pathVec.begin() ; iter != pathVec.end() ; ++iter) {
01340 StringVec vec = Tokenize(*iter);
01341
01342 size_t tokenizeIndex;
01343 if (fieldNo == -1)
01344 tokenizeIndex = vec.size() - 1;
01345 else
01346 tokenizeIndex = static_cast<size_t>(fieldNo);
01347
01348 if (tokenizeIndex >= vec.size()) {
01349 std::cerr << "Expected at least " << (tokenizeIndex+1) << " tokens per entry in '"
01350 << paramName << "', but only found "
01351 << vec.size();
01352 return false;
01353 }
01354 const string &pathStr = vec[tokenizeIndex];
01355
01356 bool fileFound=0;
01357 for(size_t i=0; i<extensions.size() && !fileFound; ++i) {
01358 fileFound|=FileExists(pathStr + extensions[i]);
01359 }
01360 if(!fileFound) {
01361 std::cerr << "File " << pathStr << " does not exist";
01362 return false;
01363 }
01364 }
01365 return true;
01366 }
01367
01369
01370
01371 string
01372 Parameter::
01373 FindParam(const string ¶mSwitch, int argc, char const* argv[])
01374 {
01375 for (int i = 0 ; i < argc ; i++) {
01376 if (string(argv[i]) == paramSwitch) {
01377 if (i+1 < argc) {
01378 return argv[i+1];
01379 } else {
01380 std::cerr << "Option " << paramSwitch << " requires a parameter!";
01381
01382 }
01383 }
01384 }
01385 return "";
01386 }
01387
01393 void
01394 Parameter::
01395 OverwriteParam(const string ¶mSwitch, const string ¶mName,
01396 int argc, char const* argv[])
01397 {
01398 int startPos = -1;
01399 for (int i = 0 ; i < argc ; i++) {
01400 if (string(argv[i]) == paramSwitch) {
01401 startPos = i+1;
01402 break;
01403 }
01404 }
01405 if (startPos < 0)
01406 return;
01407
01408 int index = 0;
01409 m_setting[paramName];
01410 while (startPos < argc && (!isOption(argv[startPos]))) {
01411 if (m_setting[paramName].size() > (size_t)index)
01412 m_setting[paramName][index] = argv[startPos];
01413 else
01414 m_setting[paramName].push_back(argv[startPos]);
01415 index++;
01416 startPos++;
01417 }
01418 }
01419
01420
01422 bool
01423 Parameter::
01424 ReadConfigFile(const string &filePath )
01425 {
01426 InputFileStream inFile(filePath);
01427 string line, paramName;
01428 while(getline(inFile, line)) {
01429
01430 size_t comPos = line.find_first_of("#");
01431 if (comPos != string::npos)
01432 line = line.substr(0, comPos);
01433
01434 line = Trim(line);
01435
01436 if (line.size() == 0) {
01437
01438 } else if (line[0]=='[') {
01439
01440 for (size_t currPos = 0 ; currPos < line.size() ; currPos++) {
01441 if (line[currPos] == ']') {
01442 paramName = line.substr(1, currPos - 1);
01443 break;
01444 }
01445 }
01446 } else {
01447
01448 m_setting[paramName].push_back(line);
01449 }
01450 }
01451 return true;
01452 }
01453
01454 struct Credit {
01455 string name, contact, currentPursuits, areaResponsibility;
01456 int sortId;
01457
01458 Credit(string name, string contact, string currentPursuits, string areaResponsibility) {
01459 this->name = name ;
01460 this->contact = contact ;
01461 this->currentPursuits = currentPursuits ;
01462 this->areaResponsibility = areaResponsibility;
01463 this->sortId = util::rand_excl(1000);
01464 }
01465
01466 bool operator<(const Credit &other) const {
01467
01468
01469
01470
01471
01472
01473
01474
01475 return sortId < other.sortId;
01476 }
01477
01478 };
01479
01480 std::ostream& operator<<(std::ostream &os, const Credit &credit)
01481 {
01482 os << credit.name;
01483 if (credit.contact != "")
01484 os << "\t contact: " << credit.contact;
01485 if (credit.currentPursuits != "")
01486 os << " " << credit.currentPursuits;
01487 if (credit.areaResponsibility != "")
01488 os << " I'll answer question on: " << credit.areaResponsibility;
01489 return os;
01490 }
01491
01492 void
01493 Parameter::
01494 PrintCredit()
01495 {
01496 vector<Credit> everyone;
01497 srand ( time(NULL) );
01498
01499 everyone.push_back(Credit("Nicola Bertoldi"
01500 , "911"
01501 , ""
01502 , "scripts & other stuff"));
01503 everyone.push_back(Credit("Ondrej Bojar"
01504 , ""
01505 , "czech this out!"
01506 , ""));
01507 everyone.push_back(Credit("Chris Callison-Burch"
01508 , "anytime, anywhere"
01509 , "international playboy"
01510 , ""));
01511 everyone.push_back(Credit("Alexandra Constantin"
01512 , ""
01513 , "eu sunt varza"
01514 , ""));
01515 everyone.push_back(Credit("Brooke Cowan"
01516 , "brooke@csail.mit.edu"
01517 , "if you're going to san francisco, be sure to wear a flower in your hair"
01518 , ""));
01519 everyone.push_back(Credit("Chris Dyer"
01520 , "can't. i'll be out driving my mustang"
01521 , "driving my mustang"
01522 , ""));
01523 everyone.push_back(Credit("Marcello Federico"
01524 , "federico at itc at it"
01525 , "Researcher at ITC-irst, Trento, Italy"
01526 , "IRST language model"));
01527 everyone.push_back(Credit("Evan Herbst"
01528 , "Small college in upstate New York"
01529 , ""
01530 , ""));
01531 everyone.push_back(Credit("Philipp Koehn"
01532 , "only between 2 and 4am"
01533 , ""
01534 , "Nothing fazes this dude"));
01535 everyone.push_back(Credit("Christine Moran"
01536 , "weird building at MIT"
01537 , ""
01538 , ""));
01539 everyone.push_back(Credit("Wade Shen"
01540 , "via morse code"
01541 , "buying another laptop"
01542 , ""));
01543 everyone.push_back(Credit("Richard Zens"
01544 , "richard at aachen dot de"
01545 , ""
01546 , "ambiguous source input, confusion networks, confusing source code"));
01547 everyone.push_back(Credit("Hieu Hoang", "http://www.hoang.co.uk/hieu/"
01548 , "phd student at Edinburgh Uni. Original Moses developer"
01549 , "general queries/ flames on Moses."));
01550
01551 sort(everyone.begin(), everyone.end());
01552
01553
01554 cerr << "Moses - A beam search decoder for phrase-based statistical machine translation models" << endl
01555 << "Copyright (C) 2006 University of Edinburgh" << endl << endl
01556
01557 << "This library is free software; you can redistribute it and/or" << endl
01558 << "modify it under the terms of the GNU Lesser General Public" << endl
01559 << "License as published by the Free Software Foundation; either" << endl
01560 << "version 2.1 of the License, or (at your option) any later version." << endl << endl
01561
01562 << "This library is distributed in the hope that it will be useful," << endl
01563 << "but WITHOUT ANY WARRANTY; without even the implied warranty of" << endl
01564 << "MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU" << endl
01565 << "Lesser General Public License for more details." << endl << endl
01566
01567 << "You should have received a copy of the GNU Lesser General Public" << endl
01568 << "License along with this library; if not, write to the Free Software" << endl
01569 << "Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA" << endl << endl
01570 << "***********************************************************************" << endl << endl
01571 << "Built on " << __DATE__ << " at " __TIME__ << endl << endl
01572 << "WHO'S FAULT IS THIS GODDAM SOFTWARE:" << endl;
01573
01574 ostream_iterator<Credit> out(cerr, "\n");
01575 copy(everyone.begin(), everyone.end(), out);
01576 cerr << endl << endl;
01577 }
01578
01582 void
01583 Parameter::
01584 OverwriteParam(const string ¶mName, PARAM_VEC values)
01585 {
01586 VERBOSE(2,"Overwriting parameter " << paramName);
01587
01588 m_setting[paramName];
01589 if (m_setting[paramName].size() > 1) {
01590 VERBOSE(2," (the parameter had " << m_setting[paramName].size() << " previous values)");
01591 UTIL_THROW_IF2(m_setting[paramName].size() != values.size(),
01592 "Number of weight override for " << paramName
01593 << " is not the same as the original number of weights");
01594 } else {
01595 VERBOSE(2," (the parameter does not have previous values)");
01596 m_setting[paramName].resize(values.size());
01597 }
01598 VERBOSE(2," with the following values:");
01599 int i=0;
01600 for (PARAM_VEC::iterator iter = values.begin(); iter != values.end() ; iter++, i++) {
01601 m_setting[paramName][i] = *iter;
01602 VERBOSE(2, " " << *iter);
01603 }
01604 VERBOSE(2, std::endl);
01605 }
01606
01607 void
01608 Parameter::
01609 PrintFF() const
01610 {
01611 StaticData::Instance().GetFeatureRegistry().PrintFF();
01612 }
01613
01614 std::set<std::string>
01615 Parameter::
01616 GetWeightNames() const
01617 {
01618 std::set<std::string> ret;
01619 std::map<std::string, std::vector<float> >::const_iterator iter;
01620 for (iter = m_weights.begin(); iter != m_weights.end(); ++iter) {
01621 const string &key = iter->first;
01622 ret.insert(key);
01623 }
01624 return ret;
01625 }
01626
01627 void
01628 Parameter::
01629 Save(const std::string path)
01630 {
01631 ofstream file;
01632 file.open(path.c_str());
01633
01634 PARAM_MAP::const_iterator iterOuter;
01635 for (iterOuter = m_setting.begin(); iterOuter != m_setting.end(); ++iterOuter) {
01636 const std::string §ionName = iterOuter->first;
01637 file << "[" << sectionName << "]" << endl;
01638
01639 const PARAM_VEC &values = iterOuter->second;
01640
01641 PARAM_VEC::const_iterator iterInner;
01642 for (iterInner = values.begin(); iterInner != values.end(); ++iterInner) {
01643 const std::string &value = *iterInner;
01644 file << value << endl;
01645 }
01646
01647 file << endl;
01648 }
01649
01650
01651 file.close();
01652 }
01653
01654 template<>
01655 void
01656 Parameter::
01657 SetParameter<bool>(bool ¶meter, std::string const& parameterName,
01658 bool const& defaultValue) const
01659 {
01660 const PARAM_VEC *params = GetParam(parameterName);
01661
01662
01663 parameter = defaultValue;
01664 if (params == NULL) {
01665 return;
01666 }
01667
01668
01669 if (params->size() == 0) {
01670 parameter = true;
01671 }
01672
01673 else if (params->size() == 1) {
01674 parameter = Scan<bool>( params->at(0));
01675 }
01676 }
01677
01678 void
01679 Parameter::
01680 SetParameter(bool& var, std::string const& name)
01681 {
01682 SetParameter(var,name,false);
01683 }
01684
01685 }
01686
01687