00001
00002
00003
00004
00005
00006
00007
00008
00009
00010
00011
00012
00013
00014
00015
00016
00017
00018
00019
00020 #include <algorithm>
00021 #include <cstdlib>
00022 #include <ctime>
00023 #include <string>
00024 #include <vector>
00025 #include <map>
00026
00027 #include <boost/program_options.hpp>
00028 #include <boost/algorithm/string.hpp>
00029
00030 #ifdef MPI_ENABLE
00031 #include <boost/mpi.hpp>
00032 namespace mpi = boost::mpi;
00033 #endif
00034
00035 #include "Main.h"
00036 #include "Optimiser.h"
00037 #include "Hildreth.h"
00038 #include "HypothesisQueue.h"
00039 #include "moses/StaticData.h"
00040 #include "moses/ChartTrellisPathList.h"
00041 #include "moses/ChartTrellisPath.h"
00042 #include "moses/ScoreComponentCollection.h"
00043 #include "moses/ThreadPool.h"
00044 #include "moses/LexicalReordering.h"
00045 #include "mert/BleuScorer.h"
00046 #include "moses/FeatureVector.h"
00047
00048 #include "moses/FF/WordTranslationFeature.h"
00049 #include "moses/FF/PhrasePairFeature.h"
00050 #include "moses/FF/WordPenaltyProducer.h"
00051
00052 using namespace Mira;
00053 using namespace std;
00054 using namespace Moses;
00055 namespace po = boost::program_options;
00056
00057 int main(int argc, char** argv)
00058 {
00059 size_t rank = 0;
00060 size_t size = 1;
00061 #ifdef MPI_ENABLE
00062 mpi::environment env(argc,argv);
00063 mpi::communicator world;
00064 rank = world.rank();
00065 size = world.size();
00066 #endif
00067
00068 bool help;
00069 int verbosity;
00070 string mosesConfigFile;
00071 string inputFile;
00072 vector<string> referenceFiles;
00073 vector<string> mosesConfigFilesFolds, inputFilesFolds, referenceFilesFolds;
00074
00075 size_t epochs;
00076 string learner;
00077 bool shuffle;
00078 size_t mixingFrequency;
00079 size_t weightDumpFrequency;
00080 string weightDumpStem;
00081 bool scale_margin, scale_margin_precision;
00082 bool scale_update, scale_update_precision;
00083 size_t n;
00084 size_t batchSize;
00085 bool distinctNbest;
00086 bool accumulateWeights;
00087 float historySmoothing;
00088 bool scaleByInputLength, scaleByAvgInputLength;
00089 bool scaleByInverseLength, scaleByAvgInverseLength;
00090 float scaleByX;
00091 float slack;
00092 bool averageWeights;
00093 bool weightConvergence;
00094 float learning_rate;
00095 float mira_learning_rate;
00096 float perceptron_learning_rate;
00097 string decoder_settings;
00098 float min_weight_change;
00099 bool normaliseWeights, normaliseMargin;
00100 bool print_feature_values;
00101 bool historyBleu ;
00102 bool sentenceBleu;
00103 bool perceptron_update;
00104 bool hope_fear;
00105 bool model_hope_fear;
00106 int hope_n, fear_n;
00107 size_t bleu_smoothing_scheme;
00108 float min_oracle_bleu;
00109 float minBleuRatio, maxBleuRatio;
00110 bool boost;
00111 bool decode_hope, decode_fear, decode_model;
00112 string decode_filename;
00113 bool batchEqualsShard;
00114 bool sparseAverage, dumpMixedWeights, sparseNoAverage;
00115 int featureCutoff;
00116 bool pruneZeroWeights;
00117 bool printFeatureCounts, printNbestWithFeatures;
00118 bool avgRefLength;
00119 bool print_weights, print_core_weights, debug_model, scale_lm, scale_wp;
00120 float scale_lm_factor, scale_wp_factor;
00121 bool kbest;
00122 string moses_src;
00123 float sigmoidParam;
00124 float bleuWeight, bleuWeight_hope, bleuWeight_fear;
00125 bool bleu_weight_lm, bleu_weight_lm_adjust;
00126 float bleu_weight_lm_factor;
00127 bool l1_regularize, l2_regularize, l1_reg_sparse, l2_reg_sparse;
00128 float l1_lambda, l2_lambda;
00129 bool most_violated, most_violated_reg, all_violated, max_bleu_diff, one_against_all;
00130 bool feature_confidence, signed_counts;
00131 float decay_core, decay_sparse, core_r0, sparse_r0;
00132 bool selective, summed;
00133 float bleu_weight_fear_factor;
00134 bool hildreth;
00135 float add2lm;
00136 bool realBleu, disableBleuFeature;
00137 bool rescaleSlack;
00138 bool makePairs;
00139 bool debug;
00140 bool reg_on_every_mix;
00141 size_t continue_epoch;
00142 bool modelPlusBleu, simpleHistoryBleu;
00143 po::options_description desc("Allowed options");
00144 desc.add_options()
00145 ("continue-epoch", po::value<size_t>(&continue_epoch)->default_value(0), "Continue an interrupted experiment from this epoch on")
00146 ("freq-reg", po::value<bool>(®_on_every_mix)->default_value(false), "Regularize after every weight mixing")
00147 ("l1sparse", po::value<bool>(&l1_reg_sparse)->default_value(true), "L1-regularization for sparse weights only")
00148 ("l2sparse", po::value<bool>(&l2_reg_sparse)->default_value(true), "L2-regularization for sparse weights only")
00149 ("mv-reg", po::value<bool>(&most_violated_reg)->default_value(false), "Regularize most violated constraint")
00150 ("dbg", po::value<bool>(&debug)->default_value(true), "More debug output")
00151 ("make-pairs", po::value<bool>(&makePairs)->default_value(true), "Make pairs of hypotheses for 1slack")
00152 ("debug", po::value<bool>(&debug)->default_value(true), "More debug output")
00153 ("rescale-slack", po::value<bool>(&rescaleSlack)->default_value(false), "Rescale slack in 1-slack formulation")
00154 ("disable-bleu-feature", po::value<bool>(&disableBleuFeature)->default_value(false), "Disable the Bleu feature")
00155 ("real-bleu", po::value<bool>(&realBleu)->default_value(false), "Compute real sentence Bleu on complete translations")
00156 ("add2lm", po::value<float>(&add2lm)->default_value(0.0), "Add the specified amount to all LM weights")
00157 ("hildreth", po::value<bool>(&hildreth)->default_value(false), "Prefer Hildreth over analytical update")
00158 ("selective", po::value<bool>(&selective)->default_value(false), "Build constraints for every feature")
00159 ("summed", po::value<bool>(&summed)->default_value(false), "Sum up all constraints")
00160 ("model-plus-bleu", po::value<bool>(&modelPlusBleu)->default_value(false), "Use the sum of model score and +/- bleu to select hope and fear translations")
00161 ("simple-history-bleu", po::value<bool>(&simpleHistoryBleu)->default_value(false), "Simple history Bleu")
00162
00163 ("bleu-weight", po::value<float>(&bleuWeight)->default_value(1.0), "Bleu weight used in decoder objective")
00164 ("bw-hope", po::value<float>(&bleuWeight_hope)->default_value(-1.0), "Bleu weight used in decoder objective for hope")
00165 ("bw-fear", po::value<float>(&bleuWeight_fear)->default_value(-1.0), "Bleu weight used in decoder objective for fear")
00166
00167 ("core-r0", po::value<float>(&core_r0)->default_value(1.0), "Start learning rate for core features")
00168 ("sparse-r0", po::value<float>(&sparse_r0)->default_value(1.0), "Start learning rate for sparse features")
00169
00170 ("tie-bw-to-lm", po::value<bool>(&bleu_weight_lm)->default_value(false), "Make bleu weight depend on lm weight")
00171 ("adjust-bw", po::value<bool>(&bleu_weight_lm_adjust)->default_value(false), "Adjust bleu weight when lm weight changes")
00172 ("bw-lm-factor", po::value<float>(&bleu_weight_lm_factor)->default_value(2.0), "Make bleu weight depend on lm weight by this factor")
00173 ("bw-factor-fear", po::value<float>(&bleu_weight_fear_factor)->default_value(1.0), "Multiply fear weight by this factor")
00174 ("accumulate-weights", po::value<bool>(&accumulateWeights)->default_value(false), "Accumulate and average weights over all epochs")
00175 ("average-weights", po::value<bool>(&averageWeights)->default_value(false), "Set decoder weights to average weights after each update")
00176 ("avg-ref-length", po::value<bool>(&avgRefLength)->default_value(false), "Use average reference length instead of shortest for BLEU score feature")
00177 ("batch-equals-shard", po::value<bool>(&batchEqualsShard)->default_value(false), "Batch size is equal to shard size (purely batch)")
00178 ("batch-size,b", po::value<size_t>(&batchSize)->default_value(1), "Size of batch that is send to optimiser for weight adjustments")
00179 ("bleu-smoothing-scheme", po::value<size_t>(&bleu_smoothing_scheme)->default_value(1), "Set a smoothing scheme for sentence-Bleu: +1 (1), +0.1 (2), papineni (3) (default:1)")
00180 ("boost", po::value<bool>(&boost)->default_value(false), "Apply boosting factor to updates on misranked candidates")
00181 ("config,f", po::value<string>(&mosesConfigFile), "Moses ini-file")
00182 ("configs-folds", po::value<vector<string> >(&mosesConfigFilesFolds), "Moses ini-files, one for each fold")
00183 ("debug-model", po::value<bool>(&debug_model)->default_value(false), "Get best model translation for debugging purposes")
00184 ("decode-hope", po::value<bool>(&decode_hope)->default_value(false), "Decode dev input set according to hope objective")
00185 ("decode-fear", po::value<bool>(&decode_fear)->default_value(false), "Decode dev input set according to fear objective")
00186 ("decode-model", po::value<bool>(&decode_model)->default_value(false), "Decode dev input set according to normal objective")
00187 ("decode-filename", po::value<string>(&decode_filename), "Filename for Bleu objective translations")
00188 ("decoder-settings", po::value<string>(&decoder_settings)->default_value(""), "Decoder settings for tuning runs")
00189 ("distinct-nbest", po::value<bool>(&distinctNbest)->default_value(true), "Use n-best list with distinct translations in inference step")
00190 ("dump-mixed-weights", po::value<bool>(&dumpMixedWeights)->default_value(false), "Dump mixed weights instead of averaged weights")
00191 ("epochs,e", po::value<size_t>(&epochs)->default_value(10), "Number of epochs")
00192 ("feature-cutoff", po::value<int>(&featureCutoff)->default_value(-1), "Feature cutoff as additional regularization for sparse features")
00193 ("fear-n", po::value<int>(&fear_n)->default_value(1), "Number of fear translations used")
00194 ("help", po::value(&help)->zero_tokens()->default_value(false), "Print this help message and exit")
00195 ("history-bleu", po::value<bool>(&historyBleu)->default_value(false), "Use 1best translations to update the history")
00196 ("history-smoothing", po::value<float>(&historySmoothing)->default_value(0.9), "Adjust the factor for history smoothing")
00197 ("hope-fear", po::value<bool>(&hope_fear)->default_value(true), "Use only hope and fear translations for optimisation (not model)")
00198 ("hope-n", po::value<int>(&hope_n)->default_value(2), "Number of hope translations used")
00199 ("input-file,i", po::value<string>(&inputFile), "Input file containing tokenised source")
00200 ("input-files-folds", po::value<vector<string> >(&inputFilesFolds), "Input files containing tokenised source, one for each fold")
00201 ("learner,l", po::value<string>(&learner)->default_value("mira"), "Learning algorithm")
00202 ("l1-lambda", po::value<float>(&l1_lambda)->default_value(0.0001), "Lambda for l1-regularization (w_i +/- lambda)")
00203 ("l2-lambda", po::value<float>(&l2_lambda)->default_value(0.01), "Lambda for l2-regularization (w_i * (1 - lambda))")
00204 ("l1-reg", po::value<bool>(&l1_regularize)->default_value(false), "L1-regularization")
00205 ("l2-reg", po::value<bool>(&l2_regularize)->default_value(false), "L2-regularization")
00206 ("min-bleu-ratio", po::value<float>(&minBleuRatio)->default_value(-1), "Set a minimum BLEU ratio between hope and fear")
00207 ("max-bleu-ratio", po::value<float>(&maxBleuRatio)->default_value(-1), "Set a maximum BLEU ratio between hope and fear")
00208 ("max-bleu-diff", po::value<bool>(&max_bleu_diff)->default_value(true), "Select hope/fear with maximum Bleu difference")
00209 ("min-oracle-bleu", po::value<float>(&min_oracle_bleu)->default_value(0), "Set a minimum oracle BLEU score")
00210 ("min-weight-change", po::value<float>(&min_weight_change)->default_value(0.0001), "Set minimum weight change for stopping criterion")
00211 ("mira-learning-rate", po::value<float>(&mira_learning_rate)->default_value(1), "Learning rate for MIRA (fixed or flexible)")
00212 ("mixing-frequency", po::value<size_t>(&mixingFrequency)->default_value(1), "How often per epoch to mix weights, when using mpi")
00213 ("model-hope-fear", po::value<bool>(&model_hope_fear)->default_value(false), "Use model, hope and fear translations for optimisation")
00214 ("moses-src", po::value<string>(&moses_src)->default_value(""), "Moses source directory")
00215 ("nbest,n", po::value<size_t>(&n)->default_value(1), "Number of translations in n-best list")
00216 ("normalise-weights", po::value<bool>(&normaliseWeights)->default_value(false), "Whether to normalise the updated weights before passing them to the decoder")
00217 ("normalise-margin", po::value<bool>(&normaliseMargin)->default_value(false), "Normalise the margin: squash between 0 and 1")
00218 ("perceptron-learning-rate", po::value<float>(&perceptron_learning_rate)->default_value(0.01), "Perceptron learning rate")
00219 ("print-feature-values", po::value<bool>(&print_feature_values)->default_value(false), "Print out feature values")
00220 ("print-feature-counts", po::value<bool>(&printFeatureCounts)->default_value(false), "Print out feature values, print feature list with hope counts after 1st epoch")
00221 ("print-nbest-with-features", po::value<bool>(&printNbestWithFeatures)->default_value(false), "Print out feature values, print feature list with hope counts after 1st epoch")
00222 ("print-weights", po::value<bool>(&print_weights)->default_value(false), "Print out current weights")
00223 ("print-core-weights", po::value<bool>(&print_core_weights)->default_value(true), "Print out current core weights")
00224 ("prune-zero-weights", po::value<bool>(&pruneZeroWeights)->default_value(false), "Prune zero-valued sparse feature weights")
00225 ("reference-files,r", po::value<vector<string> >(&referenceFiles), "Reference translation files for training")
00226 ("reference-files-folds", po::value<vector<string> >(&referenceFilesFolds), "Reference translation files for training, one for each fold")
00227 ("kbest", po::value<bool>(&kbest)->default_value(false), "Select hope/fear pairs from a list of nbest translations")
00228
00229 ("scale-by-inverse-length", po::value<bool>(&scaleByInverseLength)->default_value(false), "Scale BLEU by (history of) inverse input length")
00230 ("scale-by-input-length", po::value<bool>(&scaleByInputLength)->default_value(false), "Scale BLEU by (history of) input length")
00231 ("scale-by-avg-input-length", po::value<bool>(&scaleByAvgInputLength)->default_value(false), "Scale BLEU by average input length")
00232 ("scale-by-avg-inverse-length", po::value<bool>(&scaleByAvgInverseLength)->default_value(false), "Scale BLEU by average inverse input length")
00233 ("scale-by-x", po::value<float>(&scaleByX)->default_value(1), "Scale the BLEU score by value x")
00234 ("scale-lm", po::value<bool>(&scale_lm)->default_value(false), "Scale the language model feature")
00235 ("scale-factor-lm", po::value<float>(&scale_lm_factor)->default_value(2), "Scale the language model feature by this factor")
00236 ("scale-wp", po::value<bool>(&scale_wp)->default_value(false), "Scale the word penalty feature")
00237 ("scale-factor-wp", po::value<float>(&scale_wp_factor)->default_value(2), "Scale the word penalty feature by this factor")
00238 ("scale-margin", po::value<bool>(&scale_margin)->default_value(0), "Scale the margin by the Bleu score of the oracle translation")
00239 ("scale-margin-precision", po::value<bool>(&scale_margin_precision)->default_value(0), "Scale margin by precision of oracle")
00240 ("scale-update", po::value<bool>(&scale_update)->default_value(0), "Scale update by Bleu score of oracle")
00241 ("scale-update-precision", po::value<bool>(&scale_update_precision)->default_value(0), "Scale update by precision of oracle")
00242 ("sentence-level-bleu", po::value<bool>(&sentenceBleu)->default_value(true), "Use a sentences level Bleu scoring function")
00243 ("shuffle", po::value<bool>(&shuffle)->default_value(false), "Shuffle input sentences before processing")
00244 ("sigmoid-param", po::value<float>(&sigmoidParam)->default_value(1), "y=sigmoidParam is the axis that this sigmoid approaches")
00245 ("slack", po::value<float>(&slack)->default_value(0.01), "Use slack in optimiser")
00246 ("sparse-average", po::value<bool>(&sparseAverage)->default_value(false), "Average weights by the number of processes")
00247 ("sparse-no-average", po::value<bool>(&sparseNoAverage)->default_value(false), "Don't average sparse weights, just sum")
00248 ("stop-weights", po::value<bool>(&weightConvergence)->default_value(true), "Stop when weights converge")
00249 ("verbosity,v", po::value<int>(&verbosity)->default_value(0), "Verbosity level")
00250 ("weight-dump-frequency", po::value<size_t>(&weightDumpFrequency)->default_value(1), "How often per epoch to dump weights (mpi)")
00251 ("weight-dump-stem", po::value<string>(&weightDumpStem)->default_value("weights"), "Stem of filename to use for dumping weights");
00252
00253 po::options_description cmdline_options;
00254 cmdline_options.add(desc);
00255 po::variables_map vm;
00256 po::store(po::command_line_parser(argc, argv). options(cmdline_options).run(), vm);
00257 po::notify(vm);
00258
00259 if (help) {
00260 std::cout << "Usage: " + string(argv[0])
00261 + " -f mosesini-file -i input-file -r reference-file(s) [options]" << std::endl;
00262 std::cout << desc << std::endl;
00263 return 0;
00264 }
00265
00266 const StaticData &staticData = StaticData::Instance();
00267
00268 bool trainWithMultipleFolds = false;
00269 if (mosesConfigFilesFolds.size() > 0 || inputFilesFolds.size() > 0 || referenceFilesFolds.size() > 0) {
00270 if (rank == 0)
00271 cerr << "Training with " << mosesConfigFilesFolds.size() << " folds" << endl;
00272 trainWithMultipleFolds = true;
00273 }
00274
00275 if (dumpMixedWeights && (mixingFrequency != weightDumpFrequency)) {
00276 cerr << "Set mixing frequency = weight dump frequency for dumping mixed weights!" << endl;
00277 exit(1);
00278 }
00279
00280 if ((sparseAverage || sparseNoAverage) && averageWeights) {
00281 cerr << "Parameters --sparse-average 1/--sparse-no-average 1 and --average-weights 1 are incompatible (not implemented)" << endl;
00282 exit(1);
00283 }
00284
00285 if (trainWithMultipleFolds) {
00286 if (!mosesConfigFilesFolds.size()) {
00287 cerr << "Error: No moses ini files specified for training with folds" << endl;
00288 exit(1);
00289 }
00290
00291 if (!inputFilesFolds.size()) {
00292 cerr << "Error: No input files specified for training with folds" << endl;
00293 exit(1);
00294 }
00295
00296 if (!referenceFilesFolds.size()) {
00297 cerr << "Error: No reference files specified for training with folds" << endl;
00298 exit(1);
00299 }
00300 } else {
00301 if (mosesConfigFile.empty()) {
00302 cerr << "Error: No moses ini file specified" << endl;
00303 return 1;
00304 }
00305
00306 if (inputFile.empty()) {
00307 cerr << "Error: No input file specified" << endl;
00308 return 1;
00309 }
00310
00311 if (!referenceFiles.size()) {
00312 cerr << "Error: No reference files specified" << endl;
00313 return 1;
00314 }
00315 }
00316
00317
00318 vector<string> inputSentences;
00319 size_t inputSize = trainWithMultipleFolds? inputFilesFolds.size(): 0;
00320 size_t refSize = trainWithMultipleFolds? referenceFilesFolds.size(): referenceFiles.size();
00321 vector<vector<string> > inputSentencesFolds(inputSize);
00322 vector<vector<string> > referenceSentences(refSize);
00323
00324
00325 size_t coresPerFold = 0, myFold = 0;
00326 if (trainWithMultipleFolds) {
00327 if (mosesConfigFilesFolds.size() > size) {
00328 cerr << "Number of cores has to be a multiple of the number of folds" << endl;
00329 exit(1);
00330 }
00331 coresPerFold = size/mosesConfigFilesFolds.size();
00332 if (size % coresPerFold > 0) {
00333 cerr << "Number of cores has to be a multiple of the number of folds" << endl;
00334 exit(1);
00335 }
00336
00337 if (rank == 0)
00338 cerr << "Number of cores per fold: " << coresPerFold << endl;
00339 myFold = rank/coresPerFold;
00340 cerr << "Rank " << rank << ", my fold: " << myFold << endl;
00341 }
00342
00343
00344 if (trainWithMultipleFolds) {
00345 if (!loadSentences(inputFilesFolds[myFold], inputSentencesFolds[myFold])) {
00346 cerr << "Error: Failed to load input sentences from " << inputFilesFolds[myFold] << endl;
00347 exit(1);
00348 }
00349 VERBOSE(1, "Rank " << rank << " reading inputs from " << inputFilesFolds[myFold] << endl);
00350
00351 if (!loadSentences(referenceFilesFolds[myFold], referenceSentences[myFold])) {
00352 cerr << "Error: Failed to load reference sentences from " << referenceFilesFolds[myFold] << endl;
00353 exit(1);
00354 }
00355 if (referenceSentences[myFold].size() != inputSentencesFolds[myFold].size()) {
00356 cerr << "Error: Input file length (" << inputSentencesFolds[myFold].size() << ") != ("
00357 << referenceSentences[myFold].size() << ") reference file length (rank " << rank << ")" << endl;
00358 exit(1);
00359 }
00360 VERBOSE(1, "Rank " << rank << " reading references from " << referenceFilesFolds[myFold] << endl);
00361 } else {
00362 if (!loadSentences(inputFile, inputSentences)) {
00363 cerr << "Error: Failed to load input sentences from " << inputFile << endl;
00364 return 1;
00365 }
00366
00367 for (size_t i = 0; i < referenceFiles.size(); ++i) {
00368 if (!loadSentences(referenceFiles[i], referenceSentences[i])) {
00369 cerr << "Error: Failed to load reference sentences from "
00370 << referenceFiles[i] << endl;
00371 return 1;
00372 }
00373 if (referenceSentences[i].size() != inputSentences.size()) {
00374 cerr << "Error: Input file length (" << inputSentences.size() << ") != ("
00375 << referenceSentences[i].size() << ") length of reference file " << i
00376 << endl;
00377 return 1;
00378 }
00379 }
00380 }
00381
00382 if (scaleByAvgInputLength || scaleByInverseLength || scaleByAvgInverseLength)
00383 scaleByInputLength = false;
00384
00385 if (historyBleu || simpleHistoryBleu) {
00386 sentenceBleu = false;
00387 cerr << "Using history Bleu. " << endl;
00388 }
00389
00390 if (kbest) {
00391 realBleu = true;
00392 disableBleuFeature = true;
00393 cerr << "Use kbest lists and real Bleu scores, disable Bleu feature.." << endl;
00394 }
00395
00396
00397
00398 boost::trim(decoder_settings);
00399 decoder_settings += " -mira -distinct-nbest -references";
00400 if (trainWithMultipleFolds) {
00401 decoder_settings += " ";
00402 decoder_settings += referenceFilesFolds[myFold];
00403 } else {
00404 for (size_t i=0; i < referenceFiles.size(); ++i) {
00405 decoder_settings += " ";
00406 decoder_settings += referenceFiles[i];
00407 }
00408 }
00409
00410 vector<string> decoder_params;
00411 boost::split(decoder_params, decoder_settings, boost::is_any_of("\t "));
00412
00413 string configFile = trainWithMultipleFolds? mosesConfigFilesFolds[myFold] : mosesConfigFile;
00414 VERBOSE(1, "Rank " << rank << " reading config file from " << configFile << endl);
00415 MosesDecoder* decoder = new MosesDecoder(configFile, verbosity, decoder_params.size(), decoder_params);
00416 decoder->setBleuParameters(disableBleuFeature, sentenceBleu, scaleByInputLength, scaleByAvgInputLength,
00417 scaleByInverseLength, scaleByAvgInverseLength,
00418 scaleByX, historySmoothing, bleu_smoothing_scheme, simpleHistoryBleu);
00419 bool chartDecoding = staticData.IsChart();
00420
00421
00422 vector<size_t> order;
00423 if (trainWithMultipleFolds) {
00424 for (size_t i = 0; i < inputSentencesFolds[myFold].size(); ++i) {
00425 order.push_back(i);
00426 }
00427 } else {
00428 if (rank == 0) {
00429 for (size_t i = 0; i < inputSentences.size(); ++i) {
00430 order.push_back(i);
00431 }
00432 }
00433 }
00434
00435
00436 Optimiser* optimiser = NULL;
00437 if (learner == "mira") {
00438 if (rank == 0) {
00439 cerr << "Optimising using Mira" << endl;
00440 cerr << "slack: " << slack << ", learning rate: " << mira_learning_rate << endl;
00441 cerr << "selective: " << selective << endl;
00442 if (normaliseMargin)
00443 cerr << "sigmoid parameter: " << sigmoidParam << endl;
00444 }
00445 optimiser = new MiraOptimiser(slack, scale_margin, scale_margin_precision,
00446 scale_update, scale_update_precision, boost, normaliseMargin, sigmoidParam);
00447 learning_rate = mira_learning_rate;
00448 perceptron_update = false;
00449 } else if (learner == "perceptron") {
00450 if (rank == 0) {
00451 cerr << "Optimising using Perceptron" << endl;
00452 }
00453 optimiser = new Perceptron();
00454 learning_rate = perceptron_learning_rate;
00455 perceptron_update = true;
00456 model_hope_fear = false;
00457 hope_fear = false;
00458 n = 1;
00459 hope_n = 1;
00460 fear_n = 1;
00461 } else {
00462 cerr << "Error: Unknown optimiser: " << learner << endl;
00463 return 1;
00464 }
00465
00466
00467 if (batchSize > 1 && perceptron_update) {
00468 batchSize = 1;
00469 cerr << "Info: Setting batch size to 1 for perceptron update" << endl;
00470 }
00471
00472 if (hope_n == -1)
00473 hope_n = n;
00474 if (fear_n == -1)
00475 fear_n = n;
00476
00477 if (model_hope_fear || kbest)
00478 hope_fear = false;
00479 if (learner == "mira" && !(hope_fear || model_hope_fear || kbest)) {
00480 cerr << "Error: Need to select one of parameters --hope-fear/--model-hope-fear/--kbest for mira update." << endl;
00481 return 1;
00482 }
00483
00484 #ifdef MPI_ENABLE
00485 if (!trainWithMultipleFolds)
00486 mpi::broadcast(world, order, 0);
00487 #endif
00488
00489
00490 vector<size_t> shard;
00491 if (trainWithMultipleFolds) {
00492 size_t shardSize = order.size()/coresPerFold;
00493 size_t shardStart = (size_t) (shardSize * (rank % coresPerFold));
00494 size_t shardEnd = shardStart + shardSize;
00495 if (rank % coresPerFold == coresPerFold - 1) {
00496 shardEnd = order.size();
00497 shardSize = shardEnd - shardStart;
00498 }
00499 VERBOSE(1, "Rank: " << rank << ", shard size: " << shardSize << endl);
00500 VERBOSE(1, "Rank: " << rank << ", shard start: " << shardStart << " shard end: " << shardEnd << endl);
00501 shard.resize(shardSize);
00502 copy(order.begin() + shardStart, order.begin() + shardEnd, shard.begin());
00503 batchSize = 1;
00504 } else {
00505 size_t shardSize = order.size() / size;
00506 size_t shardStart = (size_t) (shardSize * rank);
00507 size_t shardEnd = (size_t) (shardSize * (rank + 1));
00508 if (rank == size - 1) {
00509 shardEnd = order.size();
00510 shardSize = shardEnd - shardStart;
00511 }
00512 VERBOSE(1, "Rank: " << rank << " Shard size: " << shardSize << endl);
00513 VERBOSE(1, "Rank: " << rank << " Shard start: " << shardStart << " Shard end: " << shardEnd << endl);
00514 shard.resize(shardSize);
00515 copy(order.begin() + shardStart, order.begin() + shardEnd, shard.begin());
00516 if (batchEqualsShard)
00517 batchSize = shardSize;
00518 }
00519
00520
00521 const vector<FeatureFunction*> &featureFunctions = FeatureFunction::GetFeatureFunctions();
00522 ScoreComponentCollection initialWeights = decoder->getWeights();
00523
00524 if (add2lm != 0) {
00525 const std::vector<const StatefulFeatureFunction*> &statefulFFs = StatefulFeatureFunction::GetStatefulFeatureFunctions();
00526 for (size_t i = 0; i < statefulFFs.size(); ++i) {
00527 const StatefulFeatureFunction *ff = statefulFFs[i];
00528 const LanguageModel *lm = dynamic_cast<const LanguageModel*>(ff);
00529
00530 if (lm) {
00531 float lmWeight = initialWeights.GetScoreForProducer(lm) + add2lm;
00532 initialWeights.Assign(lm, lmWeight);
00533 cerr << "Rank " << rank << ", add " << add2lm << " to lm weight." << endl;
00534 }
00535 }
00536 }
00537
00538 if (normaliseWeights) {
00539 initialWeights.L1Normalise();
00540 cerr << "Rank " << rank << ", normalised initial weights: " << initialWeights << endl;
00541 }
00542
00543 decoder->setWeights(initialWeights);
00544
00545
00546 if (bleu_weight_lm) {
00547 float lmSum = 0;
00548 const std::vector<const StatefulFeatureFunction*> &statefulFFs = StatefulFeatureFunction::GetStatefulFeatureFunctions();
00549 for (size_t i = 0; i < statefulFFs.size(); ++i) {
00550 const StatefulFeatureFunction *ff = statefulFFs[i];
00551 const LanguageModel *lm = dynamic_cast<const LanguageModel*>(ff);
00552
00553 if (lm) {
00554 lmSum += abs(initialWeights.GetScoreForProducer(lm));
00555 }
00556 }
00557
00558 bleuWeight = lmSum * bleu_weight_lm_factor;
00559 cerr << "Set bleu weight to lm weight * " << bleu_weight_lm_factor << endl;
00560 }
00561
00562 if (bleuWeight_hope == -1) {
00563 bleuWeight_hope = bleuWeight;
00564 }
00565 if (bleuWeight_fear == -1) {
00566 bleuWeight_fear = bleuWeight;
00567 }
00568 bleuWeight_fear *= bleu_weight_fear_factor;
00569 cerr << "Bleu weight: " << bleuWeight << endl;
00570 cerr << "Bleu weight fear: " << bleuWeight_fear << endl;
00571
00572 if (decode_hope || decode_fear || decode_model) {
00573 size_t decode = 1;
00574 if (decode_fear) decode = 2;
00575 if (decode_model) decode = 3;
00576 decodeHopeOrFear(rank, size, decode, decode_filename, inputSentences, decoder, n, bleuWeight);
00577 }
00578
00579
00580 ScoreComponentCollection cumulativeWeights;
00581 ScoreComponentCollection cumulativeWeightsBinary;
00582 size_t numberOfUpdates = 0;
00583 size_t numberOfUpdatesThisEpoch = 0;
00584
00585 time_t now;
00586 time(&now);
00587 cerr << "Rank " << rank << ", " << ctime(&now);
00588
00589 float avgInputLength = 0;
00590 float sumOfInputs = 0;
00591 size_t numberOfInputs = 0;
00592
00593 ScoreComponentCollection mixedWeights;
00594 ScoreComponentCollection mixedWeightsPrevious;
00595 ScoreComponentCollection mixedWeightsBeforePrevious;
00596 ScoreComponentCollection mixedAverageWeights;
00597 ScoreComponentCollection mixedAverageWeightsPrevious;
00598 ScoreComponentCollection mixedAverageWeightsBeforePrevious;
00599
00600 bool stop = false;
00601
00602 float epsilon = 0.0001;
00603
00604
00605 ScoreComponentCollection confidenceCounts, mixedConfidenceCounts, featureLearningRates;
00606 featureLearningRates.UpdateLearningRates(decay_core, decay_sparse, confidenceCounts, core_r0, sparse_r0);
00607 cerr << "Initial learning rates, core: " << core_r0 << ", sparse: " << sparse_r0 << endl;
00608
00609 for (size_t epoch = continue_epoch; epoch < epochs && !stop; ++epoch) {
00610 if (shuffle) {
00611 if (trainWithMultipleFolds || rank == 0) {
00612 cerr << "Rank " << rank << ", epoch " << epoch << ", shuffling input sentences.." << endl;
00613 RandomIndex rindex;
00614 random_shuffle(order.begin(), order.end(), rindex);
00615 }
00616
00617 #ifdef MPI_ENABLE
00618 if (!trainWithMultipleFolds)
00619 mpi::broadcast(world, order, 0);
00620 #endif
00621
00622
00623 if (trainWithMultipleFolds) {
00624 size_t shardSize = order.size()/coresPerFold;
00625 size_t shardStart = (size_t) (shardSize * (rank % coresPerFold));
00626 size_t shardEnd = shardStart + shardSize;
00627 if (rank % coresPerFold == coresPerFold - 1) {
00628 shardEnd = order.size();
00629 shardSize = shardEnd - shardStart;
00630 }
00631 VERBOSE(1, "Rank: " << rank << ", shard size: " << shardSize << endl);
00632 VERBOSE(1, "Rank: " << rank << ", shard start: " << shardStart << " shard end: " << shardEnd << endl);
00633 shard.resize(shardSize);
00634 copy(order.begin() + shardStart, order.begin() + shardEnd, shard.begin());
00635 batchSize = 1;
00636 } else {
00637 size_t shardSize = order.size()/size;
00638 size_t shardStart = (size_t) (shardSize * rank);
00639 size_t shardEnd = (size_t) (shardSize * (rank + 1));
00640 if (rank == size - 1) {
00641 shardEnd = order.size();
00642 shardSize = shardEnd - shardStart;
00643 }
00644 VERBOSE(1, "Shard size: " << shardSize << endl);
00645 VERBOSE(1, "Rank: " << rank << " Shard start: " << shardStart << " Shard end: " << shardEnd << endl);
00646 shard.resize(shardSize);
00647 copy(order.begin() + shardStart, order.begin() + shardEnd, shard.begin());
00648 if (batchEqualsShard)
00649 batchSize = shardSize;
00650 }
00651 }
00652
00653
00654
00655
00656 numberOfUpdatesThisEpoch = 0;
00657
00658 if (!accumulateWeights) {
00659 cumulativeWeights.ZeroAll();
00660 cumulativeWeightsBinary.ZeroAll();
00661 }
00662
00663
00664 size_t weightMixingThisEpoch = 0;
00665 size_t weightEpochDump = 0;
00666
00667 size_t shardPosition = 0;
00668 vector<size_t>::const_iterator sid = shard.begin();
00669 while (sid != shard.end()) {
00670
00671 vector<vector<ScoreComponentCollection> > featureValues;
00672 vector<vector<float> > bleuScores;
00673 vector<vector<float> > modelScores;
00674
00675
00676 vector<vector<ScoreComponentCollection> > featureValuesHope;
00677 vector<vector<ScoreComponentCollection> > featureValuesFear;
00678 vector<vector<float> > bleuScoresHope;
00679 vector<vector<float> > bleuScoresFear;
00680 vector<vector<float> > modelScoresHope;
00681 vector<vector<float> > modelScoresFear;
00682
00683
00684 ScoreComponentCollection mosesWeights = decoder->getWeights();
00685 VERBOSE(1, "\nRank " << rank << ", epoch " << epoch << ", weights: " << mosesWeights << endl);
00686
00687 if (historyBleu || simpleHistoryBleu) {
00688 decoder->printBleuFeatureHistory(cerr);
00689 }
00690
00691
00692 vector<float> oracleBleuScores;
00693 vector<float> oracleModelScores;
00694 vector<vector<const Word*> > oneBests;
00695 vector<ScoreComponentCollection> oracleFeatureValues;
00696 vector<size_t> inputLengths;
00697 vector<size_t> ref_ids;
00698 size_t actualBatchSize = 0;
00699
00700 vector<size_t>::const_iterator current_sid_start = sid;
00701 size_t examples_in_batch = 0;
00702 bool skip_example = false;
00703 for (size_t batchPosition = 0; batchPosition < batchSize && sid
00704 != shard.end(); ++batchPosition) {
00705 string input;
00706 if (trainWithMultipleFolds)
00707 input = inputSentencesFolds[myFold][*sid];
00708 else
00709 input = inputSentences[*sid];
00710
00711 Moses::Sentence *sentence = new Sentence();
00712 stringstream in(input + "\n");
00713 const vector<FactorType> inputFactorOrder = staticData.GetInputFactorOrder();
00714 sentence->Read(in,inputFactorOrder);
00715 cerr << "\nRank " << rank << ", epoch " << epoch << ", input sentence " << *sid << ": \"";
00716 sentence->Print(cerr);
00717 cerr << "\"" << " (batch pos " << batchPosition << ")" << endl;
00718 size_t current_input_length = (*sentence).GetSize();
00719
00720 if (epoch == 0 && (scaleByAvgInputLength || scaleByAvgInverseLength)) {
00721 sumOfInputs += current_input_length;
00722 ++numberOfInputs;
00723 avgInputLength = sumOfInputs/numberOfInputs;
00724 decoder->setAvgInputLength(avgInputLength);
00725 cerr << "Rank " << rank << ", epoch 0, average input length: " << avgInputLength << endl;
00726 }
00727
00728 vector<ScoreComponentCollection> newFeatureValues;
00729 vector<float> newScores;
00730 if (model_hope_fear) {
00731 featureValues.push_back(newFeatureValues);
00732 bleuScores.push_back(newScores);
00733 modelScores.push_back(newScores);
00734 }
00735 if (hope_fear || perceptron_update) {
00736 featureValuesHope.push_back(newFeatureValues);
00737 featureValuesFear.push_back(newFeatureValues);
00738 bleuScoresHope.push_back(newScores);
00739 bleuScoresFear.push_back(newScores);
00740 modelScoresHope.push_back(newScores);
00741 modelScoresFear.push_back(newScores);
00742 if (historyBleu || simpleHistoryBleu || debug_model) {
00743 featureValues.push_back(newFeatureValues);
00744 bleuScores.push_back(newScores);
00745 modelScores.push_back(newScores);
00746 }
00747 }
00748 if (kbest) {
00749
00750 featureValues.push_back(newFeatureValues);
00751 bleuScores.push_back(newScores);
00752 modelScores.push_back(newScores);
00753
00754
00755 featureValuesHope.push_back(newFeatureValues);
00756 featureValuesFear.push_back(newFeatureValues);
00757 bleuScoresHope.push_back(newScores);
00758 bleuScoresFear.push_back(newScores);
00759 modelScoresHope.push_back(newScores);
00760 modelScoresFear.push_back(newScores);
00761 }
00762
00763 size_t ref_length;
00764 float avg_ref_length;
00765
00766 if (print_weights)
00767 cerr << "Rank " << rank << ", epoch " << epoch << ", current weights: " << mosesWeights << endl;
00768 if (print_core_weights) {
00769 cerr << "Rank " << rank << ", epoch " << epoch << ", current weights: ";
00770 mosesWeights.PrintCoreFeatures();
00771 cerr << endl;
00772 }
00773
00774
00775 const std::vector<const StatefulFeatureFunction*> &statefulFFs = StatefulFeatureFunction::GetStatefulFeatureFunctions();
00776 for (size_t i = 0; i < statefulFFs.size(); ++i) {
00777 const StatefulFeatureFunction *ff = statefulFFs[i];
00778 const LanguageModel *lm = dynamic_cast<const LanguageModel*>(ff);
00779
00780 if (lm) {
00781 float lmWeight = mosesWeights.GetScoreForProducer(lm);
00782 cerr << "Rank " << rank << ", epoch " << epoch << ", lm weight: " << lmWeight << endl;
00783 if (lmWeight <= 0) {
00784 cerr << "Rank " << rank << ", epoch " << epoch << ", ERROR: language model weight should never be <= 0." << endl;
00785 mosesWeights.Assign(lm, 0.1);
00786 cerr << "Rank " << rank << ", epoch " << epoch << ", assign lm weights of 0.1" << endl;
00787 }
00788 }
00789 }
00790
00791
00792 cerr << "Rank " << rank << ", epoch " << epoch << ", real Bleu? " << realBleu << endl;
00793 if (hope_fear || perceptron_update) {
00794
00795 cerr << "Rank " << rank << ", epoch " << epoch << ", " << hope_n <<
00796 "best hope translations" << endl;
00797 vector< vector<const Word*> > outputHope = decoder->getNBest(input, *sid, hope_n, 1.0, bleuWeight_hope,
00798 featureValuesHope[batchPosition], bleuScoresHope[batchPosition], modelScoresHope[batchPosition],
00799 1, realBleu, distinctNbest, avgRefLength, rank, epoch, "");
00800 vector<const Word*> oracle = outputHope[0];
00801 decoder->cleanup(chartDecoding);
00802 ref_length = decoder->getClosestReferenceLength(*sid, oracle.size());
00803 avg_ref_length = ref_length;
00804 float hope_length_ratio = (float)oracle.size()/ref_length;
00805 int oracleSize = (int)oracle.size();
00806 cerr << endl;
00807
00808
00809 featureValuesHope[batchPosition][0].IncrementSparseHopeFeatures();
00810
00811 float precision = bleuScoresHope[batchPosition][0];
00812 if (historyBleu || simpleHistoryBleu) {
00813 precision /= decoder->getTargetLengthHistory();
00814 } else {
00815 if (scaleByAvgInputLength) precision /= decoder->getAverageInputLength();
00816 else if (scaleByAvgInverseLength) precision /= (100/decoder->getAverageInputLength());
00817 precision /= scaleByX;
00818 }
00819 if (scale_margin_precision || scale_update_precision) {
00820 if (historyBleu || simpleHistoryBleu || scaleByAvgInputLength || scaleByAvgInverseLength) {
00821 cerr << "Rank " << rank << ", epoch " << epoch << ", set hope precision: " << precision << endl;
00822 ((MiraOptimiser*) optimiser)->setPrecision(precision);
00823 }
00824 }
00825
00826 vector<const Word*> bestModel;
00827 if (debug_model || historyBleu || simpleHistoryBleu) {
00828
00829 cerr << "Rank " << rank << ", epoch " << epoch << ", 1best wrt model score (debug or history)" << endl;
00830 vector< vector<const Word*> > outputModel = decoder->getNBest(input, *sid, n, 0.0, bleuWeight,
00831 featureValues[batchPosition], bleuScores[batchPosition], modelScores[batchPosition],
00832 1, realBleu, distinctNbest, avgRefLength, rank, epoch, "");
00833 bestModel = outputModel[0];
00834 decoder->cleanup(chartDecoding);
00835 cerr << endl;
00836 ref_length = decoder->getClosestReferenceLength(*sid, bestModel.size());
00837 }
00838
00839
00840 float fear_length_ratio = 0;
00841 float bleuRatioHopeFear = 0;
00842 int fearSize = 0;
00843 cerr << "Rank " << rank << ", epoch " << epoch << ", " << fear_n << "best fear translations" << endl;
00844 vector< vector<const Word*> > outputFear = decoder->getNBest(input, *sid, fear_n, -1.0, bleuWeight_fear,
00845 featureValuesFear[batchPosition], bleuScoresFear[batchPosition], modelScoresFear[batchPosition],
00846 1, realBleu, distinctNbest, avgRefLength, rank, epoch, "");
00847 vector<const Word*> fear = outputFear[0];
00848 decoder->cleanup(chartDecoding);
00849 ref_length = decoder->getClosestReferenceLength(*sid, fear.size());
00850 avg_ref_length += ref_length;
00851 avg_ref_length /= 2;
00852 fear_length_ratio = (float)fear.size()/ref_length;
00853 fearSize = (int)fear.size();
00854 cerr << endl;
00855 for (size_t i = 0; i < fear.size(); ++i)
00856 delete fear[i];
00857
00858
00859 featureValuesFear[batchPosition][0].IncrementSparseFearFeatures();
00860
00861
00862 bool skip = false;
00863 bleuRatioHopeFear = bleuScoresHope[batchPosition][0] / bleuScoresFear[batchPosition][0];
00864 if (minBleuRatio != -1 && bleuRatioHopeFear < minBleuRatio)
00865 skip = true;
00866 if(maxBleuRatio != -1 && bleuRatioHopeFear > maxBleuRatio)
00867 skip = true;
00868
00869
00870 if (historyBleu || simpleHistoryBleu) {
00871 if (bleuScores[batchPosition][0] > bleuScoresHope[batchPosition][0] &&
00872 modelScores[batchPosition][0] > modelScoresHope[batchPosition][0]) {
00873 if (abs(bleuScores[batchPosition][0] - bleuScoresHope[batchPosition][0]) > epsilon &&
00874 abs(modelScores[batchPosition][0] - modelScoresHope[batchPosition][0]) > epsilon) {
00875 cerr << "Rank " << rank << ", epoch " << epoch << ", ERROR: MODEL translation better than HOPE translation." << endl;
00876 skip = true;
00877 }
00878 }
00879 if (bleuScoresFear[batchPosition][0] > bleuScores[batchPosition][0] &&
00880 modelScoresFear[batchPosition][0] > modelScores[batchPosition][0]) {
00881 if (abs(bleuScoresFear[batchPosition][0] - bleuScores[batchPosition][0]) > epsilon &&
00882 abs(modelScoresFear[batchPosition][0] - modelScores[batchPosition][0]) > epsilon) {
00883 cerr << "Rank " << rank << ", epoch " << epoch << ", ERROR: FEAR translation better than MODEL translation." << endl;
00884 skip = true;
00885 }
00886 }
00887 }
00888 if (bleuScoresFear[batchPosition][0] > bleuScoresHope[batchPosition][0]) {
00889 if (abs(bleuScoresFear[batchPosition][0] - bleuScoresHope[batchPosition][0]) > epsilon) {
00890
00891 skip = true;
00892 if (modelScoresFear[batchPosition][0] > modelScoresHope[batchPosition][0] && abs(modelScoresFear[batchPosition][0] - modelScoresHope[batchPosition][0]) > epsilon) {
00893 cerr << "Rank " << rank << ", epoch " << epoch << ", ERROR: FEAR translation better than HOPE translation. (abs-diff: " << abs(bleuScoresFear[batchPosition][0] - bleuScoresHope[batchPosition][0]) << ")" <<endl;
00894 } else {
00895 cerr << "Rank " << rank << ", epoch " << epoch << ", WARNING: FEAR translation has better Bleu than HOPE translation. (abs-diff: " << abs(bleuScoresFear[batchPosition][0] - bleuScoresHope[batchPosition][0]) << ")" <<endl;
00896 }
00897 }
00898 }
00899
00900 if (skip) {
00901 cerr << "Rank " << rank << ", epoch " << epoch << ", skip example (" << hope_length_ratio << ", " << bleuRatioHopeFear << ").. " << endl;
00902 featureValuesHope[batchPosition].clear();
00903 featureValuesFear[batchPosition].clear();
00904 bleuScoresHope[batchPosition].clear();
00905 bleuScoresFear[batchPosition].clear();
00906 if (historyBleu || simpleHistoryBleu || debug_model) {
00907 featureValues[batchPosition].clear();
00908 bleuScores[batchPosition].clear();
00909 }
00910 } else {
00911 examples_in_batch++;
00912
00913
00914 if (historyBleu || simpleHistoryBleu) {
00915 inputLengths.push_back(current_input_length);
00916 ref_ids.push_back(*sid);
00917 oneBests.push_back(bestModel);
00918 }
00919 }
00920 }
00921 if (model_hope_fear) {
00922 cerr << "Rank " << rank << ", epoch " << epoch << ", " << n << "best hope translations" << endl;
00923 size_t oraclePos = featureValues[batchPosition].size();
00924 decoder->getNBest(input, *sid, n, 1.0, bleuWeight_hope,
00925 featureValues[batchPosition], bleuScores[batchPosition], modelScores[batchPosition],
00926 0, realBleu, distinctNbest, avgRefLength, rank, epoch, "");
00927
00928
00929 inputLengths.push_back(current_input_length);
00930 ref_ids.push_back(*sid);
00931 decoder->cleanup(chartDecoding);
00932
00933
00934 cerr << endl;
00935
00936 oracleFeatureValues.push_back(featureValues[batchPosition][oraclePos]);
00937 oracleBleuScores.push_back(bleuScores[batchPosition][oraclePos]);
00938 oracleModelScores.push_back(modelScores[batchPosition][oraclePos]);
00939
00940
00941 cerr << "Rank " << rank << ", epoch " << epoch << ", " << n << "best wrt model score" << endl;
00942 if (historyBleu || simpleHistoryBleu) {
00943 vector< vector<const Word*> > outputModel = decoder->getNBest(input, *sid, n, 0.0,
00944 bleuWeight, featureValues[batchPosition], bleuScores[batchPosition],
00945 modelScores[batchPosition], 1, realBleu, distinctNbest, avgRefLength, rank, epoch, "");
00946 vector<const Word*> bestModel = outputModel[0];
00947 oneBests.push_back(bestModel);
00948 inputLengths.push_back(current_input_length);
00949 ref_ids.push_back(*sid);
00950 } else {
00951 decoder->getNBest(input, *sid, n, 0.0, bleuWeight,
00952 featureValues[batchPosition], bleuScores[batchPosition], modelScores[batchPosition],
00953 0, realBleu, distinctNbest, avgRefLength, rank, epoch, "");
00954 }
00955 decoder->cleanup(chartDecoding);
00956
00957
00958 cerr << endl;
00959
00960
00961 cerr << "Rank " << rank << ", epoch " << epoch << ", " << n << "best fear translations" << endl;
00962 decoder->getNBest(input, *sid, n, -1.0, bleuWeight_fear,
00963 featureValues[batchPosition], bleuScores[batchPosition], modelScores[batchPosition],
00964 0, realBleu, distinctNbest, avgRefLength, rank, epoch, "");
00965 decoder->cleanup(chartDecoding);
00966
00967
00968
00969 examples_in_batch++;
00970 }
00971 if (kbest) {
00972
00973 cerr << "Rank " << rank << ", epoch " << epoch << ", " << n << "best wrt model score" << endl;
00974 if (historyBleu || simpleHistoryBleu) {
00975 vector< vector<const Word*> > outputModel = decoder->getNBest(input, *sid, n, 0.0,
00976 bleuWeight, featureValues[batchPosition], bleuScores[batchPosition],
00977 modelScores[batchPosition], 1, realBleu, distinctNbest, avgRefLength, rank, epoch, "");
00978 vector<const Word*> bestModel = outputModel[0];
00979 oneBests.push_back(bestModel);
00980 inputLengths.push_back(current_input_length);
00981 ref_ids.push_back(*sid);
00982 } else {
00983 decoder->getNBest(input, *sid, n, 0.0, bleuWeight,
00984 featureValues[batchPosition], bleuScores[batchPosition],
00985 modelScores[batchPosition], 0, realBleu, distinctNbest, avgRefLength, rank, epoch, "");
00986 }
00987 decoder->cleanup(chartDecoding);
00988
00989
00990 cerr << endl;
00991
00992 examples_in_batch++;
00993
00994 HypothesisQueue queueHope(hope_n);
00995 HypothesisQueue queueFear(fear_n);
00996 cerr << endl;
00997 if (most_violated || all_violated || one_against_all) {
00998 float bleuHope = -1000;
00999 float bleuFear = 1000;
01000 size_t indexHope = -1;
01001 size_t indexFear = -1;
01002
01003 vector<float> bleuHopeList;
01004 vector<float> bleuFearList;
01005 vector<float> indexHopeList;
01006 vector<float> indexFearList;
01007
01008 if (most_violated)
01009 cerr << "Rank " << rank << ", epoch " << epoch << ", pick pair with most violated constraint" << endl;
01010 else if (all_violated)
01011 cerr << "Rank " << rank << ", epoch " << epoch << ", pick all pairs with violated constraints";
01012 else
01013 cerr << "Rank " << rank << ", epoch " << epoch << ", pick all pairs with hope";
01014
01015
01016 for (size_t i=0; i<bleuScores[batchPosition].size(); ++i) {
01017 if (abs(bleuScores[batchPosition][i] - bleuHope) < epsilon) {
01018 if (modelScores[batchPosition][i] > modelScores[batchPosition][indexHope]) {
01019 if (abs(modelScores[batchPosition][i] - modelScores[batchPosition][indexHope]) > epsilon) {
01020
01021 bleuHope = bleuScores[batchPosition][i];
01022 indexHope = i;
01023 }
01024 }
01025 } else if (bleuScores[batchPosition][i] > bleuHope) {
01026 bleuHope = bleuScores[batchPosition][i];
01027 indexHope = i;
01028 }
01029 }
01030
01031 float currentViolation = 0;
01032 float minimum_bleu_diff = 0.01;
01033 for (size_t i=0; i<bleuScores[batchPosition].size(); ++i) {
01034 float bleuDiff = bleuHope - bleuScores[batchPosition][i];
01035 float modelDiff = modelScores[batchPosition][indexHope] - modelScores[batchPosition][i];
01036 if (bleuDiff > epsilon) {
01037 if (one_against_all && bleuDiff > minimum_bleu_diff) {
01038 cerr << ".. adding pair";
01039 bleuHopeList.push_back(bleuHope);
01040 bleuFearList.push_back(bleuScores[batchPosition][i]);
01041 indexHopeList.push_back(indexHope);
01042 indexFearList.push_back(i);
01043 } else if (modelDiff < bleuDiff) {
01044 float diff = bleuDiff - modelDiff;
01045 if (diff > epsilon) {
01046 if (all_violated) {
01047 cerr << ".. adding pair";
01048 bleuHopeList.push_back(bleuHope);
01049 bleuFearList.push_back(bleuScores[batchPosition][i]);
01050 indexHopeList.push_back(indexHope);
01051 indexFearList.push_back(i);
01052 } else if (most_violated && diff > currentViolation) {
01053 currentViolation = diff;
01054 bleuFear = bleuScores[batchPosition][i];
01055 indexFear = i;
01056 cerr << "Rank " << rank << ", epoch " << epoch << ", current violation: " << currentViolation << " (" << modelDiff << " >= " << bleuDiff << ")" << endl;
01057 }
01058 }
01059 }
01060 }
01061 }
01062
01063 if (most_violated) {
01064 if (currentViolation > 0) {
01065 cerr << "Rank " << rank << ", epoch " << epoch << ", adding pair with violation " << currentViolation << endl;
01066 cerr << "Rank " << rank << ", epoch " << epoch << ", hope: " << bleuHope << " (" << indexHope << "), fear: " << bleuFear << " (" << indexFear << ")" << endl;
01067 bleuScoresHope[batchPosition].push_back(bleuHope);
01068 bleuScoresFear[batchPosition].push_back(bleuFear);
01069 featureValuesHope[batchPosition].push_back(featureValues[batchPosition][indexHope]);
01070 featureValuesFear[batchPosition].push_back(featureValues[batchPosition][indexFear]);
01071 float modelScoreHope = modelScores[batchPosition][indexHope];
01072 float modelScoreFear = modelScores[batchPosition][indexFear];
01073 if (most_violated_reg) {
01074
01075 float reg = currentViolation/4;
01076 modelScoreHope += abs(reg);
01077 modelScoreFear -= abs(reg);
01078 float newViolation = (bleuHope - bleuFear) - (modelScoreHope - modelScoreFear);
01079 cerr << "Rank " << rank << ", epoch " << epoch << ", regularized violation: " << newViolation << endl;
01080 }
01081 modelScoresHope[batchPosition].push_back(modelScoreHope);
01082 modelScoresFear[batchPosition].push_back(modelScoreFear);
01083
01084 featureValues[batchPosition][indexHope].IncrementSparseHopeFeatures();
01085 featureValues[batchPosition][indexFear].IncrementSparseFearFeatures();
01086 } else {
01087 cerr << "Rank " << rank << ", epoch " << epoch << ", no violated constraint found." << endl;
01088 skip_example = 1;
01089 }
01090 } else cerr << endl;
01091 }
01092 if (max_bleu_diff) {
01093 cerr << "Rank " << rank << ", epoch " << epoch << ", pick pair with max Bleu diff from list: " << bleuScores[batchPosition].size() << endl;
01094 for (size_t i=0; i<bleuScores[batchPosition].size(); ++i) {
01095 float hopeScore = bleuScores[batchPosition][i];
01096 if (modelPlusBleu) hopeScore += modelScores[batchPosition][i];
01097 BleuIndexPair hope(hopeScore, i);
01098 queueHope.Push(hope);
01099
01100 float fearScore = -1*(bleuScores[batchPosition][i]);
01101 if (modelPlusBleu) fearScore += modelScores[batchPosition][i];
01102 BleuIndexPair fear(fearScore, i);
01103 queueFear.Push(fear);
01104 }
01105 skip_example = 0;
01106 }
01107 cerr << endl;
01108
01109 vector<BleuIndexPair> hopeList, fearList;
01110 for (size_t i=0; i<hope_n && !queueHope.Empty(); ++i) hopeList.push_back(queueHope.Pop());
01111 for (size_t i=0; i<fear_n && !queueFear.Empty(); ++i) fearList.push_back(queueFear.Pop());
01112 for (size_t i=0; i<hopeList.size(); ++i) {
01113
01114 size_t indexHope = hopeList[i].second;
01115 float bleuHope = bleuScores[batchPosition][indexHope];
01116 for (size_t j=0; j<fearList.size(); ++j) {
01117
01118 size_t indexFear = fearList[j].second;
01119 float bleuFear = bleuScores[batchPosition][indexFear];
01120 cerr << "Rank " << rank << ", epoch " << epoch << ", hope: " << bleuHope << " (" << indexHope << "), fear: " << bleuFear << " (" << indexFear << ")" << endl;
01121 bleuScoresHope[batchPosition].push_back(bleuHope);
01122 bleuScoresFear[batchPosition].push_back(bleuFear);
01123 featureValuesHope[batchPosition].push_back(featureValues[batchPosition][indexHope]);
01124 featureValuesFear[batchPosition].push_back(featureValues[batchPosition][indexFear]);
01125 float modelScoreHope = modelScores[batchPosition][indexHope];
01126 float modelScoreFear = modelScores[batchPosition][indexFear];
01127
01128 modelScoresHope[batchPosition].push_back(modelScoreHope);
01129 modelScoresFear[batchPosition].push_back(modelScoreFear);
01130
01131 featureValues[batchPosition][indexHope].IncrementSparseHopeFeatures();
01132 featureValues[batchPosition][indexFear].IncrementSparseFearFeatures();
01133 }
01134 }
01135 if (!makePairs)
01136 cerr << "Rank " << rank << ", epoch " << epoch << "summing up hope and fear vectors, no pairs" << endl;
01137 }
01138
01139
01140 ++sid;
01141 ++actualBatchSize;
01142 ++shardPosition;
01143 }
01144
01145 if (examples_in_batch == 0 || (kbest && skip_example)) {
01146 cerr << "Rank " << rank << ", epoch " << epoch << ", batch is empty." << endl;
01147 } else {
01148 vector<vector<float> > losses(actualBatchSize);
01149 if (model_hope_fear) {
01150
01151 for (size_t batchPosition = 0; batchPosition < actualBatchSize; ++batchPosition) {
01152 for (size_t j = 0; j < bleuScores[batchPosition].size(); ++j) {
01153 losses[batchPosition].push_back(oracleBleuScores[batchPosition] - bleuScores[batchPosition][j]);
01154 }
01155 }
01156 }
01157
01158
01159 vector<FeatureFunction*>::const_iterator iter;
01160 const vector<FeatureFunction*> &featureFunctions2 = FeatureFunction::GetFeatureFunctions();
01161 for (iter = featureFunctions2.begin(); iter != featureFunctions2.end(); ++iter) {
01162 if ((*iter)->GetScoreProducerDescription() == "BleuScoreFeature") {
01163 mosesWeights.Assign(*iter, 0);
01164 break;
01165 }
01166 }
01167
01168
01169 if (scale_lm) {
01170 cerr << "scale lm" << endl;
01171 const std::vector<const StatefulFeatureFunction*> &statefulFFs = StatefulFeatureFunction::GetStatefulFeatureFunctions();
01172 for (size_t i = 0; i < statefulFFs.size(); ++i) {
01173 const StatefulFeatureFunction *ff = statefulFFs[i];
01174 const LanguageModel *lm = dynamic_cast<const LanguageModel*>(ff);
01175
01176 if (lm) {
01177
01178 if (model_hope_fear) {
01179 scaleFeatureScore(lm, scale_lm_factor, featureValues, rank, epoch);
01180 } else {
01181 scaleFeatureScore(lm, scale_lm_factor, featureValuesHope, rank, epoch);
01182 scaleFeatureScore(lm, scale_lm_factor, featureValuesFear, rank, epoch);
01183 }
01184 }
01185 }
01186 }
01187
01188
01189 if (scale_wp) {
01190
01191 WordPenaltyProducer *wp = StaticData::InstanceNonConst().GetWordPenaltyProducer();
01192
01193
01194 if (model_hope_fear) {
01195 scaleFeatureScore(wp, scale_wp_factor, featureValues, rank, epoch);
01196 } else {
01197 scaleFeatureScore(wp, scale_wp_factor, featureValuesHope, rank, epoch);
01198 scaleFeatureScore(wp, scale_wp_factor, featureValuesFear, rank, epoch);
01199 }
01200 }
01201
01202
01203 if (print_feature_values) {
01204 cerr << "\nRank " << rank << ", epoch " << epoch << ", feature values: " << endl;
01205 if (model_hope_fear) printFeatureValues(featureValues);
01206 else {
01207 cerr << "hope: " << endl;
01208 printFeatureValues(featureValuesHope);
01209 cerr << "fear: " << endl;
01210 printFeatureValues(featureValuesFear);
01211 }
01212 }
01213
01214
01215 if (feature_confidence) {
01216 cerr << "Rank " << rank << ", epoch " << epoch << ", apply feature learning rates with decays " << decay_core << "/" << decay_sparse << ": " << featureLearningRates << endl;
01217 if (model_hope_fear) {
01218 applyPerFeatureLearningRates(featureValues, featureLearningRates, sparse_r0);
01219 } else {
01220 applyPerFeatureLearningRates(featureValuesHope, featureLearningRates, sparse_r0);
01221 applyPerFeatureLearningRates(featureValuesFear, featureLearningRates, sparse_r0);
01222 }
01223 } else {
01224
01225 cerr << "Rank " << rank << ", epoch " << epoch << ", apply fixed learning rates, core: " << core_r0 << ", sparse: " << sparse_r0 << endl;
01226 if (core_r0 != 1.0 || sparse_r0 != 1.0) {
01227 if (model_hope_fear) {
01228 applyLearningRates(featureValues, core_r0, sparse_r0);
01229 } else {
01230 applyLearningRates(featureValuesHope, core_r0, sparse_r0);
01231 applyLearningRates(featureValuesFear, core_r0, sparse_r0);
01232 }
01233 }
01234 }
01235
01236
01237 VERBOSE(1, "\nRank " << rank << ", epoch " << epoch << ", run optimiser:" << endl);
01238 size_t update_status = 1;
01239 ScoreComponentCollection weightUpdate;
01240 if (perceptron_update) {
01241 vector<vector<float> > dummy1;
01242 update_status = optimiser->updateWeightsHopeFear( weightUpdate, featureValuesHope,
01243 featureValuesFear, dummy1, dummy1, dummy1, dummy1, learning_rate, rank, epoch);
01244 } else if (hope_fear) {
01245 if (bleuScoresHope[0][0] >= min_oracle_bleu) {
01246 if (hope_n == 1 && fear_n ==1 && batchSize == 1 && !hildreth) {
01247 update_status = ((MiraOptimiser*) optimiser)->updateWeightsAnalytically(weightUpdate,
01248 featureValuesHope[0][0], featureValuesFear[0][0], bleuScoresHope[0][0],
01249 bleuScoresFear[0][0], modelScoresHope[0][0], modelScoresFear[0][0], learning_rate, rank, epoch);
01250 } else
01251 update_status = optimiser->updateWeightsHopeFear(weightUpdate, featureValuesHope,
01252 featureValuesFear, bleuScoresHope, bleuScoresFear, modelScoresHope,
01253 modelScoresFear, learning_rate, rank, epoch);
01254 } else
01255 update_status = 1;
01256 } else if (kbest) {
01257 if (selective)
01258 update_status = ((MiraOptimiser*)optimiser)->updateWeightsHopeFearSelective(
01259 weightUpdate, featureValuesHope, featureValuesFear, bleuScoresHope, bleuScoresFear,
01260 modelScoresHope, modelScoresFear, learning_rate, rank, epoch);
01261 else if (summed)
01262 update_status = ((MiraOptimiser*)optimiser)->updateWeightsHopeFearSummed(
01263 weightUpdate, featureValuesHope, featureValuesFear, bleuScoresHope, bleuScoresFear,
01264 modelScoresHope, modelScoresFear, learning_rate, rank, epoch, rescaleSlack, makePairs);
01265 else {
01266 if (batchSize == 1 && featureValuesHope[0].size() == 1 && !hildreth) {
01267 cerr << "Rank " << rank << ", epoch " << epoch << ", model score hope: " << modelScoresHope[0][0] << endl;
01268 cerr << "Rank " << rank << ", epoch " << epoch << ", model score fear: " << modelScoresFear[0][0] << endl;
01269 update_status = ((MiraOptimiser*) optimiser)->updateWeightsAnalytically(
01270 weightUpdate, featureValuesHope[0][0], featureValuesFear[0][0],
01271 bleuScoresHope[0][0], bleuScoresFear[0][0], modelScoresHope[0][0],
01272 modelScoresFear[0][0], learning_rate, rank, epoch);
01273 } else {
01274 cerr << "Rank " << rank << ", epoch " << epoch << ", model score hope: " << modelScoresHope[0][0] << endl;
01275 cerr << "Rank " << rank << ", epoch " << epoch << ", model score fear: " << modelScoresFear[0][0] << endl;
01276 update_status = optimiser->updateWeightsHopeFear(weightUpdate, featureValuesHope,
01277 featureValuesFear, bleuScoresHope, bleuScoresFear, modelScoresHope,
01278 modelScoresFear, learning_rate, rank, epoch);
01279 }
01280 }
01281 } else {
01282
01283 update_status = ((MiraOptimiser*) optimiser)->updateWeights(weightUpdate,
01284 featureValues, losses, bleuScores, modelScores, oracleFeatureValues,
01285 oracleBleuScores, oracleModelScores, learning_rate, rank, epoch);
01286 }
01287
01288
01289
01290 if (update_status == 0) {
01291
01292 if (debug)
01293 cerr << "Rank " << rank << ", epoch " << epoch << ", update: " << weightUpdate << endl;
01294
01295 if (feature_confidence) {
01296
01297 confidenceCounts.UpdateConfidenceCounts(weightUpdate, signed_counts);
01298
01299
01300 featureLearningRates.UpdateLearningRates(decay_core, decay_sparse, confidenceCounts, core_r0, sparse_r0);
01301 }
01302
01303
01304 mosesWeights.PlusEquals(weightUpdate);
01305
01306 if (normaliseWeights)
01307 mosesWeights.L1Normalise();
01308
01309 cumulativeWeights.PlusEquals(mosesWeights);
01310 if (sparseAverage) {
01311 ScoreComponentCollection binary;
01312 binary.SetToBinaryOf(mosesWeights);
01313 cumulativeWeightsBinary.PlusEquals(binary);
01314 }
01315
01316 ++numberOfUpdates;
01317 ++numberOfUpdatesThisEpoch;
01318 if (averageWeights) {
01319 ScoreComponentCollection averageWeights(cumulativeWeights);
01320 if (accumulateWeights) {
01321 averageWeights.DivideEquals(numberOfUpdates);
01322 } else {
01323 averageWeights.DivideEquals(numberOfUpdatesThisEpoch);
01324 }
01325
01326 mosesWeights = averageWeights;
01327 }
01328
01329
01330 decoder->setWeights(mosesWeights);
01331
01332 }
01333
01334
01335 if (historyBleu || simpleHistoryBleu) {
01336 for (size_t i = 0; i < oneBests.size(); ++i)
01337 cerr << "Rank " << rank << ", epoch " << epoch << ", update history with 1best length: " << oneBests[i].size() << " ";
01338 decoder->updateHistory(oneBests, inputLengths, ref_ids, rank, epoch);
01339 deleteTranslations(oneBests);
01340 }
01341 }
01342
01343
01344 size_t generalShardSize;
01345 if (trainWithMultipleFolds)
01346 generalShardSize = order.size()/coresPerFold;
01347 else
01348 generalShardSize = order.size()/size;
01349
01350 size_t mixing_base = mixingFrequency == 0 ? 0 : generalShardSize / mixingFrequency;
01351 size_t dumping_base = weightDumpFrequency == 0 ? 0 : generalShardSize / weightDumpFrequency;
01352 bool mix = evaluateModulo(shardPosition, mixing_base, actualBatchSize);
01353
01354
01355 if (mix) {
01356 #ifdef MPI_ENABLE
01357 cerr << "Rank " << rank << ", epoch " << epoch << ", mixing weights.. " << endl;
01358
01359 mpi::reduce(world, mosesWeights, mixedWeights, SCCPlus(), 0);
01360
01361
01362
01363 ScoreComponentCollection totalBinary;
01364 if (sparseAverage) {
01365 ScoreComponentCollection binary;
01366 binary.SetToBinaryOf(mosesWeights);
01367 mpi::reduce(world, binary, totalBinary, SCCPlus(), 0);
01368 }
01369 if (rank == 0) {
01370
01371 if (sparseNoAverage)
01372 mixedWeights.CoreDivideEquals(size);
01373 else if (sparseAverage)
01374 mixedWeights.DivideEquals(totalBinary);
01375 else
01376 mixedWeights.DivideEquals(size);
01377
01378
01379
01380
01381
01382 if (normaliseWeights) {
01383 mixedWeights.L1Normalise();
01384 }
01385
01386 ++weightMixingThisEpoch;
01387
01388 if (pruneZeroWeights) {
01389 size_t pruned = mixedWeights.PruneZeroWeightFeatures();
01390 cerr << "Rank " << rank << ", epoch " << epoch << ", "
01391 << pruned << " zero-weighted features pruned from mixedWeights." << endl;
01392
01393 pruned = cumulativeWeights.PruneZeroWeightFeatures();
01394 cerr << "Rank " << rank << ", epoch " << epoch << ", "
01395 << pruned << " zero-weighted features pruned from cumulativeWeights." << endl;
01396 }
01397
01398 if (featureCutoff != -1 && weightMixingThisEpoch == mixingFrequency) {
01399 size_t pruned = mixedWeights.PruneSparseFeatures(featureCutoff);
01400 cerr << "Rank " << rank << ", epoch " << epoch << ", "
01401 << pruned << " features pruned from mixedWeights." << endl;
01402
01403 pruned = cumulativeWeights.PruneSparseFeatures(featureCutoff);
01404 cerr << "Rank " << rank << ", epoch " << epoch << ", "
01405 << pruned << " features pruned from cumulativeWeights." << endl;
01406 }
01407
01408 if (weightMixingThisEpoch == mixingFrequency || reg_on_every_mix) {
01409 if (l1_regularize) {
01410 size_t pruned;
01411 if (l1_reg_sparse)
01412 pruned = mixedWeights.SparseL1Regularize(l1_lambda);
01413 else
01414 pruned = mixedWeights.L1Regularize(l1_lambda);
01415 cerr << "Rank " << rank << ", epoch " << epoch << ", "
01416 << "l1-reg. on mixedWeights with lambda=" << l1_lambda << ", pruned: " << pruned << endl;
01417 }
01418 if (l2_regularize) {
01419 if (l2_reg_sparse)
01420 mixedWeights.SparseL2Regularize(l2_lambda);
01421 else
01422 mixedWeights.L2Regularize(l2_lambda);
01423 cerr << "Rank " << rank << ", epoch " << epoch << ", "
01424 << "l2-reg. on mixedWeights with lambda=" << l2_lambda << endl;
01425 }
01426 }
01427 }
01428
01429
01430 mpi::broadcast(world, mixedWeights, 0);
01431 decoder->setWeights(mixedWeights);
01432 mosesWeights = mixedWeights;
01433
01434
01435
01436
01437 #endif
01438 #ifndef MPI_ENABLE
01439
01440 mixedWeights = mosesWeights;
01441 #endif
01442 }
01443
01444
01445 if (trainWithMultipleFolds || weightEpochDump == weightDumpFrequency) {
01446
01447
01448 ostringstream filename;
01449 if (epoch < 10)
01450 filename << weightDumpStem << "_mixed_0" << epoch;
01451 else
01452 filename << weightDumpStem << "_mixed_" << epoch;
01453
01454 if (weightDumpFrequency > 1)
01455 filename << "_" << weightEpochDump;
01456
01457 mixedWeights.Save(filename.str());
01458 cerr << "Dumping mixed weights during epoch " << epoch << " to " << filename.str() << endl << endl;
01459 }
01460 if (dumpMixedWeights) {
01461 if (mix && rank == 0 && !weightDumpStem.empty()) {
01462
01463 ostringstream filename;
01464 if (epoch < 10)
01465 filename << weightDumpStem << "_0" << epoch;
01466 else
01467 filename << weightDumpStem << "_" << epoch;
01468
01469 if (weightDumpFrequency > 1)
01470 filename << "_" << weightEpochDump;
01471
01472 cerr << "Dumping mixed weights during epoch " << epoch << " to " << filename.str() << endl << endl;
01473 mixedWeights.Save(filename.str());
01474 ++weightEpochDump;
01475 }
01476 } else {
01477 if (evaluateModulo(shardPosition, dumping_base, actualBatchSize)) {
01478 cerr << "Rank " << rank << ", epoch " << epoch << ", dump weights.. (pos: " << shardPosition << ", base: " << dumping_base << ")" << endl;
01479 ScoreComponentCollection tmpAverageWeights(cumulativeWeights);
01480 bool proceed = false;
01481 if (accumulateWeights) {
01482 if (numberOfUpdates > 0) {
01483 tmpAverageWeights.DivideEquals(numberOfUpdates);
01484 proceed = true;
01485 }
01486 } else {
01487 if (numberOfUpdatesThisEpoch > 0) {
01488 if (sparseNoAverage)
01489 tmpAverageWeights.CoreDivideEquals(numberOfUpdatesThisEpoch);
01490 else if (sparseAverage)
01491 tmpAverageWeights.DivideEquals(cumulativeWeightsBinary);
01492 else
01493 tmpAverageWeights.DivideEquals(numberOfUpdatesThisEpoch);
01494 proceed = true;
01495 }
01496 }
01497
01498 if (proceed) {
01499 #ifdef MPI_ENABLE
01500
01501 mpi::reduce(world, tmpAverageWeights, mixedAverageWeights, SCCPlus(), 0);
01502 ScoreComponentCollection totalBinary;
01503 if (sparseAverage) {
01504 ScoreComponentCollection binary;
01505 binary.SetToBinaryOf(mosesWeights);
01506 mpi::reduce(world, binary, totalBinary, SCCPlus(), 0);
01507 }
01508 #endif
01509 #ifndef MPI_ENABLE
01510 mixedAverageWeights = tmpAverageWeights;
01511
01512 ScoreComponentCollection totalBinary;
01513 #endif
01514 if (rank == 0 && !weightDumpStem.empty()) {
01515
01516 if (sparseNoAverage)
01517 mixedAverageWeights.CoreDivideEquals(size);
01518 else if (sparseAverage)
01519 mixedAverageWeights.DivideEquals(totalBinary);
01520 else
01521 mixedAverageWeights.DivideEquals(size);
01522
01523
01524 if (normaliseWeights) {
01525 mixedAverageWeights.L1Normalise();
01526 }
01527
01528
01529 ostringstream filename;
01530 if (epoch < 10) {
01531 filename << weightDumpStem << "_0" << epoch;
01532 } else {
01533 filename << weightDumpStem << "_" << epoch;
01534 }
01535
01536 if (weightDumpFrequency > 1) {
01537 filename << "_" << weightEpochDump;
01538 }
01539
01540
01541
01542
01543
01544
01545
01546 cerr << "Dumping mixed average weights during epoch " << epoch << " to " << filename.str() << endl << endl;
01547 mixedAverageWeights.Save(filename.str());
01548 ++weightEpochDump;
01549
01550 if (weightEpochDump == weightDumpFrequency) {
01551 if (l1_regularize) {
01552 size_t pruned = mixedAverageWeights.SparseL1Regularize(l1_lambda);
01553 cerr << "Rank " << rank << ", epoch " << epoch << ", "
01554 << "l1-reg. on mixedAverageWeights with lambda=" << l1_lambda << ", pruned: " << pruned << endl;
01555
01556 }
01557 if (l2_regularize) {
01558 mixedAverageWeights.SparseL2Regularize(l2_lambda);
01559 cerr << "Rank " << rank << ", epoch " << epoch << ", "
01560 << "l2-reg. on mixedAverageWeights with lambda=" << l2_lambda << endl;
01561 }
01562
01563 if (l1_regularize || l2_regularize) {
01564 filename << "_reg";
01565 cerr << "Dumping regularized mixed average weights during epoch " << epoch << " to " << filename.str() << endl << endl;
01566 mixedAverageWeights.Save(filename.str());
01567 }
01568 }
01569
01570 if (weightEpochDump == weightDumpFrequency && printFeatureCounts) {
01571
01572 stringstream s1, s2;
01573 s1 << "sparse_feature_hope_counts" << "_" << epoch;
01574 s2 << "sparse_feature_fear_counts" << "_" << epoch;
01575 ofstream sparseFeatureCountsHope(s1.str().c_str());
01576 ofstream sparseFeatureCountsFear(s2.str().c_str());
01577
01578 mixedAverageWeights.PrintSparseHopeFeatureCounts(sparseFeatureCountsHope);
01579 mixedAverageWeights.PrintSparseFearFeatureCounts(sparseFeatureCountsFear);
01580 sparseFeatureCountsHope.close();
01581 sparseFeatureCountsFear.close();
01582 }
01583 }
01584 }
01585 }
01586 }
01587 }
01588 cerr << "Rank " << rank << ", epoch " << epoch << ", end of epoch.." << endl;
01589
01590 if (historyBleu || simpleHistoryBleu) {
01591 cerr << "Bleu feature history after epoch " << epoch << endl;
01592 decoder->printBleuFeatureHistory(cerr);
01593 }
01594
01595
01596
01597 size_t sumUpdates;
01598 size_t *sendbuf_uint, *recvbuf_uint;
01599 sendbuf_uint = (size_t *) malloc(sizeof(size_t));
01600 recvbuf_uint = (size_t *) malloc(sizeof(size_t));
01601 #ifdef MPI_ENABLE
01602 sendbuf_uint[0] = numberOfUpdatesThisEpoch;
01603 recvbuf_uint[0] = 0;
01604 MPI_Reduce(sendbuf_uint, recvbuf_uint, 1, MPI_UNSIGNED, MPI_SUM, 0, world);
01605 sumUpdates = recvbuf_uint[0];
01606 #endif
01607 #ifndef MPI_ENABLE
01608 sumUpdates = numberOfUpdatesThisEpoch;
01609 #endif
01610 if (rank == 0 && sumUpdates == 0) {
01611 cerr << "\nNo weight updates during this epoch.. stopping." << endl;
01612 stop = true;
01613 #ifdef MPI_ENABLE
01614 mpi::broadcast(world, stop, 0);
01615 #endif
01616 }
01617
01618 if (!stop) {
01619
01620 if (weightConvergence) {
01621 bool reached = true;
01622 if (rank == 0 && (epoch >= 2)) {
01623 ScoreComponentCollection firstDiff, secondDiff;
01624 if (dumpMixedWeights) {
01625 firstDiff = mixedWeights;
01626 firstDiff.MinusEquals(mixedWeightsPrevious);
01627 secondDiff = mixedWeights;
01628 secondDiff.MinusEquals(mixedWeightsBeforePrevious);
01629 } else {
01630 firstDiff = mixedAverageWeights;
01631 firstDiff.MinusEquals(mixedAverageWeightsPrevious);
01632 secondDiff = mixedAverageWeights;
01633 secondDiff.MinusEquals(mixedAverageWeightsBeforePrevious);
01634 }
01635 VERBOSE(1, "Average weight changes since previous epoch: " << firstDiff << " (max: " << firstDiff.GetLInfNorm() << ")" << endl);
01636 VERBOSE(1, "Average weight changes since before previous epoch: " << secondDiff << " (max: " << secondDiff.GetLInfNorm() << ")" << endl << endl);
01637
01638
01639
01640 if (firstDiff.GetLInfNorm() >= min_weight_change)
01641 reached = false;
01642 if (secondDiff.GetLInfNorm() >= min_weight_change)
01643 reached = false;
01644 if (reached) {
01645
01646 stop = true;
01647 cerr << "\nWeights have converged after epoch " << epoch << ".. stopping MIRA." << endl;
01648 ScoreComponentCollection dummy;
01649 ostringstream endfilename;
01650 endfilename << "stopping";
01651 dummy.Save(endfilename.str());
01652 }
01653 }
01654
01655 mixedWeightsBeforePrevious = mixedWeightsPrevious;
01656 mixedWeightsPrevious = mixedWeights;
01657 mixedAverageWeightsBeforePrevious = mixedAverageWeightsPrevious;
01658 mixedAverageWeightsPrevious = mixedAverageWeights;
01659 #ifdef MPI_ENABLE
01660 mpi::broadcast(world, stop, 0);
01661 #endif
01662 }
01663 }
01664 }
01665
01666 #ifdef MPI_ENABLE
01667 MPI_Finalize();
01668 #endif
01669
01670 time(&now);
01671 cerr << "Rank " << rank << ", " << ctime(&now);
01672
01673 if (rank == 0) {
01674 ScoreComponentCollection dummy;
01675 ostringstream endfilename;
01676 endfilename << "finished";
01677 dummy.Save(endfilename.str());
01678 }
01679
01680 delete decoder;
01681 exit(0);
01682 }
01683
01684 bool loadSentences(const string& filename, vector<string>& sentences)
01685 {
01686 ifstream in(filename.c_str());
01687 if (!in)
01688 return false;
01689 string line;
01690 while (getline(in, line))
01691 sentences.push_back(line);
01692 return true;
01693 }
01694
01695 bool evaluateModulo(size_t shard_position, size_t mix_or_dump_base, size_t actual_batch_size)
01696 {
01697 if (mix_or_dump_base == 0) return 0;
01698 if (actual_batch_size > 1) {
01699 bool mix_or_dump = false;
01700 size_t numberSubtracts = actual_batch_size;
01701 do {
01702 if (shard_position % mix_or_dump_base == 0) {
01703 mix_or_dump = true;
01704 break;
01705 }
01706 --shard_position;
01707 --numberSubtracts;
01708 } while (numberSubtracts > 0);
01709 return mix_or_dump;
01710 } else {
01711 return ((shard_position % mix_or_dump_base) == 0);
01712 }
01713 }
01714
01715 void printFeatureValues(vector<vector<ScoreComponentCollection> > &featureValues)
01716 {
01717 for (size_t i = 0; i < featureValues.size(); ++i) {
01718 for (size_t j = 0; j < featureValues[i].size(); ++j) {
01719 cerr << featureValues[i][j] << endl;
01720 }
01721 }
01722 cerr << endl;
01723 }
01724
01725 void deleteTranslations(vector<vector<const Word*> > &translations)
01726 {
01727 for (size_t i = 0; i < translations.size(); ++i) {
01728 for (size_t j = 0; j < translations[i].size(); ++j) {
01729 delete translations[i][j];
01730 }
01731 }
01732 }
01733
01734 void decodeHopeOrFear(size_t rank, size_t size, size_t decode, string filename, vector<string> &inputSentences, MosesDecoder* decoder, size_t n, float bleuWeight)
01735 {
01736 if (decode == 1)
01737 cerr << "Rank " << rank << ", decoding dev input set according to hope objective.. " << endl;
01738 else if (decode == 2)
01739 cerr << "Rank " << rank << ", decoding dev input set according to fear objective.. " << endl;
01740 else
01741 cerr << "Rank " << rank << ", decoding dev input set according to normal objective.. " << endl;
01742
01743
01744 vector<size_t> order;
01745 for (size_t i = 0; i < inputSentences.size(); ++i)
01746 order.push_back(i);
01747
01748 vector<size_t> shard;
01749 float shardSize = (float) (order.size()) / size;
01750 size_t shardStart = (size_t) (shardSize * rank);
01751 size_t shardEnd = (size_t) (shardSize * (rank + 1));
01752 if (rank == size - 1) {
01753 shardEnd = inputSentences.size();
01754 shardSize = shardEnd - shardStart;
01755 }
01756 VERBOSE(1, "Rank " << rank << ", shard start: " << shardStart << " Shard end: " << shardEnd << endl);
01757 VERBOSE(1, "Rank " << rank << ", shard size: " << shardSize << endl);
01758 shard.resize(shardSize);
01759 copy(order.begin() + shardStart, order.begin() + shardEnd, shard.begin());
01760
01761
01762 stringstream fname;
01763 fname << filename << ".rank" << rank;
01764 filename = fname.str();
01765 ostringstream filename_nbest;
01766 filename_nbest << filename << "." << n << "best";
01767 ofstream out(filename.c_str());
01768 ofstream nbest_out((filename_nbest.str()).c_str());
01769 if (!out) {
01770 ostringstream msg;
01771 msg << "Unable to open " << fname.str();
01772 throw runtime_error(msg.str());
01773 }
01774 if (!nbest_out) {
01775 ostringstream msg;
01776 msg << "Unable to open " << filename_nbest;
01777 throw runtime_error(msg.str());
01778 }
01779
01780 for (size_t i = 0; i < shard.size(); ++i) {
01781 size_t sid = shard[i];
01782 string& input = inputSentences[sid];
01783
01784 vector<vector<ScoreComponentCollection> > dummyFeatureValues;
01785 vector<vector<float> > dummyBleuScores;
01786 vector<vector<float> > dummyModelScores;
01787
01788 vector<ScoreComponentCollection> newFeatureValues;
01789 vector<float> newScores;
01790 dummyFeatureValues.push_back(newFeatureValues);
01791 dummyBleuScores.push_back(newScores);
01792 dummyModelScores.push_back(newScores);
01793
01794 float factor = 0.0;
01795 if (decode == 1) factor = 1.0;
01796 if (decode == 2) factor = -1.0;
01797 cerr << "Rank " << rank << ", translating sentence " << sid << endl;
01798 bool realBleu = false;
01799 vector< vector<const Word*> > nbestOutput = decoder->getNBest(input, sid, n, factor, bleuWeight, dummyFeatureValues[0],
01800 dummyBleuScores[0], dummyModelScores[0], n, realBleu, true, false, rank, 0, "");
01801 cerr << endl;
01802 decoder->cleanup(StaticData::Instance().IsChart());
01803
01804 for (size_t i = 0; i < nbestOutput.size(); ++i) {
01805 vector<const Word*> output = nbestOutput[i];
01806 stringstream translation;
01807 for (size_t k = 0; k < output.size(); ++k) {
01808 Word* w = const_cast<Word*>(output[k]);
01809 translation << w->GetString(0);
01810 translation << " ";
01811 }
01812
01813 if (i == 0)
01814 out << translation.str() << endl;
01815 nbest_out << sid << " ||| " << translation.str() << " ||| " << dummyFeatureValues[0][i] <<
01816 " ||| " << dummyModelScores[0][i] << " ||| sBleu=" << dummyBleuScores[0][i] << endl;
01817 }
01818 }
01819
01820 out.close();
01821 nbest_out.close();
01822 cerr << "Closing files " << filename << " and " << filename_nbest.str() << endl;
01823
01824 #ifdef MPI_ENABLE
01825 MPI_Finalize();
01826 #endif
01827
01828 time_t now;
01829 time(&now);
01830 cerr << "Rank " << rank << ", " << ctime(&now);
01831
01832 delete decoder;
01833 exit(0);
01834 }
01835
01836 void applyLearningRates(vector<vector<ScoreComponentCollection> > &featureValues, float core_r0, float sparse_r0)
01837 {
01838 for (size_t i=0; i<featureValues.size(); ++i)
01839 for (size_t j=0; j<featureValues[i].size(); ++j)
01840 featureValues[i][j].MultiplyEquals(core_r0, sparse_r0);
01841 }
01842
01843 void applyPerFeatureLearningRates(vector<vector<ScoreComponentCollection> > &featureValues, ScoreComponentCollection featureLearningRates, float sparse_r0)
01844 {
01845 for (size_t i=0; i<featureValues.size(); ++i)
01846 for (size_t j=0; j<featureValues[i].size(); ++j)
01847 featureValues[i][j].MultiplyEqualsBackoff(featureLearningRates, sparse_r0);
01848 }
01849
01850 void scaleFeatureScore(const FeatureFunction *sp, float scaling_factor, vector<vector<ScoreComponentCollection> > &featureValues, size_t rank, size_t epoch)
01851 {
01852 string name = sp->GetScoreProducerDescription();
01853
01854
01855 float featureScore;
01856 for (size_t i=0; i<featureValues.size(); ++i) {
01857 for (size_t j=0; j<featureValues[i].size(); ++j) {
01858 featureScore = featureValues[i][j].GetScoreForProducer(sp);
01859 featureValues[i][j].Assign(sp, featureScore*scaling_factor);
01860
01861 }
01862 }
01863 }
01864
01865 void scaleFeatureScores(const FeatureFunction *sp, float scaling_factor, vector<vector<ScoreComponentCollection> > &featureValues, size_t rank, size_t epoch)
01866 {
01867 string name = sp->GetScoreProducerDescription();
01868
01869
01870 for (size_t i=0; i<featureValues.size(); ++i) {
01871 for (size_t j=0; j<featureValues[i].size(); ++j) {
01872 vector<float> featureScores = featureValues[i][j].GetScoresForProducer(sp);
01873 for (size_t k=0; k<featureScores.size(); ++k)
01874 featureScores[k] *= scaling_factor;
01875 featureValues[i][j].Assign(sp, featureScores);
01876
01877 }
01878 }
01879 }