Moses: mosesdecoder/mira/Main.cpp Source File

00001 /***********************************************************************
00002  Moses - factored phrase-based language decoder
00003  Copyright (C) 2010 University of Edinburgh
00004 
00005  This library is free software; you can redistribute it and/or
00006  modify it under the terms of the GNU Lesser General Public
00007  License as published by the Free Software Foundation; either
00008  version 2.1 of the License, or (at your option) any later version.
00009 
00010  This library is distributed in the hope that it will be useful,
00011  but WITHOUT ANY WARRANTY; without even the implied warranty of
00012  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
00013  Lesser General Public License for more details.
00014 
00015  You should have received a copy of the GNU Lesser General Public
00016  License along with this library; if not, write to the Free Software
00017  Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301  USA
00018  ***********************************************************************/
00019 
00020 #include <algorithm>
00021 #include <cstdlib>
00022 #include <ctime>
00023 #include <string>
00024 #include <vector>
00025 #include <map>
00026 
00027 #include <boost/program_options.hpp>
00028 #include <boost/algorithm/string.hpp>
00029 
00030 #ifdef MPI_ENABLE
00031 #include <boost/mpi.hpp>
00032 namespace mpi = boost::mpi;
00033 #endif
00034 
00035 #include "Main.h"
00036 #include "Optimiser.h"
00037 #include "Hildreth.h"
00038 #include "HypothesisQueue.h"
00039 #include "moses/StaticData.h"
00040 #include "moses/ChartTrellisPathList.h"
00041 #include "moses/ChartTrellisPath.h"
00042 #include "moses/ScoreComponentCollection.h"
00043 #include "moses/ThreadPool.h"
00044 #include "moses/LexicalReordering.h"
00045 #include "mert/BleuScorer.h"
00046 #include "moses/FeatureVector.h"
00047 
00048 #include "moses/FF/WordTranslationFeature.h"
00049 #include "moses/FF/PhrasePairFeature.h"
00050 #include "moses/FF/WordPenaltyProducer.h"
00051 
00052 using namespace Mira;
00053 using namespace std;
00054 using namespace Moses;
00055 namespace po = boost::program_options;
00056 
00057 int main(int argc, char** argv)
00058 {
00059   size_t rank = 0;
00060   size_t size = 1;
00061 #ifdef MPI_ENABLE
00062   mpi::environment env(argc,argv);
00063   mpi::communicator world;
00064   rank = world.rank();
00065   size = world.size();
00066 #endif
00067 
00068   bool help;
00069   int verbosity;
00070   string mosesConfigFile;
00071   string inputFile;
00072   vector<string> referenceFiles;
00073   vector<string> mosesConfigFilesFolds, inputFilesFolds, referenceFilesFolds;
00074   //  string coreWeightFile, startWeightFile;
00075   size_t epochs;
00076   string learner;
00077   bool shuffle;
00078   size_t mixingFrequency;
00079   size_t weightDumpFrequency;
00080   string weightDumpStem;
00081   bool scale_margin, scale_margin_precision;
00082   bool scale_update, scale_update_precision;
00083   size_t n;
00084   size_t batchSize;
00085   bool distinctNbest;
00086   bool accumulateWeights;
00087   float historySmoothing;
00088   bool scaleByInputLength, scaleByAvgInputLength;
00089   bool scaleByInverseLength, scaleByAvgInverseLength;
00090   float scaleByX;
00091   float slack;
00092   bool averageWeights;
00093   bool weightConvergence;
00094   float learning_rate;
00095   float mira_learning_rate;
00096   float perceptron_learning_rate;
00097   string decoder_settings;
00098   float min_weight_change;
00099   bool normaliseWeights, normaliseMargin;
00100   bool print_feature_values;
00101   bool historyBleu   ;
00102   bool sentenceBleu;
00103   bool perceptron_update;
00104   bool hope_fear;
00105   bool model_hope_fear;
00106   int hope_n, fear_n;
00107   size_t bleu_smoothing_scheme;
00108   float min_oracle_bleu;
00109   float minBleuRatio, maxBleuRatio;
00110   bool boost;
00111   bool decode_hope, decode_fear, decode_model;
00112   string decode_filename;
00113   bool batchEqualsShard;
00114   bool sparseAverage, dumpMixedWeights, sparseNoAverage;
00115   int featureCutoff;
00116   bool pruneZeroWeights;
00117   bool printFeatureCounts, printNbestWithFeatures;
00118   bool avgRefLength;
00119   bool print_weights, print_core_weights, debug_model, scale_lm, scale_wp;
00120   float scale_lm_factor, scale_wp_factor;
00121   bool kbest;
00122   string moses_src;
00123   float sigmoidParam;
00124   float bleuWeight, bleuWeight_hope, bleuWeight_fear;
00125   bool bleu_weight_lm, bleu_weight_lm_adjust;
00126   float bleu_weight_lm_factor;
00127   bool l1_regularize, l2_regularize, l1_reg_sparse, l2_reg_sparse;
00128   float l1_lambda, l2_lambda;
00129   bool most_violated, most_violated_reg, all_violated, max_bleu_diff, one_against_all;
00130   bool feature_confidence, signed_counts;
00131   float decay_core, decay_sparse, core_r0, sparse_r0;
00132   bool selective, summed;
00133   float bleu_weight_fear_factor;
00134   bool hildreth;
00135   float add2lm;
00136   bool realBleu, disableBleuFeature;
00137   bool rescaleSlack;
00138   bool makePairs;
00139   bool debug;
00140   bool reg_on_every_mix;
00141   size_t continue_epoch;
00142   bool modelPlusBleu,  simpleHistoryBleu;
00143   po::options_description desc("Allowed options");
00144   desc.add_options()
00145   ("continue-epoch", po::value<size_t>(&continue_epoch)->default_value(0), "Continue an interrupted experiment from this epoch on")
00146   ("freq-reg", po::value<bool>(&reg_on_every_mix)->default_value(false), "Regularize after every weight mixing")
00147   ("l1sparse", po::value<bool>(&l1_reg_sparse)->default_value(true), "L1-regularization for sparse weights only")
00148   ("l2sparse", po::value<bool>(&l2_reg_sparse)->default_value(true), "L2-regularization for sparse weights only")
00149   ("mv-reg", po::value<bool>(&most_violated_reg)->default_value(false), "Regularize most violated constraint")
00150   ("dbg", po::value<bool>(&debug)->default_value(true), "More debug output")
00151   ("make-pairs", po::value<bool>(&makePairs)->default_value(true), "Make pairs of hypotheses for 1slack")
00152   ("debug", po::value<bool>(&debug)->default_value(true), "More debug output")
00153   ("rescale-slack", po::value<bool>(&rescaleSlack)->default_value(false), "Rescale slack in 1-slack formulation")
00154   ("disable-bleu-feature", po::value<bool>(&disableBleuFeature)->default_value(false), "Disable the Bleu feature")
00155   ("real-bleu", po::value<bool>(&realBleu)->default_value(false), "Compute real sentence Bleu on complete translations")
00156   ("add2lm", po::value<float>(&add2lm)->default_value(0.0), "Add the specified amount to all LM weights")
00157   ("hildreth", po::value<bool>(&hildreth)->default_value(false), "Prefer Hildreth over analytical update")
00158   ("selective", po::value<bool>(&selective)->default_value(false), "Build constraints for every feature")
00159   ("summed", po::value<bool>(&summed)->default_value(false), "Sum up all constraints")
00160   ("model-plus-bleu", po::value<bool>(&modelPlusBleu)->default_value(false), "Use the sum of model score and +/- bleu to select hope and fear translations")
00161   ("simple-history-bleu", po::value<bool>(&simpleHistoryBleu)->default_value(false), "Simple history Bleu")
00162 
00163   ("bleu-weight", po::value<float>(&bleuWeight)->default_value(1.0), "Bleu weight used in decoder objective")
00164   ("bw-hope", po::value<float>(&bleuWeight_hope)->default_value(-1.0), "Bleu weight used in decoder objective for hope")
00165   ("bw-fear", po::value<float>(&bleuWeight_fear)->default_value(-1.0), "Bleu weight used in decoder objective for fear")
00166 
00167   ("core-r0", po::value<float>(&core_r0)->default_value(1.0), "Start learning rate for core features")
00168   ("sparse-r0", po::value<float>(&sparse_r0)->default_value(1.0), "Start learning rate for sparse features")
00169 
00170   ("tie-bw-to-lm", po::value<bool>(&bleu_weight_lm)->default_value(false), "Make bleu weight depend on lm weight")
00171   ("adjust-bw", po::value<bool>(&bleu_weight_lm_adjust)->default_value(false), "Adjust bleu weight when lm weight changes")
00172   ("bw-lm-factor", po::value<float>(&bleu_weight_lm_factor)->default_value(2.0), "Make bleu weight depend on lm weight by this factor")
00173   ("bw-factor-fear", po::value<float>(&bleu_weight_fear_factor)->default_value(1.0), "Multiply fear weight by this factor")
00174   ("accumulate-weights", po::value<bool>(&accumulateWeights)->default_value(false), "Accumulate and average weights over all epochs")
00175   ("average-weights", po::value<bool>(&averageWeights)->default_value(false), "Set decoder weights to average weights after each update")
00176   ("avg-ref-length", po::value<bool>(&avgRefLength)->default_value(false), "Use average reference length instead of shortest for BLEU score feature")
00177   ("batch-equals-shard", po::value<bool>(&batchEqualsShard)->default_value(false), "Batch size is equal to shard size (purely batch)")
00178   ("batch-size,b", po::value<size_t>(&batchSize)->default_value(1), "Size of batch that is send to optimiser for weight adjustments")
00179   ("bleu-smoothing-scheme", po::value<size_t>(&bleu_smoothing_scheme)->default_value(1), "Set a smoothing scheme for sentence-Bleu: +1 (1), +0.1 (2), papineni (3) (default:1)")
00180   ("boost", po::value<bool>(&boost)->default_value(false), "Apply boosting factor to updates on misranked candidates")
00181   ("config,f", po::value<string>(&mosesConfigFile), "Moses ini-file")
00182   ("configs-folds", po::value<vector<string> >(&mosesConfigFilesFolds), "Moses ini-files, one for each fold")
00183   ("debug-model", po::value<bool>(&debug_model)->default_value(false), "Get best model translation for debugging purposes")
00184   ("decode-hope", po::value<bool>(&decode_hope)->default_value(false), "Decode dev input set according to hope objective")
00185   ("decode-fear", po::value<bool>(&decode_fear)->default_value(false), "Decode dev input set according to fear objective")
00186   ("decode-model", po::value<bool>(&decode_model)->default_value(false), "Decode dev input set according to normal objective")
00187   ("decode-filename", po::value<string>(&decode_filename), "Filename for Bleu objective translations")
00188   ("decoder-settings", po::value<string>(&decoder_settings)->default_value(""), "Decoder settings for tuning runs")
00189   ("distinct-nbest", po::value<bool>(&distinctNbest)->default_value(true), "Use n-best list with distinct translations in inference step")
00190   ("dump-mixed-weights", po::value<bool>(&dumpMixedWeights)->default_value(false), "Dump mixed weights instead of averaged weights")
00191   ("epochs,e", po::value<size_t>(&epochs)->default_value(10), "Number of epochs")
00192   ("feature-cutoff", po::value<int>(&featureCutoff)->default_value(-1), "Feature cutoff as additional regularization for sparse features")
00193   ("fear-n", po::value<int>(&fear_n)->default_value(1), "Number of fear translations used")
00194   ("help", po::value(&help)->zero_tokens()->default_value(false), "Print this help message and exit")
00195   ("history-bleu", po::value<bool>(&historyBleu)->default_value(false), "Use 1best translations to update the history")
00196   ("history-smoothing", po::value<float>(&historySmoothing)->default_value(0.9), "Adjust the factor for history smoothing")
00197   ("hope-fear", po::value<bool>(&hope_fear)->default_value(true), "Use only hope and fear translations for optimisation (not model)")
00198   ("hope-n", po::value<int>(&hope_n)->default_value(2), "Number of hope translations used")
00199   ("input-file,i", po::value<string>(&inputFile), "Input file containing tokenised source")
00200   ("input-files-folds", po::value<vector<string> >(&inputFilesFolds), "Input files containing tokenised source, one for each fold")
00201   ("learner,l", po::value<string>(&learner)->default_value("mira"), "Learning algorithm")
00202   ("l1-lambda", po::value<float>(&l1_lambda)->default_value(0.0001), "Lambda for l1-regularization (w_i +/- lambda)")
00203   ("l2-lambda", po::value<float>(&l2_lambda)->default_value(0.01), "Lambda for l2-regularization (w_i * (1 - lambda))")
00204   ("l1-reg", po::value<bool>(&l1_regularize)->default_value(false), "L1-regularization")
00205   ("l2-reg", po::value<bool>(&l2_regularize)->default_value(false), "L2-regularization")
00206   ("min-bleu-ratio", po::value<float>(&minBleuRatio)->default_value(-1), "Set a minimum BLEU ratio between hope and fear")
00207   ("max-bleu-ratio", po::value<float>(&maxBleuRatio)->default_value(-1), "Set a maximum BLEU ratio between hope and fear")
00208   ("max-bleu-diff", po::value<bool>(&max_bleu_diff)->default_value(true), "Select hope/fear with maximum Bleu difference")
00209   ("min-oracle-bleu", po::value<float>(&min_oracle_bleu)->default_value(0), "Set a minimum oracle BLEU score")
00210   ("min-weight-change", po::value<float>(&min_weight_change)->default_value(0.0001), "Set minimum weight change for stopping criterion")
00211   ("mira-learning-rate", po::value<float>(&mira_learning_rate)->default_value(1), "Learning rate for MIRA (fixed or flexible)")
00212   ("mixing-frequency", po::value<size_t>(&mixingFrequency)->default_value(1), "How often per epoch to mix weights, when using mpi")
00213   ("model-hope-fear", po::value<bool>(&model_hope_fear)->default_value(false), "Use model, hope and fear translations for optimisation")
00214   ("moses-src", po::value<string>(&moses_src)->default_value(""), "Moses source directory")
00215   ("nbest,n", po::value<size_t>(&n)->default_value(1), "Number of translations in n-best list")
00216   ("normalise-weights", po::value<bool>(&normaliseWeights)->default_value(false), "Whether to normalise the updated weights before passing them to the decoder")
00217   ("normalise-margin", po::value<bool>(&normaliseMargin)->default_value(false), "Normalise the margin: squash between 0 and 1")
00218   ("perceptron-learning-rate", po::value<float>(&perceptron_learning_rate)->default_value(0.01), "Perceptron learning rate")
00219   ("print-feature-values", po::value<bool>(&print_feature_values)->default_value(false), "Print out feature values")
00220   ("print-feature-counts", po::value<bool>(&printFeatureCounts)->default_value(false), "Print out feature values, print feature list with hope counts after 1st epoch")
00221   ("print-nbest-with-features", po::value<bool>(&printNbestWithFeatures)->default_value(false), "Print out feature values, print feature list with hope counts after 1st epoch")
00222   ("print-weights", po::value<bool>(&print_weights)->default_value(false), "Print out current weights")
00223   ("print-core-weights", po::value<bool>(&print_core_weights)->default_value(true), "Print out current core weights")
00224   ("prune-zero-weights", po::value<bool>(&pruneZeroWeights)->default_value(false), "Prune zero-valued sparse feature weights")
00225   ("reference-files,r", po::value<vector<string> >(&referenceFiles), "Reference translation files for training")
00226   ("reference-files-folds", po::value<vector<string> >(&referenceFilesFolds), "Reference translation files for training, one for each fold")
00227   ("kbest", po::value<bool>(&kbest)->default_value(false), "Select hope/fear pairs from a list of nbest translations")
00228 
00229   ("scale-by-inverse-length", po::value<bool>(&scaleByInverseLength)->default_value(false), "Scale BLEU by (history of) inverse input length")
00230   ("scale-by-input-length", po::value<bool>(&scaleByInputLength)->default_value(false), "Scale BLEU by (history of) input length")
00231   ("scale-by-avg-input-length", po::value<bool>(&scaleByAvgInputLength)->default_value(false), "Scale BLEU by average input length")
00232   ("scale-by-avg-inverse-length", po::value<bool>(&scaleByAvgInverseLength)->default_value(false), "Scale BLEU by average inverse input length")
00233   ("scale-by-x", po::value<float>(&scaleByX)->default_value(1), "Scale the BLEU score by value x")
00234   ("scale-lm", po::value<bool>(&scale_lm)->default_value(false), "Scale the language model feature")
00235   ("scale-factor-lm", po::value<float>(&scale_lm_factor)->default_value(2), "Scale the language model feature by this factor")
00236   ("scale-wp", po::value<bool>(&scale_wp)->default_value(false), "Scale the word penalty feature")
00237   ("scale-factor-wp", po::value<float>(&scale_wp_factor)->default_value(2), "Scale the word penalty feature by this factor")
00238   ("scale-margin", po::value<bool>(&scale_margin)->default_value(0), "Scale the margin by the Bleu score of the oracle translation")
00239   ("scale-margin-precision", po::value<bool>(&scale_margin_precision)->default_value(0), "Scale margin by precision of oracle")
00240   ("scale-update", po::value<bool>(&scale_update)->default_value(0), "Scale update by Bleu score of oracle")
00241   ("scale-update-precision", po::value<bool>(&scale_update_precision)->default_value(0), "Scale update by precision of oracle")
00242   ("sentence-level-bleu", po::value<bool>(&sentenceBleu)->default_value(true), "Use a sentences level Bleu scoring function")
00243   ("shuffle", po::value<bool>(&shuffle)->default_value(false), "Shuffle input sentences before processing")
00244   ("sigmoid-param", po::value<float>(&sigmoidParam)->default_value(1), "y=sigmoidParam is the axis that this sigmoid approaches")
00245   ("slack", po::value<float>(&slack)->default_value(0.01), "Use slack in optimiser")
00246   ("sparse-average", po::value<bool>(&sparseAverage)->default_value(false), "Average weights by the number of processes")
00247   ("sparse-no-average", po::value<bool>(&sparseNoAverage)->default_value(false), "Don't average sparse weights, just sum")
00248   ("stop-weights", po::value<bool>(&weightConvergence)->default_value(true), "Stop when weights converge")
00249   ("verbosity,v", po::value<int>(&verbosity)->default_value(0), "Verbosity level")
00250   ("weight-dump-frequency", po::value<size_t>(&weightDumpFrequency)->default_value(1), "How often per epoch to dump weights (mpi)")
00251   ("weight-dump-stem", po::value<string>(&weightDumpStem)->default_value("weights"), "Stem of filename to use for dumping weights");
00252 
00253   po::options_description cmdline_options;
00254   cmdline_options.add(desc);
00255   po::variables_map vm;
00256   po::store(po::command_line_parser(argc, argv). options(cmdline_options).run(), vm);
00257   po::notify(vm);
00258 
00259   if (help) {
00260     std::cout << "Usage: " + string(argv[0])
00261               + " -f mosesini-file -i input-file -r reference-file(s) [options]" << std::endl;
00262     std::cout << desc << std::endl;
00263     return 0;
00264   }
00265 
00266   const StaticData &staticData = StaticData::Instance();
00267 
00268   bool trainWithMultipleFolds = false;
00269   if (mosesConfigFilesFolds.size() > 0 || inputFilesFolds.size() > 0 || referenceFilesFolds.size() > 0) {
00270     if (rank == 0)
00271       cerr << "Training with " << mosesConfigFilesFolds.size() << " folds" << endl;
00272     trainWithMultipleFolds = true;
00273   }
00274 
00275   if (dumpMixedWeights && (mixingFrequency != weightDumpFrequency)) {
00276     cerr << "Set mixing frequency = weight dump frequency for dumping mixed weights!" << endl;
00277     exit(1);
00278   }
00279 
00280   if ((sparseAverage || sparseNoAverage) && averageWeights) {
00281     cerr << "Parameters --sparse-average 1/--sparse-no-average 1 and --average-weights 1 are incompatible (not implemented)" << endl;
00282     exit(1);
00283   }
00284 
00285   if (trainWithMultipleFolds) {
00286     if (!mosesConfigFilesFolds.size()) {
00287       cerr << "Error: No moses ini files specified for training with folds" << endl;
00288       exit(1);
00289     }
00290 
00291     if (!inputFilesFolds.size()) {
00292       cerr << "Error: No input files specified for training with folds" << endl;
00293       exit(1);
00294     }
00295 
00296     if (!referenceFilesFolds.size()) {
00297       cerr << "Error: No reference files specified for training with folds" << endl;
00298       exit(1);
00299     }
00300   } else {
00301     if (mosesConfigFile.empty()) {
00302       cerr << "Error: No moses ini file specified" << endl;
00303       return 1;
00304     }
00305 
00306     if (inputFile.empty()) {
00307       cerr << "Error: No input file specified" << endl;
00308       return 1;
00309     }
00310 
00311     if (!referenceFiles.size()) {
00312       cerr << "Error: No reference files specified" << endl;
00313       return 1;
00314     }
00315   }
00316 
00317   // load input and references
00318   vector<string> inputSentences;
00319   size_t inputSize = trainWithMultipleFolds? inputFilesFolds.size(): 0;
00320   size_t refSize = trainWithMultipleFolds? referenceFilesFolds.size(): referenceFiles.size();
00321   vector<vector<string> > inputSentencesFolds(inputSize);
00322   vector<vector<string> > referenceSentences(refSize);
00323 
00324   // number of cores for each fold
00325   size_t coresPerFold = 0, myFold = 0;
00326   if (trainWithMultipleFolds) {
00327     if (mosesConfigFilesFolds.size() > size) {
00328       cerr << "Number of cores has to be a multiple of the number of folds" << endl;
00329       exit(1);
00330     }
00331     coresPerFold = size/mosesConfigFilesFolds.size();
00332     if (size % coresPerFold > 0) {
00333       cerr << "Number of cores has to be a multiple of the number of folds" << endl;
00334       exit(1);
00335     }
00336 
00337     if (rank == 0)
00338       cerr << "Number of cores per fold: " << coresPerFold << endl;
00339     myFold = rank/coresPerFold;
00340     cerr << "Rank " << rank << ", my fold: " << myFold << endl;
00341   }
00342 
00343   // NOTE: we do not actually need the references here, because we are reading them in from StaticData
00344   if (trainWithMultipleFolds) {
00345     if (!loadSentences(inputFilesFolds[myFold], inputSentencesFolds[myFold])) {
00346       cerr << "Error: Failed to load input sentences from " << inputFilesFolds[myFold] << endl;
00347       exit(1);
00348     }
00349     VERBOSE(1, "Rank " << rank << " reading inputs from " << inputFilesFolds[myFold] << endl);
00350 
00351     if (!loadSentences(referenceFilesFolds[myFold], referenceSentences[myFold])) {
00352       cerr << "Error: Failed to load reference sentences from " << referenceFilesFolds[myFold] << endl;
00353       exit(1);
00354     }
00355     if (referenceSentences[myFold].size() != inputSentencesFolds[myFold].size()) {
00356       cerr << "Error: Input file length (" << inputSentencesFolds[myFold].size() << ") != ("
00357            << referenceSentences[myFold].size() << ") reference file length (rank " << rank << ")" << endl;
00358       exit(1);
00359     }
00360     VERBOSE(1, "Rank " << rank << " reading references from " << referenceFilesFolds[myFold] << endl);
00361   } else {
00362     if (!loadSentences(inputFile, inputSentences)) {
00363       cerr << "Error: Failed to load input sentences from " << inputFile << endl;
00364       return 1;
00365     }
00366 
00367     for (size_t i = 0; i < referenceFiles.size(); ++i) {
00368       if (!loadSentences(referenceFiles[i], referenceSentences[i])) {
00369         cerr << "Error: Failed to load reference sentences from "
00370              << referenceFiles[i] << endl;
00371         return 1;
00372       }
00373       if (referenceSentences[i].size() != inputSentences.size()) {
00374         cerr << "Error: Input file length (" << inputSentences.size() << ") != ("
00375              << referenceSentences[i].size() << ") length of reference file " << i
00376              << endl;
00377         return 1;
00378       }
00379     }
00380   }
00381 
00382   if (scaleByAvgInputLength ||  scaleByInverseLength || scaleByAvgInverseLength)
00383     scaleByInputLength = false;
00384 
00385   if (historyBleu || simpleHistoryBleu) {
00386     sentenceBleu = false;
00387     cerr << "Using history Bleu. " << endl;
00388   }
00389 
00390   if (kbest) {
00391     realBleu = true;
00392     disableBleuFeature = true;
00393     cerr << "Use kbest lists and real Bleu scores, disable Bleu feature.." << endl;
00394   }
00395 
00396   // initialise Moses
00397   // add references to initialize Bleu feature
00398   boost::trim(decoder_settings);
00399   decoder_settings += " -mira -distinct-nbest -references";
00400   if (trainWithMultipleFolds) {
00401     decoder_settings += " ";
00402     decoder_settings += referenceFilesFolds[myFold];
00403   } else {
00404     for (size_t i=0; i < referenceFiles.size(); ++i) {
00405       decoder_settings += " ";
00406       decoder_settings += referenceFiles[i];
00407     }
00408   }
00409 
00410   vector<string> decoder_params;
00411   boost::split(decoder_params, decoder_settings, boost::is_any_of("\t "));
00412 
00413   string configFile = trainWithMultipleFolds? mosesConfigFilesFolds[myFold] : mosesConfigFile;
00414   VERBOSE(1, "Rank " << rank << " reading config file from " << configFile << endl);
00415   MosesDecoder* decoder = new MosesDecoder(configFile, verbosity, decoder_params.size(), decoder_params);
00416   decoder->setBleuParameters(disableBleuFeature, sentenceBleu, scaleByInputLength, scaleByAvgInputLength,
00417                              scaleByInverseLength, scaleByAvgInverseLength,
00418                              scaleByX, historySmoothing, bleu_smoothing_scheme, simpleHistoryBleu);
00419   bool chartDecoding = staticData.IsChart();
00420 
00421   // Optionally shuffle the sentences
00422   vector<size_t> order;
00423   if (trainWithMultipleFolds) {
00424     for (size_t i = 0; i < inputSentencesFolds[myFold].size(); ++i) {
00425       order.push_back(i);
00426     }
00427   } else {
00428     if (rank == 0) {
00429       for (size_t i = 0; i < inputSentences.size(); ++i) {
00430         order.push_back(i);
00431       }
00432     }
00433   }
00434 
00435   // initialise optimizer
00436   Optimiser* optimiser = NULL;
00437   if (learner == "mira") {
00438     if (rank == 0) {
00439       cerr << "Optimising using Mira" << endl;
00440       cerr << "slack: " << slack << ", learning rate: " << mira_learning_rate << endl;
00441       cerr << "selective: " << selective << endl;
00442       if (normaliseMargin)
00443         cerr << "sigmoid parameter: " << sigmoidParam << endl;
00444     }
00445     optimiser = new MiraOptimiser(slack, scale_margin, scale_margin_precision,
00446                                   scale_update, scale_update_precision, boost, normaliseMargin, sigmoidParam);
00447     learning_rate = mira_learning_rate;
00448     perceptron_update = false;
00449   } else if (learner == "perceptron") {
00450     if (rank == 0) {
00451       cerr << "Optimising using Perceptron" << endl;
00452     }
00453     optimiser = new Perceptron();
00454     learning_rate = perceptron_learning_rate;
00455     perceptron_update = true;
00456     model_hope_fear = false; // mira only
00457     hope_fear = false; // mira only
00458     n = 1;
00459     hope_n = 1;
00460     fear_n = 1;
00461   } else {
00462     cerr << "Error: Unknown optimiser: " << learner << endl;
00463     return 1;
00464   }
00465 
00466   // resolve parameter dependencies
00467   if (batchSize > 1 && perceptron_update) {
00468     batchSize = 1;
00469     cerr << "Info: Setting batch size to 1 for perceptron update" << endl;
00470   }
00471 
00472   if (hope_n == -1)
00473     hope_n = n;
00474   if (fear_n == -1)
00475     fear_n = n;
00476 
00477   if (model_hope_fear || kbest)
00478     hope_fear = false; // is true by default
00479   if (learner == "mira" && !(hope_fear || model_hope_fear || kbest)) {
00480     cerr << "Error: Need to select one of parameters --hope-fear/--model-hope-fear/--kbest for mira update." << endl;
00481     return 1;
00482   }
00483 
00484 #ifdef MPI_ENABLE
00485   if (!trainWithMultipleFolds)
00486     mpi::broadcast(world, order, 0);
00487 #endif
00488 
00489   // Create shards according to the number of processes used
00490   vector<size_t> shard;
00491   if (trainWithMultipleFolds) {
00492     size_t shardSize = order.size()/coresPerFold;
00493     size_t shardStart = (size_t) (shardSize * (rank % coresPerFold));
00494     size_t shardEnd = shardStart + shardSize;
00495     if (rank % coresPerFold == coresPerFold - 1) { // last rank of each fold
00496       shardEnd = order.size();
00497       shardSize = shardEnd - shardStart;
00498     }
00499     VERBOSE(1, "Rank: " << rank << ", shard size: " << shardSize << endl);
00500     VERBOSE(1, "Rank: " << rank << ", shard start: " << shardStart << " shard end: " << shardEnd << endl);
00501     shard.resize(shardSize);
00502     copy(order.begin() + shardStart, order.begin() + shardEnd, shard.begin());
00503     batchSize = 1;
00504   } else {
00505     size_t shardSize = order.size() / size;
00506     size_t shardStart = (size_t) (shardSize * rank);
00507     size_t shardEnd = (size_t) (shardSize * (rank + 1));
00508     if (rank == size - 1) {
00509       shardEnd = order.size();
00510       shardSize = shardEnd - shardStart;
00511     }
00512     VERBOSE(1, "Rank: " << rank << " Shard size: " << shardSize << endl);
00513     VERBOSE(1, "Rank: " << rank << " Shard start: " << shardStart << " Shard end: " << shardEnd << endl);
00514     shard.resize(shardSize);
00515     copy(order.begin() + shardStart, order.begin() + shardEnd, shard.begin());
00516     if (batchEqualsShard)
00517       batchSize = shardSize;
00518   }
00519 
00520   // get reference to feature functions
00521   const vector<FeatureFunction*> &featureFunctions = FeatureFunction::GetFeatureFunctions();
00522   ScoreComponentCollection initialWeights = decoder->getWeights();
00523 
00524   if (add2lm != 0) {
00525     const std::vector<const StatefulFeatureFunction*> &statefulFFs = StatefulFeatureFunction::GetStatefulFeatureFunctions();
00526     for (size_t i = 0; i < statefulFFs.size(); ++i) {
00527       const StatefulFeatureFunction *ff = statefulFFs[i];
00528       const LanguageModel *lm = dynamic_cast<const LanguageModel*>(ff);
00529 
00530       if (lm) {
00531         float lmWeight = initialWeights.GetScoreForProducer(lm) + add2lm;
00532         initialWeights.Assign(lm, lmWeight);
00533         cerr << "Rank " << rank << ", add " << add2lm << " to lm weight." << endl;
00534       }
00535     }
00536   }
00537 
00538   if (normaliseWeights) {
00539     initialWeights.L1Normalise();
00540     cerr << "Rank " << rank << ", normalised initial weights: " << initialWeights << endl;
00541   }
00542 
00543   decoder->setWeights(initialWeights);
00544 
00545   // set bleu weight to twice the size of the language model weight(s)
00546   if (bleu_weight_lm) {
00547     float lmSum = 0;
00548     const std::vector<const StatefulFeatureFunction*> &statefulFFs = StatefulFeatureFunction::GetStatefulFeatureFunctions();
00549     for (size_t i = 0; i < statefulFFs.size(); ++i) {
00550       const StatefulFeatureFunction *ff = statefulFFs[i];
00551       const LanguageModel *lm = dynamic_cast<const LanguageModel*>(ff);
00552 
00553       if (lm) {
00554         lmSum += abs(initialWeights.GetScoreForProducer(lm));
00555       }
00556     }
00557 
00558     bleuWeight = lmSum * bleu_weight_lm_factor;
00559     cerr << "Set bleu weight to lm weight * " << bleu_weight_lm_factor << endl;
00560   }
00561 
00562   if (bleuWeight_hope == -1) {
00563     bleuWeight_hope = bleuWeight;
00564   }
00565   if (bleuWeight_fear == -1) {
00566     bleuWeight_fear = bleuWeight;
00567   }
00568   bleuWeight_fear *= bleu_weight_fear_factor;
00569   cerr << "Bleu weight: " << bleuWeight << endl;
00570   cerr << "Bleu weight fear: " << bleuWeight_fear << endl;
00571 
00572   if (decode_hope || decode_fear || decode_model) {
00573     size_t decode = 1;
00574     if (decode_fear) decode = 2;
00575     if (decode_model) decode = 3;
00576     decodeHopeOrFear(rank, size, decode, decode_filename, inputSentences, decoder, n, bleuWeight);
00577   }
00578 
00579   //Main loop:
00580   ScoreComponentCollection cumulativeWeights; // collect weights per epoch to produce an average
00581   ScoreComponentCollection cumulativeWeightsBinary;
00582   size_t numberOfUpdates = 0;
00583   size_t numberOfUpdatesThisEpoch = 0;
00584 
00585   time_t now;
00586   time(&now);
00587   cerr << "Rank " << rank << ", " << ctime(&now);
00588 
00589   float avgInputLength = 0;
00590   float sumOfInputs = 0;
00591   size_t numberOfInputs = 0;
00592 
00593   ScoreComponentCollection mixedWeights;
00594   ScoreComponentCollection mixedWeightsPrevious;
00595   ScoreComponentCollection mixedWeightsBeforePrevious;
00596   ScoreComponentCollection mixedAverageWeights;
00597   ScoreComponentCollection mixedAverageWeightsPrevious;
00598   ScoreComponentCollection mixedAverageWeightsBeforePrevious;
00599 
00600   bool stop = false;
00601 //      int sumStillViolatedConstraints;
00602   float epsilon = 0.0001;
00603 
00604   // Variables for feature confidence
00605   ScoreComponentCollection confidenceCounts, mixedConfidenceCounts, featureLearningRates;
00606   featureLearningRates.UpdateLearningRates(decay_core, decay_sparse, confidenceCounts, core_r0, sparse_r0); //initialise core learning rates
00607   cerr << "Initial learning rates, core: " << core_r0 << ", sparse: " << sparse_r0 << endl;
00608 
00609   for (size_t epoch = continue_epoch; epoch < epochs && !stop; ++epoch) {
00610     if (shuffle) {
00611       if (trainWithMultipleFolds || rank == 0) {
00612         cerr << "Rank " << rank << ", epoch " << epoch << ", shuffling input sentences.." << endl;
00613         RandomIndex rindex;
00614         random_shuffle(order.begin(), order.end(), rindex);
00615       }
00616 
00617 #ifdef MPI_ENABLE
00618       if (!trainWithMultipleFolds)
00619         mpi::broadcast(world, order, 0);
00620 #endif
00621 
00622       // redo shards
00623       if (trainWithMultipleFolds) {
00624         size_t shardSize = order.size()/coresPerFold;
00625         size_t shardStart = (size_t) (shardSize * (rank % coresPerFold));
00626         size_t shardEnd = shardStart + shardSize;
00627         if (rank % coresPerFold == coresPerFold - 1) { // last rank of each fold
00628           shardEnd = order.size();
00629           shardSize = shardEnd - shardStart;
00630         }
00631         VERBOSE(1, "Rank: " << rank << ", shard size: " << shardSize << endl);
00632         VERBOSE(1, "Rank: " << rank << ", shard start: " << shardStart << " shard end: " << shardEnd << endl);
00633         shard.resize(shardSize);
00634         copy(order.begin() + shardStart, order.begin() + shardEnd, shard.begin());
00635         batchSize = 1;
00636       } else {
00637         size_t shardSize = order.size()/size;
00638         size_t shardStart = (size_t) (shardSize * rank);
00639         size_t shardEnd = (size_t) (shardSize * (rank + 1));
00640         if (rank == size - 1) {
00641           shardEnd = order.size();
00642           shardSize = shardEnd - shardStart;
00643         }
00644         VERBOSE(1, "Shard size: " << shardSize << endl);
00645         VERBOSE(1, "Rank: " << rank << " Shard start: " << shardStart << " Shard end: " << shardEnd << endl);
00646         shard.resize(shardSize);
00647         copy(order.begin() + shardStart, order.begin() + shardEnd, shard.begin());
00648         if (batchEqualsShard)
00649           batchSize = shardSize;
00650       }
00651     }
00652 
00653     // sum of violated constraints in an epoch
00654     // sumStillViolatedConstraints = 0;
00655 
00656     numberOfUpdatesThisEpoch = 0;
00657     // Sum up weights over one epoch, final average uses weights from last epoch
00658     if (!accumulateWeights) {
00659       cumulativeWeights.ZeroAll();
00660       cumulativeWeightsBinary.ZeroAll();
00661     }
00662 
00663     // number of weight dumps this epoch
00664     size_t weightMixingThisEpoch = 0;
00665     size_t weightEpochDump = 0;
00666 
00667     size_t shardPosition = 0;
00668     vector<size_t>::const_iterator sid = shard.begin();
00669     while (sid != shard.end()) {
00670       // feature values for hypotheses i,j (matrix: batchSize x 3*n x featureValues)
00671       vector<vector<ScoreComponentCollection> > featureValues;
00672       vector<vector<float> > bleuScores;
00673       vector<vector<float> > modelScores;
00674 
00675       // variables for hope-fear/perceptron setting
00676       vector<vector<ScoreComponentCollection> > featureValuesHope;
00677       vector<vector<ScoreComponentCollection> > featureValuesFear;
00678       vector<vector<float> > bleuScoresHope;
00679       vector<vector<float> > bleuScoresFear;
00680       vector<vector<float> > modelScoresHope;
00681       vector<vector<float> > modelScoresFear;
00682 
00683       // get moses weights
00684       ScoreComponentCollection mosesWeights = decoder->getWeights();
00685       VERBOSE(1, "\nRank " << rank << ", epoch " << epoch << ", weights: " << mosesWeights << endl);
00686 
00687       if (historyBleu || simpleHistoryBleu) {
00688         decoder->printBleuFeatureHistory(cerr);
00689       }
00690 
00691       // BATCHING: produce nbest lists for all input sentences in batch
00692       vector<float> oracleBleuScores;
00693       vector<float> oracleModelScores;
00694       vector<vector<const Word*> > oneBests;
00695       vector<ScoreComponentCollection> oracleFeatureValues;
00696       vector<size_t> inputLengths;
00697       vector<size_t> ref_ids;
00698       size_t actualBatchSize = 0;
00699 
00700       vector<size_t>::const_iterator current_sid_start = sid;
00701       size_t examples_in_batch = 0;
00702       bool skip_example = false;
00703       for (size_t batchPosition = 0; batchPosition < batchSize && sid
00704            != shard.end(); ++batchPosition) {
00705         string input;
00706         if (trainWithMultipleFolds)
00707           input = inputSentencesFolds[myFold][*sid];
00708         else
00709           input = inputSentences[*sid];
00710 
00711         Moses::Sentence *sentence = new Sentence();
00712         stringstream in(input + "\n");
00713         const vector<FactorType> inputFactorOrder = staticData.GetInputFactorOrder();
00714         sentence->Read(in,inputFactorOrder);
00715         cerr << "\nRank " << rank << ", epoch " << epoch << ", input sentence " << *sid << ": \"";
00716         sentence->Print(cerr);
00717         cerr << "\"" << " (batch pos " << batchPosition << ")" << endl;
00718         size_t current_input_length = (*sentence).GetSize();
00719 
00720         if (epoch == 0 && (scaleByAvgInputLength || scaleByAvgInverseLength)) {
00721           sumOfInputs += current_input_length;
00722           ++numberOfInputs;
00723           avgInputLength = sumOfInputs/numberOfInputs;
00724           decoder->setAvgInputLength(avgInputLength);
00725           cerr << "Rank " << rank << ", epoch 0, average input length: " << avgInputLength << endl;
00726         }
00727 
00728         vector<ScoreComponentCollection> newFeatureValues;
00729         vector<float> newScores;
00730         if (model_hope_fear) {
00731           featureValues.push_back(newFeatureValues);
00732           bleuScores.push_back(newScores);
00733           modelScores.push_back(newScores);
00734         }
00735         if (hope_fear || perceptron_update) {
00736           featureValuesHope.push_back(newFeatureValues);
00737           featureValuesFear.push_back(newFeatureValues);
00738           bleuScoresHope.push_back(newScores);
00739           bleuScoresFear.push_back(newScores);
00740           modelScoresHope.push_back(newScores);
00741           modelScoresFear.push_back(newScores);
00742           if (historyBleu || simpleHistoryBleu || debug_model) {
00743             featureValues.push_back(newFeatureValues);
00744             bleuScores.push_back(newScores);
00745             modelScores.push_back(newScores);
00746           }
00747         }
00748         if (kbest) {
00749           // for decoding
00750           featureValues.push_back(newFeatureValues);
00751           bleuScores.push_back(newScores);
00752           modelScores.push_back(newScores);
00753 
00754           // for storing selected examples
00755           featureValuesHope.push_back(newFeatureValues);
00756           featureValuesFear.push_back(newFeatureValues);
00757           bleuScoresHope.push_back(newScores);
00758           bleuScoresFear.push_back(newScores);
00759           modelScoresHope.push_back(newScores);
00760           modelScoresFear.push_back(newScores);
00761         }
00762 
00763         size_t ref_length;
00764         float avg_ref_length;
00765 
00766         if (print_weights)
00767           cerr << "Rank " << rank << ", epoch " << epoch << ", current weights: " << mosesWeights << endl;
00768         if (print_core_weights) {
00769           cerr << "Rank " << rank << ", epoch " << epoch << ", current weights: ";
00770           mosesWeights.PrintCoreFeatures();
00771           cerr << endl;
00772         }
00773 
00774         // check LM weight
00775         const std::vector<const StatefulFeatureFunction*> &statefulFFs = StatefulFeatureFunction::GetStatefulFeatureFunctions();
00776         for (size_t i = 0; i < statefulFFs.size(); ++i) {
00777           const StatefulFeatureFunction *ff = statefulFFs[i];
00778           const LanguageModel *lm = dynamic_cast<const LanguageModel*>(ff);
00779 
00780           if (lm) {
00781             float lmWeight = mosesWeights.GetScoreForProducer(lm);
00782             cerr << "Rank " << rank << ", epoch " << epoch << ", lm weight: " << lmWeight << endl;
00783             if (lmWeight <= 0) {
00784               cerr << "Rank " << rank << ", epoch " << epoch << ", ERROR: language model weight should never be <= 0." << endl;
00785               mosesWeights.Assign(lm, 0.1);
00786               cerr << "Rank " << rank << ", epoch " << epoch << ", assign lm weights of 0.1" << endl;
00787             }
00788           }
00789         }
00790 
00791         // select inference scheme
00792         cerr << "Rank " << rank << ", epoch " << epoch << ", real Bleu? " << realBleu << endl;
00793         if (hope_fear || perceptron_update) {
00794           // HOPE
00795           cerr << "Rank " << rank << ", epoch " << epoch << ", " << hope_n <<
00796                "best hope translations" << endl;
00797           vector< vector<const Word*> > outputHope = decoder->getNBest(input, *sid, hope_n, 1.0, bleuWeight_hope,
00798               featureValuesHope[batchPosition], bleuScoresHope[batchPosition], modelScoresHope[batchPosition],
00799               1, realBleu, distinctNbest, avgRefLength, rank, epoch, "");
00800           vector<const Word*> oracle = outputHope[0];
00801           decoder->cleanup(chartDecoding);
00802           ref_length = decoder->getClosestReferenceLength(*sid, oracle.size());
00803           avg_ref_length = ref_length;
00804           float hope_length_ratio = (float)oracle.size()/ref_length;
00805           int oracleSize = (int)oracle.size();
00806           cerr << endl;
00807 
00808           // count sparse features occurring in hope translation
00809           featureValuesHope[batchPosition][0].IncrementSparseHopeFeatures();
00810 
00811           float precision = bleuScoresHope[batchPosition][0];
00812           if (historyBleu || simpleHistoryBleu) {
00813             precision /= decoder->getTargetLengthHistory();
00814           } else {
00815             if (scaleByAvgInputLength) precision /= decoder->getAverageInputLength();
00816             else if (scaleByAvgInverseLength) precision /= (100/decoder->getAverageInputLength());
00817             precision /= scaleByX;
00818           }
00819           if (scale_margin_precision || scale_update_precision) {
00820             if (historyBleu || simpleHistoryBleu || scaleByAvgInputLength || scaleByAvgInverseLength) {
00821               cerr << "Rank " << rank << ", epoch " << epoch << ", set hope precision: " << precision << endl;
00822               ((MiraOptimiser*) optimiser)->setPrecision(precision);
00823             }
00824           }
00825 
00826           vector<const Word*> bestModel;
00827           if (debug_model || historyBleu || simpleHistoryBleu) {
00828             // MODEL (for updating the history only, using dummy vectors)
00829             cerr << "Rank " << rank << ", epoch " << epoch << ", 1best wrt model score (debug or history)" << endl;
00830             vector< vector<const Word*> > outputModel = decoder->getNBest(input, *sid, n, 0.0, bleuWeight,
00831                 featureValues[batchPosition], bleuScores[batchPosition], modelScores[batchPosition],
00832                 1, realBleu, distinctNbest, avgRefLength, rank, epoch, "");
00833             bestModel = outputModel[0];
00834             decoder->cleanup(chartDecoding);
00835             cerr << endl;
00836             ref_length = decoder->getClosestReferenceLength(*sid, bestModel.size());
00837           }
00838 
00839           // FEAR
00840           float fear_length_ratio = 0;
00841           float bleuRatioHopeFear = 0;
00842           int fearSize = 0;
00843           cerr << "Rank " << rank << ", epoch " << epoch << ", " << fear_n << "best fear translations" << endl;
00844           vector< vector<const Word*> > outputFear = decoder->getNBest(input, *sid, fear_n, -1.0, bleuWeight_fear,
00845               featureValuesFear[batchPosition], bleuScoresFear[batchPosition], modelScoresFear[batchPosition],
00846               1, realBleu, distinctNbest, avgRefLength, rank, epoch, "");
00847           vector<const Word*> fear = outputFear[0];
00848           decoder->cleanup(chartDecoding);
00849           ref_length = decoder->getClosestReferenceLength(*sid, fear.size());
00850           avg_ref_length += ref_length;
00851           avg_ref_length /= 2;
00852           fear_length_ratio = (float)fear.size()/ref_length;
00853           fearSize = (int)fear.size();
00854           cerr << endl;
00855           for (size_t i = 0; i < fear.size(); ++i)
00856             delete fear[i];
00857 
00858           // count sparse features occurring in fear translation
00859           featureValuesFear[batchPosition][0].IncrementSparseFearFeatures();
00860 
00861           // Bleu-related example selection
00862           bool skip = false;
00863           bleuRatioHopeFear = bleuScoresHope[batchPosition][0] / bleuScoresFear[batchPosition][0];
00864           if (minBleuRatio != -1 && bleuRatioHopeFear < minBleuRatio)
00865             skip = true;
00866           if(maxBleuRatio != -1 && bleuRatioHopeFear > maxBleuRatio)
00867             skip = true;
00868 
00869           // sanity check
00870           if (historyBleu || simpleHistoryBleu) {
00871             if (bleuScores[batchPosition][0] > bleuScoresHope[batchPosition][0] &&
00872                 modelScores[batchPosition][0] > modelScoresHope[batchPosition][0]) {
00873               if (abs(bleuScores[batchPosition][0] - bleuScoresHope[batchPosition][0]) > epsilon &&
00874                   abs(modelScores[batchPosition][0] - modelScoresHope[batchPosition][0]) > epsilon) {
00875                 cerr << "Rank " << rank << ", epoch " << epoch << ", ERROR: MODEL translation better than HOPE translation." << endl;
00876                 skip = true;
00877               }
00878             }
00879             if (bleuScoresFear[batchPosition][0] > bleuScores[batchPosition][0] &&
00880                 modelScoresFear[batchPosition][0] > modelScores[batchPosition][0]) {
00881               if (abs(bleuScoresFear[batchPosition][0] - bleuScores[batchPosition][0]) > epsilon &&
00882                   abs(modelScoresFear[batchPosition][0] - modelScores[batchPosition][0]) > epsilon) {
00883                 cerr << "Rank " << rank << ", epoch " << epoch << ", ERROR: FEAR translation better than MODEL translation." << endl;
00884                 skip = true;
00885               }
00886             }
00887           }
00888           if (bleuScoresFear[batchPosition][0] > bleuScoresHope[batchPosition][0]) {
00889             if (abs(bleuScoresFear[batchPosition][0] - bleuScoresHope[batchPosition][0]) > epsilon) {
00890               // check if it's an error or a warning
00891               skip = true;
00892               if (modelScoresFear[batchPosition][0] > modelScoresHope[batchPosition][0] && abs(modelScoresFear[batchPosition][0] - modelScoresHope[batchPosition][0]) > epsilon) {
00893                 cerr << "Rank " << rank << ", epoch " << epoch << ", ERROR: FEAR translation better than HOPE translation. (abs-diff: " << abs(bleuScoresFear[batchPosition][0] - bleuScoresHope[batchPosition][0]) << ")" <<endl;
00894               } else {
00895                 cerr << "Rank " << rank << ", epoch " << epoch << ", WARNING: FEAR translation has better Bleu than HOPE translation. (abs-diff: " << abs(bleuScoresFear[batchPosition][0] - bleuScoresHope[batchPosition][0]) << ")" <<endl;
00896               }
00897             }
00898           }
00899 
00900           if (skip) {
00901             cerr << "Rank " << rank << ", epoch " << epoch << ", skip example (" << hope_length_ratio << ", " << bleuRatioHopeFear << ").. " << endl;
00902             featureValuesHope[batchPosition].clear();
00903             featureValuesFear[batchPosition].clear();
00904             bleuScoresHope[batchPosition].clear();
00905             bleuScoresFear[batchPosition].clear();
00906             if (historyBleu || simpleHistoryBleu || debug_model) {
00907               featureValues[batchPosition].clear();
00908               bleuScores[batchPosition].clear();
00909             }
00910           } else {
00911             examples_in_batch++;
00912 
00913             // needed for history
00914             if (historyBleu || simpleHistoryBleu)  {
00915               inputLengths.push_back(current_input_length);
00916               ref_ids.push_back(*sid);
00917               oneBests.push_back(bestModel);
00918             }
00919           }
00920         }
00921         if (model_hope_fear) {
00922           cerr << "Rank " << rank << ", epoch " << epoch << ", " << n << "best hope translations" << endl;
00923           size_t oraclePos = featureValues[batchPosition].size();
00924           decoder->getNBest(input, *sid, n, 1.0, bleuWeight_hope,
00925                             featureValues[batchPosition], bleuScores[batchPosition], modelScores[batchPosition],
00926                             0, realBleu, distinctNbest, avgRefLength, rank, epoch, "");
00927           //vector<const Word*> oracle = outputHope[0];
00928           // needed for history
00929           inputLengths.push_back(current_input_length);
00930           ref_ids.push_back(*sid);
00931           decoder->cleanup(chartDecoding);
00932           //ref_length = decoder->getClosestReferenceLength(*sid, oracle.size());
00933           //float hope_length_ratio = (float)oracle.size()/ref_length;
00934           cerr << endl;
00935 
00936           oracleFeatureValues.push_back(featureValues[batchPosition][oraclePos]);
00937           oracleBleuScores.push_back(bleuScores[batchPosition][oraclePos]);
00938           oracleModelScores.push_back(modelScores[batchPosition][oraclePos]);
00939 
00940           // MODEL
00941           cerr << "Rank " << rank << ", epoch " << epoch << ", " << n << "best wrt model score" << endl;
00942           if (historyBleu || simpleHistoryBleu) {
00943             vector< vector<const Word*> > outputModel = decoder->getNBest(input, *sid, n, 0.0,
00944                 bleuWeight, featureValues[batchPosition], bleuScores[batchPosition],
00945                 modelScores[batchPosition], 1, realBleu, distinctNbest, avgRefLength, rank, epoch, "");
00946             vector<const Word*> bestModel = outputModel[0];
00947             oneBests.push_back(bestModel);
00948             inputLengths.push_back(current_input_length);
00949             ref_ids.push_back(*sid);
00950           } else {
00951             decoder->getNBest(input, *sid, n, 0.0, bleuWeight,
00952                               featureValues[batchPosition], bleuScores[batchPosition], modelScores[batchPosition],
00953                               0, realBleu, distinctNbest, avgRefLength, rank, epoch, "");
00954           }
00955           decoder->cleanup(chartDecoding);
00956           //ref_length = decoder->getClosestReferenceLength(*sid, bestModel.size());
00957           //float model_length_ratio = (float)bestModel.size()/ref_length;
00958           cerr << endl;
00959 
00960           // FEAR
00961           cerr << "Rank " << rank << ", epoch " << epoch << ", " << n << "best fear translations" << endl;
00962           decoder->getNBest(input, *sid, n, -1.0, bleuWeight_fear,
00963                             featureValues[batchPosition], bleuScores[batchPosition], modelScores[batchPosition],
00964                             0, realBleu, distinctNbest, avgRefLength, rank, epoch, "");
00965           decoder->cleanup(chartDecoding);
00966           //ref_length = decoder->getClosestReferenceLength(*sid, fear.size());
00967           //float fear_length_ratio = (float)fear.size()/ref_length;
00968 
00969           examples_in_batch++;
00970         }
00971         if (kbest) {
00972           // MODEL
00973           cerr << "Rank " << rank << ", epoch " << epoch << ", " << n << "best wrt model score" << endl;
00974           if (historyBleu || simpleHistoryBleu) {
00975             vector< vector<const Word*> > outputModel = decoder->getNBest(input, *sid, n, 0.0,
00976                 bleuWeight, featureValues[batchPosition], bleuScores[batchPosition],
00977                 modelScores[batchPosition], 1, realBleu, distinctNbest, avgRefLength,   rank, epoch, "");
00978             vector<const Word*> bestModel = outputModel[0];
00979             oneBests.push_back(bestModel);
00980             inputLengths.push_back(current_input_length);
00981             ref_ids.push_back(*sid);
00982           } else {
00983             decoder->getNBest(input, *sid, n, 0.0, bleuWeight,
00984                               featureValues[batchPosition], bleuScores[batchPosition],
00985                               modelScores[batchPosition], 0, realBleu, distinctNbest, avgRefLength, rank, epoch, "");
00986           }
00987           decoder->cleanup(chartDecoding);
00988           //ref_length = decoder->getClosestReferenceLength(*sid, bestModel.size());
00989           //float model_length_ratio = (float)bestModel.size()/ref_length;
00990           cerr << endl;
00991 
00992           examples_in_batch++;
00993 
00994           HypothesisQueue queueHope(hope_n);
00995           HypothesisQueue queueFear(fear_n);
00996           cerr << endl;
00997           if (most_violated || all_violated || one_against_all) {
00998             float bleuHope = -1000;
00999             float bleuFear = 1000;
01000             size_t indexHope = -1;
01001             size_t indexFear = -1;
01002 
01003             vector<float> bleuHopeList;
01004             vector<float> bleuFearList;
01005             vector<float> indexHopeList;
01006             vector<float> indexFearList;
01007 
01008             if (most_violated)
01009               cerr << "Rank " << rank << ", epoch " << epoch << ", pick pair with most violated constraint" << endl;
01010             else if (all_violated)
01011               cerr << "Rank " << rank << ", epoch " << epoch << ", pick all pairs with violated constraints";
01012             else
01013               cerr << "Rank " << rank << ", epoch " << epoch << ", pick all pairs with hope";
01014 
01015             // find best hope, then find fear that violates our constraint most
01016             for (size_t i=0; i<bleuScores[batchPosition].size(); ++i) {
01017               if (abs(bleuScores[batchPosition][i] - bleuHope) < epsilon) { // equal bleu scores
01018                 if (modelScores[batchPosition][i] > modelScores[batchPosition][indexHope]) {
01019                   if (abs(modelScores[batchPosition][i] - modelScores[batchPosition][indexHope]) > epsilon) {
01020                     // better model score
01021                     bleuHope = bleuScores[batchPosition][i];
01022                     indexHope = i;
01023                   }
01024                 }
01025               } else if (bleuScores[batchPosition][i] > bleuHope) { // better than current best
01026                 bleuHope = bleuScores[batchPosition][i];
01027                 indexHope = i;
01028               }
01029             }
01030 
01031             float currentViolation = 0;
01032             float minimum_bleu_diff = 0.01;
01033             for (size_t i=0; i<bleuScores[batchPosition].size(); ++i) {
01034               float bleuDiff = bleuHope - bleuScores[batchPosition][i];
01035               float modelDiff = modelScores[batchPosition][indexHope] - modelScores[batchPosition][i];
01036               if (bleuDiff > epsilon) {
01037                 if (one_against_all && bleuDiff > minimum_bleu_diff) {
01038                   cerr << ".. adding pair";
01039                   bleuHopeList.push_back(bleuHope);
01040                   bleuFearList.push_back(bleuScores[batchPosition][i]);
01041                   indexHopeList.push_back(indexHope);
01042                   indexFearList.push_back(i);
01043                 } else if (modelDiff < bleuDiff) {
01044                   float diff = bleuDiff - modelDiff;
01045                   if (diff > epsilon) {
01046                     if (all_violated) {
01047                       cerr << ".. adding pair";
01048                       bleuHopeList.push_back(bleuHope);
01049                       bleuFearList.push_back(bleuScores[batchPosition][i]);
01050                       indexHopeList.push_back(indexHope);
01051                       indexFearList.push_back(i);
01052                     } else if (most_violated && diff > currentViolation) {
01053                       currentViolation = diff;
01054                       bleuFear = bleuScores[batchPosition][i];
01055                       indexFear = i;
01056                       cerr << "Rank " << rank << ", epoch " << epoch << ", current violation: " << currentViolation << " (" << modelDiff << " >= " << bleuDiff << ")" << endl;
01057                     }
01058                   }
01059                 }
01060               }
01061             }
01062 
01063             if (most_violated) {
01064               if (currentViolation > 0) {
01065                 cerr << "Rank " << rank << ", epoch " << epoch << ", adding pair with violation " << currentViolation << endl;
01066                 cerr << "Rank " << rank << ", epoch " << epoch << ", hope: " << bleuHope << " (" << indexHope  << "), fear: " << bleuFear << " (" << indexFear << ")" << endl;
01067                 bleuScoresHope[batchPosition].push_back(bleuHope);
01068                 bleuScoresFear[batchPosition].push_back(bleuFear);
01069                 featureValuesHope[batchPosition].push_back(featureValues[batchPosition][indexHope]);
01070                 featureValuesFear[batchPosition].push_back(featureValues[batchPosition][indexFear]);
01071                 float modelScoreHope = modelScores[batchPosition][indexHope];
01072                 float modelScoreFear = modelScores[batchPosition][indexFear];
01073                 if (most_violated_reg) {
01074                   // reduce model score difference by factor ~0.5
01075                   float reg = currentViolation/4;
01076                   modelScoreHope += abs(reg);
01077                   modelScoreFear -= abs(reg);
01078                   float newViolation = (bleuHope - bleuFear) - (modelScoreHope - modelScoreFear);
01079                   cerr << "Rank " << rank << ", epoch " << epoch << ", regularized violation: " << newViolation << endl;
01080                 }
01081                 modelScoresHope[batchPosition].push_back(modelScoreHope);
01082                 modelScoresFear[batchPosition].push_back(modelScoreFear);
01083 
01084                 featureValues[batchPosition][indexHope].IncrementSparseHopeFeatures();
01085                 featureValues[batchPosition][indexFear].IncrementSparseFearFeatures();
01086               } else {
01087                 cerr << "Rank " << rank << ", epoch " << epoch << ", no violated constraint found." << endl;
01088                 skip_example = 1;
01089               }
01090             } else cerr << endl;
01091           }
01092           if (max_bleu_diff) {
01093             cerr << "Rank " << rank << ", epoch " << epoch << ", pick pair with max Bleu diff from list: " << bleuScores[batchPosition].size() << endl;
01094             for (size_t i=0; i<bleuScores[batchPosition].size(); ++i) {
01095               float hopeScore = bleuScores[batchPosition][i];
01096               if (modelPlusBleu) hopeScore += modelScores[batchPosition][i];
01097               BleuIndexPair hope(hopeScore, i);
01098               queueHope.Push(hope);
01099 
01100               float fearScore = -1*(bleuScores[batchPosition][i]);
01101               if (modelPlusBleu) fearScore += modelScores[batchPosition][i];
01102               BleuIndexPair fear(fearScore, i);
01103               queueFear.Push(fear);
01104             }
01105             skip_example = 0;
01106           }
01107           cerr << endl;
01108 
01109           vector<BleuIndexPair> hopeList, fearList;
01110           for (size_t i=0; i<hope_n && !queueHope.Empty(); ++i) hopeList.push_back(queueHope.Pop());
01111           for (size_t i=0; i<fear_n && !queueFear.Empty(); ++i) fearList.push_back(queueFear.Pop());
01112           for (size_t i=0; i<hopeList.size(); ++i) {
01113             //float bleuHope = hopeList[i].first;
01114             size_t indexHope = hopeList[i].second;
01115             float bleuHope = bleuScores[batchPosition][indexHope];
01116             for (size_t j=0; j<fearList.size(); ++j) {
01117               //float bleuFear = -1*(fearList[j].first);
01118               size_t indexFear = fearList[j].second;
01119               float bleuFear = bleuScores[batchPosition][indexFear];
01120               cerr << "Rank " << rank << ", epoch " << epoch << ", hope: " << bleuHope << " (" << indexHope  << "), fear: " << bleuFear << " (" << indexFear << ")" << endl;
01121               bleuScoresHope[batchPosition].push_back(bleuHope);
01122               bleuScoresFear[batchPosition].push_back(bleuFear);
01123               featureValuesHope[batchPosition].push_back(featureValues[batchPosition][indexHope]);
01124               featureValuesFear[batchPosition].push_back(featureValues[batchPosition][indexFear]);
01125               float modelScoreHope = modelScores[batchPosition][indexHope];
01126               float modelScoreFear = modelScores[batchPosition][indexFear];
01127 
01128               modelScoresHope[batchPosition].push_back(modelScoreHope);
01129               modelScoresFear[batchPosition].push_back(modelScoreFear);
01130 
01131               featureValues[batchPosition][indexHope].IncrementSparseHopeFeatures();
01132               featureValues[batchPosition][indexFear].IncrementSparseFearFeatures();
01133             }
01134           }
01135           if (!makePairs)
01136             cerr << "Rank " << rank << ", epoch " << epoch << "summing up hope and fear vectors, no pairs" << endl;
01137         }
01138 
01139         // next input sentence
01140         ++sid;
01141         ++actualBatchSize;
01142         ++shardPosition;
01143       } // end of batch loop
01144 
01145       if (examples_in_batch == 0 || (kbest && skip_example)) {
01146         cerr << "Rank " << rank << ", epoch " << epoch << ", batch is empty." << endl;
01147       } else {
01148         vector<vector<float> > losses(actualBatchSize);
01149         if (model_hope_fear) {
01150           // Set loss for each sentence as BLEU(oracle) - BLEU(hypothesis)
01151           for (size_t batchPosition = 0; batchPosition < actualBatchSize; ++batchPosition) {
01152             for (size_t j = 0; j < bleuScores[batchPosition].size(); ++j) {
01153               losses[batchPosition].push_back(oracleBleuScores[batchPosition] - bleuScores[batchPosition][j]);
01154             }
01155           }
01156         }
01157 
01158         // set weight for bleu feature to 0 before optimizing
01159         vector<FeatureFunction*>::const_iterator iter;
01160         const vector<FeatureFunction*> &featureFunctions2 = FeatureFunction::GetFeatureFunctions();
01161         for (iter = featureFunctions2.begin(); iter != featureFunctions2.end(); ++iter) {
01162           if ((*iter)->GetScoreProducerDescription() == "BleuScoreFeature") {
01163             mosesWeights.Assign(*iter, 0);
01164             break;
01165           }
01166         }
01167 
01168         // scale LM feature (to avoid rapid changes)
01169         if (scale_lm) {
01170           cerr << "scale lm" << endl;
01171           const std::vector<const StatefulFeatureFunction*> &statefulFFs = StatefulFeatureFunction::GetStatefulFeatureFunctions();
01172           for (size_t i = 0; i < statefulFFs.size(); ++i) {
01173             const StatefulFeatureFunction *ff = statefulFFs[i];
01174             const LanguageModel *lm = dynamic_cast<const LanguageModel*>(ff);
01175 
01176             if (lm) {
01177               // scale down score
01178               if (model_hope_fear) {
01179                 scaleFeatureScore(lm, scale_lm_factor, featureValues, rank, epoch);
01180               } else {
01181                 scaleFeatureScore(lm, scale_lm_factor, featureValuesHope, rank, epoch);
01182                 scaleFeatureScore(lm, scale_lm_factor, featureValuesFear, rank, epoch);
01183               }
01184             }
01185           }
01186         }
01187 
01188         // scale WP
01189         if (scale_wp) {
01190           // scale up weight
01191           WordPenaltyProducer *wp = StaticData::InstanceNonConst().GetWordPenaltyProducer();
01192 
01193           // scale down score
01194           if (model_hope_fear) {
01195             scaleFeatureScore(wp, scale_wp_factor, featureValues, rank, epoch);
01196           } else {
01197             scaleFeatureScore(wp, scale_wp_factor, featureValuesHope, rank, epoch);
01198             scaleFeatureScore(wp, scale_wp_factor, featureValuesFear, rank, epoch);
01199           }
01200         }
01201 
01202         // print out the feature values
01203         if (print_feature_values) {
01204           cerr << "\nRank " << rank << ", epoch " << epoch << ", feature values: " << endl;
01205           if (model_hope_fear) printFeatureValues(featureValues);
01206           else {
01207             cerr << "hope: " << endl;
01208             printFeatureValues(featureValuesHope);
01209             cerr << "fear: " << endl;
01210             printFeatureValues(featureValuesFear);
01211           }
01212         }
01213 
01214         // apply learning rates to feature vectors before optimization
01215         if (feature_confidence) {
01216           cerr << "Rank " << rank << ", epoch " << epoch << ", apply feature learning rates with decays " << decay_core << "/" << decay_sparse << ": " << featureLearningRates << endl;
01217           if (model_hope_fear) {
01218             applyPerFeatureLearningRates(featureValues, featureLearningRates, sparse_r0);
01219           } else {
01220             applyPerFeatureLearningRates(featureValuesHope, featureLearningRates, sparse_r0);
01221             applyPerFeatureLearningRates(featureValuesFear, featureLearningRates, sparse_r0);
01222           }
01223         } else {
01224           // apply fixed learning rates
01225           cerr << "Rank " << rank << ", epoch " << epoch << ", apply fixed learning rates, core: " << core_r0 << ", sparse: " << sparse_r0 << endl;
01226           if (core_r0 != 1.0 || sparse_r0 != 1.0) {
01227             if (model_hope_fear) {
01228               applyLearningRates(featureValues, core_r0, sparse_r0);
01229             } else {
01230               applyLearningRates(featureValuesHope, core_r0, sparse_r0);
01231               applyLearningRates(featureValuesFear, core_r0, sparse_r0);
01232             }
01233           }
01234         }
01235 
01236         // Run optimiser on batch:
01237         VERBOSE(1, "\nRank " << rank << ", epoch " << epoch << ", run optimiser:" << endl);
01238         size_t update_status = 1;
01239         ScoreComponentCollection weightUpdate;
01240         if (perceptron_update) {
01241           vector<vector<float> > dummy1;
01242           update_status = optimiser->updateWeightsHopeFear( weightUpdate, featureValuesHope,
01243                           featureValuesFear, dummy1, dummy1, dummy1, dummy1, learning_rate, rank, epoch);
01244         } else if (hope_fear) {
01245           if (bleuScoresHope[0][0] >= min_oracle_bleu) {
01246             if (hope_n == 1 && fear_n ==1 && batchSize == 1 && !hildreth) {
01247               update_status = ((MiraOptimiser*) optimiser)->updateWeightsAnalytically(weightUpdate,
01248                               featureValuesHope[0][0], featureValuesFear[0][0], bleuScoresHope[0][0],
01249                               bleuScoresFear[0][0], modelScoresHope[0][0], modelScoresFear[0][0], learning_rate, rank, epoch);
01250             } else
01251               update_status = optimiser->updateWeightsHopeFear(weightUpdate, featureValuesHope,
01252                               featureValuesFear, bleuScoresHope, bleuScoresFear, modelScoresHope,
01253                               modelScoresFear, learning_rate, rank, epoch);
01254           } else
01255             update_status = 1;
01256         } else if (kbest) {
01257           if (selective)
01258             update_status = ((MiraOptimiser*)optimiser)->updateWeightsHopeFearSelective(
01259                               weightUpdate, featureValuesHope, featureValuesFear, bleuScoresHope, bleuScoresFear,
01260                               modelScoresHope, modelScoresFear, learning_rate, rank, epoch);
01261           else if (summed)
01262             update_status = ((MiraOptimiser*)optimiser)->updateWeightsHopeFearSummed(
01263                               weightUpdate, featureValuesHope, featureValuesFear, bleuScoresHope, bleuScoresFear,
01264                               modelScoresHope, modelScoresFear, learning_rate, rank, epoch, rescaleSlack, makePairs);
01265           else {
01266             if (batchSize == 1 && featureValuesHope[0].size() == 1 && !hildreth) {
01267               cerr << "Rank " << rank << ", epoch " << epoch << ", model score hope: " << modelScoresHope[0][0] << endl;
01268               cerr << "Rank " << rank << ", epoch " << epoch << ", model score fear: " << modelScoresFear[0][0] << endl;
01269               update_status = ((MiraOptimiser*) optimiser)->updateWeightsAnalytically(
01270                                 weightUpdate, featureValuesHope[0][0], featureValuesFear[0][0],
01271                                 bleuScoresHope[0][0], bleuScoresFear[0][0], modelScoresHope[0][0],
01272                                 modelScoresFear[0][0], learning_rate, rank, epoch);
01273             } else {
01274               cerr << "Rank " << rank << ", epoch " << epoch << ", model score hope: " << modelScoresHope[0][0] << endl;
01275               cerr << "Rank " << rank << ", epoch " << epoch << ", model score fear: " << modelScoresFear[0][0] << endl;
01276               update_status = optimiser->updateWeightsHopeFear(weightUpdate, featureValuesHope,
01277                               featureValuesFear, bleuScoresHope, bleuScoresFear, modelScoresHope,
01278                               modelScoresFear, learning_rate, rank, epoch);
01279             }
01280           }
01281         } else {
01282           // model_hope_fear
01283           update_status = ((MiraOptimiser*) optimiser)->updateWeights(weightUpdate,
01284                           featureValues, losses, bleuScores, modelScores, oracleFeatureValues,
01285                           oracleBleuScores, oracleModelScores, learning_rate, rank, epoch);
01286         }
01287 
01288         // sumStillViolatedConstraints += update_status;
01289 
01290         if (update_status == 0) {        // if weights were updated
01291           // apply weight update
01292           if (debug)
01293             cerr << "Rank " << rank << ", epoch " << epoch << ", update: " << weightUpdate << endl;
01294 
01295           if (feature_confidence) {
01296             // update confidence counts based on weight update
01297             confidenceCounts.UpdateConfidenceCounts(weightUpdate, signed_counts);
01298 
01299             // update feature learning rates
01300             featureLearningRates.UpdateLearningRates(decay_core, decay_sparse, confidenceCounts, core_r0, sparse_r0);
01301           }
01302 
01303           // apply weight update to Moses weights
01304           mosesWeights.PlusEquals(weightUpdate);
01305 
01306           if (normaliseWeights)
01307             mosesWeights.L1Normalise();
01308 
01309           cumulativeWeights.PlusEquals(mosesWeights);
01310           if (sparseAverage) {
01311             ScoreComponentCollection binary;
01312             binary.SetToBinaryOf(mosesWeights);
01313             cumulativeWeightsBinary.PlusEquals(binary);
01314           }
01315 
01316           ++numberOfUpdates;
01317           ++numberOfUpdatesThisEpoch;
01318           if (averageWeights) {
01319             ScoreComponentCollection averageWeights(cumulativeWeights);
01320             if (accumulateWeights) {
01321               averageWeights.DivideEquals(numberOfUpdates);
01322             } else {
01323               averageWeights.DivideEquals(numberOfUpdatesThisEpoch);
01324             }
01325 
01326             mosesWeights = averageWeights;
01327           }
01328 
01329           // set new Moses weights
01330           decoder->setWeights(mosesWeights);
01331           //cerr << "Rank " << rank << ", epoch " << epoch << ", new weights: " << mosesWeights << endl;
01332         }
01333 
01334         // update history (for approximate document Bleu)
01335         if (historyBleu || simpleHistoryBleu) {
01336           for (size_t i = 0; i < oneBests.size(); ++i)
01337             cerr << "Rank " << rank << ", epoch " << epoch << ", update history with 1best length: " << oneBests[i].size() << " ";
01338           decoder->updateHistory(oneBests, inputLengths, ref_ids, rank, epoch);
01339           deleteTranslations(oneBests);
01340         }
01341       } // END TRANSLATE AND UPDATE BATCH
01342 
01343       // size of all shards except for the last one
01344       size_t generalShardSize;
01345       if (trainWithMultipleFolds)
01346         generalShardSize = order.size()/coresPerFold;
01347       else
01348         generalShardSize = order.size()/size;
01349 
01350       size_t mixing_base = mixingFrequency == 0 ? 0 : generalShardSize / mixingFrequency;
01351       size_t dumping_base = weightDumpFrequency == 0 ? 0 : generalShardSize / weightDumpFrequency;
01352       bool mix = evaluateModulo(shardPosition, mixing_base, actualBatchSize);
01353 
01354       // mix weights?
01355       if (mix) {
01356 #ifdef MPI_ENABLE
01357         cerr << "Rank " << rank << ", epoch " << epoch << ", mixing weights.. " << endl;
01358         // collect all weights in mixedWeights and divide by number of processes
01359         mpi::reduce(world, mosesWeights, mixedWeights, SCCPlus(), 0);
01360 
01361         // mix confidence counts
01362         //mpi::reduce(world, confidenceCounts, mixedConfidenceCounts, SCCPlus(), 0);
01363         ScoreComponentCollection totalBinary;
01364         if (sparseAverage) {
01365           ScoreComponentCollection binary;
01366           binary.SetToBinaryOf(mosesWeights);
01367           mpi::reduce(world, binary, totalBinary, SCCPlus(), 0);
01368         }
01369         if (rank == 0) {
01370           // divide by number of processes
01371           if (sparseNoAverage)
01372             mixedWeights.CoreDivideEquals(size); // average only core weights
01373           else if (sparseAverage)
01374             mixedWeights.DivideEquals(totalBinary);
01375           else
01376             mixedWeights.DivideEquals(size);
01377 
01378           // divide confidence counts
01379           //mixedConfidenceCounts.DivideEquals(size);
01380 
01381           // normalise weights after averaging
01382           if (normaliseWeights) {
01383             mixedWeights.L1Normalise();
01384           }
01385 
01386           ++weightMixingThisEpoch;
01387 
01388           if (pruneZeroWeights) {
01389             size_t pruned = mixedWeights.PruneZeroWeightFeatures();
01390             cerr << "Rank " << rank << ", epoch " << epoch << ", "
01391                  << pruned << " zero-weighted features pruned from mixedWeights." << endl;
01392 
01393             pruned = cumulativeWeights.PruneZeroWeightFeatures();
01394             cerr << "Rank " << rank << ", epoch " << epoch << ", "
01395                  << pruned << " zero-weighted features pruned from cumulativeWeights." << endl;
01396           }
01397 
01398           if (featureCutoff != -1 && weightMixingThisEpoch == mixingFrequency) {
01399             size_t pruned = mixedWeights.PruneSparseFeatures(featureCutoff);
01400             cerr << "Rank " << rank << ", epoch " << epoch << ", "
01401                  << pruned << " features pruned from mixedWeights." << endl;
01402 
01403             pruned = cumulativeWeights.PruneSparseFeatures(featureCutoff);
01404             cerr << "Rank " << rank << ", epoch " << epoch << ", "
01405                  << pruned << " features pruned from cumulativeWeights." << endl;
01406           }
01407 
01408           if (weightMixingThisEpoch == mixingFrequency || reg_on_every_mix) {
01409             if (l1_regularize) {
01410               size_t pruned;
01411               if (l1_reg_sparse)
01412                 pruned = mixedWeights.SparseL1Regularize(l1_lambda);
01413               else
01414                 pruned = mixedWeights.L1Regularize(l1_lambda);
01415               cerr << "Rank " << rank << ", epoch " << epoch << ", "
01416                    << "l1-reg. on mixedWeights with lambda=" << l1_lambda << ", pruned: " << pruned << endl;
01417             }
01418             if (l2_regularize) {
01419               if (l2_reg_sparse)
01420                 mixedWeights.SparseL2Regularize(l2_lambda);
01421               else
01422                 mixedWeights.L2Regularize(l2_lambda);
01423               cerr << "Rank " << rank << ", epoch " << epoch << ", "
01424                    << "l2-reg. on mixedWeights with lambda=" << l2_lambda << endl;
01425             }
01426           }
01427         }
01428 
01429         // broadcast average weights from process 0
01430         mpi::broadcast(world, mixedWeights, 0);
01431         decoder->setWeights(mixedWeights);
01432         mosesWeights = mixedWeights;
01433 
01434         // broadcast summed confidence counts
01435         //mpi::broadcast(world, mixedConfidenceCounts, 0);
01436         //confidenceCounts = mixedConfidenceCounts;
01437 #endif
01438 #ifndef MPI_ENABLE
01439         //cerr << "\nRank " << rank << ", no mixing, weights: " << mosesWeights << endl;
01440         mixedWeights = mosesWeights;
01441 #endif
01442       } // end mixing
01443 
01444       // Dump weights?
01445       if (trainWithMultipleFolds || weightEpochDump == weightDumpFrequency) {
01446         // dump mixed weights at end of every epoch to enable continuing a crashed experiment
01447         // (for jackknife every time the weights are mixed)
01448         ostringstream filename;
01449         if (epoch < 10)
01450           filename << weightDumpStem << "_mixed_0" << epoch;
01451         else
01452           filename << weightDumpStem << "_mixed_" << epoch;
01453 
01454         if (weightDumpFrequency > 1)
01455           filename << "_" << weightEpochDump;
01456 
01457         mixedWeights.Save(filename.str());
01458         cerr << "Dumping mixed weights during epoch " << epoch << " to " << filename.str() << endl << endl;
01459       }
01460       if (dumpMixedWeights) {
01461         if (mix && rank == 0 && !weightDumpStem.empty()) {
01462           // dump mixed weights instead of average weights
01463           ostringstream filename;
01464           if (epoch < 10)
01465             filename << weightDumpStem << "_0" << epoch;
01466           else
01467             filename << weightDumpStem << "_" << epoch;
01468 
01469           if (weightDumpFrequency > 1)
01470             filename << "_" << weightEpochDump;
01471 
01472           cerr << "Dumping mixed weights during epoch " << epoch << " to " << filename.str() << endl << endl;
01473           mixedWeights.Save(filename.str());
01474           ++weightEpochDump;
01475         }
01476       } else {
01477         if (evaluateModulo(shardPosition, dumping_base, actualBatchSize)) {
01478           cerr << "Rank " << rank << ", epoch " << epoch << ", dump weights.. (pos: " << shardPosition << ", base: " << dumping_base << ")" << endl;
01479           ScoreComponentCollection tmpAverageWeights(cumulativeWeights);
01480           bool proceed = false;
01481           if (accumulateWeights) {
01482             if (numberOfUpdates > 0) {
01483               tmpAverageWeights.DivideEquals(numberOfUpdates);
01484               proceed = true;
01485             }
01486           } else {
01487             if (numberOfUpdatesThisEpoch > 0) {
01488               if (sparseNoAverage) // average only core weights
01489                 tmpAverageWeights.CoreDivideEquals(numberOfUpdatesThisEpoch);
01490               else if (sparseAverage)
01491                 tmpAverageWeights.DivideEquals(cumulativeWeightsBinary);
01492               else
01493                 tmpAverageWeights.DivideEquals(numberOfUpdatesThisEpoch);
01494               proceed = true;
01495             }
01496           }
01497 
01498           if (proceed) {
01499 #ifdef MPI_ENABLE
01500             // average across processes
01501             mpi::reduce(world, tmpAverageWeights, mixedAverageWeights, SCCPlus(), 0);
01502             ScoreComponentCollection totalBinary;
01503             if (sparseAverage) {
01504               ScoreComponentCollection binary;
01505               binary.SetToBinaryOf(mosesWeights);
01506               mpi::reduce(world, binary, totalBinary, SCCPlus(), 0);
01507             }
01508 #endif
01509 #ifndef MPI_ENABLE
01510             mixedAverageWeights = tmpAverageWeights;
01511             //FIXME: What do to for non-mpi version
01512             ScoreComponentCollection totalBinary;
01513 #endif
01514             if (rank == 0 && !weightDumpStem.empty()) {
01515               // divide by number of processes
01516               if (sparseNoAverage)
01517                 mixedAverageWeights.CoreDivideEquals(size); // average only core weights
01518               else if (sparseAverage)
01519                 mixedAverageWeights.DivideEquals(totalBinary);
01520               else
01521                 mixedAverageWeights.DivideEquals(size);
01522 
01523               // normalise weights after averaging
01524               if (normaliseWeights) {
01525                 mixedAverageWeights.L1Normalise();
01526               }
01527 
01528               // dump final average weights
01529               ostringstream filename;
01530               if (epoch < 10) {
01531                 filename << weightDumpStem << "_0" << epoch;
01532               } else {
01533                 filename << weightDumpStem << "_" << epoch;
01534               }
01535 
01536               if (weightDumpFrequency > 1) {
01537                 filename << "_" << weightEpochDump;
01538               }
01539 
01540               /*if (accumulateWeights) {
01541               cerr << "\nMixed average weights (cumulative) during epoch "      << epoch << ": " << mixedAverageWeights << endl;
01542               } else {
01543               cerr << "\nMixed average weights during epoch " << epoch << ": " << mixedAverageWeights << endl;
01544               }*/
01545 
01546               cerr << "Dumping mixed average weights during epoch " << epoch << " to " << filename.str() << endl << endl;
01547               mixedAverageWeights.Save(filename.str());
01548               ++weightEpochDump;
01549 
01550               if (weightEpochDump == weightDumpFrequency) {
01551                 if (l1_regularize) {
01552                   size_t pruned = mixedAverageWeights.SparseL1Regularize(l1_lambda);
01553                   cerr << "Rank " << rank << ", epoch " << epoch << ", "
01554                        << "l1-reg. on mixedAverageWeights with lambda=" << l1_lambda << ", pruned: " << pruned << endl;
01555 
01556                 }
01557                 if (l2_regularize) {
01558                   mixedAverageWeights.SparseL2Regularize(l2_lambda);
01559                   cerr << "Rank " << rank << ", epoch " << epoch << ", "
01560                        << "l2-reg. on mixedAverageWeights with lambda=" << l2_lambda << endl;
01561                 }
01562 
01563                 if (l1_regularize || l2_regularize) {
01564                   filename << "_reg";
01565                   cerr << "Dumping regularized mixed average weights during epoch " << epoch << " to " << filename.str() << endl << endl;
01566                   mixedAverageWeights.Save(filename.str());
01567                 }
01568               }
01569 
01570               if (weightEpochDump == weightDumpFrequency && printFeatureCounts) {
01571                 // print out all features with counts
01572                 stringstream s1, s2;
01573                 s1 << "sparse_feature_hope_counts" << "_" << epoch;
01574                 s2 << "sparse_feature_fear_counts" << "_" << epoch;
01575                 ofstream sparseFeatureCountsHope(s1.str().c_str());
01576                 ofstream sparseFeatureCountsFear(s2.str().c_str());
01577 
01578                 mixedAverageWeights.PrintSparseHopeFeatureCounts(sparseFeatureCountsHope);
01579                 mixedAverageWeights.PrintSparseFearFeatureCounts(sparseFeatureCountsFear);
01580                 sparseFeatureCountsHope.close();
01581                 sparseFeatureCountsFear.close();
01582               }
01583             }
01584           }
01585         }// end dumping
01586       } // end if dump
01587     } // end of shard loop, end of this epoch
01588     cerr << "Rank " << rank << ", epoch " << epoch << ", end of epoch.." << endl;
01589 
01590     if (historyBleu || simpleHistoryBleu) {
01591       cerr << "Bleu feature history after epoch " <<  epoch << endl;
01592       decoder->printBleuFeatureHistory(cerr);
01593     }
01594     //          cerr << "Rank " << rank << ", epoch " << epoch << ", sum of violated constraints: " << sumStillViolatedConstraints << endl;
01595 
01596     // Check whether there were any weight updates during this epoch
01597     size_t sumUpdates;
01598     size_t *sendbuf_uint, *recvbuf_uint;
01599     sendbuf_uint = (size_t *) malloc(sizeof(size_t));
01600     recvbuf_uint = (size_t *) malloc(sizeof(size_t));
01601 #ifdef MPI_ENABLE
01602     sendbuf_uint[0] = numberOfUpdatesThisEpoch;
01603     recvbuf_uint[0] = 0;
01604     MPI_Reduce(sendbuf_uint, recvbuf_uint, 1, MPI_UNSIGNED, MPI_SUM, 0, world);
01605     sumUpdates = recvbuf_uint[0];
01606 #endif
01607 #ifndef MPI_ENABLE
01608     sumUpdates = numberOfUpdatesThisEpoch;
01609 #endif
01610     if (rank == 0 && sumUpdates == 0) {
01611       cerr << "\nNo weight updates during this epoch.. stopping." << endl;
01612       stop = true;
01613 #ifdef MPI_ENABLE
01614       mpi::broadcast(world, stop, 0);
01615 #endif
01616     }
01617 
01618     if (!stop) {
01619       // Test if weights have converged
01620       if (weightConvergence) {
01621         bool reached = true;
01622         if (rank == 0 && (epoch >= 2)) {
01623           ScoreComponentCollection firstDiff, secondDiff;
01624           if (dumpMixedWeights) {
01625             firstDiff = mixedWeights;
01626             firstDiff.MinusEquals(mixedWeightsPrevious);
01627             secondDiff = mixedWeights;
01628             secondDiff.MinusEquals(mixedWeightsBeforePrevious);
01629           } else {
01630             firstDiff = mixedAverageWeights;
01631             firstDiff.MinusEquals(mixedAverageWeightsPrevious);
01632             secondDiff = mixedAverageWeights;
01633             secondDiff.MinusEquals(mixedAverageWeightsBeforePrevious);
01634           }
01635           VERBOSE(1, "Average weight changes since previous epoch: " << firstDiff << " (max: " << firstDiff.GetLInfNorm() << ")" << endl);
01636           VERBOSE(1, "Average weight changes since before previous epoch: " << secondDiff << " (max: " << secondDiff.GetLInfNorm() << ")" << endl << endl);
01637 
01638           // check whether stopping criterion has been reached
01639           // (both difference vectors must have all weight changes smaller than min_weight_change)
01640           if (firstDiff.GetLInfNorm() >= min_weight_change)
01641             reached = false;
01642           if (secondDiff.GetLInfNorm() >= min_weight_change)
01643             reached = false;
01644           if (reached) {
01645             // stop MIRA
01646             stop = true;
01647             cerr << "\nWeights have converged after epoch " << epoch << ".. stopping MIRA." << endl;
01648             ScoreComponentCollection dummy;
01649             ostringstream endfilename;
01650             endfilename << "stopping";
01651             dummy.Save(endfilename.str());
01652           }
01653         }
01654 
01655         mixedWeightsBeforePrevious = mixedWeightsPrevious;
01656         mixedWeightsPrevious = mixedWeights;
01657         mixedAverageWeightsBeforePrevious = mixedAverageWeightsPrevious;
01658         mixedAverageWeightsPrevious = mixedAverageWeights;
01659 #ifdef MPI_ENABLE
01660         mpi::broadcast(world, stop, 0);
01661 #endif
01662       } //end if (weightConvergence)
01663     }
01664   } // end of epoch loop
01665 
01666 #ifdef MPI_ENABLE
01667   MPI_Finalize();
01668 #endif
01669 
01670   time(&now);
01671   cerr << "Rank " << rank << ", " << ctime(&now);
01672 
01673   if (rank == 0) {
01674     ScoreComponentCollection dummy;
01675     ostringstream endfilename;
01676     endfilename << "finished";
01677     dummy.Save(endfilename.str());
01678   }
01679 
01680   delete decoder;
01681   exit(0);
01682 }
01683 
01684 bool loadSentences(const string& filename, vector<string>& sentences)
01685 {
01686   ifstream in(filename.c_str());
01687   if (!in)
01688     return false;
01689   string line;
01690   while (getline(in, line))
01691     sentences.push_back(line);
01692   return true;
01693 }
01694 
01695 bool evaluateModulo(size_t shard_position, size_t mix_or_dump_base, size_t actual_batch_size)
01696 {
01697   if (mix_or_dump_base == 0) return 0;
01698   if (actual_batch_size > 1) {
01699     bool mix_or_dump = false;
01700     size_t numberSubtracts = actual_batch_size;
01701     do {
01702       if (shard_position % mix_or_dump_base == 0) {
01703         mix_or_dump = true;
01704         break;
01705       }
01706       --shard_position;
01707       --numberSubtracts;
01708     } while (numberSubtracts > 0);
01709     return mix_or_dump;
01710   } else {
01711     return ((shard_position % mix_or_dump_base) == 0);
01712   }
01713 }
01714 
01715 void printFeatureValues(vector<vector<ScoreComponentCollection> > &featureValues)
01716 {
01717   for (size_t i = 0; i < featureValues.size(); ++i) {
01718     for (size_t j = 0; j < featureValues[i].size(); ++j) {
01719       cerr << featureValues[i][j] << endl;
01720     }
01721   }
01722   cerr << endl;
01723 }
01724 
01725 void deleteTranslations(vector<vector<const Word*> > &translations)
01726 {
01727   for (size_t i = 0; i < translations.size(); ++i) {
01728     for (size_t j = 0; j < translations[i].size(); ++j) {
01729       delete translations[i][j];
01730     }
01731   }
01732 }
01733 
01734 void decodeHopeOrFear(size_t rank, size_t size, size_t decode, string filename, vector<string> &inputSentences, MosesDecoder* decoder, size_t n, float bleuWeight)
01735 {
01736   if (decode == 1)
01737     cerr << "Rank " << rank << ", decoding dev input set according to hope objective.. " << endl;
01738   else if (decode == 2)
01739     cerr << "Rank " << rank << ", decoding dev input set according to fear objective.. " << endl;
01740   else
01741     cerr << "Rank " << rank << ", decoding dev input set according to normal objective.. " << endl;
01742 
01743   // Create shards according to the number of processes used
01744   vector<size_t> order;
01745   for (size_t i = 0; i < inputSentences.size(); ++i)
01746     order.push_back(i);
01747 
01748   vector<size_t> shard;
01749   float shardSize = (float) (order.size()) / size;
01750   size_t shardStart = (size_t) (shardSize * rank);
01751   size_t shardEnd = (size_t) (shardSize * (rank + 1));
01752   if (rank == size - 1) {
01753     shardEnd = inputSentences.size();
01754     shardSize = shardEnd - shardStart;
01755   }
01756   VERBOSE(1, "Rank " << rank << ", shard start: " << shardStart << " Shard end: " << shardEnd << endl);
01757   VERBOSE(1, "Rank " << rank << ", shard size: " << shardSize << endl);
01758   shard.resize(shardSize);
01759   copy(order.begin() + shardStart, order.begin() + shardEnd, shard.begin());
01760 
01761   // open files for writing
01762   stringstream fname;
01763   fname << filename << ".rank" << rank;
01764   filename = fname.str();
01765   ostringstream filename_nbest;
01766   filename_nbest << filename << "." << n << "best";
01767   ofstream out(filename.c_str());
01768   ofstream nbest_out((filename_nbest.str()).c_str());
01769   if (!out) {
01770     ostringstream msg;
01771     msg << "Unable to open " << fname.str();
01772     throw runtime_error(msg.str());
01773   }
01774   if (!nbest_out) {
01775     ostringstream msg;
01776     msg << "Unable to open " << filename_nbest;
01777     throw runtime_error(msg.str());
01778   }
01779 
01780   for (size_t i = 0; i < shard.size(); ++i) {
01781     size_t sid = shard[i];
01782     string& input = inputSentences[sid];
01783 
01784     vector<vector<ScoreComponentCollection> > dummyFeatureValues;
01785     vector<vector<float> > dummyBleuScores;
01786     vector<vector<float> > dummyModelScores;
01787 
01788     vector<ScoreComponentCollection> newFeatureValues;
01789     vector<float> newScores;
01790     dummyFeatureValues.push_back(newFeatureValues);
01791     dummyBleuScores.push_back(newScores);
01792     dummyModelScores.push_back(newScores);
01793 
01794     float factor = 0.0;
01795     if (decode == 1) factor = 1.0;
01796     if (decode == 2) factor = -1.0;
01797     cerr << "Rank " << rank << ", translating sentence " << sid << endl;
01798     bool realBleu = false;
01799     vector< vector<const Word*> > nbestOutput = decoder->getNBest(input, sid, n, factor, bleuWeight, dummyFeatureValues[0],
01800         dummyBleuScores[0], dummyModelScores[0], n, realBleu, true, false, rank, 0, "");
01801     cerr << endl;
01802     decoder->cleanup(StaticData::Instance().IsChart());
01803 
01804     for (size_t i = 0; i < nbestOutput.size(); ++i) {
01805       vector<const Word*> output = nbestOutput[i];
01806       stringstream translation;
01807       for (size_t k = 0; k < output.size(); ++k) {
01808         Word* w = const_cast<Word*>(output[k]);
01809         translation << w->GetString(0);
01810         translation << " ";
01811       }
01812 
01813       if (i == 0)
01814         out << translation.str() << endl;
01815       nbest_out << sid << " ||| " << translation.str() << " ||| " << dummyFeatureValues[0][i] <<
01816                 " ||| " << dummyModelScores[0][i] << " ||| sBleu=" << dummyBleuScores[0][i] << endl;
01817     }
01818   }
01819 
01820   out.close();
01821   nbest_out.close();
01822   cerr << "Closing files " << filename << " and " << filename_nbest.str() << endl;
01823 
01824 #ifdef MPI_ENABLE
01825   MPI_Finalize();
01826 #endif
01827 
01828   time_t now;
01829   time(&now);
01830   cerr << "Rank " << rank << ", " << ctime(&now);
01831 
01832   delete decoder;
01833   exit(0);
01834 }
01835 
01836 void applyLearningRates(vector<vector<ScoreComponentCollection> > &featureValues, float core_r0, float sparse_r0)
01837 {
01838   for (size_t i=0; i<featureValues.size(); ++i) // each item in batch
01839     for (size_t j=0; j<featureValues[i].size(); ++j) // each item in nbest
01840       featureValues[i][j].MultiplyEquals(core_r0, sparse_r0);
01841 }
01842 
01843 void applyPerFeatureLearningRates(vector<vector<ScoreComponentCollection> > &featureValues, ScoreComponentCollection featureLearningRates, float sparse_r0)
01844 {
01845   for (size_t i=0; i<featureValues.size(); ++i) // each item in batch
01846     for (size_t j=0; j<featureValues[i].size(); ++j) // each item in nbest
01847       featureValues[i][j].MultiplyEqualsBackoff(featureLearningRates, sparse_r0);
01848 }
01849 
01850 void scaleFeatureScore(const FeatureFunction *sp, float scaling_factor, vector<vector<ScoreComponentCollection> > &featureValues, size_t rank, size_t epoch)
01851 {
01852   string name = sp->GetScoreProducerDescription();
01853 
01854   // scale down score
01855   float featureScore;
01856   for (size_t i=0; i<featureValues.size(); ++i) { // each item in batch
01857     for (size_t j=0; j<featureValues[i].size(); ++j) { // each item in nbest
01858       featureScore = featureValues[i][j].GetScoreForProducer(sp);
01859       featureValues[i][j].Assign(sp, featureScore*scaling_factor);
01860       //cerr << "Rank " << rank << ", epoch " << epoch << ", " << name << " score scaled from " << featureScore << " to " << featureScore/scaling_factor << endl;
01861     }
01862   }
01863 }
01864 
01865 void scaleFeatureScores(const FeatureFunction *sp, float scaling_factor, vector<vector<ScoreComponentCollection> > &featureValues, size_t rank, size_t epoch)
01866 {
01867   string name = sp->GetScoreProducerDescription();
01868 
01869   // scale down score
01870   for (size_t i=0; i<featureValues.size(); ++i) { // each item in batch
01871     for (size_t j=0; j<featureValues[i].size(); ++j) { // each item in nbest
01872       vector<float> featureScores = featureValues[i][j].GetScoresForProducer(sp);
01873       for (size_t k=0; k<featureScores.size(); ++k)
01874         featureScores[k] *= scaling_factor;
01875       featureValues[i][j].Assign(sp, featureScores);
01876       //cerr << "Rank " << rank << ", epoch " << epoch << ", " << name << " score scaled from " << featureScore << " to " << featureScore/scaling_factor << endl;
01877     }
01878   }
01879 }