Moses: /disk4/html/www/moses/doxygen/mosesdecoder/phrase-extract/extract-ghkm/ExtractGHKM.cpp Source File

00001 /***********************************************************************
00002  Moses - statistical machine translation system
00003  Copyright (C) 2006-2011 University of Edinburgh
00004 
00005  This library is free software; you can redistribute it and/or
00006  modify it under the terms of the GNU Lesser General Public
00007  License as published by the Free Software Foundation; either
00008  version 2.1 of the License, or (at your option) any later version.
00009 
00010  This library is distributed in the hope that it will be useful,
00011  but WITHOUT ANY WARRANTY; without even the implied warranty of
00012  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
00013  Lesser General Public License for more details.
00014 
00015  You should have received a copy of the GNU Lesser General Public
00016  License along with this library; if not, write to the Free Software
00017  Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301  USA
00018 ***********************************************************************/
00019 
00020 #include "ExtractGHKM.h"
00021 
00022 #include <cassert>
00023 #include <cstdlib>
00024 #include <fstream>
00025 #include <iostream>
00026 #include <iterator>
00027 #include <string>
00028 #include <sstream>
00029 #include <vector>
00030 
00031 #include <boost/program_options.hpp>
00032 
00033 #include "syntax-common/exception.h"
00034 #include "syntax-common/xml_tree_parser.h"
00035 
00036 #include "InputFileStream.h"
00037 #include "OutputFileStream.h"
00038 #include "SyntaxNode.h"
00039 #include "SyntaxNodeCollection.h"
00040 #include "SyntaxTree.h"
00041 #include "tables-core.h"
00042 #include "XmlException.h"
00043 #include "XmlTree.h"
00044 
00045 #include "Alignment.h"
00046 #include "AlignmentGraph.h"
00047 #include "Node.h"
00048 #include "Options.h"
00049 #include "PhraseOrientation.h"
00050 #include "ScfgRule.h"
00051 #include "ScfgRuleWriter.h"
00052 #include "Span.h"
00053 #include "StsgRule.h"
00054 #include "StsgRuleWriter.h"
00055 
00056 namespace MosesTraining
00057 {
00058 namespace Syntax
00059 {
00060 namespace GHKM
00061 {
00062 
00063 int ExtractGHKM::Main(int argc, char *argv[])
00064 {
00065   using Moses::InputFileStream;
00066   using Moses::OutputFileStream;
00067 
00068   // Process command-line options.
00069   Options options;
00070   ProcessOptions(argc, argv, options);
00071 
00072   // Open input files.
00073   //
00074   // The GHKM algorithm is neutral about whether the model is string-to-tree or
00075   // tree-to-string.  This implementation assumes the model to be
00076   // string-to-tree, but if the -t2s option is given then the source and target
00077   // input files are switched prior to extraction and then the source and
00078   // target of the extracted rules are switched on output.
00079   std::string effectiveTargetFile = options.t2s ? options.sourceFile
00080                                     : options.targetFile;
00081   std::string effectiveSourceFile = options.t2s ? options.targetFile
00082                                     : options.sourceFile;
00083   InputFileStream targetStream(effectiveTargetFile);
00084   InputFileStream sourceStream(effectiveSourceFile);
00085   InputFileStream alignmentStream(options.alignmentFile);
00086 
00087   // Open output files.
00088   OutputFileStream fwdExtractStream;
00089   OutputFileStream invExtractStream;
00090   OutputFileStream glueGrammarStream;
00091   OutputFileStream targetUnknownWordStream;
00092   OutputFileStream sourceUnknownWordStream;
00093   OutputFileStream sourceLabelSetStream;
00094   OutputFileStream unknownWordSoftMatchesStream;
00095 
00096   std::string fwdFileName = options.extractFile;
00097   std::string invFileName = options.extractFile + std::string(".inv");
00098   if (options.gzOutput) {
00099     fwdFileName += ".gz";
00100     invFileName += ".gz";
00101   }
00102   OpenOutputFileOrDie(fwdFileName, fwdExtractStream);
00103   OpenOutputFileOrDie(invFileName, invExtractStream);
00104 
00105   if (!options.glueGrammarFile.empty()) {
00106     OpenOutputFileOrDie(options.glueGrammarFile, glueGrammarStream);
00107   }
00108   if (!options.targetUnknownWordFile.empty()) {
00109     OpenOutputFileOrDie(options.targetUnknownWordFile, targetUnknownWordStream);
00110   }
00111   if (!options.sourceUnknownWordFile.empty()) {
00112     OpenOutputFileOrDie(options.sourceUnknownWordFile, sourceUnknownWordStream);
00113   }
00114   if (!options.sourceLabelSetFile.empty()) {
00115     if (!options.sourceLabels) {
00116       Error("SourceLabels should be active if SourceLabelSet is supposed to be written to a file");
00117     }
00118     OpenOutputFileOrDie(options.sourceLabelSetFile, sourceLabelSetStream); // note that this is not a global source label set if extraction is parallelized
00119   }
00120   if (!options.unknownWordSoftMatchesFile.empty()) {
00121     OpenOutputFileOrDie(options.unknownWordSoftMatchesFile, unknownWordSoftMatchesStream);
00122   }
00123 
00124   // Word count statistics for producing unknown word labels.
00125   std::map<std::string, int> targetWordCount;
00126   std::map<std::string, std::string> targetWordLabel;
00127 
00128   // Word count statistics for producing unknown word labels: source side.
00129   std::map<std::string, int> sourceWordCount;
00130   std::map<std::string, std::string> sourceWordLabel;
00131 
00132   std::string targetLine;
00133   std::string sourceLine;
00134   std::string alignmentLine;
00135   Alignment alignment;
00136   XmlTreeParser targetXmlTreeParser;
00137   XmlTreeParser sourceXmlTreeParser;
00138   ScfgRuleWriter scfgWriter(fwdExtractStream, invExtractStream, options);
00139   StsgRuleWriter stsgWriter(fwdExtractStream, invExtractStream, options);
00140   size_t lineNum = options.sentenceOffset;
00141   while (true) {
00142     std::getline(targetStream, targetLine);
00143     std::getline(sourceStream, sourceLine);
00144     std::getline(alignmentStream, alignmentLine);
00145 
00146     if (targetStream.eof() && sourceStream.eof() && alignmentStream.eof()) {
00147       break;
00148     }
00149 
00150     if (targetStream.eof() || sourceStream.eof() || alignmentStream.eof()) {
00151       Error("Files must contain same number of lines");
00152     }
00153 
00154     ++lineNum;
00155 
00156     // Parse target tree.
00157     if (targetLine.size() == 0) {
00158       std::cerr << "skipping line " << lineNum << " with empty target tree\n";
00159       continue;
00160     }
00161     std::auto_ptr<SyntaxTree> targetParseTree;
00162     try {
00163       targetParseTree = targetXmlTreeParser.Parse(targetLine);
00164       assert(targetParseTree.get());
00165     } catch (const Exception &e) {
00166       std::ostringstream oss;
00167       oss << "Failed to parse target XML tree at line " << lineNum;
00168       if (!e.msg().empty()) {
00169         oss << ": " << e.msg();
00170       }
00171       Error(oss.str());
00172     }
00173 
00174     // Read source tokens (and parse tree if using source labels).
00175     std::vector<std::string> sourceTokens;
00176     std::auto_ptr<SyntaxTree> sourceParseTree;
00177     if (!options.sourceLabels) {
00178       sourceTokens = ReadTokens(sourceLine);
00179     } else {
00180       try {
00181         sourceParseTree = sourceXmlTreeParser.Parse(sourceLine);
00182         assert(sourceParseTree.get());
00183       } catch (const Exception &e) {
00184         std::ostringstream oss;
00185         oss << "Failed to parse source XML tree at line " << lineNum;
00186         if (!e.msg().empty()) {
00187           oss << ": " << e.msg();
00188         }
00189         Error(oss.str());
00190       }
00191       sourceTokens = sourceXmlTreeParser.words();
00192     }
00193 
00194     // Read word alignments.
00195     try {
00196       ReadAlignment(alignmentLine, alignment);
00197     } catch (const Exception &e) {
00198       std::ostringstream oss;
00199       oss << "Failed to read alignment at line " << lineNum << ": ";
00200       oss << e.msg();
00201       Error(oss.str());
00202     }
00203     if (alignment.size() == 0) {
00204       std::cerr << "skipping line " << lineNum << " without alignment points\n";
00205       continue;
00206     }
00207     if (options.t2s) {
00208       FlipAlignment(alignment);
00209     }
00210 
00211     // Record word counts.
00212     if (!options.targetUnknownWordFile.empty()) {
00213       CollectWordLabelCounts(*targetParseTree, options, targetWordCount,
00214                              targetWordLabel);
00215     }
00216 
00217     // Record word counts: source side.
00218     if (options.sourceLabels && !options.sourceUnknownWordFile.empty()) {
00219       CollectWordLabelCounts(*sourceParseTree, options, sourceWordCount,
00220                              sourceWordLabel);
00221     }
00222 
00223     // Form an alignment graph from the target tree, source words, and
00224     // alignment.
00225     AlignmentGraph graph(targetParseTree.get(), sourceTokens, alignment);
00226 
00227     // Extract minimal rules, adding each rule to its root node's rule set.
00228     graph.ExtractMinimalRules(options);
00229 
00230     // Extract composed rules.
00231     if (!options.minimal) {
00232       graph.ExtractComposedRules(options);
00233     }
00234 
00235     // Initialize phrase orientation scoring object
00236     PhraseOrientation phraseOrientation(sourceTokens.size(),
00237                                         targetXmlTreeParser.words().size(), alignment);
00238 
00239     // Write the rules, subject to scope pruning.
00240     const std::vector<Node *> &targetNodes = graph.GetTargetNodes();
00241     for (std::vector<Node *>::const_iterator p = targetNodes.begin();
00242          p != targetNodes.end(); ++p) {
00243 
00244       const std::vector<const Subgraph *> &rules = (*p)->GetRules();
00245 
00246       PhraseOrientation::REO_CLASS l2rOrientation=PhraseOrientation::REO_CLASS_UNKNOWN, r2lOrientation=PhraseOrientation::REO_CLASS_UNKNOWN;
00247       if (options.phraseOrientation && !rules.empty()) {
00248         int sourceSpanBegin = *((*p)->GetSpan().begin());
00249         int sourceSpanEnd   = *((*p)->GetSpan().rbegin());
00250         l2rOrientation = phraseOrientation.GetOrientationInfo(sourceSpanBegin,sourceSpanEnd,PhraseOrientation::REO_DIR_L2R);
00251         r2lOrientation = phraseOrientation.GetOrientationInfo(sourceSpanBegin,sourceSpanEnd,PhraseOrientation::REO_DIR_R2L);
00252         // std::cerr << "span " << sourceSpanBegin << " " << sourceSpanEnd << std::endl;
00253         // std::cerr << "phraseOrientation " << phraseOrientation.GetOrientationInfo(sourceSpanBegin,sourceSpanEnd) << std::endl;
00254       }
00255 
00256       for (std::vector<const Subgraph *>::const_iterator q = rules.begin();
00257            q != rules.end(); ++q) {
00258         // STSG output.
00259         if (options.stsg) {
00260           StsgRule rule(**q);
00261           if (rule.Scope() <= options.maxScope) {
00262             stsgWriter.Write(rule);
00263           }
00264           continue;
00265         }
00266         // SCFG output.
00267         ScfgRule *r = 0;
00268         if (options.sourceLabels) {
00269           r = new ScfgRule(**q, &sourceXmlTreeParser.node_collection());
00270         } else {
00271           r = new ScfgRule(**q);
00272         }
00273         // TODO Can scope pruning be done earlier?
00274         if (r->Scope() <= options.maxScope) {
00275           scfgWriter.Write(*r,lineNum,false);
00276           if (options.treeFragments) {
00277             fwdExtractStream << " {{Tree ";
00278             (*q)->PrintTree(fwdExtractStream);
00279             fwdExtractStream << "}}";
00280           }
00281           if (options.partsOfSpeech) {
00282             fwdExtractStream << " {{POS";
00283             (*q)->PrintPartsOfSpeech(fwdExtractStream);
00284             fwdExtractStream << "}}";
00285           }
00286           if (options.phraseOrientation) {
00287             fwdExtractStream << " {{Orientation ";
00288             phraseOrientation.WriteOrientation(fwdExtractStream,l2rOrientation);
00289             fwdExtractStream << " ";
00290             phraseOrientation.WriteOrientation(fwdExtractStream,r2lOrientation);
00291             fwdExtractStream << "}}";
00292             phraseOrientation.IncrementPriorCount(PhraseOrientation::REO_DIR_L2R,l2rOrientation,1);
00293             phraseOrientation.IncrementPriorCount(PhraseOrientation::REO_DIR_R2L,r2lOrientation,1);
00294           }
00295           fwdExtractStream << std::endl;
00296           invExtractStream << std::endl;
00297         }
00298         delete r;
00299       }
00300     }
00301   }
00302 
00303   if (options.phraseOrientation) {
00304     std::string phraseOrientationPriorsFileName = options.extractFile + std::string(".phraseOrientationPriors");
00305     OutputFileStream phraseOrientationPriorsStream;
00306     OpenOutputFileOrDie(phraseOrientationPriorsFileName, phraseOrientationPriorsStream);
00307     PhraseOrientation::WritePriorCounts(phraseOrientationPriorsStream);
00308   }
00309 
00310   std::map<std::string,size_t> sourceLabels;
00311   if (options.sourceLabels && !options.sourceLabelSetFile.empty()) {
00312     std::set<std::string> extendedLabelSet = sourceXmlTreeParser.label_set();
00313     extendedLabelSet.insert("XLHS"); // non-matching label (left-hand side)
00314     extendedLabelSet.insert("XRHS"); // non-matching label (right-hand side)
00315     extendedLabelSet.insert("TOPLABEL");  // as used in the glue grammar
00316     extendedLabelSet.insert("SOMELABEL"); // as used in the glue grammar
00317     size_t index = 0;
00318     for (std::set<std::string>::const_iterator iter=extendedLabelSet.begin();
00319          iter!=extendedLabelSet.end(); ++iter, ++index) {
00320       sourceLabels.insert(std::pair<std::string,size_t>(*iter,index));
00321     }
00322     WriteSourceLabelSet(sourceLabels, sourceLabelSetStream);
00323   }
00324 
00325   std::set<std::string> strippedTargetLabelSet;
00326   std::map<std::string, int> strippedTargetTopLabelSet;
00327   if (options.stripBitParLabels &&
00328       (!options.glueGrammarFile.empty() || !options.unknownWordSoftMatchesFile.empty())) {
00329     StripBitParLabels(targetXmlTreeParser.label_set(),
00330                       targetXmlTreeParser.top_label_set(),
00331                       strippedTargetLabelSet, strippedTargetTopLabelSet);
00332   }
00333 
00334   if (!options.glueGrammarFile.empty()) {
00335     if (options.stripBitParLabels) {
00336       WriteGlueGrammar(strippedTargetLabelSet, strippedTargetTopLabelSet, sourceLabels, options, glueGrammarStream);
00337     } else {
00338       WriteGlueGrammar(targetXmlTreeParser.label_set(),
00339                        targetXmlTreeParser.top_label_set(),
00340                        sourceLabels, options, glueGrammarStream);
00341     }
00342   }
00343 
00344   if (!options.targetUnknownWordFile.empty()) {
00345     WriteUnknownWordLabel(targetWordCount, targetWordLabel, options, targetUnknownWordStream);
00346   }
00347 
00348   if (options.sourceLabels && !options.sourceUnknownWordFile.empty()) {
00349     WriteUnknownWordLabel(sourceWordCount, sourceWordLabel, options, sourceUnknownWordStream, true);
00350   }
00351 
00352   if (!options.unknownWordSoftMatchesFile.empty()) {
00353     if (options.stripBitParLabels) {
00354       WriteUnknownWordSoftMatches(strippedTargetLabelSet, unknownWordSoftMatchesStream);
00355     } else {
00356       WriteUnknownWordSoftMatches(targetXmlTreeParser.label_set(),
00357                                   unknownWordSoftMatchesStream);
00358     }
00359   }
00360 
00361   return 0;
00362 }
00363 
00364 void ExtractGHKM::ProcessOptions(int argc, char *argv[],
00365                                  Options &options) const
00366 {
00367   namespace po = boost::program_options;
00368   namespace cls = boost::program_options::command_line_style;
00369 
00370   // Construct the 'top' of the usage message: the bit that comes before the
00371   // options list.
00372   std::ostringstream usageTop;
00373   usageTop << "Usage: " << name()
00374            << " [OPTION]... TARGET SOURCE ALIGNMENT EXTRACT\n\n"
00375            << "SCFG rule extractor based on the GHKM algorithm described in\n"
00376            << "Galley et al. (2004).\n\n"
00377            << "Options";
00378 
00379   // Construct the 'bottom' of the usage message.
00380   std::ostringstream usageBottom;
00381   usageBottom << "\nImplementation Notes:\n"
00382               << "\nThe parse tree is assumed to contain part-of-speech preterminal nodes.\n"
00383               << "\n"
00384               << "For the composed rule constraints: rule depth is the "
00385               "maximum distance from the\nrule's root node to a sink "
00386               "node, not counting preterminal expansions or word\n"
00387               "alignments.  Rule size is the measure defined in DeNeefe "
00388               "et al (2007): the\nnumber of non-part-of-speech, non-leaf "
00389               "constituent labels in the target tree.\nNode count is the "
00390               "number of target tree nodes (excluding target words).\n"
00391               << "\n"
00392               << "Scope pruning (Hopkins and Langmead, 2010) is applied to both minimal and\ncomposed rules.\n"
00393               << "\n"
00394               << "Unaligned source words are attached to the tree using the "
00395               "following heuristic:\nif there are aligned source words to "
00396               "both the left and the right of an unaligned\nsource word "
00397               "then it is attached to the lowest common ancestor of its "
00398               "nearest\nsuch left and right neighbours.  Otherwise, it is "
00399               "attached to the root of the\nparse tree.\n"
00400               << "\n"
00401               << "Unless the --AllowUnary option is given, unary rules containing no lexical\nsource items are eliminated using the method described in Chung et al. (2011).\nThe parsing algorithm used in Moses is unable to handle such rules.\n"
00402               << "\n"
00403               << "References:\n"
00404               << "Galley, M., Hopkins, M., Knight, K., and Marcu, D. (2004)\n"
00405               << "\"What's in a Translation Rule?\", In Proceedings of HLT/NAACL 2004.\n"
00406               << "\n"
00407               << "DeNeefe, S., Knight, K., Wang, W., and Marcu, D. (2007)\n"
00408               << "\"What Can Syntax-Based MT Learn from Phrase-Based MT?\", In Proceedings of\nEMNLP-CoNLL 2007.\n"
00409               << "\n"
00410               << "Hopkins, M. and Langmead, G. (2010)\n"
00411               << "\"SCFG Decoding Without Binarization\", In Proceedings of EMNLP 2010.\n"
00412               << "\n"
00413               << "Chung, T. and Fang, L. and Gildea, D. (2011)\n"
00414               << "\"Issues Concerning Decoding with Synchronous Context-free Grammar\", In\nProceedings of ACL/HLT 2011.";
00415 
00416   // Declare the command line options that are visible to the user.
00417   po::options_description visible(usageTop.str());
00418   visible.add_options()
00419   //("help", "print this help message and exit")
00420   ("AllowUnary",
00421    "allow fully non-lexical unary rules")
00422   ("ConditionOnTargetLHS",
00423    "write target LHS instead of \"X\" as source LHS")
00424   ("GlueGrammar",
00425    po::value(&options.glueGrammarFile),
00426    "write glue grammar to named file")
00427   ("GZOutput",
00428    "write gzipped extract files")
00429   ("IncludeSentenceId",
00430    "include sentence ID")
00431   ("MaxNodes",
00432    po::value(&options.maxNodes)->default_value(options.maxNodes),
00433    "set maximum number of tree nodes for composed rules")
00434   ("MaxRuleDepth",
00435    po::value(&options.maxRuleDepth)->default_value(options.maxRuleDepth),
00436    "set maximum depth for composed rules")
00437   ("MaxRuleSize",
00438    po::value(&options.maxRuleSize)->default_value(options.maxRuleSize),
00439    "set maximum size for composed rules")
00440   ("MaxScope",
00441    po::value(&options.maxScope)->default_value(options.maxScope),
00442    "set maximum allowed scope")
00443   ("Minimal",
00444    "extract minimal rules only")
00445   ("PartsOfSpeech",
00446    "output parts-of-speech as property (preterminals from the parse tree)")
00447   ("PartsOfSpeechFactor",
00448    "output parts-of-speech as factor (preterminals from the parse tree)")
00449   ("PCFG",
00450    "include score based on PCFG scores in target corpus")
00451   ("PhraseOrientation",
00452    "output phrase orientation information")
00453   ("StripBitParLabels",
00454    "strip suffix starting with a hyphen symbol (\"-\") from non-terminal labels")
00455   ("STSG",
00456    "output STSG rules (default is SCFG)")
00457   ("T2S",
00458    "enable tree-to-string rule extraction (string-to-tree is assumed by default)")
00459   ("TreeFragments",
00460    "output parse tree information")
00461   ("SourceLabels",
00462    "output source syntax label information")
00463   ("SourceLabelSet",
00464    po::value(&options.sourceLabelSetFile),
00465    "write source syntax label set to named file")
00466   ("SentenceOffset",
00467    po::value(&options.sentenceOffset)->default_value(options.sentenceOffset),
00468    "set sentence number offset if processing split corpus")
00469   ("UnknownWordLabel",
00470    po::value(&options.targetUnknownWordFile),
00471    "write unknown word labels to named file")
00472   ("SourceUnknownWordLabel",
00473    po::value(&options.sourceUnknownWordFile),
00474    "write source syntax unknown word labels to named file")
00475   ("UnknownWordMinRelFreq",
00476    po::value(&options.unknownWordMinRelFreq)->default_value(
00477      options.unknownWordMinRelFreq),
00478    "set minimum relative frequency for unknown word labels")
00479   ("UnknownWordSoftMatches",
00480    po::value(&options.unknownWordSoftMatchesFile),
00481    "write dummy value to unknown word label file, and mappings from dummy value to other labels to named file")
00482   ("UnknownWordUniform",
00483    "write uniform weights to unknown word label file")
00484   ("UnpairedExtractFormat",
00485    "do not pair non-terminals in extract files")
00486   ;
00487 
00488   // Declare the command line options that are hidden from the user
00489   // (these are used as positional options).
00490   po::options_description hidden("Hidden options");
00491   hidden.add_options()
00492   ("TargetFile",
00493    po::value(&options.targetFile),
00494    "target file")
00495   ("SourceFile",
00496    po::value(&options.sourceFile),
00497    "source file")
00498   ("AlignmentFile",
00499    po::value(&options.alignmentFile),
00500    "alignment file")
00501   ("ExtractFile",
00502    po::value(&options.extractFile),
00503    "extract file")
00504   ;
00505 
00506   // Compose the full set of command-line options.
00507   po::options_description cmdLineOptions;
00508   cmdLineOptions.add(visible).add(hidden);
00509 
00510   // Register the positional options.
00511   po::positional_options_description p;
00512   p.add("TargetFile", 1);
00513   p.add("SourceFile", 1);
00514   p.add("AlignmentFile", 1);
00515   p.add("ExtractFile", 1);
00516 
00517   // Process the command-line.
00518   po::variables_map vm;
00519   try {
00520     po::store(po::command_line_parser(argc, argv).style(MosesOptionStyle()).
00521               options(cmdLineOptions).positional(p).run(), vm);
00522     po::notify(vm);
00523   } catch (const std::exception &e) {
00524     std::ostringstream msg;
00525     msg << e.what() << "\n\n" << visible << usageBottom.str();
00526     Error(msg.str());
00527   }
00528 
00529   if (vm.count("help")) {
00530     std::cout << visible << usageBottom.str() << std::endl;
00531     std::exit(0);
00532   }
00533 
00534   // Check all positional options were given.
00535   if (!vm.count("TargetFile") ||
00536       !vm.count("SourceFile") ||
00537       !vm.count("AlignmentFile") ||
00538       !vm.count("ExtractFile")) {
00539     std::ostringstream msg;
00540     std::cerr << visible << usageBottom.str() << std::endl;
00541     std::exit(1);
00542   }
00543 
00544   // Process Boolean options.
00545   if (vm.count("AllowUnary")) {
00546     options.allowUnary = true;
00547   }
00548   if (vm.count("ConditionOnTargetLHS")) {
00549     options.conditionOnTargetLhs = true;
00550   }
00551   if (vm.count("GZOutput")) {
00552     options.gzOutput = true;
00553   }
00554   if (vm.count("IncludeSentenceId")) {
00555     options.includeSentenceId = true;
00556   }
00557   if (vm.count("Minimal")) {
00558     options.minimal = true;
00559   }
00560   if (vm.count("PartsOfSpeech")) {
00561     options.partsOfSpeech = true;
00562   }
00563   if (vm.count("PartsOfSpeechFactor")) {
00564     options.partsOfSpeechFactor = true;
00565   }
00566   if (vm.count("PCFG")) {
00567     options.pcfg = true;
00568   }
00569   if (vm.count("PhraseOrientation")) {
00570     options.phraseOrientation = true;
00571   }
00572   if (vm.count("StripBitParLabels")) {
00573     options.stripBitParLabels = true;
00574   }
00575   if (vm.count("STSG")) {
00576     options.stsg = true;
00577   }
00578   if (vm.count("T2S")) {
00579     options.t2s = true;
00580   }
00581   if (vm.count("TreeFragments")) {
00582     options.treeFragments = true;
00583   }
00584   if (vm.count("SourceLabels")) {
00585     options.sourceLabels = true;
00586   }
00587   if (vm.count("UnknownWordUniform")) {
00588     options.unknownWordUniform = true;
00589   }
00590   if (vm.count("UnpairedExtractFormat")) {
00591     options.unpairedExtractFormat = true;
00592   }
00593 
00594   // Workaround for extract-parallel issue.
00595   if (options.sentenceOffset > 0) {
00596     options.targetUnknownWordFile.clear();
00597   }
00598   if (options.sentenceOffset > 0) {
00599     options.sourceUnknownWordFile.clear();
00600     options.unknownWordSoftMatchesFile.clear();
00601   }
00602 }
00603 
00604 std::vector<std::string> ExtractGHKM::ReadTokens(const std::string &s) const
00605 {
00606   std::vector<std::string> tokens;
00607 
00608   std::string whitespace = " \t";
00609 
00610   std::string::size_type begin = s.find_first_not_of(whitespace);
00611   assert(begin != std::string::npos);
00612   while (true) {
00613     std::string::size_type end = s.find_first_of(whitespace, begin);
00614     std::string token;
00615     if (end == std::string::npos) {
00616       token = s.substr(begin);
00617     } else {
00618       token = s.substr(begin, end-begin);
00619     }
00620     tokens.push_back(token);
00621     if (end == std::string::npos) {
00622       break;
00623     }
00624     begin = s.find_first_not_of(whitespace, end);
00625     if (begin == std::string::npos) {
00626       break;
00627     }
00628   }
00629 
00630   return tokens;
00631 }
00632 
00633 void ExtractGHKM::WriteGlueGrammar(
00634   const std::set<std::string> &labelSet,
00635   const std::map<std::string, int> &topLabelSet,
00636   const std::map<std::string,size_t> &sourceLabels,
00637   const Options &options,
00638   std::ostream &out) const
00639 {
00640   // choose a top label that is not already a label
00641   std::string topLabel = "QQQQQQ";
00642   for(size_t i = 1; i <= topLabel.length(); i++) {
00643     if (labelSet.find(topLabel.substr(0,i)) == labelSet.end() ) {
00644       topLabel = topLabel.substr(0,i);
00645       break;
00646     }
00647   }
00648 
00649   const size_t sourceLabelGlueTop = 0;
00650   const size_t sourceLabelGlueX = 1;
00651   const size_t sourceLabelSentenceStart = 2;
00652   const size_t sourceLabelSentenceEnd = 3;
00653 //  const size_t partOfSpeechSentenceStart = 0;
00654 //  const size_t partOfSpeechSentenceEnd = 1;
00655 
00656 #ifndef BOS_
00657 #define BOS_ "<s>" //Beginning of sentence symbol
00658 #endif
00659 #ifndef EOS_
00660 #define EOS_ "</s>" //End of sentence symbol
00661 #endif
00662 
00663   std::string sentenceStartSource = BOS_;
00664   std::string sentenceEndSource   = EOS_;
00665   std::string sentenceStartTarget = BOS_;
00666   std::string sentenceEndTarget   = EOS_;
00667   if (options.partsOfSpeech) {
00668     sentenceStartTarget = sentenceStartTarget + "|" + BOS_;
00669     sentenceEndTarget   = sentenceEndTarget   + "|" + EOS_;
00670   }
00671   if (options.partsOfSpeechFactor) {
00672     sentenceStartTarget = sentenceStartTarget + "|" + BOS_;
00673     sentenceEndTarget   = sentenceEndTarget   + "|" + EOS_;
00674   }
00675 
00676   // basic rules
00677   out << sentenceStartSource << " [X] ||| " << sentenceStartTarget << " [" << topLabel << "] ||| 1 ||| 0-0 ||| ||| |||";
00678   if (options.treeFragments) {
00679     out << " {{Tree [" << topLabel << " [SSTART <s>]]}}";
00680   }
00681 //  if (options.partsOfSpeech) {
00682 //    out << " {{POS " << partOfSpeechSentenceStart << "}}";
00683 //  }
00684   if (options.sourceLabels) {
00685     out << " {{SourceLabels 2 1 " << sourceLabelSentenceStart << " 1 1 " << sourceLabelGlueTop << " 1}}";
00686   }
00687   if (options.phraseOrientation) {
00688     out << " {{Orientation 1 1 0.5 0.5 1 1 0.5 0.5}}";
00689   }
00690   out << std::endl;
00691 
00692   out << "[X][" << topLabel << "] " << sentenceEndSource << " [X] ||| [X][" << topLabel << "] " << sentenceEndTarget << " [" << topLabel << "] ||| 1 ||| 0-0 1-1 ||| ||| |||";
00693   if (options.treeFragments) {
00694     out << " {{Tree [" << topLabel << " [" << topLabel << "] [SEND </s>]]}}";
00695   }
00696 //  if (options.partsOfSpeech) {
00697 //    out << " {{POS " << partOfSpeechSentenceEnd << "}}";
00698 //  }
00699   if (options.sourceLabels) {
00700     out << " {{SourceLabels 4 1 " << sourceLabelSentenceStart << " " << sourceLabelGlueTop << " " << sourceLabelSentenceEnd << " 1 1 " << sourceLabelGlueTop << " 1}}";
00701   }
00702   if (options.phraseOrientation) {
00703     out << " {{Orientation 1 1 0.5 0.5 1 1 0.5 0.5}}";
00704   }
00705   out << std::endl;
00706 
00707   // top rules
00708   for (std::map<std::string, int>::const_iterator i = topLabelSet.begin();
00709        i != topLabelSet.end(); ++i) {
00710     out << sentenceStartSource << " [X][" << i->first << "] " << sentenceEndSource << " [X] ||| " << sentenceStartTarget << " [X][" << i->first << "] " << sentenceEndTarget << " [" << topLabel << "] ||| 1 ||| 0-0 1-1 2-2 ||| ||| |||";
00711     if (options.treeFragments) {
00712       out << " {{Tree [" << topLabel << " [SSTART <s>] [" << i->first << "] [SEND </s>]]}}";
00713     }
00714 //    if (options.partsOfSpeech) {
00715 //      out << " {{POS " << partOfSpeechSentenceStart << " " << partOfSpeechSentenceEnd << "}}";
00716 //    }
00717     if (options.sourceLabels) {
00718       out << " {{SourceLabels 4 1 " << sourceLabelSentenceStart << " " << sourceLabelGlueX << " " << sourceLabelSentenceEnd << " 1 1 " << sourceLabelGlueTop << " 1}}";
00719     }
00720     if (options.phraseOrientation) {
00721       out << " {{Orientation 1 1 0.5 0.5 1 1 0.5 0.5}}";
00722     }
00723     out << std::endl;
00724   }
00725 
00726   // glue rules
00727   for(std::set<std::string>::const_iterator i = labelSet.begin();
00728       i != labelSet.end(); i++ ) {
00729     out << "[X][" << topLabel << "] [X][" << *i << "] [X] ||| [X][" << topLabel << "] [X][" << *i << "] [" << topLabel << "] ||| 2.718 ||| 0-0 1-1 ||| ||| |||";
00730     if (options.treeFragments) {
00731       out << " {{Tree [" << topLabel << " ["<< topLabel << "] [" << *i << "]]}}";
00732     }
00733     if (options.sourceLabels) {
00734       out << " {{SourceLabels 3 1 " << sourceLabelGlueTop << " " << sourceLabelGlueX << " 1 1 " << sourceLabelGlueTop << " 1}}";
00735     }
00736     if (options.phraseOrientation) {
00737       out << " {{Orientation 1 1 0.5 0.5 1 1 0.5 0.5}}";
00738     }
00739     out << std::endl;
00740   }
00741 
00742   // glue rule for unknown word...
00743   out << "[X][" << topLabel << "] [X][X] [X] ||| [X][" << topLabel << "] [X][X] [" << topLabel << "] ||| 2.718 ||| 0-0 1-1 ||| ||| |||";
00744   if (options.treeFragments) {
00745     out << " {{Tree [" << topLabel << " [" << topLabel << "] [X]]}}";
00746   }
00747   if (options.sourceLabels) {
00748     out << " {{SourceLabels 3 1 " << sourceLabelGlueTop << " " << sourceLabelGlueX << " 1 1 " << sourceLabelGlueTop << " 1}}";
00749   }
00750   if (options.phraseOrientation) {
00751     out << " {{Orientation 1 1 0.5 0.5 1 1 0.5 0.5}}";
00752   }
00753   out << std::endl;
00754 }
00755 
00756 void ExtractGHKM::WriteSourceLabelSet(
00757   const std::map<std::string,size_t> &sourceLabels,
00758   std::ostream &out) const
00759 {
00760   out << sourceLabels.size() << std::endl;
00761   for (std::map<std::string,size_t>::const_iterator iter=sourceLabels.begin();
00762        iter!=sourceLabels.end(); ++iter) {
00763     out << iter->first << " " << iter->second << std::endl;
00764   }
00765 }
00766 
00767 void ExtractGHKM::CollectWordLabelCounts(
00768   SyntaxTree &root,
00769   const Options &options,
00770   std::map<std::string, int> &wordCount,
00771   std::map<std::string, std::string> &wordLabel)
00772 {
00773   for (SyntaxTree::ConstLeafIterator p(root);
00774        p != SyntaxTree::ConstLeafIterator(); ++p) {
00775     const SyntaxTree &leaf = *p;
00776     const std::string &word = leaf.value().label;
00777     const SyntaxTree *ancestor = leaf.parent();
00778     // If unary rule elimination is enabled and this word is at the end of a
00779     // chain of unary rewrites, e.g.
00780     //    PN-SB -> NE -> word
00781     // then record the constituent label at the top of the chain instead of
00782     // the part-of-speech label.
00783     while (!options.allowUnary &&
00784            ancestor->parent() &&
00785            ancestor->parent()->children().size() == 1) {
00786       ancestor = ancestor->parent();
00787     }
00788     const std::string &label = ancestor->value().label;
00789     ++wordCount[word];
00790     wordLabel[word] = label;
00791   }
00792 }
00793 
00794 std::vector<std::string> ExtractGHKM::ReadTokens(const SyntaxTree &root) const
00795 {
00796   std::vector<std::string> tokens;
00797   for (SyntaxTree::ConstLeafIterator p(root);
00798        p != SyntaxTree::ConstLeafIterator(); ++p) {
00799     const SyntaxTree &leaf = *p;
00800     const std::string &word = leaf.value().label;
00801     tokens.push_back(word);
00802   }
00803   return tokens;
00804 }
00805 
00806 void ExtractGHKM::WriteUnknownWordLabel(
00807   const std::map<std::string, int> &wordCount,
00808   const std::map<std::string, std::string> &wordLabel,
00809   const Options &options,
00810   std::ostream &out,
00811   bool writeCounts) const
00812 {
00813   if (!options.unknownWordSoftMatchesFile.empty()) {
00814     out << "UNK 1" << std::endl;
00815     return;
00816   }
00817 
00818   std::map<std::string, int> labelCount;
00819   int total = 0;
00820   for (std::map<std::string, int>::const_iterator p = wordCount.begin();
00821        p != wordCount.end(); ++p) {
00822     // Only consider singletons.
00823     if (p->second == 1) {
00824       std::map<std::string, std::string>::const_iterator q =
00825         wordLabel.find(p->first);
00826       assert(q != wordLabel.end());
00827       if (options.stripBitParLabels) {
00828         size_t pos = q->second.find('-');
00829         if (pos == std::string::npos) {
00830           ++labelCount[q->second];
00831         } else {
00832           ++labelCount[q->second.substr(0,pos)];
00833         }
00834       } else {
00835         ++labelCount[q->second];
00836       }
00837       ++total;
00838     }
00839   }
00840   if ( writeCounts ) {
00841     for (std::map<std::string, int>::const_iterator p = labelCount.begin();
00842          p != labelCount.end(); ++p) {
00843       out << p->first << " " << p->second << std::endl;
00844     }
00845   } else {
00846     for (std::map<std::string, int>::const_iterator p = labelCount.begin();
00847          p != labelCount.end(); ++p) {
00848       double ratio = static_cast<double>(p->second) / static_cast<double>(total);
00849       if (ratio >= options.unknownWordMinRelFreq) {
00850         float weight = options.unknownWordUniform ? 1.0f : ratio;
00851         out << p->first << " " << weight << std::endl;
00852       }
00853     }
00854   }
00855 }
00856 
00857 void ExtractGHKM::WriteUnknownWordSoftMatches(
00858   const std::set<std::string> &labelSet,
00859   std::ostream &out) const
00860 {
00861   for (std::set<std::string>::const_iterator p = labelSet.begin(); p != labelSet.end(); ++p) {
00862     std::string label = *p;
00863     out << "UNK " << label << std::endl;
00864   }
00865 }
00866 
00867 void ExtractGHKM::StripBitParLabels(
00868   const std::set<std::string> &labelSet,
00869   const std::map<std::string, int> &topLabelSet,
00870   std::set<std::string> &outLabelSet,
00871   std::map<std::string, int> &outTopLabelSet) const
00872 {
00873   for (std::set<std::string>::const_iterator it=labelSet.begin();
00874        it!=labelSet.end(); ++it) {
00875     size_t pos = it->find('-');
00876     if (pos == std::string::npos) {
00877       outLabelSet.insert(*it);
00878     } else {
00879       outLabelSet.insert(it->substr(0,pos));
00880     }
00881   }
00882   for (std::map<std::string,int>::const_iterator it=topLabelSet.begin();
00883        it!=topLabelSet.end(); ++it) {
00884     size_t pos = it->first.find('-');
00885     std::string stripped;
00886     if (pos == std::string::npos) {
00887       stripped = it->first;
00888     } else {
00889       stripped = it->first.substr(0,pos);
00890     }
00891     std::map<std::string, int>::iterator found=outTopLabelSet.find(stripped);
00892     if (found != outTopLabelSet.end()) {
00893       found->second += it->second;
00894     } else {
00895       outTopLabelSet.insert(std::pair<std::string,int>(stripped,it->second));
00896     }
00897   }
00898 }
00899 
00900 }  // namespace GHKM
00901 }  // namespace Syntax
00902 }  // namespace MosesTraining