00001
00002
00003
00004
00005
00006
00007
00008
00009
00010
00011
00012
00013
00014
00015
00016
00017
00018
00019
00020 #include "ExtractGHKM.h"
00021
00022 #include <cassert>
00023 #include <cstdlib>
00024 #include <fstream>
00025 #include <iostream>
00026 #include <iterator>
00027 #include <string>
00028 #include <sstream>
00029 #include <vector>
00030
00031 #include <boost/program_options.hpp>
00032
00033 #include "syntax-common/exception.h"
00034 #include "syntax-common/xml_tree_parser.h"
00035
00036 #include "InputFileStream.h"
00037 #include "OutputFileStream.h"
00038 #include "SyntaxNode.h"
00039 #include "SyntaxNodeCollection.h"
00040 #include "SyntaxTree.h"
00041 #include "tables-core.h"
00042 #include "XmlException.h"
00043 #include "XmlTree.h"
00044
00045 #include "Alignment.h"
00046 #include "AlignmentGraph.h"
00047 #include "Node.h"
00048 #include "Options.h"
00049 #include "PhraseOrientation.h"
00050 #include "ScfgRule.h"
00051 #include "ScfgRuleWriter.h"
00052 #include "Span.h"
00053 #include "StsgRule.h"
00054 #include "StsgRuleWriter.h"
00055
00056 namespace MosesTraining
00057 {
00058 namespace Syntax
00059 {
00060 namespace GHKM
00061 {
00062
00063 int ExtractGHKM::Main(int argc, char *argv[])
00064 {
00065 using Moses::InputFileStream;
00066 using Moses::OutputFileStream;
00067
00068
00069 Options options;
00070 ProcessOptions(argc, argv, options);
00071
00072
00073
00074
00075
00076
00077
00078
00079 std::string effectiveTargetFile = options.t2s ? options.sourceFile
00080 : options.targetFile;
00081 std::string effectiveSourceFile = options.t2s ? options.targetFile
00082 : options.sourceFile;
00083 InputFileStream targetStream(effectiveTargetFile);
00084 InputFileStream sourceStream(effectiveSourceFile);
00085 InputFileStream alignmentStream(options.alignmentFile);
00086
00087
00088 OutputFileStream fwdExtractStream;
00089 OutputFileStream invExtractStream;
00090 OutputFileStream glueGrammarStream;
00091 OutputFileStream targetUnknownWordStream;
00092 OutputFileStream sourceUnknownWordStream;
00093 OutputFileStream sourceLabelSetStream;
00094 OutputFileStream unknownWordSoftMatchesStream;
00095
00096 std::string fwdFileName = options.extractFile;
00097 std::string invFileName = options.extractFile + std::string(".inv");
00098 if (options.gzOutput) {
00099 fwdFileName += ".gz";
00100 invFileName += ".gz";
00101 }
00102 OpenOutputFileOrDie(fwdFileName, fwdExtractStream);
00103 OpenOutputFileOrDie(invFileName, invExtractStream);
00104
00105 if (!options.glueGrammarFile.empty()) {
00106 OpenOutputFileOrDie(options.glueGrammarFile, glueGrammarStream);
00107 }
00108 if (!options.targetUnknownWordFile.empty()) {
00109 OpenOutputFileOrDie(options.targetUnknownWordFile, targetUnknownWordStream);
00110 }
00111 if (!options.sourceUnknownWordFile.empty()) {
00112 OpenOutputFileOrDie(options.sourceUnknownWordFile, sourceUnknownWordStream);
00113 }
00114 if (!options.sourceLabelSetFile.empty()) {
00115 if (!options.sourceLabels) {
00116 Error("SourceLabels should be active if SourceLabelSet is supposed to be written to a file");
00117 }
00118 OpenOutputFileOrDie(options.sourceLabelSetFile, sourceLabelSetStream);
00119 }
00120 if (!options.unknownWordSoftMatchesFile.empty()) {
00121 OpenOutputFileOrDie(options.unknownWordSoftMatchesFile, unknownWordSoftMatchesStream);
00122 }
00123
00124
00125 std::map<std::string, int> targetWordCount;
00126 std::map<std::string, std::string> targetWordLabel;
00127
00128
00129 std::map<std::string, int> sourceWordCount;
00130 std::map<std::string, std::string> sourceWordLabel;
00131
00132 std::string targetLine;
00133 std::string sourceLine;
00134 std::string alignmentLine;
00135 Alignment alignment;
00136 XmlTreeParser targetXmlTreeParser;
00137 XmlTreeParser sourceXmlTreeParser;
00138 ScfgRuleWriter scfgWriter(fwdExtractStream, invExtractStream, options);
00139 StsgRuleWriter stsgWriter(fwdExtractStream, invExtractStream, options);
00140 size_t lineNum = options.sentenceOffset;
00141 while (true) {
00142 std::getline(targetStream, targetLine);
00143 std::getline(sourceStream, sourceLine);
00144 std::getline(alignmentStream, alignmentLine);
00145
00146 if (targetStream.eof() && sourceStream.eof() && alignmentStream.eof()) {
00147 break;
00148 }
00149
00150 if (targetStream.eof() || sourceStream.eof() || alignmentStream.eof()) {
00151 Error("Files must contain same number of lines");
00152 }
00153
00154 ++lineNum;
00155
00156
00157 if (targetLine.size() == 0) {
00158 std::cerr << "skipping line " << lineNum << " with empty target tree\n";
00159 continue;
00160 }
00161 std::auto_ptr<SyntaxTree> targetParseTree;
00162 try {
00163 targetParseTree = targetXmlTreeParser.Parse(targetLine);
00164 assert(targetParseTree.get());
00165 } catch (const Exception &e) {
00166 std::ostringstream oss;
00167 oss << "Failed to parse target XML tree at line " << lineNum;
00168 if (!e.msg().empty()) {
00169 oss << ": " << e.msg();
00170 }
00171 Error(oss.str());
00172 }
00173
00174
00175 std::vector<std::string> sourceTokens;
00176 std::auto_ptr<SyntaxTree> sourceParseTree;
00177 if (!options.sourceLabels) {
00178 sourceTokens = ReadTokens(sourceLine);
00179 } else {
00180 try {
00181 sourceParseTree = sourceXmlTreeParser.Parse(sourceLine);
00182 assert(sourceParseTree.get());
00183 } catch (const Exception &e) {
00184 std::ostringstream oss;
00185 oss << "Failed to parse source XML tree at line " << lineNum;
00186 if (!e.msg().empty()) {
00187 oss << ": " << e.msg();
00188 }
00189 Error(oss.str());
00190 }
00191 sourceTokens = sourceXmlTreeParser.words();
00192 }
00193
00194
00195 try {
00196 ReadAlignment(alignmentLine, alignment);
00197 } catch (const Exception &e) {
00198 std::ostringstream oss;
00199 oss << "Failed to read alignment at line " << lineNum << ": ";
00200 oss << e.msg();
00201 Error(oss.str());
00202 }
00203 if (alignment.size() == 0) {
00204 std::cerr << "skipping line " << lineNum << " without alignment points\n";
00205 continue;
00206 }
00207 if (options.t2s) {
00208 FlipAlignment(alignment);
00209 }
00210
00211
00212 if (!options.targetUnknownWordFile.empty()) {
00213 CollectWordLabelCounts(*targetParseTree, options, targetWordCount,
00214 targetWordLabel);
00215 }
00216
00217
00218 if (options.sourceLabels && !options.sourceUnknownWordFile.empty()) {
00219 CollectWordLabelCounts(*sourceParseTree, options, sourceWordCount,
00220 sourceWordLabel);
00221 }
00222
00223
00224
00225 AlignmentGraph graph(targetParseTree.get(), sourceTokens, alignment);
00226
00227
00228 graph.ExtractMinimalRules(options);
00229
00230
00231 if (!options.minimal) {
00232 graph.ExtractComposedRules(options);
00233 }
00234
00235
00236 PhraseOrientation phraseOrientation(sourceTokens.size(),
00237 targetXmlTreeParser.words().size(), alignment);
00238
00239
00240 const std::vector<Node *> &targetNodes = graph.GetTargetNodes();
00241 for (std::vector<Node *>::const_iterator p = targetNodes.begin();
00242 p != targetNodes.end(); ++p) {
00243
00244 const std::vector<const Subgraph *> &rules = (*p)->GetRules();
00245
00246 PhraseOrientation::REO_CLASS l2rOrientation=PhraseOrientation::REO_CLASS_UNKNOWN, r2lOrientation=PhraseOrientation::REO_CLASS_UNKNOWN;
00247 if (options.phraseOrientation && !rules.empty()) {
00248 int sourceSpanBegin = *((*p)->GetSpan().begin());
00249 int sourceSpanEnd = *((*p)->GetSpan().rbegin());
00250 l2rOrientation = phraseOrientation.GetOrientationInfo(sourceSpanBegin,sourceSpanEnd,PhraseOrientation::REO_DIR_L2R);
00251 r2lOrientation = phraseOrientation.GetOrientationInfo(sourceSpanBegin,sourceSpanEnd,PhraseOrientation::REO_DIR_R2L);
00252
00253
00254 }
00255
00256 for (std::vector<const Subgraph *>::const_iterator q = rules.begin();
00257 q != rules.end(); ++q) {
00258
00259 if (options.stsg) {
00260 StsgRule rule(**q);
00261 if (rule.Scope() <= options.maxScope) {
00262 stsgWriter.Write(rule);
00263 }
00264 continue;
00265 }
00266
00267 ScfgRule *r = 0;
00268 if (options.sourceLabels) {
00269 r = new ScfgRule(**q, &sourceXmlTreeParser.node_collection());
00270 } else {
00271 r = new ScfgRule(**q);
00272 }
00273
00274 if (r->Scope() <= options.maxScope) {
00275 scfgWriter.Write(*r,lineNum,false);
00276 if (options.treeFragments) {
00277 fwdExtractStream << " {{Tree ";
00278 (*q)->PrintTree(fwdExtractStream);
00279 fwdExtractStream << "}}";
00280 }
00281 if (options.partsOfSpeech) {
00282 fwdExtractStream << " {{POS";
00283 (*q)->PrintPartsOfSpeech(fwdExtractStream);
00284 fwdExtractStream << "}}";
00285 }
00286 if (options.phraseOrientation) {
00287 fwdExtractStream << " {{Orientation ";
00288 phraseOrientation.WriteOrientation(fwdExtractStream,l2rOrientation);
00289 fwdExtractStream << " ";
00290 phraseOrientation.WriteOrientation(fwdExtractStream,r2lOrientation);
00291 fwdExtractStream << "}}";
00292 phraseOrientation.IncrementPriorCount(PhraseOrientation::REO_DIR_L2R,l2rOrientation,1);
00293 phraseOrientation.IncrementPriorCount(PhraseOrientation::REO_DIR_R2L,r2lOrientation,1);
00294 }
00295 fwdExtractStream << std::endl;
00296 invExtractStream << std::endl;
00297 }
00298 delete r;
00299 }
00300 }
00301 }
00302
00303 if (options.phraseOrientation) {
00304 std::string phraseOrientationPriorsFileName = options.extractFile + std::string(".phraseOrientationPriors");
00305 OutputFileStream phraseOrientationPriorsStream;
00306 OpenOutputFileOrDie(phraseOrientationPriorsFileName, phraseOrientationPriorsStream);
00307 PhraseOrientation::WritePriorCounts(phraseOrientationPriorsStream);
00308 }
00309
00310 std::map<std::string,size_t> sourceLabels;
00311 if (options.sourceLabels && !options.sourceLabelSetFile.empty()) {
00312 std::set<std::string> extendedLabelSet = sourceXmlTreeParser.label_set();
00313 extendedLabelSet.insert("XLHS");
00314 extendedLabelSet.insert("XRHS");
00315 extendedLabelSet.insert("TOPLABEL");
00316 extendedLabelSet.insert("SOMELABEL");
00317 size_t index = 0;
00318 for (std::set<std::string>::const_iterator iter=extendedLabelSet.begin();
00319 iter!=extendedLabelSet.end(); ++iter, ++index) {
00320 sourceLabels.insert(std::pair<std::string,size_t>(*iter,index));
00321 }
00322 WriteSourceLabelSet(sourceLabels, sourceLabelSetStream);
00323 }
00324
00325 std::set<std::string> strippedTargetLabelSet;
00326 std::map<std::string, int> strippedTargetTopLabelSet;
00327 if (options.stripBitParLabels &&
00328 (!options.glueGrammarFile.empty() || !options.unknownWordSoftMatchesFile.empty())) {
00329 StripBitParLabels(targetXmlTreeParser.label_set(),
00330 targetXmlTreeParser.top_label_set(),
00331 strippedTargetLabelSet, strippedTargetTopLabelSet);
00332 }
00333
00334 if (!options.glueGrammarFile.empty()) {
00335 if (options.stripBitParLabels) {
00336 WriteGlueGrammar(strippedTargetLabelSet, strippedTargetTopLabelSet, sourceLabels, options, glueGrammarStream);
00337 } else {
00338 WriteGlueGrammar(targetXmlTreeParser.label_set(),
00339 targetXmlTreeParser.top_label_set(),
00340 sourceLabels, options, glueGrammarStream);
00341 }
00342 }
00343
00344 if (!options.targetUnknownWordFile.empty()) {
00345 WriteUnknownWordLabel(targetWordCount, targetWordLabel, options, targetUnknownWordStream);
00346 }
00347
00348 if (options.sourceLabels && !options.sourceUnknownWordFile.empty()) {
00349 WriteUnknownWordLabel(sourceWordCount, sourceWordLabel, options, sourceUnknownWordStream, true);
00350 }
00351
00352 if (!options.unknownWordSoftMatchesFile.empty()) {
00353 if (options.stripBitParLabels) {
00354 WriteUnknownWordSoftMatches(strippedTargetLabelSet, unknownWordSoftMatchesStream);
00355 } else {
00356 WriteUnknownWordSoftMatches(targetXmlTreeParser.label_set(),
00357 unknownWordSoftMatchesStream);
00358 }
00359 }
00360
00361 return 0;
00362 }
00363
00364 void ExtractGHKM::ProcessOptions(int argc, char *argv[],
00365 Options &options) const
00366 {
00367 namespace po = boost::program_options;
00368 namespace cls = boost::program_options::command_line_style;
00369
00370
00371
00372 std::ostringstream usageTop;
00373 usageTop << "Usage: " << name()
00374 << " [OPTION]... TARGET SOURCE ALIGNMENT EXTRACT\n\n"
00375 << "SCFG rule extractor based on the GHKM algorithm described in\n"
00376 << "Galley et al. (2004).\n\n"
00377 << "Options";
00378
00379
00380 std::ostringstream usageBottom;
00381 usageBottom << "\nImplementation Notes:\n"
00382 << "\nThe parse tree is assumed to contain part-of-speech preterminal nodes.\n"
00383 << "\n"
00384 << "For the composed rule constraints: rule depth is the "
00385 "maximum distance from the\nrule's root node to a sink "
00386 "node, not counting preterminal expansions or word\n"
00387 "alignments. Rule size is the measure defined in DeNeefe "
00388 "et al (2007): the\nnumber of non-part-of-speech, non-leaf "
00389 "constituent labels in the target tree.\nNode count is the "
00390 "number of target tree nodes (excluding target words).\n"
00391 << "\n"
00392 << "Scope pruning (Hopkins and Langmead, 2010) is applied to both minimal and\ncomposed rules.\n"
00393 << "\n"
00394 << "Unaligned source words are attached to the tree using the "
00395 "following heuristic:\nif there are aligned source words to "
00396 "both the left and the right of an unaligned\nsource word "
00397 "then it is attached to the lowest common ancestor of its "
00398 "nearest\nsuch left and right neighbours. Otherwise, it is "
00399 "attached to the root of the\nparse tree.\n"
00400 << "\n"
00401 << "Unless the --AllowUnary option is given, unary rules containing no lexical\nsource items are eliminated using the method described in Chung et al. (2011).\nThe parsing algorithm used in Moses is unable to handle such rules.\n"
00402 << "\n"
00403 << "References:\n"
00404 << "Galley, M., Hopkins, M., Knight, K., and Marcu, D. (2004)\n"
00405 << "\"What's in a Translation Rule?\", In Proceedings of HLT/NAACL 2004.\n"
00406 << "\n"
00407 << "DeNeefe, S., Knight, K., Wang, W., and Marcu, D. (2007)\n"
00408 << "\"What Can Syntax-Based MT Learn from Phrase-Based MT?\", In Proceedings of\nEMNLP-CoNLL 2007.\n"
00409 << "\n"
00410 << "Hopkins, M. and Langmead, G. (2010)\n"
00411 << "\"SCFG Decoding Without Binarization\", In Proceedings of EMNLP 2010.\n"
00412 << "\n"
00413 << "Chung, T. and Fang, L. and Gildea, D. (2011)\n"
00414 << "\"Issues Concerning Decoding with Synchronous Context-free Grammar\", In\nProceedings of ACL/HLT 2011.";
00415
00416
00417 po::options_description visible(usageTop.str());
00418 visible.add_options()
00419
00420 ("AllowUnary",
00421 "allow fully non-lexical unary rules")
00422 ("ConditionOnTargetLHS",
00423 "write target LHS instead of \"X\" as source LHS")
00424 ("GlueGrammar",
00425 po::value(&options.glueGrammarFile),
00426 "write glue grammar to named file")
00427 ("GZOutput",
00428 "write gzipped extract files")
00429 ("IncludeSentenceId",
00430 "include sentence ID")
00431 ("MaxNodes",
00432 po::value(&options.maxNodes)->default_value(options.maxNodes),
00433 "set maximum number of tree nodes for composed rules")
00434 ("MaxRuleDepth",
00435 po::value(&options.maxRuleDepth)->default_value(options.maxRuleDepth),
00436 "set maximum depth for composed rules")
00437 ("MaxRuleSize",
00438 po::value(&options.maxRuleSize)->default_value(options.maxRuleSize),
00439 "set maximum size for composed rules")
00440 ("MaxScope",
00441 po::value(&options.maxScope)->default_value(options.maxScope),
00442 "set maximum allowed scope")
00443 ("Minimal",
00444 "extract minimal rules only")
00445 ("PartsOfSpeech",
00446 "output parts-of-speech as property (preterminals from the parse tree)")
00447 ("PartsOfSpeechFactor",
00448 "output parts-of-speech as factor (preterminals from the parse tree)")
00449 ("PCFG",
00450 "include score based on PCFG scores in target corpus")
00451 ("PhraseOrientation",
00452 "output phrase orientation information")
00453 ("StripBitParLabels",
00454 "strip suffix starting with a hyphen symbol (\"-\") from non-terminal labels")
00455 ("STSG",
00456 "output STSG rules (default is SCFG)")
00457 ("T2S",
00458 "enable tree-to-string rule extraction (string-to-tree is assumed by default)")
00459 ("TreeFragments",
00460 "output parse tree information")
00461 ("SourceLabels",
00462 "output source syntax label information")
00463 ("SourceLabelSet",
00464 po::value(&options.sourceLabelSetFile),
00465 "write source syntax label set to named file")
00466 ("SentenceOffset",
00467 po::value(&options.sentenceOffset)->default_value(options.sentenceOffset),
00468 "set sentence number offset if processing split corpus")
00469 ("UnknownWordLabel",
00470 po::value(&options.targetUnknownWordFile),
00471 "write unknown word labels to named file")
00472 ("SourceUnknownWordLabel",
00473 po::value(&options.sourceUnknownWordFile),
00474 "write source syntax unknown word labels to named file")
00475 ("UnknownWordMinRelFreq",
00476 po::value(&options.unknownWordMinRelFreq)->default_value(
00477 options.unknownWordMinRelFreq),
00478 "set minimum relative frequency for unknown word labels")
00479 ("UnknownWordSoftMatches",
00480 po::value(&options.unknownWordSoftMatchesFile),
00481 "write dummy value to unknown word label file, and mappings from dummy value to other labels to named file")
00482 ("UnknownWordUniform",
00483 "write uniform weights to unknown word label file")
00484 ("UnpairedExtractFormat",
00485 "do not pair non-terminals in extract files")
00486 ;
00487
00488
00489
00490 po::options_description hidden("Hidden options");
00491 hidden.add_options()
00492 ("TargetFile",
00493 po::value(&options.targetFile),
00494 "target file")
00495 ("SourceFile",
00496 po::value(&options.sourceFile),
00497 "source file")
00498 ("AlignmentFile",
00499 po::value(&options.alignmentFile),
00500 "alignment file")
00501 ("ExtractFile",
00502 po::value(&options.extractFile),
00503 "extract file")
00504 ;
00505
00506
00507 po::options_description cmdLineOptions;
00508 cmdLineOptions.add(visible).add(hidden);
00509
00510
00511 po::positional_options_description p;
00512 p.add("TargetFile", 1);
00513 p.add("SourceFile", 1);
00514 p.add("AlignmentFile", 1);
00515 p.add("ExtractFile", 1);
00516
00517
00518 po::variables_map vm;
00519 try {
00520 po::store(po::command_line_parser(argc, argv).style(MosesOptionStyle()).
00521 options(cmdLineOptions).positional(p).run(), vm);
00522 po::notify(vm);
00523 } catch (const std::exception &e) {
00524 std::ostringstream msg;
00525 msg << e.what() << "\n\n" << visible << usageBottom.str();
00526 Error(msg.str());
00527 }
00528
00529 if (vm.count("help")) {
00530 std::cout << visible << usageBottom.str() << std::endl;
00531 std::exit(0);
00532 }
00533
00534
00535 if (!vm.count("TargetFile") ||
00536 !vm.count("SourceFile") ||
00537 !vm.count("AlignmentFile") ||
00538 !vm.count("ExtractFile")) {
00539 std::ostringstream msg;
00540 std::cerr << visible << usageBottom.str() << std::endl;
00541 std::exit(1);
00542 }
00543
00544
00545 if (vm.count("AllowUnary")) {
00546 options.allowUnary = true;
00547 }
00548 if (vm.count("ConditionOnTargetLHS")) {
00549 options.conditionOnTargetLhs = true;
00550 }
00551 if (vm.count("GZOutput")) {
00552 options.gzOutput = true;
00553 }
00554 if (vm.count("IncludeSentenceId")) {
00555 options.includeSentenceId = true;
00556 }
00557 if (vm.count("Minimal")) {
00558 options.minimal = true;
00559 }
00560 if (vm.count("PartsOfSpeech")) {
00561 options.partsOfSpeech = true;
00562 }
00563 if (vm.count("PartsOfSpeechFactor")) {
00564 options.partsOfSpeechFactor = true;
00565 }
00566 if (vm.count("PCFG")) {
00567 options.pcfg = true;
00568 }
00569 if (vm.count("PhraseOrientation")) {
00570 options.phraseOrientation = true;
00571 }
00572 if (vm.count("StripBitParLabels")) {
00573 options.stripBitParLabels = true;
00574 }
00575 if (vm.count("STSG")) {
00576 options.stsg = true;
00577 }
00578 if (vm.count("T2S")) {
00579 options.t2s = true;
00580 }
00581 if (vm.count("TreeFragments")) {
00582 options.treeFragments = true;
00583 }
00584 if (vm.count("SourceLabels")) {
00585 options.sourceLabels = true;
00586 }
00587 if (vm.count("UnknownWordUniform")) {
00588 options.unknownWordUniform = true;
00589 }
00590 if (vm.count("UnpairedExtractFormat")) {
00591 options.unpairedExtractFormat = true;
00592 }
00593
00594
00595 if (options.sentenceOffset > 0) {
00596 options.targetUnknownWordFile.clear();
00597 }
00598 if (options.sentenceOffset > 0) {
00599 options.sourceUnknownWordFile.clear();
00600 options.unknownWordSoftMatchesFile.clear();
00601 }
00602 }
00603
00604 std::vector<std::string> ExtractGHKM::ReadTokens(const std::string &s) const
00605 {
00606 std::vector<std::string> tokens;
00607
00608 std::string whitespace = " \t";
00609
00610 std::string::size_type begin = s.find_first_not_of(whitespace);
00611 assert(begin != std::string::npos);
00612 while (true) {
00613 std::string::size_type end = s.find_first_of(whitespace, begin);
00614 std::string token;
00615 if (end == std::string::npos) {
00616 token = s.substr(begin);
00617 } else {
00618 token = s.substr(begin, end-begin);
00619 }
00620 tokens.push_back(token);
00621 if (end == std::string::npos) {
00622 break;
00623 }
00624 begin = s.find_first_not_of(whitespace, end);
00625 if (begin == std::string::npos) {
00626 break;
00627 }
00628 }
00629
00630 return tokens;
00631 }
00632
00633 void ExtractGHKM::WriteGlueGrammar(
00634 const std::set<std::string> &labelSet,
00635 const std::map<std::string, int> &topLabelSet,
00636 const std::map<std::string,size_t> &sourceLabels,
00637 const Options &options,
00638 std::ostream &out) const
00639 {
00640
00641 std::string topLabel = "QQQQQQ";
00642 for(size_t i = 1; i <= topLabel.length(); i++) {
00643 if (labelSet.find(topLabel.substr(0,i)) == labelSet.end() ) {
00644 topLabel = topLabel.substr(0,i);
00645 break;
00646 }
00647 }
00648
00649 const size_t sourceLabelGlueTop = 0;
00650 const size_t sourceLabelGlueX = 1;
00651 const size_t sourceLabelSentenceStart = 2;
00652 const size_t sourceLabelSentenceEnd = 3;
00653
00654
00655
00656 #ifndef BOS_
00657 #define BOS_ "<s>" //Beginning of sentence symbol
00658 #endif
00659 #ifndef EOS_
00660 #define EOS_ "</s>" //End of sentence symbol
00661 #endif
00662
00663 std::string sentenceStartSource = BOS_;
00664 std::string sentenceEndSource = EOS_;
00665 std::string sentenceStartTarget = BOS_;
00666 std::string sentenceEndTarget = EOS_;
00667 if (options.partsOfSpeech) {
00668 sentenceStartTarget = sentenceStartTarget + "|" + BOS_;
00669 sentenceEndTarget = sentenceEndTarget + "|" + EOS_;
00670 }
00671 if (options.partsOfSpeechFactor) {
00672 sentenceStartTarget = sentenceStartTarget + "|" + BOS_;
00673 sentenceEndTarget = sentenceEndTarget + "|" + EOS_;
00674 }
00675
00676
00677 out << sentenceStartSource << " [X] ||| " << sentenceStartTarget << " [" << topLabel << "] ||| 1 ||| 0-0 ||| ||| |||";
00678 if (options.treeFragments) {
00679 out << " {{Tree [" << topLabel << " [SSTART <s>]]}}";
00680 }
00681
00682
00683
00684 if (options.sourceLabels) {
00685 out << " {{SourceLabels 2 1 " << sourceLabelSentenceStart << " 1 1 " << sourceLabelGlueTop << " 1}}";
00686 }
00687 if (options.phraseOrientation) {
00688 out << " {{Orientation 1 1 0.5 0.5 1 1 0.5 0.5}}";
00689 }
00690 out << std::endl;
00691
00692 out << "[X][" << topLabel << "] " << sentenceEndSource << " [X] ||| [X][" << topLabel << "] " << sentenceEndTarget << " [" << topLabel << "] ||| 1 ||| 0-0 1-1 ||| ||| |||";
00693 if (options.treeFragments) {
00694 out << " {{Tree [" << topLabel << " [" << topLabel << "] [SEND </s>]]}}";
00695 }
00696
00697
00698
00699 if (options.sourceLabels) {
00700 out << " {{SourceLabels 4 1 " << sourceLabelSentenceStart << " " << sourceLabelGlueTop << " " << sourceLabelSentenceEnd << " 1 1 " << sourceLabelGlueTop << " 1}}";
00701 }
00702 if (options.phraseOrientation) {
00703 out << " {{Orientation 1 1 0.5 0.5 1 1 0.5 0.5}}";
00704 }
00705 out << std::endl;
00706
00707
00708 for (std::map<std::string, int>::const_iterator i = topLabelSet.begin();
00709 i != topLabelSet.end(); ++i) {
00710 out << sentenceStartSource << " [X][" << i->first << "] " << sentenceEndSource << " [X] ||| " << sentenceStartTarget << " [X][" << i->first << "] " << sentenceEndTarget << " [" << topLabel << "] ||| 1 ||| 0-0 1-1 2-2 ||| ||| |||";
00711 if (options.treeFragments) {
00712 out << " {{Tree [" << topLabel << " [SSTART <s>] [" << i->first << "] [SEND </s>]]}}";
00713 }
00714
00715
00716
00717 if (options.sourceLabels) {
00718 out << " {{SourceLabels 4 1 " << sourceLabelSentenceStart << " " << sourceLabelGlueX << " " << sourceLabelSentenceEnd << " 1 1 " << sourceLabelGlueTop << " 1}}";
00719 }
00720 if (options.phraseOrientation) {
00721 out << " {{Orientation 1 1 0.5 0.5 1 1 0.5 0.5}}";
00722 }
00723 out << std::endl;
00724 }
00725
00726
00727 for(std::set<std::string>::const_iterator i = labelSet.begin();
00728 i != labelSet.end(); i++ ) {
00729 out << "[X][" << topLabel << "] [X][" << *i << "] [X] ||| [X][" << topLabel << "] [X][" << *i << "] [" << topLabel << "] ||| 2.718 ||| 0-0 1-1 ||| ||| |||";
00730 if (options.treeFragments) {
00731 out << " {{Tree [" << topLabel << " ["<< topLabel << "] [" << *i << "]]}}";
00732 }
00733 if (options.sourceLabels) {
00734 out << " {{SourceLabels 3 1 " << sourceLabelGlueTop << " " << sourceLabelGlueX << " 1 1 " << sourceLabelGlueTop << " 1}}";
00735 }
00736 if (options.phraseOrientation) {
00737 out << " {{Orientation 1 1 0.5 0.5 1 1 0.5 0.5}}";
00738 }
00739 out << std::endl;
00740 }
00741
00742
00743 out << "[X][" << topLabel << "] [X][X] [X] ||| [X][" << topLabel << "] [X][X] [" << topLabel << "] ||| 2.718 ||| 0-0 1-1 ||| ||| |||";
00744 if (options.treeFragments) {
00745 out << " {{Tree [" << topLabel << " [" << topLabel << "] [X]]}}";
00746 }
00747 if (options.sourceLabels) {
00748 out << " {{SourceLabels 3 1 " << sourceLabelGlueTop << " " << sourceLabelGlueX << " 1 1 " << sourceLabelGlueTop << " 1}}";
00749 }
00750 if (options.phraseOrientation) {
00751 out << " {{Orientation 1 1 0.5 0.5 1 1 0.5 0.5}}";
00752 }
00753 out << std::endl;
00754 }
00755
00756 void ExtractGHKM::WriteSourceLabelSet(
00757 const std::map<std::string,size_t> &sourceLabels,
00758 std::ostream &out) const
00759 {
00760 out << sourceLabels.size() << std::endl;
00761 for (std::map<std::string,size_t>::const_iterator iter=sourceLabels.begin();
00762 iter!=sourceLabels.end(); ++iter) {
00763 out << iter->first << " " << iter->second << std::endl;
00764 }
00765 }
00766
00767 void ExtractGHKM::CollectWordLabelCounts(
00768 SyntaxTree &root,
00769 const Options &options,
00770 std::map<std::string, int> &wordCount,
00771 std::map<std::string, std::string> &wordLabel)
00772 {
00773 for (SyntaxTree::ConstLeafIterator p(root);
00774 p != SyntaxTree::ConstLeafIterator(); ++p) {
00775 const SyntaxTree &leaf = *p;
00776 const std::string &word = leaf.value().label;
00777 const SyntaxTree *ancestor = leaf.parent();
00778
00779
00780
00781
00782
00783 while (!options.allowUnary &&
00784 ancestor->parent() &&
00785 ancestor->parent()->children().size() == 1) {
00786 ancestor = ancestor->parent();
00787 }
00788 const std::string &label = ancestor->value().label;
00789 ++wordCount[word];
00790 wordLabel[word] = label;
00791 }
00792 }
00793
00794 std::vector<std::string> ExtractGHKM::ReadTokens(const SyntaxTree &root) const
00795 {
00796 std::vector<std::string> tokens;
00797 for (SyntaxTree::ConstLeafIterator p(root);
00798 p != SyntaxTree::ConstLeafIterator(); ++p) {
00799 const SyntaxTree &leaf = *p;
00800 const std::string &word = leaf.value().label;
00801 tokens.push_back(word);
00802 }
00803 return tokens;
00804 }
00805
00806 void ExtractGHKM::WriteUnknownWordLabel(
00807 const std::map<std::string, int> &wordCount,
00808 const std::map<std::string, std::string> &wordLabel,
00809 const Options &options,
00810 std::ostream &out,
00811 bool writeCounts) const
00812 {
00813 if (!options.unknownWordSoftMatchesFile.empty()) {
00814 out << "UNK 1" << std::endl;
00815 return;
00816 }
00817
00818 std::map<std::string, int> labelCount;
00819 int total = 0;
00820 for (std::map<std::string, int>::const_iterator p = wordCount.begin();
00821 p != wordCount.end(); ++p) {
00822
00823 if (p->second == 1) {
00824 std::map<std::string, std::string>::const_iterator q =
00825 wordLabel.find(p->first);
00826 assert(q != wordLabel.end());
00827 if (options.stripBitParLabels) {
00828 size_t pos = q->second.find('-');
00829 if (pos == std::string::npos) {
00830 ++labelCount[q->second];
00831 } else {
00832 ++labelCount[q->second.substr(0,pos)];
00833 }
00834 } else {
00835 ++labelCount[q->second];
00836 }
00837 ++total;
00838 }
00839 }
00840 if ( writeCounts ) {
00841 for (std::map<std::string, int>::const_iterator p = labelCount.begin();
00842 p != labelCount.end(); ++p) {
00843 out << p->first << " " << p->second << std::endl;
00844 }
00845 } else {
00846 for (std::map<std::string, int>::const_iterator p = labelCount.begin();
00847 p != labelCount.end(); ++p) {
00848 double ratio = static_cast<double>(p->second) / static_cast<double>(total);
00849 if (ratio >= options.unknownWordMinRelFreq) {
00850 float weight = options.unknownWordUniform ? 1.0f : ratio;
00851 out << p->first << " " << weight << std::endl;
00852 }
00853 }
00854 }
00855 }
00856
00857 void ExtractGHKM::WriteUnknownWordSoftMatches(
00858 const std::set<std::string> &labelSet,
00859 std::ostream &out) const
00860 {
00861 for (std::set<std::string>::const_iterator p = labelSet.begin(); p != labelSet.end(); ++p) {
00862 std::string label = *p;
00863 out << "UNK " << label << std::endl;
00864 }
00865 }
00866
00867 void ExtractGHKM::StripBitParLabels(
00868 const std::set<std::string> &labelSet,
00869 const std::map<std::string, int> &topLabelSet,
00870 std::set<std::string> &outLabelSet,
00871 std::map<std::string, int> &outTopLabelSet) const
00872 {
00873 for (std::set<std::string>::const_iterator it=labelSet.begin();
00874 it!=labelSet.end(); ++it) {
00875 size_t pos = it->find('-');
00876 if (pos == std::string::npos) {
00877 outLabelSet.insert(*it);
00878 } else {
00879 outLabelSet.insert(it->substr(0,pos));
00880 }
00881 }
00882 for (std::map<std::string,int>::const_iterator it=topLabelSet.begin();
00883 it!=topLabelSet.end(); ++it) {
00884 size_t pos = it->first.find('-');
00885 std::string stripped;
00886 if (pos == std::string::npos) {
00887 stripped = it->first;
00888 } else {
00889 stripped = it->first.substr(0,pos);
00890 }
00891 std::map<std::string, int>::iterator found=outTopLabelSet.find(stripped);
00892 if (found != outTopLabelSet.end()) {
00893 found->second += it->second;
00894 } else {
00895 outTopLabelSet.insert(std::pair<std::string,int>(stripped,it->second));
00896 }
00897 }
00898 }
00899
00900 }
00901 }
00902 }