00001 #include "FilterRuleTable.h"
00002
00003 #include <cassert>
00004 #include <cstdlib>
00005 #include <fstream>
00006 #include <iostream>
00007 #include <iterator>
00008 #include <string>
00009 #include <sstream>
00010 #include <vector>
00011
00012 #include <boost/make_shared.hpp>
00013 #include <boost/program_options.hpp>
00014
00015 #include "syntax-common/exception.h"
00016 #include "syntax-common/xml_tree_parser.h"
00017
00018 #include "InputFileStream.h"
00019
00020 #include "ForestTsgFilter.h"
00021 #include "Options.h"
00022 #include "StringCfgFilter.h"
00023 #include "StringForest.h"
00024 #include "StringForestParser.h"
00025 #include "TreeCfgFilter.h"
00026 #include "TreeTsgFilter.h"
00027
00028 namespace MosesTraining
00029 {
00030 namespace Syntax
00031 {
00032 namespace FilterRuleTable
00033 {
00034
00035 int FilterRuleTable::Main(int argc, char *argv[])
00036 {
00037 enum TestSentenceFormat {
00038 kUnknownTestSentenceFormat,
00039 kString,
00040 kTree,
00041 kForest
00042 };
00043
00044 enum SourceSideRuleFormat {
00045 kUnknownSourceSideRuleFormat,
00046 kCfg,
00047 kTsg
00048 };
00049
00050
00051 Options options;
00052 ProcessOptions(argc, argv, options);
00053
00054
00055 Moses::InputFileStream testStream(options.testSetFile);
00056
00057
00058
00059 TestSentenceFormat testSentenceFormat = kUnknownTestSentenceFormat;
00060 SourceSideRuleFormat sourceSideRuleFormat = kUnknownSourceSideRuleFormat;
00061 if (options.model == "hierarchical" || options.model == "s2t") {
00062 testSentenceFormat = kString;
00063 sourceSideRuleFormat = kCfg;
00064 } else if (options.model == "t2s") {
00065 testSentenceFormat = kTree;
00066 sourceSideRuleFormat = kTsg;
00067 } else if (options.model == "t2s-scfg") {
00068 testSentenceFormat = kTree;
00069 sourceSideRuleFormat = kCfg;
00070 } else if (options.model == "f2s") {
00071 testSentenceFormat = kForest;
00072 sourceSideRuleFormat = kTsg;
00073 } else {
00074 Error(std::string("unsupported model type: ") + options.model);
00075 }
00076
00077
00078 if (testSentenceFormat == kString) {
00079 assert(sourceSideRuleFormat == kCfg);
00080 std::vector<boost::shared_ptr<std::string> > testStrings;
00081 ReadTestSet(testStream, testStrings);
00082 StringCfgFilter filter(testStrings);
00083 filter.Filter(std::cin, std::cout);
00084 } else if (testSentenceFormat == kTree) {
00085 std::vector<boost::shared_ptr<SyntaxTree> > testTrees;
00086 ReadTestSet(testStream, testTrees);
00087 if (sourceSideRuleFormat == kCfg) {
00088
00089 Warn("tree/cfg filtering algorithm not implemented: input will be copied unchanged to output");
00090 TreeCfgFilter filter(testTrees);
00091 filter.Filter(std::cin, std::cout);
00092 } else if (sourceSideRuleFormat == kTsg) {
00093 TreeTsgFilter filter(testTrees);
00094 filter.Filter(std::cin, std::cout);
00095 } else {
00096 assert(false);
00097 }
00098 } else if (testSentenceFormat == kForest) {
00099 std::vector<boost::shared_ptr<StringForest> > testForests;
00100 ReadTestSet(testStream, testForests);
00101 assert(sourceSideRuleFormat == kTsg);
00102 ForestTsgFilter filter(testForests);
00103 filter.Filter(std::cin, std::cout);
00104 }
00105
00106 return 0;
00107 }
00108
00109 void FilterRuleTable::ReadTestSet(
00110 std::istream &input,
00111 std::vector<boost::shared_ptr<std::string> > &sentences)
00112 {
00113 int lineNum = 0;
00114 std::string line;
00115 while (std::getline(input, line)) {
00116 ++lineNum;
00117 if (line.empty()) {
00118 std::cerr << "skipping blank test sentence at line " << lineNum
00119 << std::endl;
00120 continue;
00121 }
00122 sentences.push_back(boost::make_shared<std::string>(line));
00123 }
00124 }
00125
00126 void FilterRuleTable::ReadTestSet(
00127 std::istream &input, std::vector<boost::shared_ptr<SyntaxTree> > &sentences)
00128 {
00129 XmlTreeParser parser;
00130 int lineNum = 0;
00131 std::string line;
00132 while (std::getline(input, line)) {
00133 ++lineNum;
00134 if (line.empty()) {
00135 std::cerr << "skipping blank test sentence at line " << lineNum
00136 << std::endl;
00137 continue;
00138 }
00139 sentences.push_back(
00140 boost::shared_ptr<SyntaxTree>(parser.Parse(line).release()));
00141 }
00142 }
00143
00144 void FilterRuleTable::ReadTestSet(
00145 std::istream &input,
00146 std::vector<boost::shared_ptr<StringForest> > &sentences)
00147 {
00148 StringForestParser end;
00149 int sentNum = 0;
00150 for (StringForestParser p(input); p != end; ++p) {
00151 ++sentNum;
00152 if (p->forest->vertices.empty()) {
00153 std::cerr << "skipping sentence " << sentNum << ": forest is empty"
00154 << std::endl;
00155 continue;
00156 }
00157 sentences.push_back(p->forest);
00158 }
00159 }
00160
00161 void FilterRuleTable::ProcessOptions(int argc, char *argv[],
00162 Options &options) const
00163 {
00164 namespace po = boost::program_options;
00165 namespace cls = boost::program_options::command_line_style;
00166
00167
00168
00169 std::ostringstream usageTop;
00170 usageTop << "Usage: " << name()
00171 << " [OPTION]... MODEL TEST\n\n"
00172 << "Filter for SCFG/STSG rule tables.\n\n"
00173 << "Options";
00174
00175
00176 std::ostringstream usageBottom;
00177 usageBottom << "\nGiven a rule table on standard input and a set of test sentences, filters out\nthe rules that cannot be applied to any of the test sentences and writes the\nfiltered table to standard output. MODEL specifies the type of syntax model.\nThe following values are supported:\n\n"
00178 << " hierarchical, s2t, t2s, t2s-scfg, f2s\n";
00179
00180
00181 po::options_description visible(usageTop.str());
00182
00183
00184
00185 po::options_description hidden("Hidden options");
00186 hidden.add_options()
00187 ("Model",
00188 po::value(&options.model),
00189 "one of: hierarchical, s2t, t2s, t2s-scfg, f2s")
00190 ("TestSetFile",
00191 po::value(&options.testSetFile),
00192 "test set file")
00193 ;
00194
00195
00196 po::options_description cmdLineOptions;
00197 cmdLineOptions.add(visible).add(hidden);
00198
00199
00200 po::positional_options_description p;
00201 p.add("Model", 1);
00202 p.add("TestSetFile", 1);
00203
00204
00205 po::variables_map vm;
00206 try {
00207 po::store(po::command_line_parser(argc, argv).style(MosesOptionStyle()).
00208 options(cmdLineOptions).positional(p).run(), vm);
00209 po::notify(vm);
00210 } catch (const std::exception &e) {
00211 std::ostringstream msg;
00212 msg << e.what() << "\n\n" << visible << usageBottom.str();
00213 Error(msg.str());
00214 }
00215
00216 if (vm.count("help")) {
00217 std::cout << visible << usageBottom.str() << std::endl;
00218 std::exit(0);
00219 }
00220
00221
00222 if (!vm.count("TestSetFile")) {
00223 std::ostringstream msg;
00224 std::cerr << visible << usageBottom.str() << std::endl;
00225 std::exit(1);
00226 }
00227 }
00228
00229 }
00230 }
00231 }