00001
00002
00003
00004
00005
00006
00007
00008 #include <string>
00009 #include <vector>
00010 #include <map>
00011 #include <iostream>
00012 #include <fstream>
00013 #include <sstream>
00014 #include <cstdlib>
00015 #include <cstring>
00016
00017 #include "util/exception.hh"
00018 #include "util/file_piece.hh"
00019 #include "util/string_piece.hh"
00020 #include "util/tokenize_piece.hh"
00021
00022 #include "InputFileStream.h"
00023 #include "reordering_classes.h"
00024
00025 using namespace std;
00026
00027 void split_line(const StringPiece& line, StringPiece& foreign, StringPiece& english, StringPiece& wbe, StringPiece& phrase, StringPiece& hier, float& weight);
00028 void get_orientations(const StringPiece& pair, StringPiece& previous, StringPiece& next);
00029
00030 class FileFormatException : public util::Exception
00031 {
00032 public:
00033 FileFormatException() throw() {
00034 *this << "Invalid extract file format: ";
00035 }
00036 ~FileFormatException() throw() {}
00037 };
00038
00039 int main(int argc, char* argv[])
00040 {
00041
00042 cerr << "Lexical Reordering Scorer\n"
00043 << "scores lexical reordering models of several types (hierarchical, phrase-based and word-based-extraction\n";
00044
00045 if (argc < 3) {
00046 cerr << "syntax: score_reordering extractFile smoothingValue filepath (--model \"type max-orientation (specification-strings)\" )+\n";
00047 exit(1);
00048 }
00049
00050 char* extractFileName = argv[1];
00051 double smoothingValue = atof(argv[2]);
00052 string filepath = argv[3];
00053
00054 util::FilePiece eFile(extractFileName);
00055
00056 bool smoothWithCounts = false;
00057 map<string,ModelScore*> modelScores;
00058 vector<Model*> models;
00059 bool hier = false;
00060 bool phrase = false;
00061 bool wbe = false;
00062
00063 StringPiece e,f,w,p,h;
00064 StringPiece prev, next;
00065
00066 int i = 4;
00067 while (i<argc) {
00068 if (strcmp(argv[i],"--SmoothWithCounts") == 0) {
00069 smoothWithCounts = true;
00070 } else if (strcmp(argv[i],"--model") == 0) {
00071 if (i+1 >= argc) {
00072 cerr << "score: syntax error, no model information provided to the option" << argv[i] << endl;
00073 exit(1);
00074 }
00075 istringstream is(argv[++i]);
00076 string m,t;
00077 is >> m >> t;
00078 modelScores[m] = ModelScore::createModelScore(t);
00079 if (m.compare("hier") == 0) {
00080 hier = true;
00081 } else if (m.compare("phrase") == 0) {
00082 phrase = true;
00083 }
00084 if (m.compare("wbe") == 0) {
00085 wbe = true;
00086 }
00087
00088 if (!hier && !phrase && !wbe) {
00089 cerr << "WARNING: No models specified for lexical reordering. No lexical reordering table will be trained.\n";
00090 return 0;
00091 }
00092
00093 string config;
00094
00095 while (is >> config) {
00096 models.push_back(Model::createModel(modelScores[m],config,filepath));
00097 }
00098 } else {
00099 cerr << "illegal option given to lexical reordering model score\n";
00100 exit(1);
00101 }
00102 i++;
00103 }
00104
00106
00107 if (smoothWithCounts) {
00108 util::FilePiece eFileForCounts(extractFileName);
00109 while (true) {
00110 StringPiece line;
00111 try {
00112 line = eFileForCounts.ReadLine();
00113 } catch (util::EndOfFileException &e) {
00114 break;
00115 }
00116 float weight = 1;
00117 split_line(line,e,f,w,p,h,weight);
00118 if (hier) {
00119 get_orientations(h, prev, next);
00120 modelScores["hier"]->add_example(prev,next,weight);
00121 }
00122 if (phrase) {
00123 get_orientations(p, prev, next);
00124 modelScores["phrase"]->add_example(prev,next,weight);
00125 }
00126 if (wbe) {
00127 get_orientations(w, prev, next);
00128 modelScores["wbe"]->add_example(prev,next,weight);
00129 }
00130 }
00131
00132
00133 for (size_t i=0; i<models.size(); ++i) {
00134 models[i]->createSmoothing(smoothingValue);
00135 }
00136
00137 } else {
00138
00139 for (size_t i=0; i<models.size(); ++i) {
00140 models[i]->createConstSmoothing(smoothingValue);
00141 }
00142 }
00143
00145
00146 string f_current,e_current;
00147 bool first = true;
00148 while (true) {
00149 StringPiece line;
00150 try {
00151 line = eFile.ReadLine();
00152 } catch (util::EndOfFileException &e) {
00153 break;
00154 }
00155 float weight = 1;
00156 split_line(line,f,e,w,p,h,weight);
00157
00158 if (first) {
00159 f_current = f.as_string();
00160 e_current = e.as_string();
00161 first = false;
00162 } else if (f.compare(f_current) != 0 || e.compare(e_current) != 0) {
00163
00164 for (size_t i=0; i<models.size(); ++i) {
00165 models[i]->score_fe(f_current,e_current);
00166 }
00167
00168 for(map<string,ModelScore*>::const_iterator it = modelScores.begin(); it != modelScores.end(); ++it) {
00169 it->second->reset_fe();
00170 }
00171
00172 if (f.compare(f_current) != 0) {
00173
00174 for (size_t i=0; i<models.size(); ++i) {
00175 models[i]->score_f(f_current);
00176 }
00177
00178 for(map<string,ModelScore*>::const_iterator it = modelScores.begin(); it != modelScores.end(); ++it) {
00179 it->second->reset_f();
00180 }
00181 }
00182 f_current = f.as_string();
00183 e_current = e.as_string();
00184 }
00185
00186
00187 if (hier) {
00188 get_orientations(h, prev, next);
00189 modelScores["hier"]->add_example(prev,next,weight);
00190 }
00191 if (phrase) {
00192 get_orientations(p, prev, next);
00193 modelScores["phrase"]->add_example(prev,next,weight);
00194 }
00195 if (wbe) {
00196 get_orientations(w, prev, next);
00197 modelScores["wbe"]->add_example(prev,next,weight);
00198 }
00199 }
00200
00201 for (size_t i=0; i<models.size(); ++i) {
00202 models[i]->score_fe(f_current,e_current);
00203 }
00204 for (size_t i=0; i<models.size(); ++i) {
00205 models[i]->score_f(f_current);
00206 }
00207
00208
00209 for (size_t i=0; i<models.size(); ++i) {
00210 delete models[i];
00211 }
00212 return 0;
00213 }
00214
00215 template <class It> StringPiece
00216 GrabOrDie(It &it, const StringPiece& line)
00217 {
00218 UTIL_THROW_IF(!it, FileFormatException, line.as_string());
00219 return *it++;
00220 }
00221
00222
00223 void split_line(
00224 const StringPiece& line,
00225 StringPiece& foreign,
00226 StringPiece& english,
00227 StringPiece& wbe,
00228 StringPiece& phrase,
00229 StringPiece& hier,
00230 float& weight)
00231 {
00232
00233
00234
00235
00236
00237
00238
00239
00240 util::TokenIter<util::MultiCharacter> pipes(line, util::MultiCharacter(" ||| "));
00241 foreign = GrabOrDie(pipes,line);
00242 english = GrabOrDie(pipes,line);
00243 StringPiece next = GrabOrDie(pipes,line);
00244
00245 util::TokenIter<util::MultiCharacter> singlePipe(next, util::MultiCharacter(" | "));
00246 wbe = GrabOrDie(singlePipe,line);
00247 if (singlePipe) {
00248 phrase = GrabOrDie(singlePipe, line);
00249 hier = GrabOrDie(singlePipe, line);
00250 } else {
00251 phrase.clear();
00252 hier.clear();
00253 }
00254
00255 if (pipes) {
00256
00257 char* errIndex;
00258 next = *pipes++;
00259 weight = static_cast<float>(strtod(next.data(), &errIndex));
00260 UTIL_THROW_IF(errIndex == next.data(), FileFormatException, line.as_string());
00261 }
00262 }
00263
00264 void get_orientations(const StringPiece& pair, StringPiece& previous, StringPiece& next)
00265 {
00266 util::TokenIter<util::SingleCharacter> tok(pair, util::SingleCharacter(' '));
00267 previous = GrabOrDie(tok,pair);
00268 next = GrabOrDie(tok,pair);
00269 }