00001
00002
00003
00004
00005
00006
00007
00008
00009
00010 #include "ug_mm_ttrack.h"
00011 #include "ug_deptree.h"
00012 #include "tpt_tokenindex.h"
00013 #include "tpt_pickler.h"
00014 #include "moses/TranslationModel/UG/generic/program_options/ug_get_options.h"
00015 #include "moses/TranslationModel/UG/generic/file_io/ug_stream.h"
00016
00017 #include <iostream>
00018 #include <string>
00019 #include <sstream>
00020
00021 #include <boost/program_options.hpp>
00022 #include <boost/scoped_ptr.hpp>
00023
00024 #include "util/exception.hh"
00025
00026
00027
00028
00029
00030
00031
00032 using namespace std;
00033 using namespace ugdiss;
00034 using namespace sapt;
00035
00036 ofstream t1out,t2out,mam;
00037 int len1=0,len2=0;
00038 size_t lineCtr=0,sid=0;
00039 bool conll=false;
00040 bool skip=false;
00041 bool debug=false;
00042 TokenIndex V1;
00043
00044 string mtt1name,mtt2name,o1name,o2name,mamname,cfgFile;
00045 string dataFormat,A3filename;
00046 void
00047 interpret_args(int ac, char* av[])
00048 {
00049 namespace po=boost::program_options;
00050 po::variables_map vm;
00051 po::options_description o("Options");
00052 po::options_description h("Hidden Options");
00053 po::positional_options_description a;
00054
00055 o.add_options()
00056 ("help,h", "print this message")
00057 ("cfg,f", po::value<string>(&cfgFile),"config file")
00058 ("a3", po::value<string>(&A3filename), "name of A3 file (for sanity checks)")
00059 ("o1", po::value<string>(&o1name), "name of output file for track 1")
00060 ("o2", po::value<string>(&o2name), "name of output file for track 2")
00061 ("skip", "skip sentence pairs without word alignment (requires --o1 and --o2)")
00062 ("debug,d", "debug mode")
00063 ("t1", po::value<string>(&mtt1name), "file name of L1 mapped token track")
00064 ("t2", po::value<string>(&mtt2name), "file name of L2 mapped token track")
00065 ("format,F", po::value<string>(&dataFormat)->default_value("plain"), "data format (plain or conll)")
00066 ;
00067
00068 h.add_options()
00069 ("mamname", po::value<string>(&mamname), "name of output file for mam")
00070 ;
00071 a.add("mamname",1);
00072
00073 get_options(ac,av,h.add(o),a,vm,"cfg");
00074
00075 skip = vm.count("skip");
00076 debug = vm.count("debug");
00077 if (vm.count("help") || mamname.empty())
00078 {
00079 cout << "usage:\n"
00080 << "\t\n"
00081 << "\t ... | " << av[0]
00082 << " <.mam file> \n" << endl;
00083 cout << o << endl;
00084 cout << "If an A3 file is given (as produced by (m)giza), symal2mam performs\n"
00085 << "a sanity check to make sure that sentence lengths match." << endl;
00086 exit(0);
00087 }
00088 conll = dataFormat == "conll";
00089 if (!conll and dataFormat != "plain")
00090 {
00091 cerr << "format must be 'conll' or 'plain'" << endl;
00092 exit(1);
00093 }
00094 if (skip && (o1name.empty() || o2name.empty()))
00095 {
00096 cerr << "--skip requires --o1 and --o2" << endl;
00097 exit(1);
00098 }
00099 }
00100
00101 template<typename track_t>
00102 void
00103 copySentence(track_t const& T, size_t sid, ostream& dest)
00104 {
00105 char const* a = reinterpret_cast<char const*>(T.sntStart(sid));
00106 char const* z = reinterpret_cast<char const*>(T.sntEnd(sid));
00107 dest.write(a,z-a);
00108 }
00109
00110 size_t
00111 procSymalLine(string const& line, ostream& out)
00112 {
00113 ushort a,b; char dash;
00114 istringstream buf(line);
00115 while (buf>>a>>dash>>b)
00116 {
00117 if (debug && ((len1 && a >= len1) || (len2 && b >= len2)))
00118 {
00119 cerr << a << "-" << b << " " << len1 << "/" << len2 << endl;
00120 }
00121 assert(len1 == 0 || a<len1);
00122 assert(len2 == 0 || b<len2);
00123 tpt::binwrite(out,a);
00124 tpt::binwrite(out,b);
00125 }
00126 return out.tellp();
00127 }
00128
00129 void finiMAM(ofstream& out, vector<id_type>& idx, id_type numTok)
00130 {
00131 id_type offset = sizeof(filepos_type)+2*sizeof(id_type);
00132 filepos_type idxStart = out.tellp();
00133 for (vector<id_type>::iterator i = idx.begin(); i != idx.end(); ++i)
00134 tpt::numwrite(out,*i-offset);
00135 out.seekp(0);
00136 tpt::numwrite(out,idxStart);
00137 tpt::numwrite(out,id_type(idx.size()-1));
00138 tpt::numwrite(out,numTok);
00139 out.close();
00140 }
00141
00142 void
00143 finalize(ofstream& out, vector<id_type> const& idx, id_type tokenCount)
00144 {
00145 id_type idxSize = idx.size();
00146 filepos_type idxStart = out.tellp();
00147 for (size_t i = 0; i < idx.size(); ++i)
00148 tpt::numwrite(out,idx[i]);
00149 out.seekp(0);
00150 tpt::numwrite(out,idxStart);
00151 tpt::numwrite(out,idxSize-1);
00152 tpt::numwrite(out,tokenCount);
00153 out.close();
00154 }
00155
00156 bool getCheckValues(istream& in, int& check1, int& check2)
00157 {
00158 if (A3filename.empty()) return true;
00159 string line; string w;
00160 getline(in,line);
00161 size_t p1 = line.find("source length ") + 14;
00162 if (p1 >= line.size()) return false;
00163 size_t p2 = line.find("target length ",p1);
00164 if (p2 >= line.size()) return false;
00165
00166
00167 check1 = atoi(line.substr(p1,p2-p1).c_str());
00168 p1 = p2+14;
00169 p2 = line.find("alignment ",p1);
00170 if (p2 >= line.size()) return false;
00171 check2 = atoi(line.substr(p1,p2-p1).c_str());
00172 getline(in,line);
00173 getline(in,line);
00174 return true;
00175 }
00176
00177 void
00178 go()
00179 {
00180 size_t ctr=0;
00181 vector<id_type> idxm;
00182 idxm.reserve(10000000);
00183 idxm.push_back(mam.tellp());
00184 string line;
00185 while(getline(cin,line))
00186 {
00187 idxm.push_back(procSymalLine(line,mam));
00188 if (debug && ++ctr%100000==0)
00189 cerr << ctr/1000 << "K lines processed" << endl;
00190 }
00191 finiMAM(mam,idxm,0);
00192 cout << idxm.size() << endl;
00193 }
00194
00195 template<typename TKN>
00196 void
00197 go(string t1name, string t2name, string A3filename)
00198 {
00199 typedef mmTtrack<TKN> track_t;
00200 track_t T1(t1name),T2(t2name);
00201 boost::iostreams::filtering_istream A3file;
00202 open_input_stream(A3filename, A3file);
00203
00204 string line; int check1=-1,check2=-1;
00205 vector<id_type> idx1(1,0),idx2(1,0),idxm(1, mam.tellp());
00206 size_t tokenCount1=0,tokenCount2=0;
00207 size_t skipCtr=0,lineCtr=0;
00208 if (!getCheckValues(A3file, check1, check2))
00209 UTIL_THROW(util::Exception, "Mismatch in input files!");
00210
00211 for (sid = 0; sid < T1.size(); ++sid)
00212 {
00213 len1 = T1.sntLen(sid);
00214 len2 = T2.sntLen(sid);
00215 if (debug)
00216 cerr << "[" << lineCtr << "] "
00217 << len1 << " (" << check1 << ") / "
00218 << len2 << " (" << check2 << ")" << endl;
00219 if ((check1 >=0 && check1!=len1) ||
00220 (check2 >=0 && check2!=len2))
00221 {
00222 if (skip)
00223 {
00224 cerr << "[" << ++skipCtr << "] skipping "
00225 << check1 << "/" << check2 << " vs. "
00226 << len1 << "/" << len2
00227 << " at line " << lineCtr << endl;
00228 }
00229 else
00230 {
00231 idxm.push_back(mam.tellp());
00232 }
00233 if (len1 > 100 || len2 > 100)
00234 {
00235 getline(cin,line);
00236 getCheckValues(A3file,check1,check2);
00237 lineCtr++;
00238 }
00239 continue;
00240 }
00241 if (skip)
00242 {
00243 idx1.push_back(tokenCount1 += len1);
00244 copySentence(T1,sid,t1out);
00245 idx2.push_back(tokenCount2 += len2);
00246 copySentence(T2,sid,t2out);
00247 }
00248
00249 if (!getline(cin,line))
00250 UTIL_THROW(util::Exception, "Too few lines in symal input!");
00251
00252 lineCtr++;
00253 idxm.push_back(procSymalLine(line,mam));
00254 if (debug) cerr << "[" << lineCtr << "] "
00255 << check1 << " (" << len1 <<") "
00256 << check2 << " (" << len2 <<") "
00257 << line << endl;
00258 getCheckValues(A3file,check1,check2);
00259 }
00260 if (skip)
00261 {
00262 finalize(t1out,idx1,tokenCount1);
00263 finalize(t2out,idx2,tokenCount2);
00264 }
00265 finiMAM(mam,idxm,0);
00266 cout << idxm.size() << endl;
00267 }
00268
00269 void
00270 initialize(ofstream& out, string const& fname)
00271 {
00272 out.open(fname.c_str());
00273 tpt::numwrite(out,filepos_type(0));
00274 tpt::numwrite(out,id_type(0));
00275 tpt::numwrite(out,id_type(0));
00276 }
00277
00278 int main(int argc, char* argv[])
00279 {
00280 interpret_args(argc,argv);
00281 if (skip)
00282 {
00283 initialize(t1out,o1name);
00284 initialize(t2out,o2name);
00285 }
00286 initialize(mam,mamname);
00287 if (A3filename.size() == 0)
00288 go();
00289 else if (conll)
00290 go<Conll_Record>(mtt1name,mtt2name,A3filename);
00291 else
00292 go<id_type>(mtt1name,mtt2name,A3filename);
00293 }