00001 #include <iostream>
00002
00003 #include <sstream>
00004 #include <vector>
00005 #include <string>
00006 #include <iterator>
00007 #include <functional>
00008 #include <sys/stat.h>
00009 #include "moses/TypeDef.h"
00010 #include "moses/ConfusionNet.h"
00011 #include "moses/FactorCollection.h"
00012 #include "moses/Phrase.h"
00013 #include "moses/InputFileStream.h"
00014 #include "moses/Timer.h"
00015 #include "moses/TranslationModel/PhraseDictionaryTree.h"
00016
00017 using namespace std;
00018 using namespace Moses;
00019
00020 Timer timer;
00021
00022 template<typename T>
00023 std::ostream& operator<<(std::ostream& out,const std::vector<T>& x)
00024 {
00025 out<<x.size()<<" ";
00026 typename std::vector<T>::const_iterator iend=x.end();
00027 for(typename std::vector<T>::const_iterator i=x.begin(); i!=iend; ++i)
00028 out<<*i<<' ';
00029 return out;
00030 }
00031
00032 inline bool existsFile(const char* filename)
00033 {
00034 struct stat mystat;
00035 return (stat(filename,&mystat)==0);
00036 }
00037 inline bool existsFile(const std::string& filename)
00038 {
00039 return existsFile(filename.c_str());
00040 }
00041
00042 int main(int argc,char **argv)
00043 {
00044 std::string fto;
00045 size_t noScoreComponent=5;
00046 int cn=0;
00047 bool aligninfo=true;
00048 std::vector<std::pair<std::string,std::pair<char*,char*> > > ftts;
00049 int verb=0;
00050 for(int i=1; i<argc; ++i) {
00051 std::string s(argv[i]);
00052 if(s=="-ttable") {
00053 std::pair<char*,char*> p;
00054 p.first=argv[++i];
00055 p.second=argv[++i];
00056 ftts.push_back(std::make_pair(std::string(argv[++i]),p));
00057 } else if(s=="-nscores") noScoreComponent=atoi(argv[++i]);
00058 else if(s=="-out") fto=std::string(argv[++i]);
00059 else if(s=="-cn") cn=1;
00060 else if(s=="-irst") cn=2;
00061 else if(s=="-no-alignment-info") aligninfo=false;
00062 else if(s=="-v") verb=atoi(argv[++i]);
00063 else if(s=="-h") {
00064 std::cerr<<"usage "<<argv[0]<<" :\n\n"
00065 "options:\n"
00066 "\t-ttable int int string -- translation table file, use '-' for stdin\n"
00067 "\t-out string -- output file name prefix for binary ttable\n"
00068 "\t-nscores int -- number of scores in ttable\n"
00069 "\t-no-alignment-info -- omit alignment info from the binary ttable \n"
00070 "\nfunctions:\n"
00071 "\t - convert ascii ttable in binary format\n"
00072 "\t - if ttable is not read from stdin:\n"
00073 "\t treat each line as source phrase an print tgt candidates\n"
00074 "\n";
00075 return 1;
00076 } else {
00077 std::cerr<<"ERROR: unknown option '"<<s<<"'\n";
00078 return 1;
00079 }
00080 }
00081
00082
00083 if(ftts.size()) {
00084
00085 if(ftts.size()==1) {
00086 std::cerr<<"processing ptree for ";
00087 PhraseDictionaryTree pdt;
00088
00089 pdt.PrintWordAlignment(aligninfo);
00090
00091 if (ftts[0].first=="-") {
00092 std::cerr<< "stdin\n";
00093 pdt.Create(std::cin,fto);
00094 } else {
00095 std::cerr<< ftts[0].first << "\n";
00096 InputFileStream in(ftts[0].first);
00097 pdt.Create(in,fto);
00098 }
00099 } else {
00100 #if 0
00101 std::vector<PhraseDictionaryTree const*> pdicts;
00102 std::vector<FactorType> factorOrder;
00103 for(size_t i=0; i<ftts.size(); ++i) {
00104
00105 PhraseDictionaryTree *pdtptr=new PhraseDictionaryTree(noScoreComponent,
00106 &factorCollection,
00107 getFactorType(atoi(ftts[i].second.first)),
00108 getFactorType(atoi(ftts[i].second.second))
00109 );
00110 factorOrder.push_back(pdtptr->GetInputFactorType());
00111 PhraseDictionaryTree &pdt=*pdtptr;
00112 pdicts.push_back(pdtptr);
00113
00114 std::string facStr="."+std::string(ftts[i].second.first)+"-"+std::string(ftts[i].second.second);
00115 std::string prefix=ftts[i].first+facStr;
00116 if(!existsFile(prefix+".binphr.idx")) {
00117 std::cerr<<"bin ttable does not exist -> create it\n";
00118 InputFileStream in(prefix);
00119 pdt.Create(in,prefix);
00120 }
00121 std::cerr<<"reading bin ttable\n";
00122 pdt.Read(prefix);
00123
00124 }
00125
00126 std::cerr<<"processing stdin\n";
00127 if(!cn) {
00128 std::string line;
00129 while(getline(std::cin,line)) {
00130 std::istringstream is(line);
00131 #if 0
00132 std::vector<std::string> f;
00133 std::copy(std::istream_iterator<std::string>(is),
00134 std::istream_iterator<std::string>(),
00135 std::back_inserter(f));
00136 #endif
00137 std::cerr<<"got source phrase '"<<line<<"'\n";
00138
00139 Phrase F(Input);
00140 F.CreateFromString(factorOrder,line,factorCollection);
00141
00142 for(size_t k=0; k<pdicts.size(); ++k) {
00143 PhraseDictionaryTree const& pdt=*pdicts[k];
00144
00145 std::vector<std::string> f(F.GetSize());
00146 for(size_t i=0; i<F.GetSize(); ++i)
00147 f[i]=F.GetFactor(i,pdt.GetInputFactorType())->ToString();
00148
00149 std::stringstream iostA,iostB;
00150 std::cerr<<"full phrase processing "<<f<<"\n";
00151 pdt.PrintTargetCandidates(f,iostA);
00152
00153 std::cerr<<"processing with prefix ptr\n";
00154 PhraseDictionaryTree::PrefixPtr p(pdt.GetRoot());
00155
00156 for(size_t i=0; i<f.size() && p; ++i) {
00157 std::cerr<<"pre "<<i<<" "<<(p?"1":"0")<<"\n";
00158 p=pdt.Extend(p,f[i]);
00159 std::cerr<<"post "<<i<<" "<<(p?"1":"0")<<"\n";
00160 }
00161 if(p) {
00162 std::cerr<<"retrieving candidates from prefix ptr\n";
00163 pdt.PrintTargetCandidates(p,iostB);
00164 } else {
00165 std::cerr<<"final ptr is invalid\n";
00166 iostB<<"there are 0 target candidates\n";
00167 }
00168 if(iostA.str() != iostB.str())
00169 std::cerr<<"ERROR: translation candidates mismatch '"<<iostA.str()<<"' and for prefix pointer: '"<<iostB.str()<<"'\n";
00170
00171 std::cerr<<"translation candidates:\n"<<iostA.str()<<"\n";
00172 pdt.FreeMemory();
00173
00174 }
00175
00176 }
00177 } else {
00178
00179 ConfusionNet net(&factorCollection);
00180 std::vector<std::vector<float> > weights;
00181 for(size_t i=0; i<pdicts.size(); ++i)
00182 weights.push_back(std::vector<float>(noScoreComponent,1/(1.0*noScoreComponent)));
00183
00184 while(net.ReadF(std::cin,factorOrder,cn-1)) {
00185 net.Print(std::cerr);
00186 GenerateCandidates(net,pdicts,weights,verb);
00187 }
00188
00189 }
00190 #else
00191 std::cerr<<"ERROR: these functions are currently broken...\n";
00192 exit(1);
00193 #endif
00194 }
00195 }
00196
00197 }