Moses: /disk4/html/www/moses/doxygen/mosesdecoder/moses/TranslationModel/UG/check-coverage5.cc Source File

00001 // -*- mode: c++; indent-tabs-mode: nil; tab-width:2  -*-
00002 
00003 // read a text from stdin, report percentage of n-grams covered
00004 
00005 #include <boost/foreach.hpp>
00006 #include <boost/format.hpp>
00007 #include <boost/tokenizer.hpp>
00008 #include <boost/shared_ptr.hpp>
00009 #include <algorithm>
00010 #include <iostream>
00011 #include "mm/ug_bitext.h"
00012 #include "generic/file_io/ug_stream.h"
00013 #include <string>
00014 #include <sstream>
00015 #include "mm/ug_bitext_sampler.h"
00016 
00017 #include <boost/program_options.hpp>
00018 #include <boost/math/distributions/binomial.hpp>
00019 
00020 // #include "LSA.h"
00021 
00022 namespace po=boost::program_options;
00023 using namespace Moses;
00024 using namespace sapt;
00025 using namespace std;
00026 using namespace boost;
00027 
00028 typedef sapt::L2R_Token<sapt::SimpleWordId> Token;
00029 typedef mmTtrack<Token> ttrack_t;
00030 
00031 size_t ngram_size;
00032 size_t verbosity;
00033 string bname;
00034 vector<string> ifiles;
00035 
00036 void interpret_args(int ac, char* av[]);
00037 
00038 
00039 void
00040 dump(mmTSA<Token>::tree_iterator& m, TokenIndex& V)
00041 {
00042   if (m.size()) cout << m.str(NULL) << endl;
00043   if (m.size()) cout << m.str(&V) << endl;
00044   if (m.down())
00045     {
00046       do { dump(m, V); } while (m.over());
00047       m.up();
00048     }
00049 }
00050 
00051 int
00052 main(int argc, char* argv[])
00053 {
00054   interpret_args(argc,argv);
00055   TokenIndex V;
00056   V.open(bname+".tdx"); V.setDynamic(true); V.iniReverseIndex();
00057   boost::shared_ptr<mmTtrack<Token> > T(new mmTtrack<Token>);
00058   T->open(bname+".mct");
00059   mmTSA<Token> I; I.open(bname+".sfa", T);
00060 
00061   string line;
00062   BOOST_FOREACH(string const& file, ifiles)
00063     {
00064       size_t total_ngrams=0;
00065       float matched_ngrams=0;
00066       ifstream in(file.c_str());
00067       while(getline(in,line))
00068         {
00069           // cout << line << endl;
00070           vector<id_type> snt;
00071           V.fillIdSeq(line,snt);
00072           if (snt.size() < ngram_size) continue;
00073           total_ngrams += snt.size() - ngram_size + 1;
00074           for (size_t i = 0; i + ngram_size <= snt.size(); ++i)
00075             // for (size_t i = 0; i < snt.size(); ++i)
00076             {
00077               mmTSA<Token>::tree_iterator m(&I);
00078               size_t stop = min(snt.size(), i+ngram_size);
00079               size_t k = i; 
00080               while (k < stop && m.extend(snt[k])) ++k;
00081               if (verbosity) cout << i << " " << k-i << " " << m.str(&V) << endl;
00082               if (k - i == ngram_size)
00083                 ++matched_ngrams;
00084             }
00085         }
00086       printf ("%5.1f%% matched %zu-grams (%.0f/%zu): %s\n",
00087               (100 * matched_ngrams / total_ngrams), ngram_size,
00088               matched_ngrams, total_ngrams, file.c_str());
00089     }
00090 }
00091 
00092 void
00093 interpret_args(int ac, char* av[])
00094 {
00095   po::variables_map vm;
00096   po::options_description o("Options");
00097   o.add_options()
00098 
00099     ("help,h",  "print this message")
00100     ("ngram-size,n", po::value<size_t>(&ngram_size)->default_value(5),
00101      "sample size")
00102     ("verbose,v", po::value<size_t>(&verbosity)->default_value(0),
00103      "verbosity")
00104     ;
00105 
00106   po::options_description h("Hidden Options");
00107   h.add_options()
00108     ("bname", po::value<string>(&bname), "base name of corpus")
00109     ("ifiles", po::value<vector<string> >(&ifiles), "input files")
00110     ;
00111 
00112   h.add(o);
00113   po::positional_options_description a;
00114   a.add("bname",1);
00115   a.add("ifiles",-1);
00116 
00117   po::store(po::command_line_parser(ac,av)
00118             .options(h)
00119             .positional(a)
00120             .run(),vm);
00121   po::notify(vm);
00122   if (vm.count("help"))
00123     {
00124       std::cout << "\nusage:\n\t" << av[0]
00125                 << " [options] <model file stem>" << std::endl;
00126       std::cout << o << std::endl;
00127       exit(0);
00128     }
00129 }