00001
00002
00003
00004
00005 #include <boost/program_options.hpp>
00006 #include <boost/algorithm/string/predicate.hpp>
00007 #include <boost/format.hpp>
00008 #include <boost/math/distributions/binomial.hpp>
00009
00010 #include "mm/ug_bitext.h"
00011 #include "mm/tpt_typedefs.h"
00012 #include "mm/ug_prime_sampling1.h"
00013 #include "mm/ug_bitext_sampler.h"
00014 #include "mm/ug_phrasepair.h"
00015 #include "mm/ug_lru_cache.h"
00016 #include "generic/sorting/VectorIndexSorter.h"
00017 #include "generic/sorting/NBestList.h"
00018 #include <string>
00019 #include <boost/unordered_map.hpp>
00020 #include "moses/thread_safe_container.h"
00021 #include "mm/ug_prep_phrases.h"
00022
00023 using namespace std;
00024 using namespace Moses;
00025 using namespace Moses::bitext;
00026 namespace po=boost::program_options;
00027 using namespace boost::algorithm;
00028 typedef L2R_Token<SimpleWordId> Token;
00029 typedef mmBitext<Token> mmbitext;
00030 typedef Bitext<Token>::tsa tsa;
00031 typedef Bitext<Token>::iter iter;
00032 typedef imTtrack<Token> imttrack;
00033 typedef imTSA<Token> imtsa;
00034 typedef vector<PhrasePair<Token> > pplist_t;
00035
00036 string bname, bname1, bname2, ifile, L1, L2, Q1, Q2;
00037 size_t maxhits;
00038 size_t cache_size;
00039 void interpret_args(int ac, char* av[]);
00040
00041 typedef PhrasePair<Token>::SortDescendingByJointCount sorter_t;
00042 sorter_t sorter;
00043
00044 void
00045 show(Bitext<Token> const& B, iter const& m, pstats& stats)
00046 {
00047 pplist_t pplist;
00048 expand(m, B, stats, pplist, NULL);
00049 if (pplist.empty()) return;
00050 cout << "\n" << m.str(B.V1.get()) << " [" << m.ca() << "]" << endl;
00051 VectorIndexSorter<PhrasePair<Token>, sorter_t> viso(pplist, sorter);
00052 sptr<vector<size_t> > ranked = viso.GetOrder();
00053 size_t ctr=0;
00054 size_t cumul=0;
00055 BOOST_FOREACH(size_t const i, *ranked)
00056 {
00057 typedef map<uint32_t, uint32_t>::value_type entry_t;
00058
00059 PhrasePair<Token> const& pp = pplist[i];
00060 if (pp.joint < pp.good1 * .01) break;
00061 size_t remarkable = 0;
00062 float p = float(pp.joint)/pp.good1;
00063 BOOST_FOREACH(entry_t const& e, pp.indoc)
00064 {
00065 boost::math::binomial binomi(stats.indoc[e.first], p);
00066 float x = boost::math::cdf(binomi, e.second);
00067 float y = boost::math::cdf(boost::math::complement(binomi, e.second-1));
00068 if ((x > .01 && y > .01) || e.second < 5) continue;
00069 remarkable += e.second;
00070
00071
00072
00073
00074
00075
00076 }
00077 if (remarkable*20 > pp.good1)
00078 {
00079 cout << boost::format(" %6d | ") % pp.joint
00080 << toString(*B.V2, pp.start2, pp.len2)
00081 << boost::format(" (%d: %.2f)") % cumul % (float(cumul)/pp.good1)
00082 << endl;
00083 BOOST_FOREACH(entry_t const& e, pp.indoc)
00084 {
00085 boost::math::binomial binomi(stats.indoc[e.first], p);
00086 float x = boost::math::cdf(binomi, e.second);
00087 float y = boost::math::cdf(boost::math::complement(binomi, e.second-1));
00088 if ((x > .001 && y > .001) || e.second < 20) continue;
00089 cout << p * stats.indoc[e.first]
00090 << "/" << e.second << "/" << stats.indoc[e.first]
00091 << " " << boost::math::cdf(binomi, e.second)
00092 << " " << boost::math::cdf(boost::math::complement
00093 (binomi, e.second-1))
00094 << " " << toString(*B.V2, pp.start2, pp.len2)
00095 << endl;
00096 }
00097 }
00098 }
00099 }
00100
00101
00102 void
00103 process(SPTR<Bitext<Token> const> const& bitext, TSA<Token>::tree_iterator& m)
00104 {
00105 static boost::shared_ptr<SamplingBias> nil(new SamplingBiasAlways(bitext->sid2did()));
00106 static Moses::bitext::sampling_method random = Moses::bitext::random_sampling;
00107
00108 if (m.extend((*bitext->V1)["job"]))
00109 {
00110 do
00111 {
00112 if (m.ca() >= 5000)
00113 {
00114
00115 Moses::bitext::BitextSampler<Token> s(bitext, m, nil, 10000, random);
00116 s();
00117 show(*bitext, m, *s.stats());
00118 process(bitext, m);
00119 }
00120 }
00121 while (m.over());
00122 m.up();
00123 }
00124 }
00125
00126 int main(int argc, char* argv[])
00127 {
00128 interpret_args(argc, argv);
00129 SPTR<mmbitext> B(new mmbitext);
00130 B->open(bname, L1, L2);
00131 TSA<Token>::tree_iterator m(B->I1.get());
00132
00133 process(B.get(), m);
00134 }
00135
00136 void
00137 interpret_args(int ac, char* av[])
00138 {
00139 po::variables_map vm;
00140 po::options_description o("Options");
00141 o.add_options()
00142 ("help,h", "print this message")
00143 ;
00144
00145 po::options_description h("Hidden Options");
00146 h.add_options()
00147 ("bname", po::value<string>(&bname), "base name of corpus")
00148 ("L1", po::value<string>(&L1), "L1 tag")
00149 ("L2", po::value<string>(&L2), "L2 tag")
00150 ;
00151
00152 h.add(o);
00153 po::positional_options_description a;
00154 a.add("bname",1);
00155 a.add("L1",1);
00156 a.add("L2",1);
00157
00158 po::store(po::command_line_parser(ac,av)
00159 .options(h)
00160 .positional(a)
00161 .run(),vm);
00162 po::notify(vm);
00163 if (vm.count("help"))
00164 {
00165 cout << "\nusage:\n\t" << av[0]
00166 << " <bname> <L1> <L2>" << endl;
00167 cout << o << endl;
00168 exit(0);
00169 }
00170 }