00001
00002
00003 #include <boost/program_options.hpp>
00004 #include <iomanip>
00005
00006 #include "tpt_typedefs.h"
00007 #include "ug_mm_ttrack.h"
00008 #include "tpt_tokenindex.h"
00009 #include "ug_deptree.h"
00010 #include "ug_corpus_token.h"
00011 #include "tpt_pickler.h"
00012
00013 using namespace std;
00014 using namespace tpt;
00015 using namespace sapt;
00016 namespace po = boost::program_options;
00017
00018 typedef L2R_Token<Conll_Sform> Token;
00019
00020 string bname,L1,L2;
00021 mmTtrack<char> MAM;
00022 mmTtrack<Token> T1,T2;
00023 bool inv;
00024 vector<string> range;
00025 void
00026 interpret_args(int ac, char* av[])
00027 {
00028 po::variables_map vm;
00029 po::options_description o("Options");
00030 o.add_options()
00031 ("help,h", "print this message")
00032 ("inv,i", po::bool_switch(&inv), "inverse")
00033 ;
00034
00035 po::options_description h("Hidden Options");
00036 h.add_options()
00037 ("bname", po::value<string>(&bname), "base name")
00038 ("L1", po::value<string>(&L1), "L1")
00039 ("L2", po::value<string>(&L2), "L2")
00040 ("range", po::value<vector<string> >(&range), "range")
00041 ;
00042 po::positional_options_description a;
00043 a.add("bname",1);
00044 a.add("L1",1);
00045 a.add("L2",1);
00046 a.add("range",-1);
00047
00048 po::store(po::command_line_parser(ac,av)
00049 .options(h.add(o))
00050 .positional(a)
00051 .run(),vm);
00052 po::notify(vm);
00053 if (vm.count("help") || L2.empty())
00054 {
00055 cout << "usage:\n\t"
00056 << av[0] << " <base name> <L1> <L2> \n"
00057 << endl;
00058 cout << o << endl;
00059 exit(0);
00060 }
00061 }
00062
00063 size_t
00064 check_range(size_t start, size_t stop)
00065 {
00066 size_t noAln = 0;
00067 for (size_t sid = start; sid < stop; ++sid)
00068 {
00069 char const* p = MAM.sntStart(sid);
00070 char const* q = MAM.sntEnd(sid);
00071 size_t slen = T1.sntLen(sid);
00072 size_t tlen = T2.sntLen(sid);
00073 if (p == q) ++noAln;
00074 ushort s,t;
00075 while (p < q)
00076 {
00077 p = binread(p,s);
00078 p = binread(p,t);
00079 if (s >= slen || t >= tlen)
00080 {
00081 cout << "alignment out of bounds in sentence " << sid << ": "
00082 << s << "-" << t << " in " << slen << ":" << tlen << "."
00083 << endl;
00084 break;
00085 }
00086 }
00087 }
00088 return noAln;
00089 }
00090
00091 int
00092 main(int argc, char*argv[])
00093 {
00094 interpret_args(argc,argv);
00095 MAM.open(bname+L1+"-"+L2+".mam");
00096 T1.open(bname+L1+".mct");
00097 T2.open(bname+L2+".mct");
00098 if (T1.size() != T2.size() || T1.size() != MAM.size())
00099 {
00100 cout << "Track sizes don't match!" << endl;
00101 exit(1);
00102 }
00103 size_t noAln;
00104 if (!range.size())
00105 noAln = check_range(0, MAM.size());
00106 else
00107 {
00108 noAln = 0;
00109 for (size_t i = 0; i < range.size(); i++)
00110 {
00111 istringstream buf(range[i]);
00112 size_t first,last; uchar c;
00113 buf>>first;
00114 if (buf.peek() == '-') buf>>c>>last;
00115 else last = first;
00116 if (last < MAM.size())
00117 noAln += check_range(first,last+1);
00118 }
00119 }
00120 cout << noAln << " sentence pairs without alignment" << endl;
00121 }