Moses: /disk4/html/www/moses/doxygen/mosesdecoder/phrase-extract/extract-lex-main.cpp Source File

00001 #include <iostream>
00002 #include <fstream>
00003 #include <cassert>
00004 #include <vector>
00005 #include "extract-lex.h"
00006 #include "InputFileStream.h"
00007 #include "moses/Util.h"
00008 
00009 using namespace std;
00010 using namespace MosesTraining;
00011 
00012 float COUNT_INCR = 1;
00013 
00014 void fix(std::ostream& stream)
00015 {
00016   stream.setf(std::ios::fixed);
00017   stream.precision(7);
00018 }
00019 
00020 int main(int argc, char* argv[])
00021 {
00022   cerr << "Starting...\n";
00023 
00024   assert(argc == 6);
00025   char* &filePathTarget = argv[1];
00026   char* &filePathSource = argv[2];
00027   char* &filePathAlign  = argv[3];
00028   char* &filePathLexS2T = argv[4];
00029   char* &filePathLexT2S = argv[5];
00030 
00031   Moses::InputFileStream streamTarget(filePathTarget);
00032   Moses::InputFileStream streamSource(filePathSource);
00033   Moses::InputFileStream streamAlign(filePathAlign);
00034 
00035   ofstream streamLexS2T;
00036   ofstream streamLexT2S;
00037   streamLexS2T.open(filePathLexS2T);
00038   streamLexT2S.open(filePathLexT2S);
00039 
00040   fix(streamLexS2T);
00041   fix(streamLexT2S);
00042 
00043   ExtractLex extractSingleton;
00044 
00045   size_t lineCount = 0;
00046   string lineTarget, lineSource, lineAlign;
00047   while (getline(streamTarget, lineTarget)) {
00048     if (lineCount % 10000 == 0)
00049       cerr << lineCount << " ";
00050 
00051     istream &isSource = getline(streamSource, lineSource);
00052     assert(isSource);
00053     istream &isAlign = getline(streamAlign, lineAlign);
00054     assert(isAlign);
00055 
00056     vector<string> toksTarget, toksSource, toksAlign;
00057     Moses::Tokenize(toksTarget, lineTarget);
00058     Moses::Tokenize(toksSource, lineSource);
00059     Moses::Tokenize(toksAlign, lineAlign);
00060 
00061     /*
00062     cerr  << endl
00063           << toksTarget.size() << " " << lineTarget << endl
00064           << toksSource.size() << " " << lineSource << endl
00065           << toksAlign.size() << " " << lineAlign << endl;
00066     */
00067 
00068     extractSingleton.Process(toksTarget, toksSource, toksAlign, lineCount);
00069 
00070     ++lineCount;
00071   }
00072 
00073   extractSingleton.Output(streamLexS2T, streamLexT2S);
00074 
00075   streamTarget.Close();
00076   streamSource.Close();
00077   streamAlign.Close();
00078   streamLexS2T.close();
00079   streamLexT2S.close();
00080 
00081   cerr << "\nFinished\n";
00082 }
00083 
00084 namespace MosesTraining
00085 {
00086 
00087 const std::string *Vocab::GetOrAdd(const std::string &word)
00088 {
00089   const string *ret = &(*m_coll.insert(word).first);
00090   return ret;
00091 }
00092 
00093 void ExtractLex::Process(vector<string> &toksTarget, vector<string> &toksSource, vector<string> &toksAlign, size_t lineCount)
00094 {
00095   std::vector<bool> m_sourceAligned(toksSource.size(), false)
00096   , m_targetAligned(toksTarget.size(), false);
00097 
00098   vector<string>::const_iterator iterAlign;
00099   for (iterAlign = toksAlign.begin(); iterAlign != toksAlign.end(); ++iterAlign) {
00100     const string &alignTok = *iterAlign;
00101 
00102     vector<size_t> alignPos;
00103     Moses::Tokenize(alignPos, alignTok, "-");
00104     assert(alignPos.size() == 2);
00105 
00106     if (alignPos[0] >= toksSource.size()) {
00107       cerr << "ERROR: alignment over source length. Alignment " << alignPos[0] << " at line " << lineCount << endl;
00108       continue;
00109     }
00110     if (alignPos[1] >= toksTarget.size()) {
00111       cerr << "ERROR: alignment over target length. Alignment " << alignPos[1] << " at line " << lineCount << endl;
00112       continue;
00113     }
00114 
00115     assert(alignPos[0] < toksSource.size());
00116     assert(alignPos[1] < toksTarget.size());
00117 
00118     m_sourceAligned[ alignPos[0] ] = true;
00119     m_targetAligned[ alignPos[1] ] = true;
00120 
00121     const string &tmpSource = toksSource[ alignPos[0] ];
00122     const string &tmpTarget = toksTarget[ alignPos[1] ];
00123 
00124     const string *source = m_vocab.GetOrAdd(tmpSource);
00125     const string *target = m_vocab.GetOrAdd(tmpTarget);
00126 
00127     Process(target, source);
00128 
00129   }
00130 
00131   ProcessUnaligned(toksTarget, toksSource, m_sourceAligned, m_targetAligned);
00132 }
00133 
00134 void ExtractLex::Process(const std::string *target, const std::string *source)
00135 {
00136   WordCount &wcS2T = m_collS2T[source];
00137   WordCount &wcT2S = m_collT2S[target];
00138 
00139   wcS2T.AddCount(COUNT_INCR);
00140   wcT2S.AddCount(COUNT_INCR);
00141 
00142   Process(wcS2T, target);
00143   Process(wcT2S, source);
00144 }
00145 
00146 void ExtractLex::Process(WordCount &wcIn, const std::string *out)
00147 {
00148   std::map<const std::string*, WordCount> &collOut = wcIn.GetColl();
00149   WordCount &wcOut = collOut[out];
00150   wcOut.AddCount(COUNT_INCR);
00151 }
00152 
00153 void ExtractLex::ProcessUnaligned(vector<string> &toksTarget, vector<string> &toksSource
00154                                   , const std::vector<bool> &m_sourceAligned, const std::vector<bool> &m_targetAligned)
00155 {
00156   const string *nullWord = m_vocab.GetOrAdd("NULL");
00157 
00158   for (size_t pos = 0; pos < m_sourceAligned.size(); ++pos) {
00159     bool isAlignedCurr = m_sourceAligned[pos];
00160     if (!isAlignedCurr) {
00161       const string &tmpWord = toksSource[pos];
00162       const string *sourceWord = m_vocab.GetOrAdd(tmpWord);
00163 
00164       Process(nullWord, sourceWord);
00165     }
00166   }
00167 
00168   for (size_t pos = 0; pos < m_targetAligned.size(); ++pos) {
00169     bool isAlignedCurr = m_targetAligned[pos];
00170     if (!isAlignedCurr) {
00171       const string &tmpWord = toksTarget[pos];
00172       const string *targetWord = m_vocab.GetOrAdd(tmpWord);
00173 
00174       Process(targetWord, nullWord);
00175     }
00176   }
00177 
00178 }
00179 
00180 void ExtractLex::Output(std::ofstream &streamLexS2T, std::ofstream &streamLexT2S)
00181 {
00182   Output(m_collS2T, streamLexS2T);
00183   Output(m_collT2S, streamLexT2S);
00184 }
00185 
00186 void ExtractLex::Output(const std::map<const std::string*, WordCount> &coll, std::ofstream &outStream)
00187 {
00188   std::map<const std::string*, WordCount>::const_iterator iterOuter;
00189   for (iterOuter = coll.begin(); iterOuter != coll.end(); ++iterOuter) {
00190     const string &inStr = *iterOuter->first;
00191     const WordCount &inWC = iterOuter->second;
00192 
00193     const std::map<const std::string*, WordCount> &outColl = inWC.GetColl();
00194 
00195     std::map<const std::string*, WordCount>::const_iterator iterInner;
00196     for (iterInner = outColl.begin(); iterInner != outColl.end(); ++iterInner) {
00197       const string &outStr = *iterInner->first;
00198       const WordCount &outWC = iterInner->second;
00199 
00200       float prob = outWC.GetCount() / inWC.GetCount();
00201       outStream << outStr << " "  << inStr << " " << prob << endl;
00202     }
00203   }
00204 }
00205 
00206 std::ostream& operator<<(std::ostream &out, const WordCount &obj)
00207 {
00208   out << "(" << obj.GetCount() << ")";
00209   return out;
00210 }
00211 
00212 void WordCount::AddCount(float incr)
00213 {
00214   m_count += incr;
00215 }
00216 
00217 } // namespace
00218