00001 #include <iostream>
00002 #include <fstream>
00003 #include <cassert>
00004 #include <vector>
00005 #include "extract-lex.h"
00006 #include "InputFileStream.h"
00007 #include "moses/Util.h"
00008
00009 using namespace std;
00010 using namespace MosesTraining;
00011
00012 float COUNT_INCR = 1;
00013
00014 void fix(std::ostream& stream)
00015 {
00016 stream.setf(std::ios::fixed);
00017 stream.precision(7);
00018 }
00019
00020 int main(int argc, char* argv[])
00021 {
00022 cerr << "Starting...\n";
00023
00024 assert(argc == 6);
00025 char* &filePathTarget = argv[1];
00026 char* &filePathSource = argv[2];
00027 char* &filePathAlign = argv[3];
00028 char* &filePathLexS2T = argv[4];
00029 char* &filePathLexT2S = argv[5];
00030
00031 Moses::InputFileStream streamTarget(filePathTarget);
00032 Moses::InputFileStream streamSource(filePathSource);
00033 Moses::InputFileStream streamAlign(filePathAlign);
00034
00035 ofstream streamLexS2T;
00036 ofstream streamLexT2S;
00037 streamLexS2T.open(filePathLexS2T);
00038 streamLexT2S.open(filePathLexT2S);
00039
00040 fix(streamLexS2T);
00041 fix(streamLexT2S);
00042
00043 ExtractLex extractSingleton;
00044
00045 size_t lineCount = 0;
00046 string lineTarget, lineSource, lineAlign;
00047 while (getline(streamTarget, lineTarget)) {
00048 if (lineCount % 10000 == 0)
00049 cerr << lineCount << " ";
00050
00051 istream &isSource = getline(streamSource, lineSource);
00052 assert(isSource);
00053 istream &isAlign = getline(streamAlign, lineAlign);
00054 assert(isAlign);
00055
00056 vector<string> toksTarget, toksSource, toksAlign;
00057 Moses::Tokenize(toksTarget, lineTarget);
00058 Moses::Tokenize(toksSource, lineSource);
00059 Moses::Tokenize(toksAlign, lineAlign);
00060
00061
00062
00063
00064
00065
00066
00067
00068 extractSingleton.Process(toksTarget, toksSource, toksAlign, lineCount);
00069
00070 ++lineCount;
00071 }
00072
00073 extractSingleton.Output(streamLexS2T, streamLexT2S);
00074
00075 streamTarget.Close();
00076 streamSource.Close();
00077 streamAlign.Close();
00078 streamLexS2T.close();
00079 streamLexT2S.close();
00080
00081 cerr << "\nFinished\n";
00082 }
00083
00084 namespace MosesTraining
00085 {
00086
00087 const std::string *Vocab::GetOrAdd(const std::string &word)
00088 {
00089 const string *ret = &(*m_coll.insert(word).first);
00090 return ret;
00091 }
00092
00093 void ExtractLex::Process(vector<string> &toksTarget, vector<string> &toksSource, vector<string> &toksAlign, size_t lineCount)
00094 {
00095 std::vector<bool> m_sourceAligned(toksSource.size(), false)
00096 , m_targetAligned(toksTarget.size(), false);
00097
00098 vector<string>::const_iterator iterAlign;
00099 for (iterAlign = toksAlign.begin(); iterAlign != toksAlign.end(); ++iterAlign) {
00100 const string &alignTok = *iterAlign;
00101
00102 vector<size_t> alignPos;
00103 Moses::Tokenize(alignPos, alignTok, "-");
00104 assert(alignPos.size() == 2);
00105
00106 if (alignPos[0] >= toksSource.size()) {
00107 cerr << "ERROR: alignment over source length. Alignment " << alignPos[0] << " at line " << lineCount << endl;
00108 continue;
00109 }
00110 if (alignPos[1] >= toksTarget.size()) {
00111 cerr << "ERROR: alignment over target length. Alignment " << alignPos[1] << " at line " << lineCount << endl;
00112 continue;
00113 }
00114
00115 assert(alignPos[0] < toksSource.size());
00116 assert(alignPos[1] < toksTarget.size());
00117
00118 m_sourceAligned[ alignPos[0] ] = true;
00119 m_targetAligned[ alignPos[1] ] = true;
00120
00121 const string &tmpSource = toksSource[ alignPos[0] ];
00122 const string &tmpTarget = toksTarget[ alignPos[1] ];
00123
00124 const string *source = m_vocab.GetOrAdd(tmpSource);
00125 const string *target = m_vocab.GetOrAdd(tmpTarget);
00126
00127 Process(target, source);
00128
00129 }
00130
00131 ProcessUnaligned(toksTarget, toksSource, m_sourceAligned, m_targetAligned);
00132 }
00133
00134 void ExtractLex::Process(const std::string *target, const std::string *source)
00135 {
00136 WordCount &wcS2T = m_collS2T[source];
00137 WordCount &wcT2S = m_collT2S[target];
00138
00139 wcS2T.AddCount(COUNT_INCR);
00140 wcT2S.AddCount(COUNT_INCR);
00141
00142 Process(wcS2T, target);
00143 Process(wcT2S, source);
00144 }
00145
00146 void ExtractLex::Process(WordCount &wcIn, const std::string *out)
00147 {
00148 std::map<const std::string*, WordCount> &collOut = wcIn.GetColl();
00149 WordCount &wcOut = collOut[out];
00150 wcOut.AddCount(COUNT_INCR);
00151 }
00152
00153 void ExtractLex::ProcessUnaligned(vector<string> &toksTarget, vector<string> &toksSource
00154 , const std::vector<bool> &m_sourceAligned, const std::vector<bool> &m_targetAligned)
00155 {
00156 const string *nullWord = m_vocab.GetOrAdd("NULL");
00157
00158 for (size_t pos = 0; pos < m_sourceAligned.size(); ++pos) {
00159 bool isAlignedCurr = m_sourceAligned[pos];
00160 if (!isAlignedCurr) {
00161 const string &tmpWord = toksSource[pos];
00162 const string *sourceWord = m_vocab.GetOrAdd(tmpWord);
00163
00164 Process(nullWord, sourceWord);
00165 }
00166 }
00167
00168 for (size_t pos = 0; pos < m_targetAligned.size(); ++pos) {
00169 bool isAlignedCurr = m_targetAligned[pos];
00170 if (!isAlignedCurr) {
00171 const string &tmpWord = toksTarget[pos];
00172 const string *targetWord = m_vocab.GetOrAdd(tmpWord);
00173
00174 Process(targetWord, nullWord);
00175 }
00176 }
00177
00178 }
00179
00180 void ExtractLex::Output(std::ofstream &streamLexS2T, std::ofstream &streamLexT2S)
00181 {
00182 Output(m_collS2T, streamLexS2T);
00183 Output(m_collT2S, streamLexT2S);
00184 }
00185
00186 void ExtractLex::Output(const std::map<const std::string*, WordCount> &coll, std::ofstream &outStream)
00187 {
00188 std::map<const std::string*, WordCount>::const_iterator iterOuter;
00189 for (iterOuter = coll.begin(); iterOuter != coll.end(); ++iterOuter) {
00190 const string &inStr = *iterOuter->first;
00191 const WordCount &inWC = iterOuter->second;
00192
00193 const std::map<const std::string*, WordCount> &outColl = inWC.GetColl();
00194
00195 std::map<const std::string*, WordCount>::const_iterator iterInner;
00196 for (iterInner = outColl.begin(); iterInner != outColl.end(); ++iterInner) {
00197 const string &outStr = *iterInner->first;
00198 const WordCount &outWC = iterInner->second;
00199
00200 float prob = outWC.GetCount() / inWC.GetCount();
00201 outStream << outStr << " " << inStr << " " << prob << endl;
00202 }
00203 }
00204 }
00205
00206 std::ostream& operator<<(std::ostream &out, const WordCount &obj)
00207 {
00208 out << "(" << obj.GetCount() << ")";
00209 return out;
00210 }
00211
00212 void WordCount::AddCount(float incr)
00213 {
00214 m_count += incr;
00215 }
00216
00217 }
00218