00001
00002
00003
00004
00005
00006
00007
00008
00009
00010 #include <sstream>
00011 #include "PhraseAlignment.h"
00012 #include "SafeGetline.h"
00013 #include "tables-core.h"
00014 #include "score.h"
00015
00016 #include <cstdlib>
00017
00018 using namespace std;
00019
00020 namespace MosesTraining
00021 {
00022
00023 extern Vocabulary vcbT;
00024 extern Vocabulary vcbS;
00025
00026 extern bool hierarchicalFlag;
00027
00029 template<typename T>
00030 inline T Scan(const std::string &input)
00031 {
00032 std::stringstream stream(input);
00033 T ret;
00034 stream >> ret;
00035 return ret;
00036 }
00037
00038
00040 template<typename T>
00041 inline void Scan(std::vector<T> &output, const std::vector< std::string > &input)
00042 {
00043 output.resize(input.size());
00044 for (size_t i = 0 ; i < input.size() ; i++) {
00045 output[i] = Scan<T>( input[i] );
00046 }
00047 }
00048
00049
00050 inline void Tokenize(std::vector<std::string> &output
00051 , const std::string& str
00052 , const std::string& delimiters = " \t")
00053 {
00054
00055 std::string::size_type lastPos = str.find_first_not_of(delimiters, 0);
00056
00057 std::string::size_type pos = str.find_first_of(delimiters, lastPos);
00058
00059 while (std::string::npos != pos || std::string::npos != lastPos) {
00060
00061 output.push_back(str.substr(lastPos, pos - lastPos));
00062
00063 lastPos = str.find_first_not_of(delimiters, pos);
00064
00065 pos = str.find_first_of(delimiters, lastPos);
00066 }
00067 }
00068
00069
00070 template<typename T>
00071 inline void Tokenize( std::vector<T> &output
00072 , const std::string &input
00073 , const std::string& delimiters = " \t")
00074 {
00075 std::vector<std::string> stringVector;
00076 Tokenize(stringVector, input, delimiters);
00077 return Scan<T>(output, stringVector );
00078 }
00079
00080
00081 void PhraseAlignment::create( char line[], int lineID, bool includeSentenceIdFlag )
00082 {
00083 assert(phraseS.empty());
00084 assert(phraseT.empty());
00085
00086 vector< string > token = tokenize( line );
00087 int item = 1;
00088 for (size_t j=0; j<token.size(); j++) {
00089 if (token[j] == "|||") item++;
00090 else if (item == 1) {
00091 phraseS.push_back( vcbS.storeIfNew( token[j] ) );
00092 }
00093
00094 else if (item == 2) {
00095 phraseT.push_back( vcbT.storeIfNew( token[j] ) );
00096 } else if (item == 3) {
00097 int s,t;
00098 sscanf(token[j].c_str(), "%d-%d", &s, &t);
00099 if ((size_t)t >= phraseT.size() || (size_t)s >= phraseS.size()) {
00100 cerr << "WARNING: phrase pair " << lineID
00101 << " has alignment point (" << s << ", " << t
00102 << ") out of bounds (" << phraseS.size() << ", " << phraseT.size() << ")\n";
00103 } else {
00104
00105 createAlignVec(phraseS.size(), phraseT.size());
00106
00107
00108 alignedToT[t].insert( s );
00109 alignedToS[s].insert( t );
00110 }
00111 } else if (includeSentenceIdFlag && item == 4) {
00112 sscanf(token[j].c_str(), "%d", &sentenceId);
00113 } else if (item + (includeSentenceIdFlag?-1:0) == 4) {
00114 sscanf(token[j].c_str(), "%f", &count);
00115 } else if (item + (includeSentenceIdFlag?-1:0) == 5) {
00116 addNTLength(token[j]);
00117 } else if (item + (includeSentenceIdFlag?-1:0) == 6) {
00118 float pcfgScore = std::atof(token[j].c_str());
00119 pcfgSum = pcfgScore * count;
00120 }
00121 }
00122
00123 createAlignVec(phraseS.size(), phraseT.size());
00124
00125 if (item + (includeSentenceIdFlag?-1:0) == 3) {
00126 count = 1.0;
00127 }
00128 if (item < 3 || item > 6) {
00129 cerr << "ERROR: faulty line " << lineID << ": " << line << endl;
00130 }
00131 }
00132
00133 void PhraseAlignment::addNTLength(const std::string &tok)
00134 {
00135 vector< string > tokens;
00136
00137 Tokenize(tokens, tok, "=");
00138 assert(tokens.size() == 2);
00139
00140 size_t sourcePos = Scan<size_t>(tokens[0]);
00141 assert(sourcePos < phraseS.size());
00142
00143 vector< size_t > ntLengths;
00144 Tokenize<size_t>(ntLengths, tokens[1], ",");
00145 assert(ntLengths.size() == 2);
00146
00147 m_ntLengths[sourcePos] = std::pair<size_t, size_t>(ntLengths[0], ntLengths[1]);
00148 }
00149
00150 void PhraseAlignment::createAlignVec(size_t sourceSize, size_t targetSize)
00151 {
00152
00153 if (alignedToT.size() == 0) {
00154 size_t numTgtSymbols = (hierarchicalFlag ? targetSize-1 : targetSize);
00155 alignedToT.resize(numTgtSymbols);
00156 }
00157
00158 if (alignedToS.size() == 0) {
00159 size_t numSrcSymbols = (hierarchicalFlag ? sourceSize-1 : sourceSize);
00160 alignedToS.resize(numSrcSymbols);
00161 }
00162 }
00163
00164 void PhraseAlignment::clear()
00165 {
00166 phraseS.clear();
00167 phraseT.clear();
00168 alignedToT.clear();
00169 alignedToS.clear();
00170 }
00171
00172
00173 bool PhraseAlignment::equals( const PhraseAlignment& other )
00174 {
00175 if (this == &other) return true;
00176 if (other.GetTarget() != GetTarget()) return false;
00177 if (other.GetSource() != GetSource()) return false;
00178 if (other.alignedToT != alignedToT) return false;
00179 if (other.alignedToS != alignedToS) return false;
00180 return true;
00181 }
00182
00183
00184
00185 bool PhraseAlignment::match( const PhraseAlignment& other )
00186 {
00187 if (this == &other) return true;
00188 if (other.GetTarget() != GetTarget()) return false;
00189 if (other.GetSource() != GetSource()) return false;
00190 if (!hierarchicalFlag) return true;
00191
00192 assert(phraseT.size() == alignedToT.size() + 1);
00193 assert(alignedToT.size() == other.alignedToT.size());
00194
00195
00196 for(size_t i=0; i<phraseT.size()-1; i++) {
00197 if (isNonTerminal( vcbT.getWord( phraseT[i] ) )) {
00198 if (alignedToT[i].size() != 1 ||
00199 other.alignedToT[i].size() != 1 ||
00200 *(alignedToT[i].begin()) != *(other.alignedToT[i].begin()))
00201 return false;
00202 }
00203 }
00204 return true;
00205 }
00206
00207 int PhraseAlignment::Compare(const PhraseAlignment &other) const
00208 {
00209 if (this == &other)
00210 return 0;
00211
00212 if (GetTarget() != other.GetTarget())
00213 return ( GetTarget() < other.GetTarget() ) ? -1 : +1;
00214
00215 if (GetSource() != other.GetSource())
00216 return ( GetSource() < other.GetSource() ) ? -1 : +1;
00217
00218 if (!hierarchicalFlag)
00219 return 0;
00220
00221
00222 for(size_t i=0; i<phraseT.size()-1; i++) {
00223 if (isNonTerminal( vcbT.getWord( phraseT[i] ) )) {
00224 size_t thisAlign = *(alignedToT[i].begin());
00225 size_t otherAlign = *(other.alignedToT[i].begin());
00226
00227 if (alignedToT[i].size() != 1 ||
00228 other.alignedToT[i].size() != 1 ||
00229 thisAlign != otherAlign) {
00230 int ret = (thisAlign < otherAlign) ? -1 : +1;
00231 return ret;
00232 }
00233 }
00234 }
00235 return 0;
00236
00237 }
00238
00239 }
00240