00001
00002
00003
00004
00005
00006
00007
00008
00009
00010
00011
00012
00013
00014
00015
00016
00017
00018
00019
00020
00021
00022
00023 #include "XmlOption.h"
00024 #include <vector>
00025 #include <string>
00026 #include <iostream>
00027 #include "Util.h"
00028 #include "StaticData.h"
00029 #include "WordsRange.h"
00030 #include "TargetPhrase.h"
00031 #include "DummyScoreProducers.h"
00032
00033 namespace Moses
00034 {
00035 using namespace std;
00036
00037 string ParseXmlTagAttribute(const string& tag,const string& attributeName)
00038 {
00039
00040 string tagOpen = attributeName + "=\"";
00041 size_t contentsStart = tag.find(tagOpen);
00042 if (contentsStart == string::npos) return "";
00043 contentsStart += tagOpen.size();
00044 size_t contentsEnd = tag.find_first_of('"',contentsStart+1);
00045 if (contentsEnd == string::npos) {
00046 TRACE_ERR("Malformed XML attribute: "<< tag);
00047 return "";
00048 }
00049 size_t possibleEnd;
00050 while (tag.at(contentsEnd-1) == '\\' && (possibleEnd = tag.find_first_of('"',contentsEnd+1)) != string::npos) {
00051 contentsEnd = possibleEnd;
00052 }
00053 return tag.substr(contentsStart,contentsEnd-contentsStart);
00054 }
00055
00063 string TrimXml(const string& str, const std::string& lbrackStr, const std::string& rbrackStr)
00064 {
00065
00066 if (str.size() < lbrackStr.length()+rbrackStr.length() ) return str;
00067
00068
00069 if (str.substr(0,lbrackStr.length()) == lbrackStr && str.substr(str.size()-rbrackStr.length()) == rbrackStr) {
00070 return str.substr(lbrackStr.length(), str.size()-lbrackStr.length()-rbrackStr.length());
00071 }
00072
00073 else {
00074 return str;
00075 }
00076 }
00077
00085 bool isXmlTag(const string& tag, const std::string& lbrackStr, const std::string& rbrackStr)
00086 {
00087 return (tag.substr(0,lbrackStr.length()) == lbrackStr &&
00088 (tag[lbrackStr.length()] == '/' ||
00089 (tag[lbrackStr.length()] >= 'a' && tag[lbrackStr.length()] <= 'z') ||
00090 (tag[lbrackStr.length()] >= 'A' && tag[lbrackStr.length()] <= 'Z')));
00091 }
00092
00103 vector<string> TokenizeXml(const string& str, const std::string& lbrackStr, const std::string& rbrackStr)
00104 {
00105 string lbrack = lbrackStr;
00106 string rbrack = rbrackStr;
00107 vector<string> tokens;
00108 string::size_type cpos = 0;
00109 string::size_type lpos = 0;
00110 string::size_type rpos = 0;
00111
00112
00113 while (cpos != str.size()) {
00114
00115 lpos = str.find(lbrack, cpos);
00116 if (lpos != string::npos) {
00117
00118 rpos = str.find(rbrack, lpos+lbrackStr.length()-1);
00119
00120 if (rpos == string::npos) {
00121 TRACE_ERR("ERROR: malformed XML: " << str << endl);
00122 return tokens;
00123 }
00124 } else {
00125
00126 tokens.push_back(str.substr(cpos));
00127 break;
00128 }
00129
00130
00131 if (lpos - cpos > 0)
00132 tokens.push_back(str.substr(cpos, lpos - cpos));
00133
00134
00135 tokens.push_back(str.substr(lpos, rpos-lpos+rbrackStr.length()));
00136 cpos = rpos + rbrackStr.length();
00137 }
00138 return tokens;
00139 }
00140
00153 bool ProcessAndStripXMLTags(string &line, vector<XmlOption*> &res, ReorderingConstraint &reorderingConstraint, vector< size_t > &walls,
00154 const std::string& lbrackStr, const std::string& rbrackStr)
00155 {
00156
00157
00158
00159
00160 if (line.find(lbrackStr) == string::npos) {
00161 return true;
00162 }
00163
00164
00165
00166 vector<string> xmlTokens = TokenizeXml(line, lbrackStr, rbrackStr);
00167
00168
00169
00170 typedef pair< string, pair< size_t, string > > OpenedTag;
00171 vector< OpenedTag > tagStack;
00172
00173 string cleanLine;
00174 size_t wordPos = 0;
00175
00176 const vector<FactorType> &outputFactorOrder = StaticData::Instance().GetOutputFactorOrder();
00177 const string &factorDelimiter = StaticData::Instance().GetFactorDelimiter();
00178
00179
00180 for (size_t xmlTokenPos = 0 ; xmlTokenPos < xmlTokens.size() ; xmlTokenPos++) {
00181
00182 if(!isXmlTag(xmlTokens[xmlTokenPos], lbrackStr, rbrackStr)) {
00183
00184 if (cleanLine.size()>0 &&
00185 cleanLine[cleanLine.size() - 1] != ' ' &&
00186 xmlTokens[xmlTokenPos][0] != ' ') {
00187 cleanLine += " ";
00188 }
00189 cleanLine += xmlTokens[xmlTokenPos];
00190 wordPos = Tokenize(cleanLine).size();
00191 }
00192
00193
00194 else {
00195
00196
00197
00198 string tag = Trim(TrimXml(xmlTokens[xmlTokenPos], lbrackStr, rbrackStr));
00199 VERBOSE(3,"XML TAG IS: " << tag << std::endl);
00200
00201 if (tag.size() == 0) {
00202 TRACE_ERR("ERROR: empty tag name: " << line << endl);
00203 return false;
00204 }
00205
00206
00207 bool isUnary = ( tag[tag.size() - 1] == '/' );
00208
00209
00210 bool isClosed = ( tag[0] == '/' );
00211 bool isOpen = !isClosed;
00212
00213 if (isClosed && isUnary) {
00214 TRACE_ERR("ERROR: can't have both closed and unary tag " << lbrackStr << tag << rbrackStr << ": " << line << endl);
00215 return false;
00216 }
00217
00218 if (isClosed)
00219 tag = tag.substr(1);
00220 if (isUnary)
00221 tag = tag.substr(0,tag.size()-1);
00222
00223
00224 string::size_type endOfName = tag.find_first_of(' ');
00225 string tagName = tag;
00226 string tagContent = "";
00227 if (endOfName != string::npos) {
00228 tagName = tag.substr(0,endOfName);
00229 tagContent = tag.substr(endOfName+1);
00230 }
00231
00232
00233
00234 if (isOpen || isUnary) {
00235
00236 OpenedTag openedTag = make_pair( tagName, make_pair( wordPos, tagContent ) );
00237 tagStack.push_back( openedTag );
00238 VERBOSE(3,"XML TAG " << tagName << " (" << tagContent << ") added to stack, now size " << tagStack.size() << endl);
00239 }
00240
00241
00242
00243 if (isClosed || isUnary) {
00244
00245 if (tagStack.size() == 0) {
00246 TRACE_ERR("ERROR: tag " << tagName << " closed, but not opened" << ":" << line << endl);
00247 return false;
00248 }
00249 OpenedTag openedTag = tagStack.back();
00250 tagStack.pop_back();
00251
00252
00253 if (openedTag.first != tagName) {
00254 TRACE_ERR("ERROR: tag " << openedTag.first << " closed by tag " << tagName << ": " << line << endl );
00255 return false;
00256 }
00257
00258
00259 size_t startPos = openedTag.second.first;
00260 string tagContent = openedTag.second.second;
00261 size_t endPos = wordPos;
00262
00263
00264 string span = ParseXmlTagAttribute(tagContent,"span");
00265 if (! span.empty()) {
00266 vector<string> ij = Tokenize(span, "-");
00267 if (ij.size() != 1 && ij.size() != 2) {
00268 TRACE_ERR("ERROR: span attribute must be of the form \"i-j\" or \"i\": " << line << endl);
00269 return false;
00270 }
00271 startPos = atoi(ij[0].c_str());
00272 if (ij.size() == 1) endPos = startPos + 1;
00273 else endPos = atoi(ij[1].c_str()) + 1;
00274 }
00275
00276 VERBOSE(3,"XML TAG " << tagName << " (" << tagContent << ") spanning " << startPos << " to " << (endPos-1) << " complete, commence processing" << endl);
00277
00278
00279 if (tagName == "wall") {
00280 size_t start = (startPos == 0) ? 0 : startPos-1;
00281 for(size_t pos = start; pos < endPos; pos++)
00282 walls.push_back( pos );
00283 }
00284
00285
00286 else if (tagName == "zone") {
00287 if (startPos >= endPos) {
00288 TRACE_ERR("ERROR: zone must span at least one word: " << line << endl);
00289 return false;
00290 }
00291 reorderingConstraint.SetZone( startPos, endPos-1 );
00292 }
00293
00294
00295 else {
00296 if (startPos >= endPos) {
00297 TRACE_ERR("ERROR: tag " << tagName << " must span at least one word: " << line << endl);
00298 return false;
00299 }
00300
00301
00302
00303 vector<string> altTexts = TokenizeMultiCharSeparator(ParseXmlTagAttribute(tagContent,"translation"), "||");
00304 if( altTexts.size() == 1 && altTexts[0] == "" )
00305 altTexts.pop_back();
00306
00307 vector<string> moreAltTexts = TokenizeMultiCharSeparator(ParseXmlTagAttribute(tagContent,"english"), "||");
00308 if (moreAltTexts.size()>1 || moreAltTexts[0] != "") {
00309 for(vector<string>::iterator translation=moreAltTexts.begin();
00310 translation != moreAltTexts.end();
00311 translation++) {
00312 string t = *translation;
00313 altTexts.push_back( t );
00314 }
00315 }
00316
00317
00318 vector<string> altProbs = TokenizeMultiCharSeparator(ParseXmlTagAttribute(tagContent,"prob"), "||");
00319 if( altProbs.size() == 1 && altProbs[0] == "" )
00320 altProbs.pop_back();
00321
00322
00323 VERBOSE(3,"XML TAG NAME IS: '" << tagName << "'" << endl);
00324 VERBOSE(3,"XML TAG TRANSLATION IS: '" << altTexts[0] << "'" << endl);
00325 VERBOSE(3,"XML TAG PROB IS: '" << altProbs[0] << "'" << endl);
00326 VERBOSE(3,"XML TAG SPAN IS: " << startPos << "-" << (endPos-1) << endl);
00327 if (altProbs.size() > 0 && altTexts.size() != altProbs.size()) {
00328 TRACE_ERR("ERROR: Unequal number of probabilities and translation alternatives: " << line << endl);
00329 return false;
00330 }
00331
00332
00333 if (StaticData::Instance().GetXmlInputType() != XmlIgnore) {
00334
00335 for (size_t i=0; i<altTexts.size(); ++i) {
00336
00337 float probValue = 1;
00338 if (altProbs.size() > 0) probValue = Scan<float>(altProbs[i]);
00339
00340 float scoreValue = FloorScore(TransformScore(probValue));
00341
00342 WordsRange range(startPos,endPos-1);
00343 TargetPhrase targetPhrase;
00344 targetPhrase.CreateFromString(Output, outputFactorOrder,altTexts[i],factorDelimiter);
00345
00346 targetPhrase.SetXMLScore(scoreValue);
00347 targetPhrase.Evaluate();
00348
00349 XmlOption *option = new XmlOption(range,targetPhrase);
00350 CHECK(option);
00351
00352 res.push_back(option);
00353 }
00354 altTexts.clear();
00355 altProbs.clear();
00356 }
00357 }
00358 }
00359 }
00360 }
00361
00362 if (tagStack.size() > 0) {
00363 TRACE_ERR("ERROR: some opened tags were never closed: " << line << endl);
00364 return false;
00365 }
00366
00367
00368 line = cleanLine;
00369 return true;
00370 }
00371
00372 }