00001
00002
00003 #include "TreeInput.h"
00004 #include "StaticData.h"
00005 #include "Util.h"
00006 #include "XmlOption.h"
00007
00008 using namespace std;
00009
00010 namespace Moses
00011 {
00012
00023 bool TreeInput::ProcessAndStripXMLTags(string &line, std::vector<XMLParseOutput> &sourceLabels)
00024 {
00025
00026
00027
00028 if (line.find_first_of('<') == string::npos) {
00029 return true;
00030 }
00031
00032
00033
00034 vector<string> xmlTokens = TokenizeXml(line);
00035
00036
00037
00038 typedef pair< string, pair< size_t, string > > OpenedTag;
00039 vector< OpenedTag > tagStack;
00040
00041 string cleanLine;
00042 size_t wordPos = 0;
00043
00044
00045 for (size_t xmlTokenPos = 0 ; xmlTokenPos < xmlTokens.size() ; xmlTokenPos++) {
00046
00047 if(!isXmlTag(xmlTokens[xmlTokenPos])) {
00048
00049 if (cleanLine.size()>0 &&
00050 cleanLine[cleanLine.size() - 1] != ' ' &&
00051 xmlTokens[xmlTokenPos][0] != ' ') {
00052 cleanLine += " ";
00053 }
00054 cleanLine += xmlTokens[xmlTokenPos];
00055 wordPos = Tokenize(cleanLine).size();
00056 }
00057
00058
00059 else {
00060
00061
00062
00063 string tag = Trim(TrimXml(xmlTokens[xmlTokenPos]));
00064 VERBOSE(3,"XML TAG IS: " << tag << std::endl);
00065
00066 if (tag.size() == 0) {
00067 TRACE_ERR("ERROR: empty tag name: " << line << endl);
00068 return false;
00069 }
00070
00071
00072 bool isUnary = ( tag[tag.size() - 1] == '/' );
00073
00074
00075 bool isClosed = ( tag[0] == '/' );
00076 bool isOpen = !isClosed;
00077
00078 if (isClosed && isUnary) {
00079 TRACE_ERR("ERROR: can't have both closed and unary tag <" << tag << ">: " << line << endl);
00080 return false;
00081 }
00082
00083 if (isClosed)
00084 tag = tag.substr(1);
00085 if (isUnary)
00086 tag = tag.substr(0,tag.size()-1);
00087
00088
00089 string::size_type endOfName = tag.find_first_of(' ');
00090 string tagName = tag;
00091 string tagContent = "";
00092 if (endOfName != string::npos) {
00093 tagName = tag.substr(0,endOfName);
00094 tagContent = tag.substr(endOfName+1);
00095 }
00096
00097
00098
00099 if (isOpen || isUnary) {
00100
00101 OpenedTag openedTag = make_pair( tagName, make_pair( wordPos, tagContent ) );
00102 tagStack.push_back( openedTag );
00103 VERBOSE(3,"XML TAG " << tagName << " (" << tagContent << ") added to stack, now size " << tagStack.size() << endl);
00104 }
00105
00106
00107
00108 if (isClosed || isUnary) {
00109
00110 if (tagStack.size() == 0) {
00111 TRACE_ERR("ERROR: tag " << tagName << " closed, but not opened" << ":" << line << endl);
00112 return false;
00113 }
00114 OpenedTag openedTag = tagStack.back();
00115 tagStack.pop_back();
00116
00117
00118 if (openedTag.first != tagName) {
00119 TRACE_ERR("ERROR: tag " << openedTag.first << " closed by tag " << tagName << ": " << line << endl );
00120 return false;
00121 }
00122
00123
00124 size_t startPos = openedTag.second.first;
00125 string tagContent = openedTag.second.second;
00126 size_t endPos = wordPos;
00127
00128
00129 string span = ParseXmlTagAttribute(tagContent,"span");
00130 if (! span.empty()) {
00131 vector<string> ij = Tokenize(span, "-");
00132 if (ij.size() != 1 && ij.size() != 2) {
00133 TRACE_ERR("ERROR: span attribute must be of the form \"i-j\" or \"i\": " << line << endl);
00134 return false;
00135 }
00136 startPos = atoi(ij[0].c_str());
00137 if (ij.size() == 1) endPos = startPos + 1;
00138 else endPos = atoi(ij[1].c_str()) + 1;
00139 }
00140
00141 VERBOSE(3,"XML TAG " << tagName << " (" << tagContent << ") spanning " << startPos << " to " << (endPos-1) << " complete, commence processing" << endl);
00142
00143 if (startPos >= endPos) {
00144 TRACE_ERR("ERROR: tag " << tagName << " must span at least one word: " << line << endl);
00145 return false;
00146 }
00147
00148 WordsRange range(startPos,endPos-1);
00149
00150
00151 vector<string> altTexts = TokenizeMultiCharSeparator(ParseXmlTagAttribute(tagContent,"label"), "||");
00152 CHECK(altTexts.size() == 1);
00153
00154 XMLParseOutput item(altTexts[0], range);
00155 sourceLabels.push_back(item);
00156 }
00157 }
00158 }
00159
00160 if (tagStack.size() > 0) {
00161 TRACE_ERR("ERROR: some opened tags were never closed: " << line << endl);
00162 return false;
00163 }
00164
00165
00166 line = cleanLine;
00167 return true;
00168 }
00169
00171 int TreeInput::Read(std::istream& in,const std::vector<FactorType>& factorOrder)
00172 {
00173 const StaticData &staticData = StaticData::Instance();
00174
00175 string line;
00176 if (getline(in, line, '\n').eof())
00177 return 0;
00178
00179
00180
00181 std::vector<XMLParseOutput> sourceLabels;
00182 ProcessAndStripXMLTags(line, sourceLabels);
00183
00184
00185 stringstream strme;
00186 strme << line << endl;
00187
00188 Sentence::Read(strme, factorOrder);
00189
00190
00191 size_t sourceSize = GetSize();
00192 m_sourceChart.resize(sourceSize);
00193
00194 for (size_t pos = 0; pos < sourceSize; ++pos) {
00195 m_sourceChart[pos].resize(sourceSize - pos);
00196 }
00197
00198
00199 vector<XMLParseOutput>::const_iterator iterLabel;
00200 for (iterLabel = sourceLabels.begin(); iterLabel != sourceLabels.end(); ++iterLabel) {
00201 const XMLParseOutput &labelItem = *iterLabel;
00202 const WordsRange &range = labelItem.m_range;
00203 const string &label = labelItem.m_label;
00204 AddChartLabel(range.GetStartPos() + 1, range.GetEndPos() + 1, label, factorOrder);
00205 }
00206
00207
00208 for (size_t startPos = 0; startPos < sourceSize; ++startPos) {
00209 for (size_t endPos = startPos; endPos < sourceSize; ++endPos) {
00210 AddChartLabel(startPos, endPos, staticData.GetInputDefaultNonTerminal(), factorOrder);
00211 }
00212 }
00213
00214 return 1;
00215 }
00216
00218 void TreeInput::Print(std::ostream &out) const
00219 {
00220 out << *this << "\n";
00221 }
00222
00224 TranslationOptionCollection* TreeInput::CreateTranslationOptionCollection() const
00225 {
00226
00227 return NULL;
00228 }
00229
00230 void TreeInput::AddChartLabel(size_t startPos, size_t endPos, const Word &label
00231 , const std::vector<FactorType>& )
00232 {
00233 CHECK(label.IsNonTerminal());
00234
00235 SourceLabelOverlap overlapType = StaticData::Instance().GetSourceLabelOverlap();
00236 NonTerminalSet &list = GetLabelSet(startPos, endPos);
00237 switch (overlapType) {
00238 case SourceLabelOverlapAdd:
00239 list.insert(label);
00240 break;
00241 case SourceLabelOverlapReplace:
00242 if (list.size() > 0)
00243 list.clear();
00244 list.insert(label);
00245 break;
00246 case SourceLabelOverlapDiscard:
00247 if (list.size() == 0)
00248 list.insert(label);
00249 break;
00250 }
00251 }
00252
00253 void TreeInput::AddChartLabel(size_t startPos, size_t endPos, const string &label
00254 , const std::vector<FactorType>& factorOrder)
00255 {
00256 Word word(true);
00257 const Factor *factor = FactorCollection::Instance().AddFactor(Input, factorOrder[0], label);
00258 word.SetFactor(0, factor);
00259
00260 AddChartLabel(startPos, endPos, word, factorOrder);
00261 }
00262
00263 std::ostream& operator<<(std::ostream &out, const TreeInput &input)
00264 {
00265 out<< static_cast<Phrase const&>(input) << " ||| ";
00266
00267 size_t size = input.GetSize();
00268 for (size_t startPos = 0; startPos < size; ++startPos) {
00269 for (size_t endPos = startPos; endPos < size; ++endPos) {
00270 const NonTerminalSet &labelSet = input.GetLabelSet(startPos, endPos);
00271 NonTerminalSet::const_iterator iter;
00272 for (iter = labelSet.begin(); iter != labelSet.end(); ++iter) {
00273 const Word &word = *iter;
00274 out << "[" << startPos <<"," << endPos << "]="
00275 << word << "(" << word.IsNonTerminal() << ") ";
00276 CHECK(word.IsNonTerminal());
00277 }
00278 }
00279 }
00280
00281 return out;
00282 }
00283
00284
00285 }
00286