00001 #include "ForestParser.h"
00002
00003 #include <istream>
00004 #include <string>
00005
00006 #include <boost/make_shared.hpp>
00007
00008 #include "util/tokenize_piece.hh"
00009
00010 #include "syntax-common/exception.h"
00011
00012 namespace MosesTraining
00013 {
00014 namespace Syntax
00015 {
00016 namespace PostprocessEgretForests
00017 {
00018
00019 ForestParser::ForestParser()
00020 : m_input(0)
00021 {
00022 }
00023
00024 ForestParser::ForestParser(std::istream &input)
00025 : m_input(&input)
00026 {
00027 ++(*this);
00028 }
00029
00030 ForestParser &ForestParser::operator++()
00031 {
00032 if (!m_input) {
00033 return *this;
00034 }
00035 m_vertexSet.clear();
00036 m_entry.forest.vertices.clear();
00037 if (!std::getline(*m_input, m_tmpLine)) {
00038 m_input = 0;
00039 return *this;
00040 }
00041
00042 ParseSentenceNumLine(m_tmpLine, m_entry.sentNum);
00043
00044 std::getline(*m_input, m_entry.sentence);
00045
00046
00047 std::getline(*m_input, m_tmpLine);
00048 if (m_tmpLine == "") {
00049 std::getline(*m_input, m_tmpLine);
00050 assert(m_tmpLine == "");
00051 return *this;
00052 }
00053 while (m_tmpLine != "") {
00054 ParseHyperedgeLine(m_tmpLine, m_entry.forest);
00055 std::getline(*m_input, m_tmpLine);
00056 }
00057 return *this;
00058 }
00059
00060 boost::shared_ptr<Forest::Vertex> ForestParser::AddVertex(const VertexSP &v)
00061 {
00062 std::pair<VertexSet::iterator, bool> ret = m_vertexSet.insert(v);
00063 if (ret.second) {
00064 m_entry.forest.vertices.push_back(*ret.first);
00065 }
00066 return *ret.first;
00067 }
00068
00069 void ForestParser::ParseSentenceNumLine(const std::string &line,
00070 std::size_t &sentNum)
00071 {
00072 const util::AnyCharacter delimiter(" \t");
00073 util::TokenIter<util::AnyCharacter, true> p(line, delimiter);
00074 if (*p != "sentence") {
00075
00076 throw Exception("");
00077 }
00078 ++p;
00079 std::string tmp;
00080 p->CopyToString(&tmp);
00081 sentNum = std::atoi(tmp.c_str());
00082 }
00083
00084 void ForestParser::ParseHyperedgeLine(const std::string &line, Forest &forest)
00085 {
00086 const util::AnyCharacter delimiter(" \t");
00087 util::TokenIter<util::AnyCharacter, true> p(line, delimiter);
00088 VertexSP v = AddVertex(ParseVertex(*p));
00089 HyperedgeSP e = boost::make_shared<Forest::Hyperedge>();
00090 e->head = v.get();
00091 ++p;
00092 if (*p != "=>") {
00093
00094 throw Exception("");
00095 }
00096 for (++p; *p != "|||"; ++p) {
00097 v = ParseVertex(*p);
00098 if (v->start == -1) {
00099
00100 v->start = v->end = e->head->start;
00101 }
00102 e->tail.push_back(AddVertex(v).get());
00103 }
00104 ++p;
00105 std::string tmp;
00106 p->CopyToString(&tmp);
00107 e->weight = std::atof(tmp.c_str());
00108 e->head->incoming.push_back(e);
00109 }
00110
00111 boost::shared_ptr<Forest::Vertex> ForestParser::ParseVertex(
00112 const StringPiece &s)
00113 {
00114 VertexSP v = boost::make_shared<Forest::Vertex>();
00115 std::size_t pos = s.rfind('[');
00116 if (pos == std::string::npos) {
00117 s.CopyToString(&v->symbol.value);
00118 v->symbol.isNonTerminal = false;
00119 v->start = v->end = -1;
00120 return v;
00121 }
00122 if (pos > 2 && s[pos-2] == '^' && s[pos-1] == 'g') {
00123 s.substr(0, pos-2).CopyToString(&v->symbol.value);
00124 } else {
00125 s.substr(0, pos).CopyToString(&v->symbol.value);
00126 }
00127 v->symbol.isNonTerminal = true;
00128 std::size_t begin = pos + 1;
00129 pos = s.find(',', begin+1);
00130 std::string tmp;
00131 s.substr(begin, pos-begin).CopyToString(&tmp);
00132 v->start = std::atoi(tmp.c_str());
00133 s.substr(pos+1, s.size()-pos-2).CopyToString(&tmp);
00134 v->end = std::atoi(tmp.c_str());
00135 return v;
00136 }
00137
00138 bool operator==(const ForestParser &lhs, const ForestParser &rhs)
00139 {
00140
00141 return lhs.m_input == rhs.m_input;
00142 }
00143
00144 bool operator!=(const ForestParser &lhs, const ForestParser &rhs)
00145 {
00146 return !(lhs == rhs);
00147 }
00148
00149 }
00150 }
00151 }