00001
00002
00003
00004
00005
00006
00007
00008
00009
00010
00011
00012
00013
00014
00015
00016
00017
00018
00019 #include <iostream>
00020 #include <set>
00021
00022 #include <boost/lexical_cast.hpp>
00023
00024 #include "util/double-conversion/double-conversion.h"
00025 #include "util/string_piece.hh"
00026 #include "util/tokenize_piece.hh"
00027
00028 #include "Hypergraph.h"
00029
00030 using namespace std;
00031 static const string kBOS = "<s>";
00032 static const string kEOS = "</s>";
00033
00034 namespace MosesTuning
00035 {
00036
00037 StringPiece NextLine(util::FilePiece& from)
00038 {
00039 StringPiece line;
00040 while ((line = from.ReadLine()).starts_with("#"));
00041 return line;
00042 }
00043
00044 Vocab::Vocab() : eos_( FindOrAdd(kEOS)), bos_(FindOrAdd(kBOS))
00045 {
00046 }
00047
00048 const Vocab::Entry &Vocab::FindOrAdd(const StringPiece &str)
00049 {
00050 #if BOOST_VERSION >= 104200
00051 Map::const_iterator i= map_.find(str, Hash(), Equals());
00052 #else
00053 std::string copied_str(str.data(), str.size());
00054 Map::const_iterator i = map_.find(copied_str.c_str());
00055 #endif
00056 if (i != map_.end()) return *i;
00057 char *copied = static_cast<char*>(piece_backing_.Allocate(str.size() + 1));
00058 memcpy(copied, str.data(), str.size());
00059 copied[str.size()] = 0;
00060 return *map_.insert(Entry(copied, map_.size())).first;
00061 }
00062
00063 double_conversion::StringToDoubleConverter converter(double_conversion::StringToDoubleConverter::NO_FLAGS, NAN, NAN, "inf", "nan");
00064
00065
00069 static pair<Edge*,size_t> ReadEdge(util::FilePiece &from, Graph &graph)
00070 {
00071 Edge* edge = graph.NewEdge();
00072 StringPiece line = from.ReadLine();
00073 util::TokenIter<util::MultiCharacter> pipes(line, util::MultiCharacter(" ||| "));
00074
00075 for (util::TokenIter<util::SingleCharacter, true> i(*pipes, util::SingleCharacter(' ')); i; ++i) {
00076 StringPiece got = *i;
00077 if ('[' == *got.data() && ']' == got.data()[got.size() - 1]) {
00078
00079 char *end_ptr;
00080 unsigned long int child = std::strtoul(got.data() + 1, &end_ptr, 10);
00081 UTIL_THROW_IF(end_ptr != got.data() + got.size() - 1, HypergraphException, "Bad non-terminal" << got);
00082 UTIL_THROW_IF(child >= graph.VertexSize(), HypergraphException, "Reference to vertex " << child << " but we only have " << graph.VertexSize() << " vertices. Is the file in bottom-up format?");
00083 edge->AddWord(NULL);
00084 edge->AddChild(child);
00085 } else {
00086 const Vocab::Entry &found = graph.MutableVocab().FindOrAdd(got);
00087 edge->AddWord(&found);
00088 }
00089 }
00090
00091
00092 ++pipes;
00093 for (util::TokenIter<util::SingleCharacter, true> i(*pipes, util::SingleCharacter(' ')); i; ++i) {
00094 StringPiece fv = *i;
00095 if (!fv.size()) break;
00096 size_t equals = fv.find_last_of("=");
00097 UTIL_THROW_IF(equals == fv.npos, HypergraphException, "Failed to parse feature '" << fv << "'");
00098 StringPiece name = fv.substr(0,equals);
00099 StringPiece value = fv.substr(equals+1);
00100 int processed;
00101 float score = converter.StringToFloat(value.data(), value.length(), &processed);
00102 UTIL_THROW_IF(isnan(score), HypergraphException, "Failed to parse weight '" << value << "'");
00103 edge->AddFeature(name,score);
00104 }
00105
00106 ++pipes;
00107 size_t sourceCovered = boost::lexical_cast<size_t>(*pipes);
00108 return pair<Edge*,size_t>(edge,sourceCovered);
00109 }
00110
00111 void Graph::Prune(Graph* pNewGraph, const SparseVector& weights, size_t minEdgeCount) const
00112 {
00113
00114 Graph& newGraph = *pNewGraph;
00115
00116
00117
00118
00119
00120
00121
00122
00123
00124
00125
00126
00127
00128
00129
00130
00131
00132
00133
00134
00135
00136
00137
00138 map<const Edge*, FeatureStatsType> edgeBackwardScores;
00139 map<const Edge*, size_t> edgeHeads;
00140 vector<FeatureStatsType> vertexBackwardScores(vertices_.Size(), kMinScore);
00141 vector<vector<const Edge*> > outgoing(vertices_.Size());
00142
00143
00144 for (size_t vi = 0; vi < vertices_.Size(); ++vi) {
00145
00146 const Vertex& vertex = vertices_[vi];
00147 const vector<const Edge*>& incoming = vertex.GetIncoming();
00148 if (!incoming.size()) {
00149 vertexBackwardScores[vi] = 0;
00150 } else {
00151 for (size_t ei = 0; ei < incoming.size(); ++ei) {
00152
00153 edgeHeads[incoming[ei]]= vi;
00154 FeatureStatsType incomingScore = incoming[ei]->GetScore(weights);
00155 for (size_t i = 0; i < incoming[ei]->Children().size(); ++i) {
00156
00157 size_t childId = incoming[ei]->Children()[i];
00158 UTIL_THROW_IF(vertexBackwardScores[childId] == kMinScore,
00159 HypergraphException, "Graph was not topologically sorted. curr=" << vi << " prev=" << childId);
00160 outgoing[childId].push_back(incoming[ei]);
00161 incomingScore += vertexBackwardScores[childId];
00162 }
00163 edgeBackwardScores[incoming[ei]]= incomingScore;
00164
00165 if (incomingScore > vertexBackwardScores[vi]) vertexBackwardScores[vi] = incomingScore;
00166 }
00167 }
00168 }
00169
00170
00171 vector<FeatureStatsType> vertexForwardScores(vertices_.Size(), kMinScore);
00172 map<const Edge*, FeatureStatsType> edgeForwardScores;
00173 for (size_t i = 1; i <= vertices_.Size(); ++i) {
00174 size_t vi = vertices_.Size() - i;
00175
00176 if (!outgoing[vi].size()) {
00177 vertexForwardScores[vi] = 0;
00178 } else {
00179 for (size_t ei = 0; ei < outgoing[vi].size(); ++ei) {
00180
00181 FeatureStatsType outgoingScore = 0;
00182
00183 outgoingScore += vertexForwardScores[edgeHeads[outgoing[vi][ei]]];
00184
00185 edgeForwardScores[outgoing[vi][ei]] = outgoingScore;
00186
00187 for (size_t i = 0; i < outgoing[vi][ei]->Children().size(); ++i) {
00188 size_t siblingId = outgoing[vi][ei]->Children()[i];
00189 if (siblingId != vi) {
00190
00191 outgoingScore += vertexBackwardScores[siblingId];
00192 }
00193 }
00194 outgoingScore += outgoing[vi][ei]->GetScore(weights);
00195 if (outgoingScore > vertexForwardScores[vi]) vertexForwardScores[vi] = outgoingScore;
00196
00197 }
00198 }
00199 }
00200
00201
00202
00203 multimap<FeatureStatsType, const Edge*> edgeScores;
00204 for (size_t i = 0; i < edges_.Size(); ++i) {
00205 const Edge* edge = &(edges_[i]);
00206 if (edgeForwardScores.find(edge) == edgeForwardScores.end()) {
00207
00208
00209 edgeForwardScores[edge] = vertexForwardScores[edgeHeads[edge]];
00210 }
00211 FeatureStatsType score = edgeForwardScores[edge] + edgeBackwardScores[edge];
00212 edgeScores.insert(pair<FeatureStatsType, const Edge*>(score,edge));
00213
00214 }
00215
00216
00217
00218 multimap<FeatureStatsType, const Edge*>::const_reverse_iterator ei = edgeScores.rbegin();
00219 size_t edgeCount = 1;
00220 while(edgeCount < minEdgeCount && ei != edgeScores.rend()) {
00221 ++ei;
00222 ++edgeCount;
00223 }
00224 multimap<FeatureStatsType, const Edge*>::const_iterator lowest = edgeScores.begin();
00225 if (ei != edgeScores.rend()) lowest = edgeScores.lower_bound(ei->first);
00226
00227
00228 set<size_t> retainedVertices;
00229 set<const Edge*> retainedEdges;
00230 for (; lowest != edgeScores.end(); ++lowest) {
00231
00232 retainedEdges.insert(lowest->second);
00233 retainedVertices.insert(edgeHeads[lowest->second]);
00234 for (size_t i = 0; i < lowest->second->Children().size(); ++i) {
00235 retainedVertices.insert(lowest->second->Children()[i]);
00236 }
00237 }
00238 newGraph.SetCounts(retainedVertices.size(), retainedEdges.size());
00239
00240
00241 map<size_t,size_t> oldIdToNew;
00242 size_t vi = 0;
00243 for (set<size_t>::const_iterator i = retainedVertices.begin(); i != retainedVertices.end(); ++i, ++vi) {
00244
00245 oldIdToNew[*i] = vi;
00246 Vertex* vertex = newGraph.NewVertex();
00247 vertex->SetSourceCovered(vertices_[*i].SourceCovered());
00248 }
00249
00250 for (set<const Edge*>::const_iterator i = retainedEdges.begin(); i != retainedEdges.end(); ++i) {
00251 Edge* newEdge = newGraph.NewEdge();
00252 const Edge* oldEdge = *i;
00253 for (size_t j = 0; j < oldEdge->Words().size(); ++j) {
00254 newEdge->AddWord(oldEdge->Words()[j]);
00255 }
00256 for (size_t j = 0; j < oldEdge->Children().size(); ++j) {
00257 newEdge->AddChild(oldIdToNew[oldEdge->Children()[j]]);
00258 }
00259 newEdge->SetFeatures(oldEdge->Features());
00260 Vertex& newHead = newGraph.vertices_[oldIdToNew[edgeHeads[oldEdge]]];
00261 newHead.AddEdge(newEdge);
00262 }
00263
00264
00265
00266
00267
00268
00269
00270
00271
00272
00273
00274
00275
00276
00277
00278
00279
00280
00281
00282
00283
00284
00285
00286
00287 }
00288
00292 void ReadGraph(util::FilePiece &from, Graph &graph)
00293 {
00294
00295
00296 StringPiece line = from.ReadLine();
00297 UTIL_THROW_IF(line.compare("# target ||| features ||| source-covered") != 0, HypergraphException, "Incorrect format spec on first line: '" << line << "'");
00298 line = NextLine(from);
00299
00300
00301 util::TokenIter<util::SingleCharacter, false> i(line, util::SingleCharacter(' '));
00302 unsigned long int vertices = boost::lexical_cast<unsigned long int>(*i);
00303 ++i;
00304 unsigned long int edges = boost::lexical_cast<unsigned long int>(*i);
00305 graph.SetCounts(vertices, edges);
00306
00307 for (size_t i = 0; i < vertices; ++i) {
00308 line = NextLine(from);
00309 unsigned long int edge_count = boost::lexical_cast<unsigned long int>(line);
00310 Vertex* vertex = graph.NewVertex();
00311 for (unsigned long int e = 0; e < edge_count; ++e) {
00312 pair<Edge*,size_t> edge = ReadEdge(from, graph);
00313 vertex->AddEdge(edge.first);
00314
00315
00316 if (!e) {
00317 vertex->SetSourceCovered(edge.second);
00318 }
00319 }
00320 }
00321 }
00322
00323 };