00001 #include <map>
00002 #include "StaticData.h"
00003 #include "WordLattice.h"
00004 #include "PCNTools.h"
00005 #include "Util.h"
00006 #include "FloydWarshall.h"
00007 #include "TranslationOptionCollectionLattice.h"
00008 #include "TranslationOptionCollectionConfusionNet.h"
00009 #include "moses/FF/InputFeature.h"
00010 #include "moses/TranslationTask.h"
00011
00012 namespace Moses
00013 {
00014 WordLattice::WordLattice(AllOptions::ptr const& opts) : ConfusionNet(opts)
00015 {
00016 UTIL_THROW_IF2(InputFeature::InstancePtr() == NULL,
00017 "Input feature must be specified");
00018 }
00019
00020 size_t WordLattice::GetColumnIncrement(size_t i, size_t j) const
00021 {
00022 return next_nodes[i][j];
00023 }
00024
00025 void WordLattice::Print(std::ostream& out) const
00026 {
00027 out<<"word lattice: "<<data.size()<<"\n";
00028 for(size_t i=0; i<data.size(); ++i) {
00029 out<<i<<" -- ";
00030 for(size_t j=0; j<data[i].size(); ++j) {
00031 out<<"("<<data[i][j].first.ToString()<<", ";
00032
00033
00034 std::vector<float>::const_iterator iterDense;
00035 for(iterDense = data[i][j].second.denseScores.begin(); iterDense < data[i][j].second.denseScores.end(); ++iterDense) {
00036 out<<", "<<*iterDense;
00037 }
00038
00039
00040 std::map<StringPiece, float>::const_iterator iterSparse;
00041 for(iterSparse = data[i][j].second.sparseScores.begin(); iterSparse != data[i][j].second.sparseScores.end(); ++iterSparse) {
00042 out << ", " << iterSparse->first << "=" << iterSparse->second;
00043 }
00044
00045 out << GetColumnIncrement(i,j) << ") ";
00046 }
00047
00048 out<<"\n";
00049 }
00050 out<<"\n\n";
00051 }
00052
00053 int
00054 WordLattice::
00055 InitializeFromPCNDataType(const PCN::CN& cn, const std::string& debug_line)
00056 {
00057 const std::vector<FactorType>& factorOrder = m_options->input.factor_order;
00058 size_t const maxPhraseLength = m_options->search.max_phrase_length;
00059
00060 const InputFeature *inputFeature = InputFeature::InstancePtr();
00061 size_t numInputScores = inputFeature->GetNumInputScores();
00062 size_t numRealWordCount = inputFeature->GetNumRealWordsInInput();
00063
00064 bool addRealWordCount = (numRealWordCount > 0);
00065
00066
00067 data.resize(cn.size());
00068 next_nodes.resize(cn.size());
00069 for(size_t i=0; i<cn.size(); ++i) {
00070 const PCN::CNCol& col = cn[i];
00071 if (col.empty()) return false;
00072 data[i].resize(col.size());
00073 next_nodes[i].resize(col.size());
00074 for (size_t j=0; j<col.size(); ++j) {
00075 const PCN::CNAlt& alt = col[j];
00076
00077
00078 if (alt.m_denseFeatures.size() != numInputScores) {
00079 TRACE_ERR("ERROR: need " << numInputScores
00080 << " link parameters, found "
00081 << alt.m_denseFeatures.size()
00082 << " while reading column " << i
00083 << " from " << debug_line << "\n");
00084 return false;
00085 }
00086
00087
00088 std::vector<float>::const_iterator probsIterator;
00089 data[i][j].second = std::vector<float>(0);
00090 for(probsIterator = alt.m_denseFeatures.begin();
00091 probsIterator < alt.m_denseFeatures.end();
00092 probsIterator++) {
00093 IFVERBOSE(1) {
00094 if (*probsIterator < 0.0f) {
00095 TRACE_ERR("WARN: neg probability: " << *probsIterator << "\n");
00096
00097 }
00098 if (*probsIterator > 1.0f) {
00099 TRACE_ERR("WARN: probability > 1: " << *probsIterator << "\n");
00100
00101 }
00102 }
00103
00104 float score = std::max(static_cast<float>(log(*probsIterator)), LOWEST_SCORE);
00105 ScorePair &scorePair = data[i][j].second;
00106 scorePair.denseScores.push_back(score);
00107 }
00108
00109 if (addRealWordCount) {
00110
00111 float value = (alt.m_word=="" || alt.m_word==EPSILON) ? 0.0f : -1.0f;
00112 data[i][j].second.denseScores.push_back(value);
00113 }
00114 Word& w = data[i][j].first;
00115 w.CreateFromString(Input,factorOrder,StringPiece(alt.m_word),false);
00116
00117 next_nodes[i][j] = alt.m_next;
00118
00119 if(next_nodes[i][j] > maxPhraseLength) {
00120 TRACE_ERR("ERROR: Jump length " << next_nodes[i][j] << " in word lattice exceeds maximum phrase length " << maxPhraseLength << ".\n");
00121 TRACE_ERR("ERROR: Increase max-phrase-length to process this lattice.\n");
00122 return false;
00123 }
00124 }
00125 }
00126 if (!cn.empty()) {
00127 std::vector<std::vector<bool> > edges(0);
00128 this->GetAsEdgeMatrix(edges);
00129 floyd_warshall(edges,distances);
00130
00131 IFVERBOSE(2) {
00132 TRACE_ERR("Shortest paths:\n");
00133 for (size_t i=0; i<edges.size(); ++i) {
00134 for (size_t j=0; j<edges.size(); ++j) {
00135 int d = distances[i][j];
00136 if (d > 99999) {
00137 d=-1;
00138 }
00139 TRACE_ERR("\t" << d);
00140 }
00141 TRACE_ERR("\n");
00142 }
00143 }
00144 }
00145 return !cn.empty();
00146 }
00147
00148 int
00149 WordLattice::
00150 Read(std::istream& in)
00151 {
00152 Clear();
00153 std::string line;
00154 if(!getline(in,line)) return 0;
00155 std::map<std::string, std::string> meta=ProcessAndStripSGML(line);
00156 if (meta.find("id") != meta.end()) {
00157 this->SetTranslationId(atol(meta["id"].c_str()));
00158 }
00159
00160 PCN::CN cn = PCN::parsePCN(line);
00161 return InitializeFromPCNDataType(cn, line);
00162 }
00163
00164 void WordLattice::GetAsEdgeMatrix(std::vector<std::vector<bool> >& edges) const
00165 {
00166 edges.resize(data.size()+1,std::vector<bool>(data.size()+1, false));
00167 for (size_t i=0; i<data.size(); ++i) {
00168 for (size_t j=0; j<data[i].size(); ++j) {
00169 edges[i][i+next_nodes[i][j]] = true;
00170 }
00171 }
00172 }
00173
00174 int WordLattice::ComputeDistortionDistance(const Range& prev, const Range& current) const
00175 {
00176 int result;
00177
00178 if (prev.GetStartPos() == NOT_FOUND && current.GetStartPos() == 0) {
00179 result = 0;
00180
00181 VERBOSE(4, "Word lattice distortion: monotonic initial step\n");
00182 } else if (prev.GetEndPos()+1 == current.GetStartPos()) {
00183 result = 0;
00184
00185 VERBOSE(4, "Word lattice distortion: monotonic step from " << prev.GetEndPos() << " to " << current.GetStartPos() << "\n");
00186 } else if (prev.GetStartPos() == NOT_FOUND) {
00187 result = distances[0][current.GetStartPos()];
00188
00189 VERBOSE(4, "Word lattice distortion: initial step from 0 to " << current.GetStartPos() << " of length " << result << "\n");
00190 if (result < 0 || result > 99999) {
00191 TRACE_ERR("prev: " << prev << "\ncurrent: " << current << "\n");
00192 TRACE_ERR("A: got a weird distance from 0 to " << (current.GetStartPos()+1) << " of " << result << "\n");
00193 }
00194 } else if (prev.GetEndPos() > current.GetStartPos()) {
00195 result = distances[current.GetStartPos()][prev.GetEndPos() + 1];
00196
00197 VERBOSE(4, "Word lattice distortion: backward step from " << (prev.GetEndPos()+1) << " to " << current.GetStartPos() << " of length " << result << "\n");
00198 if (result < 0 || result > 99999) {
00199 TRACE_ERR("prev: " << prev << "\ncurrent: " << current << "\n");
00200 TRACE_ERR("B: got a weird distance from "<< current.GetStartPos() << " to " << prev.GetEndPos()+1 << " of " << result << "\n");
00201 }
00202 } else {
00203 result = distances[prev.GetEndPos() + 1][current.GetStartPos()];
00204
00205 VERBOSE(4, "Word lattice distortion: forward step from " << (prev.GetEndPos()+1) << " to " << current.GetStartPos() << " of length " << result << "\n");
00206 if (result < 0 || result > 99999) {
00207 TRACE_ERR("prev: " << prev << "\ncurrent: " << current << "\n");
00208 TRACE_ERR("C: got a weird distance from "<< prev.GetEndPos()+1 << " to " << current.GetStartPos() << " of " << result << "\n");
00209 }
00210 }
00211
00212 return result;
00213 }
00214
00215 bool WordLattice::CanIGetFromAToB(size_t start, size_t end) const
00216 {
00217
00218 return distances[start][end] < 100000;
00219 }
00220
00221 TranslationOptionCollection*
00222 WordLattice::
00223 CreateTranslationOptionCollection(ttasksptr const& ttask) const
00224 {
00225 TranslationOptionCollection *rv = NULL;
00226 if (StaticData::Instance().GetUseLegacyPT()) {
00227 rv = new TranslationOptionCollectionConfusionNet(ttask, *this);
00228 } else {
00229 rv = new TranslationOptionCollectionLattice(ttask, *this);
00230 }
00231 assert(rv);
00232 return rv;
00233 }
00234
00235
00236 std::ostream& operator<<(std::ostream &out, const WordLattice &obj)
00237 {
00238 out << "next_nodes=";
00239 for (size_t i = 0; i < obj.next_nodes.size(); ++i) {
00240 out << i << ":";
00241
00242 const std::vector<size_t> &inner = obj.next_nodes[i];
00243 for (size_t j = 0; j < inner.size(); ++j) {
00244 out << inner[j] << " ";
00245 }
00246 }
00247
00248 out << "distances=";
00249 for (size_t i = 0; i < obj.distances.size(); ++i) {
00250 out << i << ":";
00251
00252 const std::vector<int> &inner = obj.distances[i];
00253 for (size_t j = 0; j < inner.size(); ++j) {
00254 out << inner[j] << " ";
00255 }
00256 }
00257 return out;
00258 }
00259
00260 }
00261