00001
00002
00003
00004
00005
00006
00007
00008
00009
00010
00011
00012
00013
00014
00015
00016
00017
00018
00019
00020
00021
00022
00023
00024
00025
00026
00027
00028
00029
00030
00031
00032
00033
00034
00035 #include <iostream>
00036 #include <stack>
00037 #include <boost/algorithm/string.hpp>
00038
00039 #include "moses/TypeDef.h"
00040 #include "moses/Util.h"
00041 #include "moses/Hypothesis.h"
00042 #include "moses/WordsRange.h"
00043 #include "moses/TrellisPathList.h"
00044 #include "moses/StaticData.h"
00045 #include "moses/FeatureVector.h"
00046 #include "moses/InputFileStream.h"
00047 #include "IOWrapper.h"
00048
00049 using namespace std;
00050 using namespace Moses;
00051
00052 namespace MosesCmd
00053 {
00054
00055 IOWrapper::IOWrapper(
00056 const vector<FactorType> &inputFactorOrder
00057 , const vector<FactorType> &outputFactorOrder
00058 , const FactorMask &inputFactorUsed
00059 , size_t nBestSize
00060 , const string &nBestFilePath)
00061 :m_inputFactorOrder(inputFactorOrder)
00062 ,m_outputFactorOrder(outputFactorOrder)
00063 ,m_inputFactorUsed(inputFactorUsed)
00064 ,m_inputFile(NULL)
00065 ,m_inputStream(&std::cin)
00066 ,m_nBestStream(NULL)
00067 ,m_outputWordGraphStream(NULL)
00068 ,m_outputSearchGraphStream(NULL)
00069 ,m_detailedTranslationReportingStream(NULL)
00070 ,m_alignmentOutputStream(NULL)
00071 {
00072 Initialization(inputFactorOrder, outputFactorOrder
00073 , inputFactorUsed
00074 , nBestSize, nBestFilePath);
00075 }
00076
00077 IOWrapper::IOWrapper(const std::vector<FactorType> &inputFactorOrder
00078 , const std::vector<FactorType> &outputFactorOrder
00079 , const FactorMask &inputFactorUsed
00080 , size_t nBestSize
00081 , const std::string &nBestFilePath
00082 , const std::string &inputFilePath)
00083 :m_inputFactorOrder(inputFactorOrder)
00084 ,m_outputFactorOrder(outputFactorOrder)
00085 ,m_inputFactorUsed(inputFactorUsed)
00086 ,m_inputFilePath(inputFilePath)
00087 ,m_inputFile(new InputFileStream(inputFilePath))
00088 ,m_nBestStream(NULL)
00089 ,m_outputWordGraphStream(NULL)
00090 ,m_outputSearchGraphStream(NULL)
00091 ,m_detailedTranslationReportingStream(NULL)
00092 ,m_alignmentOutputStream(NULL)
00093 {
00094 Initialization(inputFactorOrder, outputFactorOrder
00095 , inputFactorUsed
00096 , nBestSize, nBestFilePath);
00097
00098 m_inputStream = m_inputFile;
00099 }
00100
00101 IOWrapper::~IOWrapper()
00102 {
00103 if (m_inputFile != NULL)
00104 delete m_inputFile;
00105 if (m_nBestStream != NULL && !m_surpressSingleBestOutput) {
00106
00107 delete m_nBestStream;
00108 }
00109 if (m_outputWordGraphStream != NULL) {
00110 delete m_outputWordGraphStream;
00111 }
00112 if (m_outputSearchGraphStream != NULL) {
00113 delete m_outputSearchGraphStream;
00114 }
00115 delete m_detailedTranslationReportingStream;
00116 delete m_alignmentOutputStream;
00117 }
00118
00119 void IOWrapper::Initialization(const std::vector<FactorType> &
00120 , const std::vector<FactorType> &
00121 , const FactorMask &
00122 , size_t nBestSize
00123 , const std::string &nBestFilePath)
00124 {
00125 const StaticData &staticData = StaticData::Instance();
00126
00127
00128 m_surpressSingleBestOutput = false;
00129
00130 if (nBestSize > 0) {
00131 if (nBestFilePath == "-" || nBestFilePath == "/dev/stdout") {
00132 m_nBestStream = &std::cout;
00133 m_surpressSingleBestOutput = true;
00134 } else {
00135 std::ofstream *file = new std::ofstream;
00136 m_nBestStream = file;
00137 file->open(nBestFilePath.c_str());
00138 }
00139 }
00140
00141
00142 if (staticData.GetOutputWordGraph()) {
00143 string fileName = staticData.GetParam("output-word-graph")[0];
00144 std::ofstream *file = new std::ofstream;
00145 m_outputWordGraphStream = file;
00146 file->open(fileName.c_str());
00147 }
00148
00149
00150
00151 if (staticData.GetOutputSearchGraph()) {
00152 string fileName;
00153 if (staticData.GetOutputSearchGraphExtended())
00154 fileName = staticData.GetParam("output-search-graph-extended")[0];
00155 else
00156 fileName = staticData.GetParam("output-search-graph")[0];
00157 std::ofstream *file = new std::ofstream;
00158 m_outputSearchGraphStream = file;
00159 file->open(fileName.c_str());
00160 }
00161
00162
00163 if (staticData.IsDetailedTranslationReportingEnabled()) {
00164 const std::string &path = staticData.GetDetailedTranslationReportingFilePath();
00165 m_detailedTranslationReportingStream = new std::ofstream(path.c_str());
00166 CHECK(m_detailedTranslationReportingStream->good());
00167 }
00168
00169
00170 if (! staticData.GetAlignmentOutputFile().empty()) {
00171 m_alignmentOutputStream = new ofstream(staticData.GetAlignmentOutputFile().c_str());
00172 CHECK(m_alignmentOutputStream->good());
00173 }
00174
00175 }
00176
00177 InputType*IOWrapper::GetInput(InputType* inputType)
00178 {
00179 if(inputType->Read(*m_inputStream, m_inputFactorOrder)) {
00180 if (long x = inputType->GetTranslationId()) {
00181 if (x>=m_translationId) m_translationId = x+1;
00182 } else inputType->SetTranslationId(m_translationId++);
00183
00184 return inputType;
00185 } else {
00186 delete inputType;
00187 return NULL;
00188 }
00189 }
00190
00191
00192
00193
00194 void OutputSurface(std::ostream &out, const Hypothesis &edge, const std::vector<FactorType> &outputFactorOrder,
00195 bool reportSegmentation, bool reportAllFactors)
00196 {
00197 CHECK(outputFactorOrder.size() > 0);
00198 const Phrase& phrase = edge.GetCurrTargetPhrase();
00199 if (reportAllFactors == true) {
00200 out << phrase;
00201 } else {
00202 size_t size = phrase.GetSize();
00203 for (size_t pos = 0 ; pos < size ; pos++) {
00204 const Factor *factor = phrase.GetFactor(pos, outputFactorOrder[0]);
00205 out << *factor;
00206 CHECK(factor);
00207
00208 for (size_t i = 1 ; i < outputFactorOrder.size() ; i++) {
00209 const Factor *factor = phrase.GetFactor(pos, outputFactorOrder[i]);
00210 CHECK(factor);
00211
00212 out << "|" << *factor;
00213 }
00214 out << " ";
00215 }
00216 }
00217
00218
00219 if (reportSegmentation == true && phrase.GetSize() > 0) {
00220 out << "|" << edge.GetCurrSourceWordsRange().GetStartPos()
00221 << "-" << edge.GetCurrSourceWordsRange().GetEndPos() << "| ";
00222 }
00223 }
00224
00225 void OutputBestSurface(std::ostream &out, const Hypothesis *hypo, const std::vector<FactorType> &outputFactorOrder,
00226 bool reportSegmentation, bool reportAllFactors)
00227 {
00228 if (hypo != NULL) {
00229
00230 OutputBestSurface(out, hypo->GetPrevHypo(), outputFactorOrder, reportSegmentation, reportAllFactors);
00231 OutputSurface(out, *hypo, outputFactorOrder, reportSegmentation, reportAllFactors);
00232 }
00233 }
00234
00235 void OutputAlignment(ostream &out, const AlignmentInfo &ai, size_t sourceOffset, size_t targetOffset)
00236 {
00237 typedef std::vector< const std::pair<size_t,size_t>* > AlignVec;
00238 AlignVec alignments = ai.GetSortedAlignments();
00239
00240 AlignVec::const_iterator it;
00241 for (it = alignments.begin(); it != alignments.end(); ++it) {
00242 const std::pair<size_t,size_t> &alignment = **it;
00243 out << alignment.first + sourceOffset << "-" << alignment.second + targetOffset << " ";
00244 }
00245
00246 }
00247
00248 void OutputAlignment(ostream &out, const vector<const Hypothesis *> &edges)
00249 {
00250 size_t targetOffset = 0;
00251
00252 for (int currEdge = (int)edges.size() - 1 ; currEdge >= 0 ; currEdge--) {
00253 const Hypothesis &edge = *edges[currEdge];
00254 const TargetPhrase &tp = edge.GetCurrTargetPhrase();
00255 size_t sourceOffset = edge.GetCurrSourceWordsRange().GetStartPos();
00256
00257 OutputAlignment(out, tp.GetAlignTerm(), sourceOffset, targetOffset);
00258
00259 targetOffset += tp.GetSize();
00260 }
00261 out << std::endl;
00262 }
00263
00264 void OutputAlignment(std::ostream &out, const Moses::Hypothesis *hypo)
00265 {
00266 std::vector<const Hypothesis *> edges;
00267 const Hypothesis *currentHypo = hypo;
00268 while (currentHypo) {
00269 edges.push_back(currentHypo);
00270 currentHypo = currentHypo->GetPrevHypo();
00271 }
00272
00273 OutputAlignment(out, edges);
00274
00275 }
00276
00277 void OutputAlignment(OutputCollector* collector, size_t lineNo , const vector<const Hypothesis *> &edges)
00278 {
00279 ostringstream out;
00280 OutputAlignment(out, edges);
00281
00282 collector->Write(lineNo,out.str());
00283 }
00284
00285 void OutputAlignment(OutputCollector* collector, size_t lineNo , const Hypothesis *hypo)
00286 {
00287 if (collector) {
00288 std::vector<const Hypothesis *> edges;
00289 const Hypothesis *currentHypo = hypo;
00290 while (currentHypo) {
00291 edges.push_back(currentHypo);
00292 currentHypo = currentHypo->GetPrevHypo();
00293 }
00294
00295 OutputAlignment(collector,lineNo, edges);
00296 }
00297 }
00298
00299 void OutputAlignment(OutputCollector* collector, size_t lineNo , const TrellisPath &path)
00300 {
00301 if (collector) {
00302 OutputAlignment(collector,lineNo, path.GetEdges());
00303 }
00304 }
00305
00306 void OutputBestHypo(const Moses::TrellisPath &path, long , bool reportSegmentation, bool reportAllFactors, std::ostream &out)
00307 {
00308 const std::vector<const Hypothesis *> &edges = path.GetEdges();
00309
00310 for (int currEdge = (int)edges.size() - 1 ; currEdge >= 0 ; currEdge--) {
00311 const Hypothesis &edge = *edges[currEdge];
00312 OutputSurface(out, edge, StaticData::Instance().GetOutputFactorOrder(), reportSegmentation, reportAllFactors);
00313 }
00314 out << endl;
00315 }
00316
00317 void IOWrapper::Backtrack(const Hypothesis *hypo)
00318 {
00319
00320 if (hypo->GetPrevHypo() != NULL) {
00321 VERBOSE(3,hypo->GetId() << " <= ");
00322 Backtrack(hypo->GetPrevHypo());
00323 }
00324 }
00325
00326 void OutputBestHypo(const std::vector<Word>& mbrBestHypo, long , bool , bool , ostream& out)
00327 {
00328
00329 for (size_t i = 0 ; i < mbrBestHypo.size() ; i++) {
00330 const Factor *factor = mbrBestHypo[i].GetFactor(StaticData::Instance().GetOutputFactorOrder()[0]);
00331 CHECK(factor);
00332 if (i>0) out << " " << *factor;
00333 else out << *factor;
00334 }
00335 out << endl;
00336 }
00337
00338
00339 void OutputInput(std::vector<const Phrase*>& map, const Hypothesis* hypo)
00340 {
00341 if (hypo->GetPrevHypo()) {
00342 OutputInput(map, hypo->GetPrevHypo());
00343 map[hypo->GetCurrSourceWordsRange().GetStartPos()] = hypo->GetSourcePhrase();
00344 }
00345 }
00346
00347 void OutputInput(std::ostream& os, const Hypothesis* hypo)
00348 {
00349 size_t len = hypo->GetInput().GetSize();
00350 std::vector<const Phrase*> inp_phrases(len, 0);
00351 OutputInput(inp_phrases, hypo);
00352 for (size_t i=0; i<len; ++i)
00353 if (inp_phrases[i]) os << *inp_phrases[i];
00354 }
00355
00356 void IOWrapper::OutputBestHypo(const Hypothesis *hypo, long , bool reportSegmentation, bool reportAllFactors)
00357 {
00358 if (hypo != NULL) {
00359 VERBOSE(1,"BEST TRANSLATION: " << *hypo << endl);
00360 VERBOSE(3,"Best path: ");
00361 Backtrack(hypo);
00362 VERBOSE(3,"0" << std::endl);
00363 if (!m_surpressSingleBestOutput) {
00364 if (StaticData::Instance().IsPathRecoveryEnabled()) {
00365 OutputInput(cout, hypo);
00366 cout << "||| ";
00367 }
00368 OutputBestSurface(cout, hypo, m_outputFactorOrder, reportSegmentation, reportAllFactors);
00369 cout << endl;
00370 }
00371 } else {
00372 VERBOSE(1, "NO BEST TRANSLATION" << endl);
00373 if (!m_surpressSingleBestOutput) {
00374 cout << endl;
00375 }
00376 }
00377 }
00378
00379 void OutputNBest(std::ostream& out
00380 , const Moses::TrellisPathList &nBestList
00381 , const std::vector<Moses::FactorType>& outputFactorOrder
00382 , long translationId
00383 , bool reportSegmentation)
00384 {
00385 const StaticData &staticData = StaticData::Instance();
00386 bool labeledOutput = staticData.IsLabeledNBestList();
00387 bool reportAllFactors = staticData.GetReportAllFactorsNBest();
00388 bool includeSegmentation = staticData.NBestIncludesSegmentation();
00389 bool includeWordAlignment = staticData.PrintAlignmentInfoInNbest();
00390
00391 TrellisPathList::const_iterator iter;
00392 for (iter = nBestList.begin() ; iter != nBestList.end() ; ++iter) {
00393 const TrellisPath &path = **iter;
00394 const std::vector<const Hypothesis *> &edges = path.GetEdges();
00395
00396
00397 out << translationId << " ||| ";
00398 for (int currEdge = (int)edges.size() - 1 ; currEdge >= 0 ; currEdge--) {
00399 const Hypothesis &edge = *edges[currEdge];
00400 OutputSurface(out, edge, outputFactorOrder, reportSegmentation, reportAllFactors);
00401 }
00402 out << " |||";
00403
00404
00405 OutputAllFeatureScores(path.GetScoreBreakdown(), out );
00406
00407
00408 out << " ||| " << path.GetTotalScore();
00409
00410
00411 if (includeSegmentation) {
00412 out << " |||";
00413 for (int currEdge = (int)edges.size() - 2 ; currEdge >= 0 ; currEdge--) {
00414 const Hypothesis &edge = *edges[currEdge];
00415 const WordsRange &sourceRange = edge.GetCurrSourceWordsRange();
00416 WordsRange targetRange = path.GetTargetWordsRange(edge);
00417 out << " " << sourceRange.GetStartPos();
00418 if (sourceRange.GetStartPos() < sourceRange.GetEndPos()) {
00419 out << "-" << sourceRange.GetEndPos();
00420 }
00421 out<< "=" << targetRange.GetStartPos();
00422 if (targetRange.GetStartPos() < targetRange.GetEndPos()) {
00423 out<< "-" << targetRange.GetEndPos();
00424 }
00425 }
00426 }
00427
00428 if (includeWordAlignment) {
00429 out << " ||| ";
00430 for (int currEdge = (int)edges.size() - 2 ; currEdge >= 0 ; currEdge--) {
00431 const Hypothesis &edge = *edges[currEdge];
00432 const WordsRange &sourceRange = edge.GetCurrSourceWordsRange();
00433 WordsRange targetRange = path.GetTargetWordsRange(edge);
00434 const int sourceOffset = sourceRange.GetStartPos();
00435 const int targetOffset = targetRange.GetStartPos();
00436 const AlignmentInfo &ai = edge.GetCurrTargetPhrase().GetAlignTerm();
00437
00438 OutputAlignment(out, ai, sourceOffset, targetOffset);
00439
00440 }
00441 }
00442
00443 if (StaticData::Instance().IsPathRecoveryEnabled()) {
00444 out << "|||";
00445 OutputInput(out, edges[0]);
00446 }
00447
00448 out << endl;
00449 }
00450
00451 out << std::flush;
00452 }
00453
00454 void OutputAllFeatureScores(const Moses::ScoreComponentCollection &features
00455 , std::ostream &out)
00456 {
00457 std::string lastName = "";
00458 const vector<const StatefulFeatureFunction*>& sff = StatefulFeatureFunction::GetStatefulFeatureFunctions();
00459 for( size_t i=0; i<sff.size(); i++ ) {
00460 const StatefulFeatureFunction *ff = sff[i];
00461 if (ff->GetScoreProducerDescription() != "BleuScoreFeature"
00462 && ff->IsTuneable()) {
00463 OutputFeatureScores( out, features, ff, lastName );
00464 }
00465 }
00466 const vector<const StatelessFeatureFunction*>& slf = StatelessFeatureFunction::GetStatelessFeatureFunctions();
00467 for( size_t i=0; i<slf.size(); i++ ) {
00468 const StatelessFeatureFunction *ff = slf[i];
00469 if (ff->IsTuneable()) {
00470 OutputFeatureScores( out, features, ff, lastName );
00471 }
00472 }
00473 }
00474
00475 void OutputFeatureScores( std::ostream& out
00476 , const ScoreComponentCollection &features
00477 , const FeatureFunction *ff
00478 , std::string &lastName )
00479 {
00480 const StaticData &staticData = StaticData::Instance();
00481 bool labeledOutput = staticData.IsLabeledNBestList();
00482
00483
00484 if (ff->GetNumScoreComponents() != 0) {
00485 if( labeledOutput && lastName != ff->GetScoreProducerDescription() ) {
00486 lastName = ff->GetScoreProducerDescription();
00487 out << " " << lastName << "=";
00488 }
00489 vector<float> scores = features.GetScoresForProducer( ff );
00490 for (size_t j = 0; j<scores.size(); ++j) {
00491 out << " " << scores[j];
00492 }
00493 }
00494
00495
00496 const FVector scores = features.GetVectorForProducer( ff );
00497 for(FVector::FNVmap::const_iterator i = scores.cbegin(); i != scores.cend(); i++) {
00498 out << " " << i->first << "= " << i->second;
00499 }
00500 }
00501
00502 void OutputLatticeMBRNBest(std::ostream& out, const vector<LatticeMBRSolution>& solutions,long translationId)
00503 {
00504 for (vector<LatticeMBRSolution>::const_iterator si = solutions.begin(); si != solutions.end(); ++si) {
00505 out << translationId;
00506 out << " |||";
00507 const vector<Word> mbrHypo = si->GetWords();
00508 for (size_t i = 0 ; i < mbrHypo.size() ; i++) {
00509 const Factor *factor = mbrHypo[i].GetFactor(StaticData::Instance().GetOutputFactorOrder()[0]);
00510 if (i>0) out << " " << *factor;
00511 else out << *factor;
00512 }
00513 out << " |||";
00514 out << " map: " << si->GetMapScore();
00515 out << " w: " << mbrHypo.size();
00516 const vector<float>& ngramScores = si->GetNgramScores();
00517 for (size_t i = 0; i < ngramScores.size(); ++i) {
00518 out << " " << ngramScores[i];
00519 }
00520 out << " ||| " << si->GetScore();
00521
00522 out << endl;
00523 }
00524 }
00525
00526
00527 void IOWrapper::OutputLatticeMBRNBestList(const vector<LatticeMBRSolution>& solutions,long translationId)
00528 {
00529 OutputLatticeMBRNBest(*m_nBestStream, solutions,translationId);
00530 }
00531
00532 bool ReadInput(IOWrapper &ioWrapper, InputTypeEnum inputType, InputType*& source)
00533 {
00534 delete source;
00535 switch(inputType) {
00536 case SentenceInput:
00537 source = ioWrapper.GetInput(new Sentence);
00538 break;
00539 case ConfusionNetworkInput:
00540 source = ioWrapper.GetInput(new ConfusionNet);
00541 break;
00542 case WordLatticeInput:
00543 source = ioWrapper.GetInput(new WordLattice);
00544 break;
00545 default:
00546 TRACE_ERR("Unknown input type: " << inputType << "\n");
00547 }
00548 return (source ? true : false);
00549 }
00550
00551
00552
00553 IOWrapper *GetIOWrapper(const StaticData &staticData)
00554 {
00555 IOWrapper *ioWrapper;
00556 const std::vector<FactorType> &inputFactorOrder = staticData.GetInputFactorOrder()
00557 ,&outputFactorOrder = staticData.GetOutputFactorOrder();
00558 FactorMask inputFactorUsed(inputFactorOrder);
00559
00560
00561 if (staticData.GetParam("input-file").size() == 1) {
00562 VERBOSE(2,"IO from File" << endl);
00563 string filePath = staticData.GetParam("input-file")[0];
00564
00565 ioWrapper = new IOWrapper(inputFactorOrder, outputFactorOrder, inputFactorUsed
00566 , staticData.GetNBestSize()
00567 , staticData.GetNBestFilePath()
00568 , filePath);
00569 } else {
00570 VERBOSE(1,"IO from STDOUT/STDIN" << endl);
00571 ioWrapper = new IOWrapper(inputFactorOrder, outputFactorOrder, inputFactorUsed
00572 , staticData.GetNBestSize()
00573 , staticData.GetNBestFilePath());
00574 }
00575 ioWrapper->ResetTranslationId();
00576
00577 IFVERBOSE(1)
00578 PrintUserTime("Created input-output object");
00579
00580 return ioWrapper;
00581 }
00582
00583 }
00584