00001
00002
00003
00004
00005
00006
00007
00008
00009
00010
00011
00012
00013
00014
00015
00016
00017
00018
00019
00020
00021
00022
00023
00024
00025
00026
00027
00028
00029
00030
00031
00032
00033
00034
00035 #include <iostream>
00036 #include <stack>
00037 #include "TypeDef.h"
00038 #include "Util.h"
00039 #include "IOWrapper.h"
00040 #include "Hypothesis.h"
00041 #include "WordsRange.h"
00042 #include "TrellisPathList.h"
00043 #include "StaticData.h"
00044 #include "DummyScoreProducers.h"
00045 #include "InputFileStream.h"
00046
00047 using namespace std;
00048 using namespace Moses;
00049
00050 IOWrapper::IOWrapper(
00051 const vector<FactorType> &inputFactorOrder
00052 , const vector<FactorType> &outputFactorOrder
00053 , const FactorMask &inputFactorUsed
00054 , size_t nBestSize
00055 , const string &nBestFilePath)
00056 :m_inputFactorOrder(inputFactorOrder)
00057 ,m_outputFactorOrder(outputFactorOrder)
00058 ,m_inputFactorUsed(inputFactorUsed)
00059 ,m_inputFile(NULL)
00060 ,m_inputStream(&std::cin)
00061 ,m_nBestStream(NULL)
00062 ,m_outputWordGraphStream(NULL)
00063 ,m_outputSearchGraphStream(NULL)
00064 ,m_detailedTranslationReportingStream(NULL)
00065 ,m_alignmentOutputStream(NULL)
00066 {
00067 Initialization(inputFactorOrder, outputFactorOrder
00068 , inputFactorUsed
00069 , nBestSize, nBestFilePath);
00070 }
00071
00072 IOWrapper::IOWrapper(const std::vector<FactorType> &inputFactorOrder
00073 , const std::vector<FactorType> &outputFactorOrder
00074 , const FactorMask &inputFactorUsed
00075 , size_t nBestSize
00076 , const std::string &nBestFilePath
00077 , const std::string &inputFilePath)
00078 :m_inputFactorOrder(inputFactorOrder)
00079 ,m_outputFactorOrder(outputFactorOrder)
00080 ,m_inputFactorUsed(inputFactorUsed)
00081 ,m_inputFilePath(inputFilePath)
00082 ,m_inputFile(new InputFileStream(inputFilePath))
00083 ,m_nBestStream(NULL)
00084 ,m_outputWordGraphStream(NULL)
00085 ,m_outputSearchGraphStream(NULL)
00086 ,m_detailedTranslationReportingStream(NULL)
00087 ,m_alignmentOutputStream(NULL)
00088 {
00089 Initialization(inputFactorOrder, outputFactorOrder
00090 , inputFactorUsed
00091 , nBestSize, nBestFilePath);
00092
00093 m_inputStream = m_inputFile;
00094 }
00095
00096 IOWrapper::~IOWrapper()
00097 {
00098 if (m_inputFile != NULL)
00099 delete m_inputFile;
00100 if (m_nBestStream != NULL && !m_surpressSingleBestOutput) {
00101
00102 delete m_nBestStream;
00103 }
00104 if (m_outputWordGraphStream != NULL) {
00105 delete m_outputWordGraphStream;
00106 }
00107 if (m_outputSearchGraphStream != NULL) {
00108 delete m_outputSearchGraphStream;
00109 }
00110 delete m_detailedTranslationReportingStream;
00111 delete m_alignmentOutputStream;
00112 }
00113
00114 void IOWrapper::Initialization(const std::vector<FactorType> &
00115 , const std::vector<FactorType> &
00116 , const FactorMask &
00117 , size_t nBestSize
00118 , const std::string &nBestFilePath)
00119 {
00120 const StaticData &staticData = StaticData::Instance();
00121
00122
00123 m_surpressSingleBestOutput = false;
00124
00125 if (nBestSize > 0) {
00126 if (nBestFilePath == "-" || nBestFilePath == "/dev/stdout") {
00127 m_nBestStream = &std::cout;
00128 m_surpressSingleBestOutput = true;
00129 } else {
00130 std::ofstream *file = new std::ofstream;
00131 m_nBestStream = file;
00132 file->open(nBestFilePath.c_str());
00133 }
00134 }
00135
00136
00137 if (staticData.GetOutputWordGraph()) {
00138 string fileName = staticData.GetParam("output-word-graph")[0];
00139 std::ofstream *file = new std::ofstream;
00140 m_outputWordGraphStream = file;
00141 file->open(fileName.c_str());
00142 }
00143
00144
00145
00146 if (staticData.GetOutputSearchGraph()) {
00147 string fileName;
00148 if (staticData.GetOutputSearchGraphExtended())
00149 fileName = staticData.GetParam("output-search-graph-extended")[0];
00150 else
00151 fileName = staticData.GetParam("output-search-graph")[0];
00152 std::ofstream *file = new std::ofstream;
00153 m_outputSearchGraphStream = file;
00154 file->open(fileName.c_str());
00155 }
00156
00157
00158 if (staticData.IsDetailedTranslationReportingEnabled()) {
00159 const std::string &path = staticData.GetDetailedTranslationReportingFilePath();
00160 m_detailedTranslationReportingStream = new std::ofstream(path.c_str());
00161 CHECK(m_detailedTranslationReportingStream->good());
00162 }
00163
00164
00165 if (! staticData.GetAlignmentOutputFile().empty()) {
00166 m_alignmentOutputStream = new ofstream(staticData.GetAlignmentOutputFile().c_str());
00167 CHECK(m_alignmentOutputStream->good());
00168 }
00169
00170 }
00171
00172 InputType*IOWrapper::GetInput(InputType* inputType)
00173 {
00174 if(inputType->Read(*m_inputStream, m_inputFactorOrder)) {
00175 if (long x = inputType->GetTranslationId()) {
00176 if (x>=m_translationId) m_translationId = x+1;
00177 } else inputType->SetTranslationId(m_translationId++);
00178
00179 return inputType;
00180 } else {
00181 delete inputType;
00182 return NULL;
00183 }
00184 }
00185
00186
00187
00188
00189 void OutputSurface(std::ostream &out, const Phrase &phrase, const std::vector<FactorType> &outputFactorOrder, bool reportAllFactors)
00190 {
00191 CHECK(outputFactorOrder.size() > 0);
00192 if (reportAllFactors == true) {
00193 out << phrase;
00194 } else {
00195 size_t size = phrase.GetSize();
00196 for (size_t pos = 0 ; pos < size ; pos++) {
00197 const Factor *factor = phrase.GetFactor(pos, outputFactorOrder[0]);
00198 out << *factor;
00199
00200 for (size_t i = 1 ; i < outputFactorOrder.size() ; i++) {
00201 const Factor *factor = phrase.GetFactor(pos, outputFactorOrder[i]);
00202 out << "|" << *factor;
00203 }
00204 out << " ";
00205 }
00206 }
00207 }
00208
00209 void OutputAlignment(ostream &out, const AlignmentInfo &ai, size_t sourceOffset, size_t targetOffset)
00210 {
00211 typedef std::vector< const std::pair<size_t,size_t>* > AlignVec;
00212 AlignVec alignments = ai.GetSortedAlignments();
00213
00214 AlignVec::const_iterator it;
00215 for (it = alignments.begin(); it != alignments.end(); ++it) {
00216 const std::pair<size_t,size_t> &alignment = **it;
00217 out << alignment.first + sourceOffset << "-" << alignment.second + targetOffset << " ";
00218 }
00219
00220 }
00221
00222 void OutputAlignment(ostream &out, const vector<const Hypothesis *> &edges)
00223 {
00224 size_t targetOffset = 0;
00225
00226 for (int currEdge = (int)edges.size() - 1 ; currEdge >= 0 ; currEdge--) {
00227 const Hypothesis &edge = *edges[currEdge];
00228 const TargetPhrase &tp = edge.GetCurrTargetPhrase();
00229 size_t sourceOffset = edge.GetCurrSourceWordsRange().GetStartPos();
00230
00231 OutputAlignment(out, tp.GetAlignmentInfo(), sourceOffset, targetOffset);
00232
00233 targetOffset += tp.GetSize();
00234 }
00235 out << std::endl;
00236 }
00237
00238 void OutputAlignment(OutputCollector* collector, size_t lineNo , const vector<const Hypothesis *> &edges)
00239 {
00240 ostringstream out;
00241 OutputAlignment(out, edges);
00242
00243 collector->Write(lineNo,out.str());
00244 }
00245
00246 void OutputAlignment(OutputCollector* collector, size_t lineNo , const Hypothesis *hypo)
00247 {
00248 if (collector) {
00249 std::vector<const Hypothesis *> edges;
00250 const Hypothesis *currentHypo = hypo;
00251 while (currentHypo) {
00252 edges.push_back(currentHypo);
00253 currentHypo = currentHypo->GetPrevHypo();
00254 }
00255
00256 OutputAlignment(collector,lineNo, edges);
00257 }
00258 }
00259
00260 void OutputAlignment(OutputCollector* collector, size_t lineNo , const TrellisPath &path)
00261 {
00262 if (collector) {
00263 OutputAlignment(collector,lineNo, path.GetEdges());
00264 }
00265 }
00266
00267 void OutputSurface(std::ostream &out, const Hypothesis *hypo, const std::vector<FactorType> &outputFactorOrder
00268 ,bool reportSegmentation, bool reportAllFactors)
00269 {
00270 if ( hypo != NULL) {
00271 OutputSurface(out, hypo->GetPrevHypo(), outputFactorOrder, reportSegmentation, reportAllFactors);
00272 OutputSurface(out, hypo->GetCurrTargetPhrase(), outputFactorOrder, reportAllFactors);
00273
00274
00275 if (reportSegmentation == true
00276 && hypo->GetCurrTargetPhrase().GetSize() > 0) {
00277 out << "|" << hypo->GetCurrSourceWordsRange().GetStartPos()
00278 << "-" << hypo->GetCurrSourceWordsRange().GetEndPos() << "| ";
00279 }
00280 }
00281 }
00282
00283 void OutputBestHypo(const Moses::TrellisPath &path, long ,bool reportSegmentation, bool reportAllFactors, std::ostream &out)
00284 {
00285 const std::vector<const Hypothesis *> &edges = path.GetEdges();
00286
00287 for (int currEdge = (int)edges.size() - 1 ; currEdge >= 0 ; currEdge--) {
00288 const Hypothesis &edge = *edges[currEdge];
00289
00290 OutputSurface(out, edge.GetCurrTargetPhrase(), StaticData::Instance().GetOutputFactorOrder(), reportAllFactors);
00291 if (reportSegmentation == true
00292 && edge.GetCurrTargetPhrase().GetSize() > 0) {
00293 out << "|" << edge.GetCurrSourceWordsRange().GetStartPos()
00294 << "-" << edge.GetCurrSourceWordsRange().GetEndPos() << "| ";
00295 }
00296 }
00297 out << endl;
00298 }
00299
00300 void IOWrapper::Backtrack(const Hypothesis *hypo)
00301 {
00302
00303 if (hypo->GetPrevHypo() != NULL) {
00304 VERBOSE(3,hypo->GetId() << " <= ");
00305 Backtrack(hypo->GetPrevHypo());
00306 }
00307 }
00308
00309 void OutputBestHypo(const std::vector<Word>& mbrBestHypo, long , bool , bool , ostream& out)
00310 {
00311
00312 for (size_t i = 0 ; i < mbrBestHypo.size() ; i++) {
00313 const Factor *factor = mbrBestHypo[i].GetFactor(StaticData::Instance().GetOutputFactorOrder()[0]);
00314 if (i>0) out << " " << *factor;
00315 else out << *factor;
00316 }
00317 out << endl;
00318 }
00319
00320
00321 void OutputInput(std::vector<const Phrase*>& map, const Hypothesis* hypo)
00322 {
00323 if (hypo->GetPrevHypo()) {
00324 OutputInput(map, hypo->GetPrevHypo());
00325 map[hypo->GetCurrSourceWordsRange().GetStartPos()] = hypo->GetSourcePhrase();
00326 }
00327 }
00328
00329 void OutputInput(std::ostream& os, const Hypothesis* hypo)
00330 {
00331 size_t len = hypo->GetInput().GetSize();
00332 std::vector<const Phrase*> inp_phrases(len, 0);
00333 OutputInput(inp_phrases, hypo);
00334 for (size_t i=0; i<len; ++i)
00335 if (inp_phrases[i]) os << *inp_phrases[i];
00336 }
00337
00338 void IOWrapper::OutputBestHypo(const Hypothesis *hypo, long , bool reportSegmentation, bool reportAllFactors)
00339 {
00340 if (hypo != NULL) {
00341 VERBOSE(1,"BEST TRANSLATION: " << *hypo << endl);
00342 VERBOSE(3,"Best path: ");
00343 Backtrack(hypo);
00344 VERBOSE(3,"0" << std::endl);
00345 if (!m_surpressSingleBestOutput) {
00346 if (StaticData::Instance().IsPathRecoveryEnabled()) {
00347 OutputInput(cout, hypo);
00348 cout << "||| ";
00349 }
00350 OutputSurface(cout, hypo, m_outputFactorOrder, reportSegmentation, reportAllFactors);
00351 cout << endl;
00352 }
00353 } else {
00354 VERBOSE(1, "NO BEST TRANSLATION" << endl);
00355 if (!m_surpressSingleBestOutput) {
00356 cout << endl;
00357 }
00358 }
00359 }
00360
00361
00362
00363
00364 void OutputNBest(std::ostream& out, const Moses::TrellisPathList &nBestList, const std::vector<Moses::FactorType>& outputFactorOrder, const TranslationSystem* system, long translationId)
00365 {
00366 const StaticData &staticData = StaticData::Instance();
00367 bool labeledOutput = staticData.IsLabeledNBestList();
00368 bool reportAllFactors = staticData.GetReportAllFactorsNBest();
00369 bool includeAlignment = staticData.NBestIncludesAlignment();
00370 bool includeWordAlignment = staticData.PrintAlignmentInfoInNbest();
00371
00372 TrellisPathList::const_iterator iter;
00373 for (iter = nBestList.begin() ; iter != nBestList.end() ; ++iter) {
00374 const TrellisPath &path = **iter;
00375 const std::vector<const Hypothesis *> &edges = path.GetEdges();
00376
00377
00378 out << translationId << " ||| ";
00379 for (int currEdge = (int)edges.size() - 1 ; currEdge >= 0 ; currEdge--) {
00380 const Hypothesis &edge = *edges[currEdge];
00381 OutputSurface(out, edge.GetCurrTargetPhrase(), outputFactorOrder, reportAllFactors);
00382 }
00383 out << " |||";
00384
00385 std::string lastName = "";
00386 const vector<const StatefulFeatureFunction*>& sff = system->GetStatefulFeatureFunctions();
00387 for( size_t i=0; i<sff.size(); i++ ) {
00388 if( labeledOutput && lastName != sff[i]->GetScoreProducerWeightShortName() ) {
00389 lastName = sff[i]->GetScoreProducerWeightShortName();
00390 out << " " << lastName << ":";
00391 }
00392 vector<float> scores = path.GetScoreBreakdown().GetScoresForProducer( sff[i] );
00393 for (size_t j = 0; j<scores.size(); ++j) {
00394 out << " " << scores[j];
00395 }
00396 }
00397
00398 const vector<const StatelessFeatureFunction*>& slf = system->GetStatelessFeatureFunctions();
00399 for( size_t i=0; i<slf.size(); i++ ) {
00400 if( labeledOutput && lastName != slf[i]->GetScoreProducerWeightShortName() ) {
00401 lastName = slf[i]->GetScoreProducerWeightShortName();
00402 out << " " << lastName << ":";
00403 }
00404 vector<float> scores = path.GetScoreBreakdown().GetScoresForProducer( slf[i] );
00405 for (size_t j = 0; j<scores.size(); ++j) {
00406 out << " " << scores[j];
00407 }
00408 }
00409
00410
00411 const vector<PhraseDictionaryFeature*>& pds = system->GetPhraseDictionaries();
00412 if (pds.size() > 0) {
00413
00414 for( size_t i=0; i<pds.size(); i++ ) {
00415 size_t pd_numinputscore = pds[i]->GetNumInputScores();
00416 vector<float> scores = path.GetScoreBreakdown().GetScoresForProducer( pds[i] );
00417 for (size_t j = 0; j<scores.size(); ++j){
00418
00419 if (labeledOutput && (i == 0) ){
00420 if ((j == 0) || (j == pd_numinputscore)){
00421 lastName = pds[i]->GetScoreProducerWeightShortName(j);
00422 out << " " << lastName << ":";
00423 }
00424 }
00425 out << " " << scores[j];
00426 }
00427 }
00428 }
00429
00430
00431 const vector<GenerationDictionary*>& gds = system->GetGenerationDictionaries();
00432 if (gds.size() > 0) {
00433
00434 for( size_t i=0; i<gds.size(); i++ ) {
00435 size_t pd_numinputscore = gds[i]->GetNumInputScores();
00436 vector<float> scores = path.GetScoreBreakdown().GetScoresForProducer( gds[i] );
00437 for (size_t j = 0; j<scores.size(); ++j){
00438
00439 if (labeledOutput && (i == 0) ){
00440 if ((j == 0) || (j == pd_numinputscore)){
00441 lastName = gds[i]->GetScoreProducerWeightShortName(j);
00442 out << " " << lastName << ":";
00443 }
00444 }
00445 out << " " << scores[j];
00446 }
00447 }
00448 }
00449
00450
00451 out << " ||| " << path.GetTotalScore();
00452
00453
00454 if (includeAlignment) {
00455 out << " |||";
00456 for (int currEdge = (int)edges.size() - 2 ; currEdge >= 0 ; currEdge--) {
00457 const Hypothesis &edge = *edges[currEdge];
00458 const WordsRange &sourceRange = edge.GetCurrSourceWordsRange();
00459 WordsRange targetRange = path.GetTargetWordsRange(edge);
00460 out << " " << sourceRange.GetStartPos();
00461 if (sourceRange.GetStartPos() < sourceRange.GetEndPos()) {
00462 out << "-" << sourceRange.GetEndPos();
00463 }
00464 out<< "=" << targetRange.GetStartPos();
00465 if (targetRange.GetStartPos() < targetRange.GetEndPos()) {
00466 out<< "-" << targetRange.GetEndPos();
00467 }
00468 }
00469 }
00470
00471 if (includeWordAlignment) {
00472 out << " ||| ";
00473 for (int currEdge = (int)edges.size() - 2 ; currEdge >= 0 ; currEdge--) {
00474 const Hypothesis &edge = *edges[currEdge];
00475 const WordsRange &sourceRange = edge.GetCurrSourceWordsRange();
00476 WordsRange targetRange = path.GetTargetWordsRange(edge);
00477 const int sourceOffset = sourceRange.GetStartPos();
00478 const int targetOffset = targetRange.GetStartPos();
00479 const AlignmentInfo &ai = edge.GetCurrTargetPhrase().GetAlignmentInfo();
00480
00481 OutputAlignment(out, ai, sourceOffset, targetOffset);
00482
00483 }
00484 }
00485
00486 if (StaticData::Instance().IsPathRecoveryEnabled()) {
00487 out << "|||";
00488 OutputInput(out, edges[0]);
00489 }
00490
00491 out << endl;
00492 }
00493
00494
00495 out <<std::flush;
00496 }
00497
00498 void OutputLatticeMBRNBest(std::ostream& out, const vector<LatticeMBRSolution>& solutions,long translationId)
00499 {
00500 for (vector<LatticeMBRSolution>::const_iterator si = solutions.begin(); si != solutions.end(); ++si) {
00501 out << translationId;
00502 out << " |||";
00503 const vector<Word> mbrHypo = si->GetWords();
00504 for (size_t i = 0 ; i < mbrHypo.size() ; i++) {
00505 const Factor *factor = mbrHypo[i].GetFactor(StaticData::Instance().GetOutputFactorOrder()[0]);
00506 if (i>0) out << " " << *factor;
00507 else out << *factor;
00508 }
00509 out << " |||";
00510 out << " map: " << si->GetMapScore();
00511 out << " w: " << mbrHypo.size();
00512 const vector<float>& ngramScores = si->GetNgramScores();
00513 for (size_t i = 0; i < ngramScores.size(); ++i) {
00514 out << " " << ngramScores[i];
00515 }
00516 out << " ||| " << si->GetScore();
00517
00518 out << endl;
00519 }
00520 }
00521
00522
00523 void IOWrapper::OutputLatticeMBRNBestList(const vector<LatticeMBRSolution>& solutions,long translationId)
00524 {
00525 OutputLatticeMBRNBest(*m_nBestStream, solutions,translationId);
00526 }
00527
00528 bool ReadInput(IOWrapper &ioWrapper, InputTypeEnum inputType, InputType*& source)
00529 {
00530 delete source;
00531 switch(inputType) {
00532 case SentenceInput:
00533 source = ioWrapper.GetInput(new Sentence);
00534 break;
00535 case ConfusionNetworkInput:
00536 source = ioWrapper.GetInput(new ConfusionNet);
00537 break;
00538 case WordLatticeInput:
00539 source = ioWrapper.GetInput(new WordLattice);
00540 break;
00541 default:
00542 TRACE_ERR("Unknown input type: " << inputType << "\n");
00543 }
00544 return (source ? true : false);
00545 }
00546
00547
00548
00549 IOWrapper *GetIODevice(const StaticData &staticData)
00550 {
00551 IOWrapper *ioWrapper;
00552 const std::vector<FactorType> &inputFactorOrder = staticData.GetInputFactorOrder()
00553 ,&outputFactorOrder = staticData.GetOutputFactorOrder();
00554 FactorMask inputFactorUsed(inputFactorOrder);
00555
00556
00557 if (staticData.GetParam("input-file").size() == 1) {
00558 VERBOSE(2,"IO from File" << endl);
00559 string filePath = staticData.GetParam("input-file")[0];
00560
00561 ioWrapper = new IOWrapper(inputFactorOrder, outputFactorOrder, inputFactorUsed
00562 , staticData.GetNBestSize()
00563 , staticData.GetNBestFilePath()
00564 , filePath);
00565 } else {
00566 VERBOSE(1,"IO from STDOUT/STDIN" << endl);
00567 ioWrapper = new IOWrapper(inputFactorOrder, outputFactorOrder, inputFactorUsed
00568 , staticData.GetNBestSize()
00569 , staticData.GetNBestFilePath());
00570 }
00571 ioWrapper->ResetTranslationId();
00572
00573 IFVERBOSE(1)
00574 PrintUserTime("Created input-output object");
00575
00576 return ioWrapper;
00577 }