00001
00002
00003
00004
00005
00006
00007
00008
00009
00010
00011
00012
00013
00014
00015
00016
00017
00018
00019
00020 #include <string.h>
00021 #include <fstream>
00022 #include <vector>
00023 #include <string>
00024 #include <iostream>
00025 #include <cstdlib>
00026 #include "InputFileStream.h"
00027 #include "OutputFileStream.h"
00028 #include "util/tokenize.hh"
00029
00030 using namespace std;
00031
00032 vector< string > splitLine(const char *line)
00033 {
00034 vector< string > item;
00035 int start=0;
00036 int i=0;
00037 for(; line[i] != '\0'; i++) {
00038 if (line[i] == ' ' &&
00039 line[i+1] == '|' &&
00040 line[i+2] == '|' &&
00041 line[i+3] == '|' &&
00042 line[i+4] == ' ') {
00043 if (start > i) start = i;
00044 item.push_back( string( line+start, i-start ) );
00045 start = i+5;
00046 i += 3;
00047 }
00048 }
00049 item.push_back( string( line+start, i-start ) );
00050
00051 return item;
00052 }
00053
00054 bool getLine( istream &fileP, vector< string > &item )
00055 {
00056 if (fileP.eof())
00057 return false;
00058
00059 string line;
00060 if (getline(fileP, line)) {
00061 item = splitLine(line.c_str());
00062 return true;
00063 } else {
00064 return false;
00065 }
00066 }
00067
00068
00069 int main(int argc, char* argv[])
00070 {
00071 cerr << "Starting..." << endl;
00072
00073 char* &fileNameDirect = argv[1];
00074 Moses::InputFileStream fileDirect(fileNameDirect);
00075
00076
00077
00078 if (fileDirect.fail()) {
00079 cerr << "ERROR: could not open extract file " << fileNameDirect << endl;
00080 exit(1);
00081 }
00082 istream &fileDirectP = fileDirect;
00083
00084 char* &fileNameConsolidated = argv[2];
00085 ostream *fileConsolidated;
00086
00087 if (strcmp(fileNameConsolidated, "-") == 0) {
00088 fileConsolidated = &cout;
00089 } else {
00090 Moses::OutputFileStream *outputFile = new Moses::OutputFileStream();
00091 bool success = outputFile->Open(fileNameConsolidated);
00092 if (!success) {
00093 cerr << "ERROR: could not open file phrase table file "
00094 << fileNameConsolidated << endl;
00095 exit(1);
00096 }
00097 fileConsolidated = outputFile;
00098 }
00099
00100 int i=0;
00101 while(true) {
00102 i++;
00103 if (i%1000 == 0) cerr << "." << flush;
00104 if (i%10000 == 0) cerr << ":" << flush;
00105 if (i%100000 == 0) cerr << "!" << flush;
00106
00107 vector< string > itemDirect;
00108 if (! getLine(fileDirectP, itemDirect ))
00109 break;
00110
00111 const vector< string > count = util::tokenize( itemDirect[4] );
00112 float countEF = atof(count[0].c_str());
00113 float countF = atof(count[1].c_str());
00114 float prob = countF/countEF;
00115
00116 (*fileConsolidated) << itemDirect[0] << " ||| "
00117 << itemDirect[1] << " ||| "
00118 << prob << " ||| "
00119 << itemDirect[2] << "||| "
00120 << itemDirect[4] << " " << countEF
00121 << " ||| " << endl;
00122 }
00123
00124 fileConsolidated->flush();
00125 if (fileConsolidated != &cout) {
00126 delete fileConsolidated;
00127 }
00128
00129 cerr << "Finished" << endl;
00130 }
00131