00001
00002
00003
00004
00005
00006
00007 #include <boost/foreach.hpp>
00008 #include "StoreTarget.h"
00009 #include "line_splitter.hh"
00010 #include "probing_hash_utils.hh"
00011 #include "moses/OutputFileStream.h"
00012 #include "moses/Util.h"
00013
00014 using namespace std;
00015
00016 namespace Moses
00017 {
00018
00019 StoreTarget::StoreTarget(const std::string &basepath)
00020 :m_basePath(basepath)
00021 ,m_vocab(basepath + "/TargetVocab.dat")
00022 {
00023 std::string path = basepath + "/TargetColl.dat";
00024 m_fileTargetColl.open(path.c_str(),
00025 std::ios::out | std::ios::binary | std::ios::ate | std::ios::trunc);
00026 if (!m_fileTargetColl.is_open()) {
00027 throw "can't create file ";
00028 }
00029
00030 }
00031
00032 StoreTarget::~StoreTarget()
00033 {
00034 assert(m_coll.empty());
00035 m_fileTargetColl.close();
00036
00037
00038 m_vocab.Save();
00039 }
00040
00041 uint64_t StoreTarget::Save()
00042 {
00043 uint64_t ret = m_fileTargetColl.tellp();
00044
00045
00046 uint64_t numTP = m_coll.size();
00047 m_fileTargetColl.write((char*) &numTP, sizeof(uint64_t));
00048
00049 for (size_t i = 0; i < m_coll.size(); ++i) {
00050 Save(*m_coll[i]);
00051 }
00052
00053
00054 RemoveAllInColl(m_coll);
00055 m_coll.clear();
00056
00057
00058 return ret;
00059 }
00060
00061 void StoreTarget::Save(const target_text &rule)
00062 {
00063
00064 TargetPhraseInfo tpInfo;
00065 tpInfo.alignTerm = GetAlignId(rule.word_align_term);
00066 tpInfo.alignNonTerm = GetAlignId(rule.word_align_non_term);
00067 tpInfo.numWords = rule.target_phrase.size();
00068 tpInfo.propLength = rule.property.size();
00069
00070
00071 m_fileTargetColl.write((char*) &tpInfo, sizeof(TargetPhraseInfo));
00072
00073
00074 for (size_t i = 0; i < rule.prob.size(); ++i) {
00075 float prob = rule.prob[i];
00076 m_fileTargetColl.write((char*) &prob, sizeof(prob));
00077 }
00078
00079
00080 for (size_t i = 0; i < rule.target_phrase.size(); ++i) {
00081 uint32_t vocabId = rule.target_phrase[i];
00082 m_fileTargetColl.write((char*) &vocabId, sizeof(vocabId));
00083 }
00084
00085
00086
00087 }
00088
00089 void StoreTarget::SaveAlignment()
00090 {
00091 std::string path = m_basePath + "/Alignments.dat";
00092 OutputFileStream file(path);
00093
00094 BOOST_FOREACH(Alignments::value_type &valPair, m_aligns) {
00095 file << valPair.second << "\t";
00096
00097 const std::vector<size_t> &aligns = valPair.first;
00098 BOOST_FOREACH(size_t align, aligns) {
00099 file << align << " ";
00100 }
00101 file << endl;
00102 }
00103
00104 }
00105
00106 void StoreTarget::Append(const line_text &line, bool log_prob, bool scfg)
00107 {
00108 target_text *rule = new target_text;
00109
00110
00111
00112 vector<bool> nonTerms;
00113 util::TokenIter<util::SingleCharacter> it;
00114 it = util::TokenIter<util::SingleCharacter>(line.target_phrase,
00115 util::SingleCharacter(' '));
00116 while (it) {
00117 StringPiece word = *it;
00118
00119
00120 bool nonTerm = false;
00121 if (scfg) {
00122
00123 if (scfg && word[0] == '[' && word[word.size() - 1] == ']') {
00124
00125 nonTerm = true;
00126 }
00127 nonTerms.push_back(nonTerm);
00128 }
00129
00130 util::TokenIter<util::SingleCharacter> itFactor;
00131 itFactor = util::TokenIter<util::SingleCharacter>(word,
00132 util::SingleCharacter('|'));
00133 while (itFactor) {
00134 StringPiece factor = *itFactor;
00135
00136 string factorStr = factor.as_string();
00137 uint32_t vocabId = m_vocab.GetVocabId(factorStr);
00138
00139 rule->target_phrase.push_back(vocabId);
00140
00141 itFactor++;
00142 }
00143
00144 it++;
00145 }
00146
00147
00148 it = util::TokenIter<util::SingleCharacter>(line.prob,
00149 util::SingleCharacter(' '));
00150 while (it) {
00151 string tok = it->as_string();
00152 float prob = Scan<float>(tok);
00153
00154 if (log_prob) {
00155 prob = FloorScore(log(prob));
00156 if (prob == 0.0f) prob = 0.0000000001;
00157 }
00158
00159 rule->prob.push_back(prob);
00160 it++;
00161 }
00162
00163
00164
00165
00166
00167
00168
00169
00170
00171
00172 it = util::TokenIter<util::SingleCharacter>(line.word_align,
00173 util::SingleCharacter(' '));
00174 while (it) {
00175 string tokPair = Trim(it->as_string());
00176 if (tokPair.empty()) {
00177 break;
00178 }
00179
00180 vector<size_t> alignPair = Tokenize<size_t>(tokPair, "-");
00181 assert(alignPair.size() == 2);
00182
00183 bool nonTerm = false;
00184 size_t sourcePos = alignPair[0];
00185 size_t targetPos = alignPair[1];
00186 if (scfg) {
00187 nonTerm = nonTerms[targetPos];
00188 }
00189
00190
00191
00192 if (nonTerm) {
00193 rule->word_align_non_term.push_back(sourcePos);
00194 rule->word_align_non_term.push_back(targetPos);
00195
00196 } else {
00197 rule->word_align_term.push_back(sourcePos);
00198 rule->word_align_term.push_back(targetPos);
00199 }
00200
00201 it++;
00202 }
00203
00204
00205 string prop = line.property.as_string();
00206 AppendLexRO(prop, rule->prob, log_prob);
00207
00208
00209
00210
00211
00212
00213
00214
00215
00216
00217 m_coll.push_back(rule);
00218 }
00219
00220 uint32_t StoreTarget::GetAlignId(const std::vector<size_t> &align)
00221 {
00222 boost::unordered_map<std::vector<size_t>, uint32_t>::iterator iter =
00223 m_aligns.find(align);
00224 if (iter == m_aligns.end()) {
00225 uint32_t ind = m_aligns.size();
00226 m_aligns[align] = ind;
00227 return ind;
00228 } else {
00229 return iter->second;
00230 }
00231 }
00232
00233 void StoreTarget::AppendLexRO(std::string &prop, std::vector<float> &retvector,
00234 bool log_prob) const
00235 {
00236 size_t startPos = prop.find("{{LexRO ");
00237
00238 if (startPos != string::npos) {
00239 size_t endPos = prop.find("}}", startPos + 8);
00240 string lexProb = prop.substr(startPos + 8, endPos - startPos - 8);
00241
00242
00243
00244 vector<float> scores = Tokenize<float>(lexProb);
00245
00246 if (log_prob) {
00247 for (size_t i = 0; i < scores.size(); ++i) {
00248 scores[i] = FloorScore(log(scores[i]));
00249 if (scores[i] == 0.0f) scores[i] = 0.0000000001;
00250 }
00251 }
00252
00253 for (size_t i = 0; i < scores.size(); ++i) {
00254 retvector.push_back(scores[i]);
00255 }
00256
00257
00258 prop = prop.substr(0, startPos)
00259 + prop.substr(endPos + 2, prop.size() - endPos - 2);
00260
00261 }
00262 }
00263
00264 }