00001 #include "Scorer.h"
00002
00003 #include <limits>
00004 #include "Vocabulary.h"
00005 #include "Util.h"
00006 #include "Singleton.h"
00007 #include "util/tokenize_piece.hh"
00008
00009 #if defined(__GLIBCXX__) || defined(__GLIBCPP__)
00010 #include "PreProcessFilter.h"
00011 #endif
00012
00013 using namespace std;
00014
00015 namespace MosesTuning
00016 {
00017
00018 namespace
00019 {
00020
00021
00022 const int kUnknownToken = -1;
00023 }
00024
00025 Scorer::Scorer(const string& name, const string& config)
00026 : m_name(name),
00027 m_vocab(mert::VocabularyFactory::GetVocabulary()),
00028 #if defined(__GLIBCXX__) || defined(__GLIBCPP__)
00029 m_filter(NULL),
00030 #endif
00031 m_score_data(NULL),
00032 m_enable_preserve_case(true)
00033 {
00034 InitConfig(config);
00035 }
00036
00037 Scorer::~Scorer()
00038 {
00039 Singleton<mert::Vocabulary>::Delete();
00040 #if defined(__GLIBCXX__) || defined(__GLIBCPP__)
00041 delete m_filter;
00042 #endif
00043 }
00044
00045 void Scorer::InitConfig(const string& config)
00046 {
00047
00048 size_t start = 0;
00049 while (start < config.size()) {
00050 size_t end = config.find(",", start);
00051 if (end == string::npos) {
00052 end = config.size();
00053 }
00054 string nv = config.substr(start, end - start);
00055 size_t split = nv.find(":");
00056 if (split == string::npos) {
00057 throw runtime_error("Missing colon when processing scorer config: " + config);
00058 }
00059 const string name = nv.substr(0, split);
00060 const string value = nv.substr(split + 1, nv.size() - split - 1);
00061 cerr << "name: " << name << " value: " << value << endl;
00062 m_config[name] = value;
00063 start = end + 1;
00064 }
00065 }
00066
00067 void Scorer::TokenizeAndEncode(const string& line, vector<int>& encoded) const
00068 {
00069 for (util::TokenIter<util::AnyCharacter, true> it(line, util::AnyCharacter(" "));
00070 it; ++it) {
00071 if (!m_enable_preserve_case) {
00072 string token = it->as_string();
00073 for (std::string::iterator sit = token.begin();
00074 sit != token.end(); ++sit) {
00075 *sit = tolower(*sit);
00076 }
00077 encoded.push_back(m_vocab->Encode(token));
00078 } else {
00079 encoded.push_back(m_vocab->Encode(it->as_string()));
00080 }
00081 }
00082 }
00083
00084 void Scorer::TokenizeAndEncodeTesting(const string& line, vector<int>& encoded) const
00085 {
00086 for (util::TokenIter<util::AnyCharacter, true> it(line, util::AnyCharacter(" "));
00087 it; ++it) {
00088 if (!m_enable_preserve_case) {
00089 string token = it->as_string();
00090 for (std::string::iterator sit = token.begin();
00091 sit != token.end(); ++sit) {
00092 *sit = tolower(*sit);
00093 }
00094 mert::Vocabulary::const_iterator cit = m_vocab->find(token);
00095 if (cit == m_vocab->end()) {
00096 encoded.push_back(kUnknownToken);
00097 } else {
00098 encoded.push_back(cit->second);
00099 }
00100 } else {
00101 mert::Vocabulary::const_iterator cit = m_vocab->find(it->as_string());
00102 if (cit == m_vocab->end()) {
00103 encoded.push_back(kUnknownToken);
00104 } else {
00105 encoded.push_back(cit->second);
00106 }
00107 }
00108 }
00109 }
00110
00114 void Scorer::setFactors(const string& factors)
00115 {
00116 if (factors.empty()) return;
00117 vector<string> factors_vec;
00118 split(factors, '|', factors_vec);
00119 for(vector<string>::iterator it = factors_vec.begin(); it != factors_vec.end(); ++it) {
00120 int factor = atoi(it->c_str());
00121 m_factors.push_back(factor);
00122 }
00123 }
00124
00128 void Scorer::setFilter(const string& filterCommand)
00129 {
00130 if (filterCommand.empty()) return;
00131 #if defined(__GLIBCXX__) || defined(__GLIBCPP__)
00132 m_filter = new PreProcessFilter(filterCommand);
00133 #else
00134 throw runtime_error("Cannot use filter command as mert was compiled with non-gcc compiler");
00135 #endif
00136 }
00137
00141 string Scorer::applyFactors(const string& sentence) const
00142 {
00143 if (m_factors.size() == 0) return sentence;
00144
00145 vector<string> tokens;
00146 split(sentence, ' ', tokens);
00147
00148 stringstream sstream;
00149 for (size_t i = 0; i < tokens.size(); ++i) {
00150 if (tokens[i] == "") continue;
00151
00152 vector<string> factors;
00153 split(tokens[i], '|', factors);
00154
00155 int fsize = factors.size();
00156
00157 if (i > 0) sstream << " ";
00158
00159 for (size_t j = 0; j < m_factors.size(); ++j) {
00160 int findex = m_factors[j];
00161 if (findex < 0 || findex >= fsize) throw runtime_error("Factor index is out of range.");
00162
00163 if (j > 0) sstream << "|";
00164 sstream << factors[findex];
00165 }
00166 }
00167 return sstream.str();
00168 }
00169
00173 string Scorer::applyFilter(const string& sentence) const
00174 {
00175 #if defined(__GLIBCXX__) || defined(__GLIBCPP__)
00176 if (m_filter) {
00177 return m_filter->ProcessSentence(sentence);
00178 } else {
00179 return sentence;
00180 }
00181 #endif
00182 return sentence;
00183 }
00184
00185 float Scorer::score(const candidates_t& candidates) const
00186 {
00187 diffs_t diffs;
00188 statscores_t scores;
00189 score(candidates, diffs, scores);
00190 return scores[0];
00191 }
00192
00193 }