00001 #include "Scorer.h"
00002
00003 #include <limits>
00004 #include "Vocabulary.h"
00005 #include "Util.h"
00006 #include "Singleton.h"
00007 #include "PreProcessFilter.h"
00008 #include "util/tokenize_piece.hh"
00009
00010 using namespace std;
00011
00012 namespace MosesTuning
00013 {
00014
00015 namespace {
00016
00017
00018 const int kUnknownToken = -1;
00019 }
00020
00021 Scorer::Scorer(const string& name, const string& config)
00022 : m_name(name),
00023 m_vocab(mert::VocabularyFactory::GetVocabulary()),
00024 m_filter(NULL),
00025 m_score_data(NULL),
00026 m_enable_preserve_case(true) {
00027 InitConfig(config);
00028 }
00029
00030 Scorer::~Scorer() {
00031 Singleton<mert::Vocabulary>::Delete();
00032 delete m_filter;
00033 }
00034
00035 void Scorer::InitConfig(const string& config) {
00036
00037 size_t start = 0;
00038 while (start < config.size()) {
00039 size_t end = config.find(",", start);
00040 if (end == string::npos) {
00041 end = config.size();
00042 }
00043 string nv = config.substr(start, end - start);
00044 size_t split = nv.find(":");
00045 if (split == string::npos) {
00046 throw runtime_error("Missing colon when processing scorer config: " + config);
00047 }
00048 const string name = nv.substr(0, split);
00049 const string value = nv.substr(split + 1, nv.size() - split - 1);
00050 cerr << "name: " << name << " value: " << value << endl;
00051 m_config[name] = value;
00052 start = end + 1;
00053 }
00054 }
00055
00056 void Scorer::TokenizeAndEncode(const string& line, vector<int>& encoded) {
00057 for (util::TokenIter<util::AnyCharacter, true> it(line, util::AnyCharacter(" "));
00058 it; ++it) {
00059 if (!m_enable_preserve_case) {
00060 string token = it->as_string();
00061 for (std::string::iterator sit = token.begin();
00062 sit != token.end(); ++sit) {
00063 *sit = tolower(*sit);
00064 }
00065 encoded.push_back(m_vocab->Encode(token));
00066 } else {
00067 encoded.push_back(m_vocab->Encode(it->as_string()));
00068 }
00069 }
00070 }
00071
00072 void Scorer::TokenizeAndEncodeTesting(const string& line, vector<int>& encoded) {
00073 for (util::TokenIter<util::AnyCharacter, true> it(line, util::AnyCharacter(" "));
00074 it; ++it) {
00075 if (!m_enable_preserve_case) {
00076 string token = it->as_string();
00077 for (std::string::iterator sit = token.begin();
00078 sit != token.end(); ++sit) {
00079 *sit = tolower(*sit);
00080 }
00081 mert::Vocabulary::const_iterator cit = m_vocab->find(token);
00082 if (cit == m_vocab->end()) {
00083 encoded.push_back(kUnknownToken);
00084 } else {
00085 encoded.push_back(cit->second);
00086 }
00087 } else {
00088 mert::Vocabulary::const_iterator cit = m_vocab->find(it->as_string());
00089 if (cit == m_vocab->end()) {
00090 encoded.push_back(kUnknownToken);
00091 } else {
00092 encoded.push_back(cit->second);
00093 }
00094 }
00095 }
00096 }
00097
00101 void Scorer::setFactors(const string& factors)
00102 {
00103 if (factors.empty()) return;
00104 vector<string> factors_vec;
00105 split(factors, '|', factors_vec);
00106 for(vector<string>::iterator it = factors_vec.begin(); it != factors_vec.end(); ++it)
00107 {
00108 int factor = atoi(it->c_str());
00109 m_factors.push_back(factor);
00110 }
00111 }
00112
00116 void Scorer::setFilter(const string& filterCommand)
00117 {
00118 if (filterCommand.empty()) return;
00119 m_filter = new PreProcessFilter(filterCommand);
00120 }
00121
00125 string Scorer::applyFactors(const string& sentence) const
00126 {
00127 if (m_factors.size() == 0) return sentence;
00128
00129 vector<string> tokens;
00130 split(sentence, ' ', tokens);
00131
00132 stringstream sstream;
00133 for (size_t i = 0; i < tokens.size(); ++i)
00134 {
00135 if (tokens[i] == "") continue;
00136
00137 vector<string> factors;
00138 split(tokens[i], '|', factors);
00139
00140 int fsize = factors.size();
00141
00142 if (i > 0) sstream << " ";
00143
00144 for (size_t j = 0; j < m_factors.size(); ++j)
00145 {
00146 int findex = m_factors[j];
00147 if (findex < 0 || findex >= fsize) throw runtime_error("Factor index is out of range.");
00148
00149 if (j > 0) sstream << "|";
00150 sstream << factors[findex];
00151 }
00152 }
00153 return sstream.str();
00154 }
00155
00159 string Scorer::applyFilter(const string& sentence) const
00160 {
00161 if (m_filter)
00162 {
00163 return m_filter->ProcessSentence(sentence);
00164 }
00165 else
00166 {
00167 return sentence;
00168 }
00169 }
00170
00171 float Scorer::score(const candidates_t& candidates) const {
00172 diffs_t diffs;
00173 statscores_t scores;
00174 score(candidates, diffs, scores);
00175 return scores[0];
00176 }
00177
00178 }