00001 #include "CderScorer.h"
00002
00003 #include <algorithm>
00004 #include <fstream>
00005 #include <stdexcept>
00006
00007 using namespace std;
00008
00009 namespace
00010 {
00011
00012 inline int CalcDistance(int word1, int word2)
00013 {
00014 return word1 == word2 ? 0 : 1;
00015 }
00016
00017 }
00018
00019 namespace MosesTuning
00020 {
00021
00022
00023 CderScorer::CderScorer(const string& config, bool allowed_long_jumps)
00024 : StatisticsBasedScorer(allowed_long_jumps ? "CDER" : "WER", config),
00025 m_allowed_long_jumps(allowed_long_jumps) {}
00026
00027 CderScorer::~CderScorer() {}
00028
00029 void CderScorer::setReferenceFiles(const vector<string>& referenceFiles)
00030 {
00031
00032 m_ref_sentences.clear();
00033
00034
00035 for (size_t rid = 0; rid < referenceFiles.size(); ++rid) {
00036 ifstream refin(referenceFiles[rid].c_str());
00037 if (!refin) {
00038 throw runtime_error("Unable to open: " + referenceFiles[rid]);
00039 }
00040 m_ref_sentences.push_back(vector<sent_t>());
00041 string line;
00042 while (getline(refin,line)) {
00043 line = this->preprocessSentence(line);
00044 sent_t encoded;
00045 TokenizeAndEncode(line, encoded);
00046 m_ref_sentences[rid].push_back(encoded);
00047 }
00048 }
00049 }
00050
00051 void CderScorer::prepareStats(size_t sid, const string& text, ScoreStats& entry)
00052 {
00053 string sentence = this->preprocessSentence(text);
00054
00055 vector<ScoreStatsType> stats;
00056 prepareStatsVector(sid, sentence, stats);
00057 entry.set(stats);
00058 }
00059
00060 void CderScorer::prepareStatsVector(size_t sid, const string& text, vector<ScoreStatsType>& stats)
00061 {
00062 sent_t cand;
00063 TokenizeAndEncode(text, cand);
00064
00065 float max = -2;
00066 vector<ScoreStatsType> tmp;
00067 for (size_t rid = 0; rid < m_ref_sentences.size(); ++rid) {
00068 const sent_t& ref = m_ref_sentences[rid][sid];
00069 tmp.clear();
00070 computeCD(cand, ref, tmp);
00071 int score = calculateScore(tmp);
00072 if (rid == 0) {
00073 stats = tmp;
00074 max = score;
00075 } else if (score > max) {
00076 stats = tmp;
00077 max = score;
00078 }
00079 }
00080 }
00081
00082 float CderScorer::calculateScore(const vector<ScoreStatsType>& comps) const
00083 {
00084 if (comps.size() != 2) {
00085 throw runtime_error("Size of stat vector for CDER is not 2");
00086 }
00087 if (comps[1] == 0) return 1.0f;
00088 return 1.0f - (comps[0] / static_cast<float>(comps[1]));
00089 }
00090
00091 void CderScorer::computeCD(const sent_t& cand, const sent_t& ref,
00092 vector<ScoreStatsType>& stats) const
00093 {
00094 int I = cand.size() + 1;
00095 int L = ref.size() + 1;
00096
00097 int l = 0;
00098
00099 vector<int>* row = new vector<int>(I);
00100
00101
00102 for (int i = 0; i < I; ++i) (*row)[i] = i;
00103
00104
00105 if (m_allowed_long_jumps) {
00106 for (int i = 1; i < I; ++i) (*row)[i] = 1;
00107 }
00108
00109
00110 while (++l < L) {
00111 vector<int>* nextRow = new vector<int>(I);
00112 for (int i = 0; i < I; ++i) {
00113 vector<int> possibleCosts;
00114 if (i > 0) {
00115 possibleCosts.push_back((*nextRow)[i-1] + 1);
00116 possibleCosts.push_back((*row)[i-1] + CalcDistance(ref[l-1], cand[i-1]));
00117 }
00118 possibleCosts.push_back((*row)[i] + 1);
00119 (*nextRow)[i] = *min_element(possibleCosts.begin(), possibleCosts.end());
00120 }
00121
00122 if (m_allowed_long_jumps) {
00123
00124 int LJ = 1 + *min_element(nextRow->begin(), nextRow->end());
00125
00126 for (int i = 0; i < I; ++i) {
00127 (*nextRow)[i] = min((*nextRow)[i], LJ);
00128 }
00129 }
00130
00131 delete row;
00132 row = nextRow;
00133 }
00134
00135 stats.resize(2);
00136 stats[0] = *(row->rbegin());
00137 stats[1] = ref.size();
00138
00139 delete row;
00140 }
00141
00142 }