00001 #include "TerScorer.h"
00002
00003 #include <cmath>
00004 #include <sstream>
00005 #include <stdexcept>
00006
00007 #include "ScoreStats.h"
00008 #include "TER/tercalc.h"
00009 #include "TER/terAlignment.h"
00010 #include "Util.h"
00011
00012 using namespace std;
00013 using namespace TERCPPNS_TERCpp;
00014
00015 namespace MosesTuning
00016 {
00017
00018
00019 TerScorer::TerScorer(const string& config)
00020 : StatisticsBasedScorer("TER",config), kLENGTH(2) {}
00021
00022 TerScorer::~TerScorer() {}
00023
00024 void TerScorer::setReferenceFiles ( const vector<string>& referenceFiles )
00025 {
00026
00027
00028 for ( int incRefs = 0; incRefs < ( int ) referenceFiles.size(); incRefs++ ) {
00029 stringstream convert;
00030 m_references.clear();
00031
00032 m_ref_tokens.clear();
00033 m_ref_lengths.clear();
00034 ifstream in ( referenceFiles.at ( incRefs ).c_str() );
00035 if ( !in ) {
00036 throw runtime_error ( "Unable to open " + referenceFiles.at ( incRefs ) );
00037 }
00038 string line;
00039 int sid = 0;
00040 while ( getline ( in, line ) ) {
00041 line = this->preprocessSentence(line);
00042 vector<int> tokens;
00043 TokenizeAndEncode(line, tokens);
00044 m_references.push_back ( tokens );
00045 TRACE_ERR ( "." );
00046 ++sid;
00047 }
00048 m_multi_references.push_back ( m_references );
00049 }
00050
00051 TRACE_ERR ( endl );
00052 m_references=m_multi_references.at(0);
00053 }
00054
00055 void TerScorer::prepareStats ( size_t sid, const string& text, ScoreStats& entry )
00056 {
00057 string sentence = this->preprocessSentence(text);
00058
00059 terAlignment result;
00060 result.numEdits = 0.0 ;
00061 result.numWords = 0.0 ;
00062 result.averageWords = 0.0;
00063
00064 for ( int incRefs = 0; incRefs < ( int ) m_multi_references.size(); incRefs++ ) {
00065 if ( sid >= m_multi_references.at(incRefs).size() ) {
00066 stringstream msg;
00067 msg << "Sentence id (" << sid << ") not found in reference set";
00068 throw runtime_error ( msg.str() );
00069 }
00070
00071 vector<int> testtokens;
00072 vector<int> reftokens;
00073 reftokens = m_multi_references.at ( incRefs ).at ( sid );
00074 double averageLength=0.0;
00075 for ( int incRefsBis = 0; incRefsBis < ( int ) m_multi_references.size(); incRefsBis++ ) {
00076 if ( sid >= m_multi_references.at(incRefsBis).size() ) {
00077 stringstream msg;
00078 msg << "Sentence id (" << sid << ") not found in reference set";
00079 throw runtime_error ( msg.str() );
00080 }
00081 averageLength+=(double)m_multi_references.at ( incRefsBis ).at ( sid ).size();
00082 }
00083 averageLength=averageLength/( double ) m_multi_references.size();
00084 TokenizeAndEncode(sentence, testtokens);
00085 terCalc * evaluation=new terCalc();
00086 evaluation->setDebugMode ( false );
00087 terAlignment tmp_result = evaluation->TER ( reftokens, testtokens );
00088 tmp_result.averageWords=averageLength;
00089 if ( ( result.numEdits == 0.0 ) && ( result.averageWords == 0.0 ) ) {
00090 result = tmp_result;
00091 } else if ( result.scoreAv() > tmp_result.scoreAv() ) {
00092 result = tmp_result;
00093 }
00094 delete evaluation;
00095 }
00096 ostringstream stats;
00097
00098
00099 stats << result.numEdits*100.0 << " " << result.averageWords*100.0 << " " << result.scoreAv()*100.0 << " " ;
00100 string stats_str = stats.str();
00101 entry.set ( stats_str );
00102 }
00103
00104 float TerScorer::calculateScore(const vector<ScoreStatsType>& comps) const
00105 {
00106 float denom = 1.0 * comps[1];
00107 float num = -1.0 * comps[0];
00108 if ( denom == 0 ) {
00109
00110 return 1.0;
00111 } else {
00112 return (1.0+(num / denom));
00113 }
00114 }
00115
00116 }