00001
00002
00003
00004
00005
00006
00007
00008
00009
00010
00011
00012
00013
00014
00015
00016
00017
00018
00019
00020
00021
00022 #pragma once
00023
00024 #include <iostream>
00025 #include <fstream>
00026 #include <sstream>
00027 #include <string>
00028 #include <vector>
00029 #include <cmath>
00030 #include <cassert>
00031 #include <limits>
00032 #include <map>
00033 #include <cstdlib>
00034 #include <cstring>
00035 #include "util/exception.hh"
00036 #include "util/string_stream.hh"
00037 #include "TypeDef.h"
00038
00039 namespace Moses
00040 {
00041
00052
00053 #ifdef TRACE_ERR
00054 #undef TRACE_ERR
00055 #endif
00056 #ifdef TRACE_ENABLE
00057 #define TRACE_ERR(str) do { std::cerr << str; } while (false)
00058 #else
00059 #define TRACE_ERR(str) do {} while (false)
00060 #endif
00061
00065
00066 #ifdef VERBOSE
00067 #undef VERBOSE
00068 #endif
00069 #define VERBOSE(level,str) { IFVERBOSE(level) { TRACE_ERR(str); } }
00070
00071
00072 #ifdef IFVERBOSE
00073 #undef IFVERBOSE
00074 #endif
00075 #define IFVERBOSE(level) if (Moses::StaticData::Instance().GetVerboseLevel() >= level)
00076 #define XVERBOSE(level,str) VERBOSE(level, "[" << HERE << "] " << str)
00077 #define HERE __FILE__ << ":" << __LINE__
00078 #define FEATUREVERBOSE(level,str) FEATUREVERBOSE2(level, "[" << GetScoreProducerDescription() << "] " << str)
00079 #define FEATUREVERBOSE2(level,str) { IFFEATUREVERBOSE(level) { TRACE_ERR(str); } }
00080 #define IFFEATUREVERBOSE(level) if ((m_verbosity == std::numeric_limits<std::size_t>::max() && StaticData::Instance().GetVerboseLevel() >= level) || (m_verbosity != std::numeric_limits<std::size_t>::max() && m_verbosity >= level))
00081
00082
00083 #if __GNUC__ == 4 && __GNUC_MINOR__ == 8 && (__GNUC_PATCHLEVEL__ == 1 || __GNUC_PATCHLEVEL__ == 2)
00084
00085 #define NTH_ELEMENT3(begin, middle, end) std::sort(begin, end)
00086 #define NTH_ELEMENT4(begin, middle, end, orderer) std::sort(begin, end, orderer)
00087 #else
00088 #define NTH_ELEMENT3(begin, middle, end) std::nth_element(begin, middle, end)
00089 #define NTH_ELEMENT4(begin, middle, end, orderer) std::nth_element(begin, middle, end, orderer)
00090 #endif
00091
00092
00093 const std::string ToLower(const std::string& str);
00094
00096 inline std::string Trim(const std::string& str, const std::string dropChars = " \t\n\r")
00097 {
00098 std::string res = str;
00099 res.erase(str.find_last_not_of(dropChars)+1);
00100 return res.erase(0, res.find_first_not_of(dropChars));
00101 }
00102
00104 template<typename T>
00105 inline std::string SPrint(const T &input)
00106 {
00107 std::stringstream stream("");
00108 stream << input;
00109 return stream.str();
00110 }
00111
00113 template<typename T>
00114 inline T Scan(const std::string &input)
00115 {
00116 std::stringstream stream(input);
00117 T ret;
00118 stream >> ret;
00119 return ret;
00120 }
00121
00123 template<>
00124 inline std::string Scan<std::string>(const std::string &input)
00125 {
00126 return input;
00127 }
00128
00129 template<>
00130 inline WordAlignmentSort Scan<WordAlignmentSort>(const std::string &input)
00131 {
00132 return (WordAlignmentSort) Scan<size_t>(input);
00133 }
00134
00135 template<>
00136 inline InputTypeEnum Scan<InputTypeEnum>(const std::string &input)
00137 {
00138 return (InputTypeEnum) Scan<size_t>(input);
00139 }
00140
00141 template<>
00142 inline SearchAlgorithm Scan<SearchAlgorithm>(const std::string &input)
00143 {
00144 return (SearchAlgorithm) Scan<size_t>(input);
00145 }
00146
00147 template<>
00148 inline S2TParsingAlgorithm Scan<S2TParsingAlgorithm>(const std::string &input)
00149 {
00150 return (S2TParsingAlgorithm) Scan<size_t>(input);
00151 }
00152
00153 template<>
00154 inline SourceLabelOverlap Scan<SourceLabelOverlap>(const std::string &input)
00155 {
00156 return (SourceLabelOverlap) Scan<size_t>(input);
00157 }
00158
00159 template<>
00160 inline XmlInputType Scan<XmlInputType>(const std::string &input)
00161 {
00162 XmlInputType ret;
00163 if (input=="exclusive") ret = XmlExclusive;
00164 else if (input=="inclusive") ret = XmlInclusive;
00165 else if (input=="constraint") ret = XmlConstraint;
00166 else if (input=="ignore") ret = XmlIgnore;
00167 else if (input=="pass-through") ret = XmlPassThrough;
00168 else {
00169 UTIL_THROW2("Unknown XML input type");
00170 }
00171
00172 return ret;
00173 }
00174
00176 template<>
00177 bool Scan<bool>(const std::string &input);
00178
00180 template<typename T>
00181 inline std::vector<T> Scan(const std::vector< std::string > &input)
00182 {
00183 std::vector<T> output(input.size());
00184 for (size_t i = 0 ; i < input.size() ; i++) {
00185 output[i] = Scan<T>( input[i] );
00186 }
00187 return output;
00188 }
00189
00191 template<typename T>
00192 inline void Scan(std::vector<T> &output, const std::vector< std::string > &input)
00193 {
00194 output.resize(input.size());
00195 for (size_t i = 0 ; i < input.size() ; i++) {
00196 output[i] = Scan<T>( input[i] );
00197 }
00198 }
00199
00201 inline std::string Replace(const std::string& str,
00202 const std::string& todelStr,
00203 const std::string& toaddStr)
00204 {
00205 size_t pos=0;
00206 std::string newStr=str;
00207 while ((pos=newStr.find(todelStr,pos))!=std::string::npos) {
00208 newStr.replace(pos++,todelStr.size(),toaddStr);
00209 }
00210 return newStr;
00211 }
00212
00216 inline std::vector<std::string> Tokenize(const std::string& str,
00217 const std::string& delimiters = " \t")
00218 {
00219 std::vector<std::string> tokens;
00220
00221 std::string::size_type lastPos = str.find_first_not_of(delimiters, 0);
00222
00223 std::string::size_type pos = str.find_first_of(delimiters, lastPos);
00224
00225 while (std::string::npos != pos || std::string::npos != lastPos) {
00226
00227 tokens.push_back(str.substr(lastPos, pos - lastPos));
00228
00229 lastPos = str.find_first_not_of(delimiters, pos);
00230
00231 pos = str.find_first_of(delimiters, lastPos);
00232 }
00233
00234 return tokens;
00235 }
00236
00237
00238 inline void Tokenize(std::vector<std::string> &output
00239 , const std::string& str
00240 , const std::string& delimiters = " \t")
00241 {
00242
00243 std::string::size_type lastPos = str.find_first_not_of(delimiters, 0);
00244
00245 std::string::size_type pos = str.find_first_of(delimiters, lastPos);
00246
00247 while (std::string::npos != pos || std::string::npos != lastPos) {
00248
00249 output.push_back(str.substr(lastPos, pos - lastPos));
00250
00251 lastPos = str.find_first_not_of(delimiters, pos);
00252
00253 pos = str.find_first_of(delimiters, lastPos);
00254 }
00255 }
00256
00258 template<typename T>
00259 inline std::vector<T> Tokenize( const std::string &input
00260 , const std::string& delimiters = " \t")
00261 {
00262 std::vector<std::string> stringVector = Tokenize(input, delimiters);
00263 return Scan<T>( stringVector );
00264 }
00265
00266
00267 template<typename T>
00268 inline void Tokenize( std::vector<T> &output
00269 , const std::string &input
00270 , const std::string& delimiters = " \t")
00271 {
00272 std::vector<std::string> stringVector;
00273 Tokenize(stringVector, input, delimiters);
00274 return Scan<T>(output, stringVector );
00275 }
00276
00277 inline std::vector<std::string> TokenizeMultiCharSeparator(
00278 const std::string& str,
00279 const std::string& separator)
00280 {
00281 std::vector<std::string> tokens;
00282
00283 size_t pos = 0;
00284
00285 std::string::size_type nextPos = str.find(separator, pos);
00286
00287 while (nextPos != std::string::npos) {
00288
00289 tokens.push_back(str.substr(pos, nextPos - pos));
00290
00291 pos = nextPos + separator.size();
00292
00293 nextPos = str.find(separator, pos);
00294 }
00295 tokens.push_back(str.substr(pos, nextPos - pos));
00296
00297 return tokens;
00298 }
00299
00300
00301 inline void TokenizeMultiCharSeparator(std::vector<std::string> &output
00302 ,const std::string& str
00303 ,const std::string& separator)
00304 {
00305 size_t pos = 0;
00306
00307 std::string::size_type nextPos = str.find(separator, pos);
00308
00309 while (nextPos != std::string::npos) {
00310
00311 output.push_back(Trim(str.substr(pos, nextPos - pos)));
00312
00313 pos = nextPos + separator.size();
00314
00315 nextPos = str.find(separator, pos);
00316 }
00317 output.push_back(Trim(str.substr(pos, nextPos - pos)));
00318 }
00319
00323 inline std::vector<std::string> TokenizeFirstOnly(const std::string& str,
00324 const std::string& delimiters = " \t")
00325 {
00326 std::vector<std::string> tokens;
00327 std::string::size_type pos = str.find_first_of(delimiters);
00328
00329 if (std::string::npos != pos) {
00330
00331 tokens.push_back(str.substr(0, pos));
00332 tokens.push_back(str.substr(pos + 1, str.size() - pos - 1));
00333 } else {
00334 tokens.push_back(str);
00335 }
00336
00337 return tokens;
00338 }
00339
00340
00344 template <typename T>
00345 std::string Join(const std::string& delimiter, const std::vector<T>& items)
00346 {
00347 util::StringStream outstr;
00348 if(items.size() == 0) return "";
00349 outstr << items[0];
00350 for(unsigned int i = 1; i < items.size(); i++)
00351 outstr << delimiter << items[i];
00352 return outstr.str();
00353 }
00354
00355
00356
00357
00358 template<typename It>
00359 std::string Join(const std::string &delim, It begin, It end)
00360 {
00361 util::StringStream outstr;
00362 if (begin != end)
00363 outstr << *begin++;
00364 for ( ; begin != end; ++begin)
00365 outstr << delim << *begin;
00366 return outstr.str();
00367 }
00368
00370 inline float TransformScore(float prob)
00371 {
00372 return log(prob);
00373 }
00374
00376 inline float UntransformScore(float score)
00377 {
00378 return exp(score);
00379 }
00380
00382 inline float TransformLMScore(float irstScore)
00383 {
00384 return irstScore * 2.30258509299405f;
00385 }
00386
00387 inline float UntransformLMScore(float logNScore)
00388 {
00389
00390 return logNScore / 2.30258509299405f;
00391 }
00392
00394 inline float FloorScore(float logScore)
00395 {
00396 return (std::max)(logScore , LOWEST_SCORE);
00397 }
00398
00403 inline float CalcTranslationScore(const std::vector<float> &probVector,
00404 const std::vector<float> &weightT)
00405 {
00406 UTIL_THROW_IF2(weightT.size() != probVector.size(),
00407 "Weight and score vector sizes not the same");
00408 float rv=0.0;
00409 for(float const *sb=&probVector[0],*se=sb+probVector.size(),*wb=&weightT[0];
00410 sb!=se; ++sb, ++wb)
00411 rv += TransformScore(*sb) * (*wb);
00412 return rv;
00413 }
00414
00421 #define TO_STRING() std::string ToString() const;
00422
00424 #define TO_STRING_BODY(CLASS) \
00425 std::string CLASS::ToString() const \
00426 { \
00427 std::stringstream out; \
00428 out << *this; \
00429 return out.str(); \
00430 } \
00431
00433 template<class COLL>
00434 void RemoveAllInColl(COLL &coll)
00435 {
00436 for (typename COLL::const_iterator iter = coll.begin() ; iter != coll.end() ; ++iter) {
00437 delete (*iter);
00438 }
00439 coll.clear();
00440 }
00441
00443 template<class COLL>
00444 void RemoveAllInMap(COLL &coll)
00445 {
00446 for (typename COLL::const_iterator iter = coll.begin() ; iter != coll.end() ; ++iter) {
00447 delete (iter->second);
00448 }
00449 coll.clear();
00450 }
00451
00452
00454 std::string GetTempFolder();
00456 std::string GetMD5Hash(const std::string &filePath);
00457
00459 template<typename T>
00460 inline void ShrinkToFit(T& v)
00461 {
00462 if(v.capacity()>v.size())
00463 T(v).swap(v);
00464 assert(v.capacity()==v.size());
00465 }
00466
00467 bool FileExists(const std::string& filePath);
00468
00469
00470 std::vector< std::map<std::string, std::string> > ProcessAndStripDLT(std::string &line);
00471
00472
00473 std::map<std::string, std::string> ProcessAndStripSGML(std::string &line);
00474
00475 std::string PassthroughSGML(std::string &line, const std::string tagName,const std::string& lbrackStr="<", const std::string& rbrackStr=">");
00476
00481 inline std::string GetFirstString(const std::string& str, int& first_pos, const std::string& delimiters = " \t")
00482 {
00483
00484 std::string first_str;
00485
00486 std::string::size_type lastPos = str.find_first_not_of(delimiters, first_pos);
00487
00488
00489 std::string::size_type pos = str.find_first_of(delimiters, lastPos);
00490
00491 if (std::string::npos != pos || std::string::npos != lastPos) {
00492
00493 first_str = str.substr(lastPos, pos - lastPos);
00494
00495
00496 lastPos = str.find_first_not_of(delimiters, pos);
00497
00498 }
00499
00500 first_pos = lastPos;
00501 return first_str;
00502 }
00503
00504 template<class T>
00505 T log_sum (T log_a, T log_b)
00506 {
00507 if (log_a < log_b) {
00508 return log_b + log1p(exp(log_a - log_b));
00509 } else {
00510 return log_a + log1p(exp(log_b - log_a));
00511 }
00512 }
00513
00517 inline bool Equals(float a, float b)
00518 {
00519 return fabs(a - b) < FLOAT_EPSILON;
00520 }
00521
00522
00524 inline void FixPrecision(std::ostream& stream, size_t size = 3)
00525 {
00526 stream.setf(std::ios::fixed);
00527 stream.precision(size);
00528 }
00529
00530 class FeatureFunction;
00531
00532 void PrintFeatureWeight(const FeatureFunction* ff);
00533 void ShowWeights();
00534
00535 template<typename T>
00536 class UnorderedComparer
00537 {
00538 public:
00539 size_t operator()(const T& obj) const {
00540 return obj.hash();
00541 }
00542
00543 bool operator()(const T& a, const T& b) const {
00544 return a == b;
00545 }
00546
00547 size_t operator()(const T* obj) const {
00548 return obj->hash();
00549 }
00550
00551 bool operator()(const T* a, const T* b) const {
00552 return (*a) == (*b);
00553 }
00554
00555 };
00556
00557 }
00558