00001 // $Id: n_gram.h 383 2010-04-23 15:29:28Z nicolabertoldi $ 00002 00003 /****************************************************************************** 00004 IrstLM: IRST Language Model Toolkit 00005 Copyright (C) 2006 Marcello Federico, ITC-irst Trento, Italy 00006 00007 This library is free software; you can redistribute it and/or 00008 modify it under the terms of the GNU Lesser General Public 00009 License as published by the Free Software Foundation; either 00010 version 2.1 of the License, or (at your option) any later version. 00011 00012 This library is distributed in the hope that it will be useful, 00013 but WITHOUT ANY WARRANTY; without even the implied warranty of 00014 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU 00015 Lesser General Public License for more details. 00016 00017 You should have received a copy of the GNU Lesser General Public 00018 License along with this library; if not, write to the Free Software 00019 Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA 00020 00021 ******************************************************************************/ 00022 00023 // n-gram tables 00024 // by M. Federico 00025 // Copyright Marcello Federico, ITC-irst, 1998 00026 00027 #ifndef MF_NGRAM_H 00028 #define MF_NGRAM_H 00029 00030 #include <fstream> 00031 #include <cassert> 00032 #include "dictionary.h" 00033 00034 #ifdef MYMAXNGRAM 00035 #define MAX_NGRAM MYMAXNGRAM 00036 #else 00037 #define MAX_NGRAM 20 00038 #endif 00039 00040 class dictionary; 00041 00042 //typedef int code; 00043 00044 class ngram{ 00045 int word[MAX_NGRAM]; //encoded ngram 00046 public: 00047 dictionary *dict; //dictionary 00048 char* link; // ngram-tree pointer 00049 char* succlink; // pointer to the first successor 00050 int midx[MAX_NGRAM]; // ngram-tree scan pointer 00051 char* path[MAX_NGRAM]; //path in the ngram-tree 00052 int lev; // ngram-tree level 00053 int size; // ngram size 00054 long long freq; // ngram frequency or integer prob 00055 int succ; // number of successors 00056 float bow; // back-off weight 00057 float prob; // probability 00058 00059 unsigned char info; // ngram-tree info flags 00060 unsigned char pinfo; // ngram-tree parent info flags 00061 int isym; // last interruption symbol 00062 00063 ngram(dictionary* d,int sz=0); 00064 ngram(ngram& ng); 00065 00066 int *wordp()// n-gram pointer 00067 {return wordp(size);} 00068 int *wordp(int k) // n-gram pointer 00069 {return size>=k?&word[MAX_NGRAM-k]:0;} 00070 const int *wordp() const // n-gram pointer 00071 {return wordp(size);} 00072 const int *wordp(int k) const // n-gram pointer 00073 {return size>=k?&word[MAX_NGRAM-k]:0;} 00074 00075 00076 int containsWord(const char* s,int lev){ 00077 00078 int c=dict->encode(s); 00079 if (c == -1) return 0; 00080 00081 assert(lev <= size); 00082 for (int i=0;i<lev;i++){ 00083 if (*wordp(size-i)== c) return 1; 00084 } 00085 return 0; 00086 } 00087 00088 00089 void trans(const ngram& ng); 00090 void invert (const ngram& ng); 00091 void shift (); 00092 00093 friend std::ifstream& operator>> (std::ifstream& fi,ngram& ng); 00094 friend std::ofstream& operator<< (std::ofstream& fi,ngram& ng); 00095 friend std::istream& operator>> (std::istream& fi,ngram& ng); 00096 friend std::ostream& operator<< (std::ostream& fi,ngram& ng); 00097 00098 inline bool operator==(const ngram &compare) const 00099 { 00100 if ( size != compare.size || dict != compare.dict) 00101 return false; 00102 else 00103 for (int i=size;i>0;i--) 00104 if (word[MAX_NGRAM-i] != compare.word[MAX_NGRAM-i]) 00105 return false; 00106 return true; 00107 } 00108 00109 inline bool operator!=(const ngram &compare) const 00110 { 00111 if ( size != compare.size || dict != compare.dict) 00112 return true; 00113 else 00114 for (int i=size;i>0;i--) 00115 if (word[MAX_NGRAM-i] != compare.word[MAX_NGRAM-i]) 00116 return true; 00117 return false; 00118 } 00119 00120 00121 00122 inline int ckhisto(int sz){ 00123 00124 for (int i=sz;i>1;i--) 00125 if (*wordp(i)==dict->oovcode()) 00126 return 0; 00127 return 1; 00128 } 00129 00130 int pushc(int c); 00131 int pushw(const char* w); 00132 00133 //~ngram(); 00134 00135 00136 00137 }; 00138 00139 #endif 00140 00141 00142
1.5.9