00001 #ifndef LM_BINARY_FORMAT_H
00002 #define LM_BINARY_FORMAT_H
00003
00004 #include "lm/config.hh"
00005 #include "lm/model_type.hh"
00006 #include "lm/read_arpa.hh"
00007
00008 #include "util/file_piece.hh"
00009 #include "util/mmap.hh"
00010 #include "util/scoped.hh"
00011
00012 #include <cstddef>
00013 #include <vector>
00014
00015 #include <stdint.h>
00016
00017 namespace lm {
00018 namespace ngram {
00019
00020 extern const char *kModelNames[6];
00021
00022
00023
00024
00025
00026 bool RecognizeBinary(const char *file, ModelType &recognized);
00027
00028 struct FixedWidthParameters {
00029 unsigned char order;
00030 float probing_multiplier;
00031
00032 ModelType model_type;
00033
00034 bool has_vocabulary;
00035 unsigned int search_version;
00036 };
00037
00038
00039 #define ALIGN8(a) ((std::ptrdiff_t(((a)-1)/8)+1)*8)
00040
00041
00042 struct Parameters {
00043 FixedWidthParameters fixed;
00044 std::vector<uint64_t> counts;
00045 };
00046
00047 class BinaryFormat {
00048 public:
00049 explicit BinaryFormat(const Config &config);
00050
00051
00052
00053 void InitializeBinary(int fd, ModelType model_type, unsigned int search_version, Parameters ¶ms);
00054
00055 void ReadForConfig(void *to, std::size_t amount, uint64_t offset_excluding_header) const;
00056
00057 void *LoadBinary(std::size_t size);
00058
00059 uint64_t VocabStringReadingOffset() const {
00060 assert(vocab_string_offset_ != kInvalidOffset);
00061 return vocab_string_offset_;
00062 }
00063
00064
00065
00066 void *SetupJustVocab(std::size_t memory_size, uint8_t order);
00067
00068 void *GrowForSearch(std::size_t memory_size, std::size_t vocab_pad, void *&vocab_base);
00069
00070 void WriteVocabWords(const std::string &buffer, void *&vocab_base, void *&search_base);
00071
00072 void FinishFile(const Config &config, ModelType model_type, unsigned int search_version, const std::vector<uint64_t> &counts);
00073
00074 private:
00075 void MapFile(void *&vocab_base, void *&search_base);
00076
00077
00078 const Config::WriteMethod write_method_;
00079 const char *write_mmap_;
00080 util::LoadMethod load_method_;
00081
00082
00083 util::scoped_fd file_;
00084
00085
00086 util::scoped_memory mapping_;
00087
00088
00089
00090
00091 util::scoped_memory memory_vocab_, memory_search_;
00092
00093
00094
00095 std::size_t header_size_, vocab_size_, vocab_pad_;
00096
00097 uint64_t vocab_string_offset_;
00098
00099 static const uint64_t kInvalidOffset = (uint64_t)-1;
00100 };
00101
00102 bool IsBinaryFormat(int fd);
00103
00104 }
00105 }
00106 #endif // LM_BINARY_FORMAT_H