Tesseract 3.01
|
00001 00002 // File: tessdatamanager.h 00003 // Description: Functions to handle loading/combining tesseract data files. 00004 // Author: Daria Antonova 00005 // Created: Wed Jun 03 11:26:43 PST 2009 00006 // 00007 // (C) Copyright 2009, Google Inc. 00008 // Licensed under the Apache License, Version 2.0 (the "License"); 00009 // you may not use this file except in compliance with the License. 00010 // You may obtain a copy of the License at 00011 // http://www.apache.org/licenses/LICENSE-2.0 00012 // Unless required by applicable law or agreed to in writing, software 00013 // distributed under the License is distributed on an "AS IS" BASIS, 00014 // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 00015 // See the License for the specific language governing permissions and 00016 // limitations under the License. 00017 // 00019 00020 #ifndef TESSERACT_CCUTIL_TESSDATAMANAGER_H_ 00021 #define TESSERACT_CCUTIL_TESSDATAMANAGER_H_ 00022 00023 #include <stdio.h> 00024 #include "host.h" 00025 #include "tprintf.h" 00026 00027 static const char kTrainedDataSuffix[] = "traineddata"; 00028 00029 // When adding new tessdata types and file suffixes, please make sure to 00030 // update TessdataType enum, kTessdataFileSuffixes and kTessdataFileIsText. 00031 static const char kLangConfigFileSuffix[] = "config"; 00032 static const char kUnicharsetFileSuffix[] = "unicharset"; 00033 static const char kAmbigsFileSuffix[] = "unicharambigs"; 00034 static const char kBuiltInTemplatesFileSuffix[] = "inttemp"; 00035 static const char kBuiltInCutoffsFileSuffix[] = "pffmtable"; 00036 static const char kNormProtoFileSuffix[] = "normproto"; 00037 static const char kPuncDawgFileSuffix[] = "punc-dawg"; 00038 static const char kSystemDawgFileSuffix[] = "word-dawg"; 00039 static const char kNumberDawgFileSuffix[] = "number-dawg"; 00040 static const char kFreqDawgFileSuffix[] = "freq-dawg"; 00041 static const char kFixedLengthDawgsFileSuffix[] = "fixed-length-dawgs"; 00042 static const char kCubeUnicharsetFileSuffix[] = "cube-unicharset"; 00043 static const char kCubeSystemDawgFileSuffix[] = "cube-word-dawg"; 00044 00045 namespace tesseract { 00046 00047 enum TessdataType { 00048 TESSDATA_LANG_CONFIG, // 0 00049 TESSDATA_UNICHARSET, // 1 00050 TESSDATA_AMBIGS, // 2 00051 TESSDATA_INTTEMP, // 3 00052 TESSDATA_PFFMTABLE, // 4 00053 TESSDATA_NORMPROTO, // 5 00054 TESSDATA_PUNC_DAWG, // 6 00055 TESSDATA_SYSTEM_DAWG, // 7 00056 TESSDATA_NUMBER_DAWG, // 8 00057 TESSDATA_FREQ_DAWG, // 9 00058 TESSDATA_FIXED_LENGTH_DAWGS, // 10 00059 TESSDATA_CUBE_UNICHARSET, // 11 00060 TESSDATA_CUBE_SYSTEM_DAWG, // 12 00061 00062 TESSDATA_NUM_ENTRIES 00063 }; 00064 00069 static const char * const kTessdataFileSuffixes[] = { 00070 kLangConfigFileSuffix, // 0 00071 kUnicharsetFileSuffix, // 1 00072 kAmbigsFileSuffix, // 2 00073 kBuiltInTemplatesFileSuffix, // 3 00074 kBuiltInCutoffsFileSuffix, // 4 00075 kNormProtoFileSuffix, // 5 00076 kPuncDawgFileSuffix, // 6 00077 kSystemDawgFileSuffix, // 7 00078 kNumberDawgFileSuffix, // 8 00079 kFreqDawgFileSuffix, // 9 00080 kFixedLengthDawgsFileSuffix, // 10 00081 kCubeUnicharsetFileSuffix, // 11 00082 kCubeSystemDawgFileSuffix, // 12 00083 }; 00084 00089 static const bool kTessdataFileIsText[] = { 00090 true, // 0 00091 true, // 1 00092 true, // 2 00093 false, // 3 00094 true, // 4 00095 true, // 5 00096 false, // 6 00097 false, // 7 00098 false, // 8 00099 false, // 9 00100 false, // 10 00101 true, // 11 00102 false, // 12 00103 }; 00104 00112 static const int kMaxNumTessdataEntries = 1000; 00113 00114 00115 class TessdataManager { 00116 public: 00117 TessdataManager() { 00118 data_file_ = NULL; 00119 actual_tessdata_num_entries_ = 0; 00120 for (int i = 0; i < TESSDATA_NUM_ENTRIES; ++i) { 00121 offset_table_[i] = -1; 00122 } 00123 } 00124 ~TessdataManager() {} 00125 int DebugLevel() { return debug_level_; } 00126 00131 bool Init(const char *data_file_name, int debug_level); 00132 00134 inline FILE *GetDataFilePtr() const { return data_file_; } 00135 00141 inline bool SeekToStart(TessdataType tessdata_type) { 00142 if (debug_level_) { 00143 tprintf("TessdataManager: seek to offset %lld - start of tessdata" 00144 "type %d (%s))\n", offset_table_[tessdata_type], 00145 tessdata_type, kTessdataFileSuffixes[tessdata_type]); 00146 } 00147 if (offset_table_[tessdata_type] < 0) { 00148 return false; 00149 } else { 00150 ASSERT_HOST(fseek(data_file_, 00151 static_cast<size_t>(offset_table_[tessdata_type]), 00152 SEEK_SET) == 0); 00153 return true; 00154 } 00155 } 00157 inline inT64 GetEndOffset(TessdataType tessdata_type) const { 00158 int index = tessdata_type + 1; 00159 while (index < actual_tessdata_num_entries_ && offset_table_[index] == -1) { 00160 ++index; // skip tessdata types not present in the combined file 00161 } 00162 if (debug_level_) { 00163 tprintf("TessdataManager: end offset for type %d is %lld\n", 00164 tessdata_type, 00165 (index == actual_tessdata_num_entries_) ? -1 00166 : offset_table_[index]); 00167 } 00168 return (index == actual_tessdata_num_entries_) ? -1 : offset_table_[index] - 1; 00169 } 00171 inline void End() { 00172 if (data_file_ != NULL) { 00173 fclose(data_file_); 00174 data_file_ = NULL; 00175 } 00176 } 00177 00179 static void WriteMetadata(inT64 *offset_table, FILE *output_file); 00180 00186 static bool CombineDataFiles(const char *language_data_path_prefix, 00187 const char *output_filename); 00188 00194 bool OverwriteComponents(const char *new_traineddata_filename, 00195 char **component_filenames, 00196 int num_new_components); 00197 00208 bool ExtractToFile(const char *filename); 00209 00215 static void CopyFile(FILE *input_file, FILE *output_file, 00216 bool newline_end, inT64 num_bytes_to_copy); 00217 00226 static bool TessdataTypeFromFileSuffix(const char *suffix, 00227 TessdataType *type, 00228 bool *text_file); 00229 00234 static bool TessdataTypeFromFileName(const char *filename, 00235 TessdataType *type, 00236 bool *text_file); 00237 00238 private: 00239 00244 static FILE *GetFilePtr(const char *language_data_path_prefix, 00245 const char *file_suffix, bool text_file); 00246 00251 inT64 offset_table_[TESSDATA_NUM_ENTRIES]; 00260 inT32 actual_tessdata_num_entries_; 00261 FILE *data_file_; 00262 int debug_level_; 00263 }; 00264 00265 00266 } // namespace tesseract 00267 00268 #endif // TESSERACT_CCUTIL_TESSDATAMANAGER_H_