00001
00002
00003
00004
00005
00006
00007
00008
00009
00010
00011
00012
00013
00014
00015
00016
00017
00019
00020 #ifndef TESSERACT_CCUTIL_TESSDATAMANAGER_H_
00021 #define TESSERACT_CCUTIL_TESSDATAMANAGER_H_
00022
00023 #include <stdio.h>
00024 #include "host.h"
00025 #include "tprintf.h"
00026 #include "varable.h"
00027
00028 extern BOOL_VAR_H(global_load_punc_dawg, true,
00029 "Load dawg with punctuation patterns.");
00030 extern BOOL_VAR_H(global_load_system_dawg, true, "Load system word dawg.");
00031 extern BOOL_VAR_H(global_load_number_dawg, true,
00032 "Load dawg with number patterns.");
00033 extern BOOL_VAR_H(global_load_freq_dawg, true, "Load frequent word dawg.");
00034
00035 extern INT_VAR_H(global_tessdata_manager_debug_level, 0,
00036 "Debug level for TessdataManager functions.");
00037
00038 static const char kTrainedDataSuffix[] = "traineddata";
00039
00040
00041
00042 static const char kLangConfigFileSuffix[] = "config";
00043 static const char kUnicharsetFileSuffix[] = "unicharset";
00044 static const char kAmbigsFileSuffix[] = "unicharambigs";
00045 static const char kBuiltInTemplatesFileSuffix[] = "inttemp";
00046 static const char kBuiltInCutoffsFileSuffix[] = "pffmtable";
00047 static const char kNormProtoFileSuffix[] = "normproto";
00048 static const char kPuncDawgFileSuffix[] = "punc-dawg";
00049 static const char kSystemDawgFileSuffix[] = "word-dawg";
00050 static const char kNumberDawgFileSuffix[] = "number-dawg";
00051 static const char kFreqDawgFileSuffix[] = "freq-dawg";
00052
00053 namespace tesseract {
00054
00055 enum TessdataType {
00056 TESSDATA_LANG_CONFIG,
00057 TESSDATA_UNICHARSET,
00058 TESSDATA_AMBIGS,
00059 TESSDATA_INTTEMP,
00060 TESSDATA_PFFMTABLE,
00061 TESSDATA_NORMPROTO,
00062 TESSDATA_PUNC_DAWG,
00063 TESSDATA_SYSTEM_DAWG,
00064 TESSDATA_NUMBER_DAWG,
00065 TESSDATA_FREQ_DAWG,
00066
00067 TESSDATA_NUM_ENTRIES
00068 };
00069
00074 static const char * const kTessdataFileSuffixes[] = {
00075 kLangConfigFileSuffix,
00076 kUnicharsetFileSuffix,
00077 kAmbigsFileSuffix,
00078 kBuiltInTemplatesFileSuffix,
00079 kBuiltInCutoffsFileSuffix,
00080 kNormProtoFileSuffix,
00081 kPuncDawgFileSuffix,
00082 kSystemDawgFileSuffix,
00083 kNumberDawgFileSuffix,
00084 kFreqDawgFileSuffix,
00085 };
00086
00091 static const bool kTessdataFileIsText[] = {
00092 true,
00093 true,
00094 true,
00095 false,
00096 true,
00097 true,
00098 false,
00099 false,
00100 false,
00101 false,
00102 };
00103
00111 static const int kMaxNumTessdataEntries = 1000;
00112
00113
00114 class TessdataManager {
00115 public:
00116 TessdataManager() {
00117 data_file_ = NULL;
00118 actual_tessdata_num_entries_ = 0;
00119 for (int i = 0; i < TESSDATA_NUM_ENTRIES; ++i) {
00120 offset_table_[i] = -1;
00121 }
00122 }
00123 ~TessdataManager() {}
00124
00126 void Init(const char *data_file_name);
00127
00129 inline FILE *GetDataFilePtr() const { return data_file_; }
00130
00136 inline bool SeekToStart(TessdataType tessdata_type) {
00137 if (global_tessdata_manager_debug_level) {
00138 tprintf("TessdataManager: seek to offset %lld (start of tessdata"
00139 "type %d)\n", offset_table_[tessdata_type], tessdata_type);
00140 }
00141 if (offset_table_[tessdata_type] < 0) {
00142 return false;
00143 } else {
00144 ASSERT_HOST(fseek(data_file_,
00145 static_cast<size_t>(offset_table_[tessdata_type]),
00146 SEEK_SET) == 0);
00147 return true;
00148 }
00149 }
00151 inline inT64 GetEndOffset(TessdataType tessdata_type) const {
00152 int index = tessdata_type + 1;
00153 while (index < actual_tessdata_num_entries_ && offset_table_[index] == -1) {
00154 ++index;
00155 }
00156 if (global_tessdata_manager_debug_level) {
00157 tprintf("TessdataManager: end offset for type %d is %lld\n",
00158 tessdata_type,
00159 (index == actual_tessdata_num_entries_) ? -1
00160 : offset_table_[index]);
00161 }
00162 return (index == actual_tessdata_num_entries_) ? -1 : offset_table_[index] - 1;
00163 }
00165 inline void End() {
00166 if (data_file_ != NULL) {
00167 fclose(data_file_);
00168 data_file_ = NULL;
00169 }
00170 }
00171
00173 static void WriteMetadata(inT64 *offset_table, FILE *output_file);
00174
00180 static bool CombineDataFiles(const char *language_data_path_prefix,
00181 const char *output_filename);
00182
00188 bool OverwriteComponents(const char *new_traineddata_filename,
00189 char **component_filenames,
00190 int num_new_components);
00191
00202 bool ExtractToFile(const char *filename);
00203
00209 static void CopyFile(FILE *input_file, FILE *output_file,
00210 bool newline_end, inT64 num_bytes_to_copy);
00211
00220 static bool TessdataTypeFromFileSuffix(const char *suffix,
00221 TessdataType *type,
00222 bool *text_file);
00223
00228 static bool TessdataTypeFromFileName(const char *filename,
00229 TessdataType *type,
00230 bool *text_file);
00231
00232 private:
00237 inT64 offset_table_[TESSDATA_NUM_ENTRIES];
00246 inT32 actual_tessdata_num_entries_;
00247 FILE *data_file_;
00248 };
00249
00250
00251 }
00252
00253 #endif // TESSERACT_CCUTIL_TESSDATAMANAGER_H_