Tesseract 3.01
/data/source/tesseract-ocr/ccutil/tessdatamanager.h
Go to the documentation of this file.
00001 
00002 // File:        tessdatamanager.h
00003 // Description: Functions to handle loading/combining tesseract data files.
00004 // Author:      Daria Antonova
00005 // Created:     Wed Jun 03 11:26:43 PST 2009
00006 //
00007 // (C) Copyright 2009, Google Inc.
00008 // Licensed under the Apache License, Version 2.0 (the "License");
00009 // you may not use this file except in compliance with the License.
00010 // You may obtain a copy of the License at
00011 // http://www.apache.org/licenses/LICENSE-2.0
00012 // Unless required by applicable law or agreed to in writing, software
00013 // distributed under the License is distributed on an "AS IS" BASIS,
00014 // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
00015 // See the License for the specific language governing permissions and
00016 // limitations under the License.
00017 //
00019 
00020 #ifndef TESSERACT_CCUTIL_TESSDATAMANAGER_H_
00021 #define TESSERACT_CCUTIL_TESSDATAMANAGER_H_
00022 
00023 #include <stdio.h>
00024 #include "host.h"
00025 #include "tprintf.h"
00026 
00027 static const char kTrainedDataSuffix[] = "traineddata";
00028 
00029 // When adding new tessdata types and file suffixes, please make sure to
00030 // update TessdataType enum, kTessdataFileSuffixes and kTessdataFileIsText.
00031 static const char kLangConfigFileSuffix[] = "config";
00032 static const char kUnicharsetFileSuffix[] = "unicharset";
00033 static const char kAmbigsFileSuffix[] = "unicharambigs";
00034 static const char kBuiltInTemplatesFileSuffix[] = "inttemp";
00035 static const char kBuiltInCutoffsFileSuffix[] = "pffmtable";
00036 static const char kNormProtoFileSuffix[] = "normproto";
00037 static const char kPuncDawgFileSuffix[] = "punc-dawg";
00038 static const char kSystemDawgFileSuffix[] = "word-dawg";
00039 static const char kNumberDawgFileSuffix[] = "number-dawg";
00040 static const char kFreqDawgFileSuffix[] = "freq-dawg";
00041 static const char kFixedLengthDawgsFileSuffix[] = "fixed-length-dawgs";
00042 static const char kCubeUnicharsetFileSuffix[] = "cube-unicharset";
00043 static const char kCubeSystemDawgFileSuffix[] = "cube-word-dawg";
00044 
00045 namespace tesseract {
00046 
00047 enum TessdataType {
00048   TESSDATA_LANG_CONFIG,         // 0
00049   TESSDATA_UNICHARSET,          // 1
00050   TESSDATA_AMBIGS,              // 2
00051   TESSDATA_INTTEMP,             // 3
00052   TESSDATA_PFFMTABLE,           // 4
00053   TESSDATA_NORMPROTO,           // 5
00054   TESSDATA_PUNC_DAWG,           // 6
00055   TESSDATA_SYSTEM_DAWG,         // 7
00056   TESSDATA_NUMBER_DAWG,         // 8
00057   TESSDATA_FREQ_DAWG,           // 9
00058   TESSDATA_FIXED_LENGTH_DAWGS,  // 10
00059   TESSDATA_CUBE_UNICHARSET,     // 11
00060   TESSDATA_CUBE_SYSTEM_DAWG,    // 12
00061 
00062   TESSDATA_NUM_ENTRIES
00063 };
00064 
00069 static const char * const kTessdataFileSuffixes[] = {
00070   kLangConfigFileSuffix,        // 0
00071   kUnicharsetFileSuffix,        // 1
00072   kAmbigsFileSuffix,            // 2
00073   kBuiltInTemplatesFileSuffix,  // 3
00074   kBuiltInCutoffsFileSuffix,    // 4
00075   kNormProtoFileSuffix,         // 5
00076   kPuncDawgFileSuffix,          // 6
00077   kSystemDawgFileSuffix,        // 7
00078   kNumberDawgFileSuffix,        // 8
00079   kFreqDawgFileSuffix,          // 9
00080   kFixedLengthDawgsFileSuffix,  // 10
00081   kCubeUnicharsetFileSuffix,    // 11
00082   kCubeSystemDawgFileSuffix,    // 12
00083 };
00084 
00089 static const bool kTessdataFileIsText[] = {
00090   true,                         // 0
00091   true,                         // 1
00092   true,                         // 2
00093   false,                        // 3
00094   true,                         // 4
00095   true,                         // 5
00096   false,                        // 6
00097   false,                        // 7
00098   false,                        // 8
00099   false,                        // 9
00100   false,                        // 10
00101   true,                         // 11
00102   false,                        // 12
00103 };
00104 
00112 static const int kMaxNumTessdataEntries = 1000;
00113 
00114 
00115 class TessdataManager {
00116  public:
00117   TessdataManager() {
00118     data_file_ = NULL;
00119     actual_tessdata_num_entries_ = 0;
00120     for (int i = 0; i < TESSDATA_NUM_ENTRIES; ++i) {
00121       offset_table_[i] = -1;
00122     }
00123   }
00124   ~TessdataManager() {}
00125   int DebugLevel() { return debug_level_; }
00126 
00131   bool Init(const char *data_file_name, int debug_level);
00132 
00134   inline FILE *GetDataFilePtr() const { return data_file_; }
00135 
00141   inline bool SeekToStart(TessdataType tessdata_type) {
00142     if (debug_level_) {
00143       tprintf("TessdataManager: seek to offset %lld - start of tessdata"
00144               "type %d (%s))\n", offset_table_[tessdata_type],
00145               tessdata_type, kTessdataFileSuffixes[tessdata_type]);
00146     }
00147     if (offset_table_[tessdata_type] < 0) {
00148       return false;
00149     } else {
00150       ASSERT_HOST(fseek(data_file_,
00151                         static_cast<size_t>(offset_table_[tessdata_type]),
00152                         SEEK_SET) == 0);
00153       return true;
00154     }
00155   }
00157   inline inT64 GetEndOffset(TessdataType tessdata_type) const {
00158     int index = tessdata_type + 1;
00159     while (index < actual_tessdata_num_entries_ && offset_table_[index] == -1) {
00160       ++index;  // skip tessdata types not present in the combined file
00161     }
00162     if (debug_level_) {
00163       tprintf("TessdataManager: end offset for type %d is %lld\n",
00164               tessdata_type,
00165               (index == actual_tessdata_num_entries_) ? -1
00166               : offset_table_[index]);
00167     }
00168     return (index == actual_tessdata_num_entries_) ? -1 : offset_table_[index] - 1;
00169   }
00171   inline void End() {
00172     if (data_file_ != NULL) {
00173       fclose(data_file_);
00174       data_file_ = NULL;
00175     }
00176   }
00177 
00179   static void WriteMetadata(inT64 *offset_table, FILE *output_file);
00180 
00186   static bool CombineDataFiles(const char *language_data_path_prefix,
00187                                const char *output_filename);
00188 
00194   bool OverwriteComponents(const char *new_traineddata_filename,
00195                             char **component_filenames,
00196                             int num_new_components);
00197 
00208   bool ExtractToFile(const char *filename);
00209 
00215   static void CopyFile(FILE *input_file, FILE *output_file,
00216                        bool newline_end, inT64 num_bytes_to_copy);
00217 
00226   static bool TessdataTypeFromFileSuffix(const char *suffix,
00227                                          TessdataType *type,
00228                                          bool *text_file);
00229 
00234   static bool TessdataTypeFromFileName(const char *filename,
00235                                        TessdataType *type,
00236                                        bool *text_file);
00237 
00238  private:
00239 
00244   static FILE *GetFilePtr(const char *language_data_path_prefix,
00245                           const char *file_suffix, bool text_file);
00246 
00251   inT64 offset_table_[TESSDATA_NUM_ENTRIES];
00260   inT32 actual_tessdata_num_entries_;
00261   FILE *data_file_;  
00262   int debug_level_;
00263 };
00264 
00265 
00266 }  // namespace tesseract
00267 
00268 #endif  // TESSERACT_CCUTIL_TESSDATAMANAGER_H_
 All Classes Namespaces Files Functions Variables Typedefs Enumerations Enumerator Friends Defines