Tesseract 3.01
|
00001 00002 // File: tesseractclass.h 00003 // Description: An instance of Tesseract. For thread safety, *every* 00004 // global variable goes in here, directly, or indirectly. 00005 // Author: Ray Smith 00006 // Created: Fri Mar 07 08:17:01 PST 2008 00007 // 00008 // (C) Copyright 2008, Google Inc. 00009 // Licensed under the Apache License, Version 2.0 (the "License"); 00010 // you may not use this file except in compliance with the License. 00011 // You may obtain a copy of the License at 00012 // http://www.apache.org/licenses/LICENSE-2.0 00013 // Unless required by applicable law or agreed to in writing, software 00014 // distributed under the License is distributed on an "AS IS" BASIS, 00015 // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 00016 // See the License for the specific language governing permissions and 00017 // limitations under the License. 00018 // 00020 00021 #ifndef TESSERACT_CCMAIN_TESSERACTCLASS_H__ 00022 #define TESSERACT_CCMAIN_TESSERACTCLASS_H__ 00023 00024 #include "allheaders.h" 00025 #include "genericvector.h" 00026 #include "params.h" 00027 #include "wordrec.h" 00028 #include "ocrclass.h" 00029 #include "control.h" 00030 #include "docqual.h" 00031 #include "devanagari_processing.h" 00032 #include "textord.h" 00033 00034 class PAGE_RES; 00035 class PAGE_RES_IT; 00036 class BLOCK_LIST; 00037 class CharSamp; 00038 class TO_BLOCK_LIST; 00039 class IMAGE; 00040 class WERD_RES; 00041 class ROW; 00042 class TBOX; 00043 class SVMenuNode; 00044 struct Pix; 00045 class WERD_CHOICE; 00046 class WERD; 00047 class BLOB_CHOICE_LIST_CLIST; 00048 struct OSResults; 00049 00050 00051 // Top-level class for all tesseract global instance data. 00052 // This class either holds or points to all data used by an instance 00053 // of Tesseract, including the memory allocator. When this is 00054 // complete, Tesseract will be thread-safe. UNTIL THEN, IT IS NOT! 00055 // 00056 // NOTE to developers: Do not create cyclic dependencies through this class! 00057 // The directory dependency tree must remain a tree! The keep this clean, 00058 // lower-level code (eg in ccutil, the bottom level) must never need to 00059 // know about the content of a higher-level directory. 00060 // The following scheme will grant the easiest access to lower-level 00061 // global members without creating a cyclic dependency: 00062 // 00063 // Class Hierarchy (^ = inheritance): 00064 // 00065 // CCUtil (ccutil/ccutil.h) 00066 // ^ Members include: UNICHARSET 00067 // CUtil (cutil/cutil_class.h) 00068 // ^ Members include: TBLOB*, TEXTBLOCK* 00069 // CCStruct (ccstruct/ccstruct.h) 00070 // ^ Members include: Image 00071 // Classify (classify/classify.h) 00072 // ^ Members include: Dict 00073 // WordRec (wordrec/wordrec.h) 00074 // ^ Members include: WERD*, DENORM* 00075 // Tesseract (ccmain/tesseractclass.h) 00076 // Members include: Pix*, CubeRecoContext*, 00077 // TesseractCubeCombiner* 00078 // 00079 // Other important classes: 00080 // 00081 // TessBaseAPI (api/baseapi.h) 00082 // Members include: BLOCK_LIST*, PAGE_RES*, 00083 // Tesseract*, ImageThresholder* 00084 // Dict (dict/dict.h) 00085 // Members include: Image* (private) 00086 // 00087 // NOTE: that each level contains members that correspond to global 00088 // data that is defined (and used) at that level, not necessarily where 00089 // the type is defined so for instance: 00090 // BOOL_VAR_H(textord_show_blobs, false, "Display unsorted blobs"); 00091 // goes inside the Textord class, not the cc_util class. 00092 00093 namespace tesseract { 00094 00095 class CubeLineObject; 00096 class CubeObject; 00097 class CubeRecoContext; 00098 class TesseractCubeCombiner; 00099 00100 // A collection of various variables for statistics and debugging. 00101 struct TesseractStats { 00102 TesseractStats() 00103 : adaption_word_number(0), 00104 doc_blob_quality(0), 00105 doc_outline_errs(0), 00106 doc_char_quality(0), 00107 good_char_count(0), 00108 doc_good_char_quality(0), 00109 word_count(0), 00110 dict_words(0), 00111 tilde_crunch_written(false), 00112 last_char_was_newline(true), 00113 last_char_was_tilde(false), 00114 write_results_empty_block(true) {} 00115 00116 inT32 adaption_word_number; 00117 inT16 doc_blob_quality; 00118 inT16 doc_outline_errs; 00119 inT16 doc_char_quality; 00120 inT16 good_char_count; 00121 inT16 doc_good_char_quality; 00122 inT32 word_count; // count of word in the document 00123 inT32 dict_words; // number of dicitionary words in the document 00124 STRING dump_words_str; // accumulator used by dump_words() 00125 // Flags used by write_results() 00126 bool tilde_crunch_written; 00127 bool last_char_was_newline; 00128 bool last_char_was_tilde; 00129 bool write_results_empty_block; 00130 }; 00131 00132 class Tesseract : public Wordrec { 00133 public: 00134 Tesseract(); 00135 ~Tesseract(); 00136 00137 void Clear(); 00138 00139 // Simple accessors. 00140 const FCOORD& reskew() const { 00141 return reskew_; 00142 } 00143 // Destroy any existing pix and return a pointer to the pointer. 00144 Pix** mutable_pix_binary() { 00145 Clear(); 00146 return &pix_binary_; 00147 } 00148 Pix* pix_binary() const { 00149 return pix_binary_; 00150 } 00151 Pix* pix_grey() const { 00152 return pix_grey_; 00153 } 00154 void set_pix_grey(Pix* grey_pix) { 00155 pix_grey_ = grey_pix; 00156 } 00157 int ImageWidth() const { 00158 return pixGetWidth(pix_binary_); 00159 } 00160 int ImageHeight() const { 00161 return pixGetHeight(pix_binary_); 00162 } 00163 00164 const ShiroRekhaSplitter& splitter() const { 00165 return splitter_; 00166 } 00167 ShiroRekhaSplitter* mutable_splitter() { 00168 return &splitter_; 00169 } 00170 const Textord& textord() const { 00171 return textord_; 00172 } 00173 Textord* mutable_textord() { 00174 return &textord_; 00175 } 00176 00177 bool right_to_left() const { 00178 return right_to_left_; 00179 } 00180 00181 void SetBlackAndWhitelist(); 00182 00183 // Perform steps to prepare underlying binary image/other data structures for 00184 // page segmentation. Uses the strategy specified in the global variable 00185 // pageseg_devanagari_split_strategy for perform splitting while preparing for 00186 // page segmentation. 00187 void PrepareForPageseg(); 00188 00189 // Perform steps to prepare underlying binary image/other data structures for 00190 // Tesseract OCR. The current segmentation is required by this method. 00191 // Uses the strategy specified in the global variable 00192 // ocr_devanagari_split_strategy for performing splitting while preparing for 00193 // Tesseract ocr. 00194 void PrepareForTessOCR(BLOCK_LIST* block_list, 00195 Tesseract* osd_tess, OSResults* osr); 00196 00197 // Perform steps to prepare underlying binary image/other data structures for 00198 // Cube OCR. 00199 void PrepareForCubeOCR(); 00200 00201 int SegmentPage(const STRING* input_file, BLOCK_LIST* blocks, 00202 Tesseract* osd_tess, OSResults* osr); 00203 void SetupWordScripts(BLOCK_LIST* blocks); 00204 int AutoPageSeg(int resolution, bool single_column, 00205 bool osd, bool only_osd, 00206 BLOCK_LIST* blocks, TO_BLOCK_LIST* to_blocks, 00207 Tesseract* osd_tess, OSResults* osr); 00208 00210 bool ProcessTargetWord(const TBOX& word_box, const TBOX& target_word_box, 00211 const char* word_config, int pass); 00212 void recog_all_words(PAGE_RES* page_res, 00213 ETEXT_DESC* monitor, 00214 const TBOX* target_word_box, 00215 const char* word_config, 00216 int dopasses); 00217 void classify_word_pass1( //recog one word 00218 WERD_RES *word, //word to do 00219 ROW *row, 00220 BLOCK* block); 00221 void recog_pseudo_word(PAGE_RES* page_res, // blocks to check 00222 TBOX &selection_box); 00223 00224 void fix_rep_char(PAGE_RES_IT* page_res_it); 00225 void ExplodeRepeatedWord(BLOB_CHOICE* best_choice, PAGE_RES_IT* page_res_it); 00226 00227 // Callback helper for fix_quotes returns a double quote if both 00228 // arguments are quote, otherwise INVALID_UNICHAR_ID. 00229 UNICHAR_ID BothQuotes(UNICHAR_ID id1, UNICHAR_ID id2); 00230 void fix_quotes(WERD_RES* word_res, 00231 BLOB_CHOICE_LIST_CLIST *blob_choices); 00232 ACCEPTABLE_WERD_TYPE acceptable_word_string(const char *s, 00233 const char *lengths); 00234 void match_word_pass2( //recog one word 00235 WERD_RES *word, //word to do 00236 ROW *row, 00237 BLOCK* block); 00238 void classify_word_pass2( //word to do 00239 WERD_RES *word, 00240 BLOCK* block, 00241 ROW *row); 00242 void ReportXhtFixResult(bool accept_new_word, float new_x_ht, 00243 WERD_RES* word, WERD_RES* new_word); 00244 bool RunOldFixXht(WERD_RES *word, BLOCK* block, ROW *row); 00245 bool TrainedXheightFix(WERD_RES *word, BLOCK* block, ROW *row); 00246 BOOL8 recog_interactive(BLOCK* block, ROW* row, WERD_RES* word_res); 00247 00248 // Callback helper for fix_hyphens returns UNICHAR_ID of - if both 00249 // arguments are hyphen, otherwise INVALID_UNICHAR_ID. 00250 UNICHAR_ID BothHyphens(UNICHAR_ID id1, UNICHAR_ID id2); 00251 // Callback helper for fix_hyphens returns true if box1 and box2 overlap 00252 // (assuming both on the same textline, are in order and a chopped em dash.) 00253 bool HyphenBoxesOverlap(const TBOX& box1, const TBOX& box2); 00254 void fix_hyphens(WERD_RES* word_res, 00255 BLOB_CHOICE_LIST_CLIST *blob_choices); 00256 void set_word_fonts( 00257 WERD_RES *word, // set fonts of this word 00258 BLOB_CHOICE_LIST_CLIST *blob_choices); // detailed results 00259 void font_recognition_pass( //good chars in word 00260 PAGE_RES_IT &page_res_it); 00261 BOOL8 check_debug_pt(WERD_RES *word, int location); 00262 00264 bool init_cube_objects(bool load_combiner, 00265 TessdataManager *tessdata_manager); 00266 void run_cube(PAGE_RES *page_res); 00267 void cube_recognize(CubeObject *cube_obj, PAGE_RES_IT *page_res_it); 00268 void fill_werd_res(const BoxWord& cube_box_word, 00269 WERD_CHOICE* cube_werd_choice, 00270 const char* cube_best_str, 00271 PAGE_RES_IT *page_res_it); 00272 bool extract_cube_state(CubeObject* cube_obj, int* num_chars, 00273 Boxa** char_boxes, CharSamp*** char_samples); 00274 bool create_cube_box_word(Boxa *char_boxes, int num_chars, 00275 TBOX word_box, BoxWord* box_word); 00277 00278 void output_pass(PAGE_RES_IT &page_res_it, const TBOX *target_word_box); 00279 void write_results(PAGE_RES_IT &page_res_it, // full info 00280 char newline_type, // type of newline 00281 BOOL8 force_eol // override tilde crunch? 00282 ); 00283 void set_unlv_suspects(WERD_RES *word); 00284 UNICHAR_ID get_rep_char(WERD_RES *word); // what char is repeated? 00285 BOOL8 acceptable_number_string(const char *s, 00286 const char *lengths); 00287 inT16 count_alphanums(const WERD_CHOICE &word); 00288 inT16 count_alphas(const WERD_CHOICE &word); 00290 void read_config_file(const char *filename, bool init_only); 00291 int init_tesseract(const char *arg0, 00292 const char *textbase, 00293 const char *language, 00294 OcrEngineMode oem, 00295 char **configs, 00296 int configs_size, 00297 const GenericVector<STRING> *vars_vec, 00298 const GenericVector<STRING> *vars_values, 00299 bool set_only_init_params); 00300 int init_tesseract(const char *datapath, 00301 const char *language, 00302 OcrEngineMode oem) { 00303 return init_tesseract(datapath, NULL, language, oem, 00304 NULL, 0, NULL, NULL, false); 00305 } 00306 00307 int init_tesseract_lm(const char *arg0, 00308 const char *textbase, 00309 const char *language); 00310 00311 void recognize_page(STRING& image_name); 00312 void end_tesseract(); 00313 00314 bool init_tesseract_lang_data(const char *arg0, 00315 const char *textbase, 00316 const char *language, 00317 OcrEngineMode oem, 00318 char **configs, 00319 int configs_size, 00320 const GenericVector<STRING> *vars_vec, 00321 const GenericVector<STRING> *vars_values, 00322 bool set_only_init_params); 00323 00325 SVMenuNode *build_menu_new(); 00326 void pgeditor_main(int width, int height, PAGE_RES* page_res); 00327 void process_image_event( // action in image win 00328 const SVEvent &event); 00329 BOOL8 process_cmd_win_event( // UI command semantics 00330 inT32 cmd_event, // which menu item? 00331 char *new_value // any prompt data 00332 ); 00333 void debug_word(PAGE_RES* page_res, const TBOX &selection_box); 00334 void do_re_display( 00335 BOOL8 (tesseract::Tesseract::*word_painter)(BLOCK* block, 00336 ROW* row, 00337 WERD_RES* word_res)); 00338 BOOL8 word_display(BLOCK* block, ROW* row, WERD_RES* word_res); 00339 BOOL8 word_bln_display(BLOCK* block, ROW* row, WERD_RES* word_res); 00340 BOOL8 word_blank_and_set_display(BLOCK* block, ROW* row, WERD_RES* word_res); 00341 BOOL8 word_set_display(BLOCK* block, ROW* row, WERD_RES* word_res); 00342 BOOL8 word_dumper(BLOCK* block, ROW* row, WERD_RES* word_res); 00344 void make_reject_map( //make rej map for wd //detailed results 00345 WERD_RES *word, 00346 BLOB_CHOICE_LIST_CLIST *blob_choices, 00347 ROW *row, 00348 inT16 pass //1st or 2nd? 00349 ); 00350 BOOL8 one_ell_conflict(WERD_RES *word_res, BOOL8 update_map); 00351 inT16 first_alphanum_index(const char *word, 00352 const char *word_lengths); 00353 inT16 first_alphanum_offset(const char *word, 00354 const char *word_lengths); 00355 inT16 alpha_count(const char *word, 00356 const char *word_lengths); 00357 BOOL8 word_contains_non_1_digit(const char *word, 00358 const char *word_lengths); 00359 void dont_allow_1Il(WERD_RES *word); 00360 inT16 count_alphanums( //how many alphanums 00361 WERD_RES *word); 00362 void flip_0O(WERD_RES *word); 00363 BOOL8 non_0_digit(UNICHAR_ID unichar_id); 00364 BOOL8 non_O_upper(UNICHAR_ID unichar_id); 00365 BOOL8 repeated_nonalphanum_wd(WERD_RES *word, ROW *row); 00366 void nn_match_word( //Match a word 00367 WERD_RES *word, 00368 ROW *row); 00369 void nn_recover_rejects(WERD_RES *word, ROW *row); 00370 BOOL8 test_ambig_word( //test for ambiguity 00371 WERD_RES *word); 00372 void set_done( //set done flag 00373 WERD_RES *word, 00374 inT16 pass); 00375 inT16 safe_dict_word(const WERD_CHOICE &word); 00376 void flip_hyphens(WERD_RES *word); 00377 void reject_I_1_L(WERD_RES *word); 00378 void reject_edge_blobs(WERD_RES *word); 00379 void reject_mostly_rejects(WERD_RES *word); 00381 BOOL8 word_adaptable( //should we adapt? 00382 WERD_RES *word, 00383 uinT16 mode); 00384 00386 void recog_word_recursive(WERD_RES* word, 00387 BLOB_CHOICE_LIST_CLIST *blob_choices); 00388 void recog_word(WERD_RES *word, 00389 BLOB_CHOICE_LIST_CLIST *blob_choices); 00390 void split_and_recog_word(WERD_RES* word, 00391 BLOB_CHOICE_LIST_CLIST *blob_choices); 00393 BOOL8 digit_or_numeric_punct(WERD_RES *word, int char_position); 00394 inT16 eval_word_spacing(WERD_RES_LIST &word_res_list); 00395 void match_current_words(WERD_RES_LIST &words, ROW *row, BLOCK* block); 00396 inT16 fp_eval_word_spacing(WERD_RES_LIST &word_res_list); 00397 void fix_noisy_space_list(WERD_RES_LIST &best_perm, ROW *row, BLOCK* block); 00398 void fix_fuzzy_space_list( //space explorer 00399 WERD_RES_LIST &best_perm, 00400 ROW *row, 00401 BLOCK* block); 00402 void fix_sp_fp_word(WERD_RES_IT &word_res_it, ROW *row, BLOCK* block); 00403 void fix_fuzzy_spaces( //find fuzzy words 00404 ETEXT_DESC *monitor, //progress monitor 00405 inT32 word_count, //count of words in doc 00406 PAGE_RES *page_res); 00407 void dump_words(WERD_RES_LIST &perm, inT16 score, 00408 inT16 mode, BOOL8 improved); 00409 BOOL8 uniformly_spaced(WERD_RES *word); 00410 BOOL8 fixspace_thinks_word_done(WERD_RES *word); 00411 inT16 worst_noise_blob(WERD_RES *word_res, float *worst_noise_score); 00412 float blob_noise_score(TBLOB *blob); 00413 void break_noisiest_blob_word(WERD_RES_LIST &words); 00415 GARBAGE_LEVEL garbage_word(WERD_RES *word, BOOL8 ok_dict_word); 00416 BOOL8 potential_word_crunch(WERD_RES *word, 00417 GARBAGE_LEVEL garbage_level, 00418 BOOL8 ok_dict_word); 00419 void tilde_crunch(PAGE_RES_IT &page_res_it); 00420 void unrej_good_quality_words( //unreject potential 00421 PAGE_RES_IT &page_res_it); 00422 void doc_and_block_rejection( //reject big chunks 00423 PAGE_RES_IT &page_res_it, 00424 BOOL8 good_quality_doc); 00425 void quality_based_rejection(PAGE_RES_IT &page_res_it, 00426 BOOL8 good_quality_doc); 00427 void convert_bad_unlv_chs(WERD_RES *word_res); 00428 // Callback helper for merge_tess_fails returns a space if both 00429 // arguments are space, otherwise INVALID_UNICHAR_ID. 00430 UNICHAR_ID BothSpaces(UNICHAR_ID id1, UNICHAR_ID id2); 00431 void merge_tess_fails(WERD_RES *word_res); 00432 void tilde_delete(PAGE_RES_IT &page_res_it); 00433 inT16 word_blob_quality(WERD_RES *word, ROW *row); 00434 void word_char_quality(WERD_RES *word, ROW *row, inT16 *match_count, 00435 inT16 *accepted_match_count); 00436 void unrej_good_chs(WERD_RES *word, ROW *row); 00437 inT16 count_outline_errs(char c, inT16 outline_count); 00438 inT16 word_outline_errs(WERD_RES *word); 00439 BOOL8 terrible_word_crunch(WERD_RES *word, GARBAGE_LEVEL garbage_level); 00440 CRUNCH_MODE word_deletable(WERD_RES *word, inT16 &delete_mode); 00441 inT16 failure_count(WERD_RES *word); 00442 BOOL8 noise_outlines(TWERD *word); 00444 void 00445 process_selected_words ( 00446 PAGE_RES* page_res, // blocks to check 00447 //function to call 00448 TBOX & selection_box, 00449 BOOL8 (tesseract::Tesseract::*word_processor) (BLOCK* block, 00450 ROW* row, 00451 WERD_RES* word_res)); 00453 void tess_add_doc_word( //test acceptability 00454 WERD_CHOICE *word_choice //after context 00455 ); 00456 void tess_segment_pass1(WERD_RES *word, 00457 BLOB_CHOICE_LIST_CLIST *blob_choices); 00458 void tess_segment_pass2(WERD_RES *word, 00459 BLOB_CHOICE_LIST_CLIST *blob_choices); 00460 BOOL8 tess_acceptable_word( //test acceptability 00461 WERD_CHOICE *word_choice, //after context 00462 WERD_CHOICE *raw_choice //before context 00463 ); 00465 // Applies the box file based on the image name fname, and resegments 00466 // the words in the block_list (page), with: 00467 // blob-mode: one blob per line in the box file, words as input. 00468 // word/line-mode: one blob per space-delimited unit after the #, and one word 00469 // per line in the box file. (See comment above for box file format.) 00470 // If find_segmentation is true, (word/line mode) then the classifier is used 00471 // to re-segment words/lines to match the space-delimited truth string for 00472 // each box. In this case, the input box may be for a word or even a whole 00473 // text line, and the output words will contain multiple blobs corresponding 00474 // to the space-delimited input string. 00475 // With find_segmentation false, no classifier is needed, but the chopper 00476 // can still be used to correctly segment touching characters with the help 00477 // of the input boxes. 00478 // In the returned PAGE_RES, the WERD_RES are setup as they would be returned 00479 // from normal classification, ie. with a word, chopped_word, rebuild_word, 00480 // seam_array, denorm, box_word, and best_state, but NO best_choice or 00481 // raw_choice, as they would require a UNICHARSET, which we aim to avoid. 00482 // Instead, the correct_text member of WERD_RES is set, and this may be later 00483 // converted to a best_choice using CorrectClassifyWords. CorrectClassifyWords 00484 // is not required before calling ApplyBoxTraining. 00485 PAGE_RES* ApplyBoxes(const STRING& fname, bool find_segmentation, 00486 BLOCK_LIST *block_list); 00487 00488 // Builds a PAGE_RES from the block_list in the way required for ApplyBoxes: 00489 // All fuzzy spaces are removed, and all the words are maximally chopped. 00490 PAGE_RES* SetupApplyBoxes(BLOCK_LIST *block_list); 00491 // Tests the chopper by exhaustively running chop_one_blob. 00492 // The word_res will contain filled chopped_word, seam_array, denorm, 00493 // box_word and best_state for the maximally chopped word. 00494 void MaximallyChopWord(BLOCK* block, ROW* row, WERD_RES* word_res); 00495 // Gather consecutive blobs that match the given box into the best_state 00496 // and corresponding correct_text. 00497 // Fights over which box owns which blobs are settled by pre-chopping and 00498 // applying the blobs to box or next_box with the least non-overlap. 00499 // Returns false if the box was in error, which can only be caused by 00500 // failing to find an appropriate blob for a box. 00501 // This means that occasionally, blobs may be incorrectly segmented if the 00502 // chopper fails to find a suitable chop point. 00503 bool ResegmentCharBox(PAGE_RES* page_res, 00504 const TBOX& box, const TBOX& next_box, 00505 const char* correct_text); 00506 // Consume all source blobs that strongly overlap the given box, 00507 // putting them into a new word, with the correct_text label. 00508 // Fights over which box owns which blobs are settled by 00509 // applying the blobs to box or next_box with the least non-overlap. 00510 // Returns false if the box was in error, which can only be caused by 00511 // failing to find an overlapping blob for a box. 00512 bool ResegmentWordBox(BLOCK_LIST *block_list, 00513 const TBOX& box, const TBOX& next_box, 00514 const char* correct_text); 00515 // Resegments the words by running the classifier in an attempt to find the 00516 // correct segmentation that produces the required string. 00517 void ReSegmentByClassification(PAGE_RES* page_res); 00518 // Converts the space-delimited string of utf8 text to a vector of UNICHAR_ID. 00519 // Returns false if an invalid UNICHAR_ID is encountered. 00520 bool ConvertStringToUnichars(const char* utf8, 00521 GenericVector<UNICHAR_ID>* class_ids); 00522 // Resegments the word to achieve the target_text from the classifier. 00523 // Returns false if the re-segmentation fails. 00524 // Uses brute-force combination of upto kMaxGroupSize adjacent blobs, and 00525 // applies a full search on the classifier results to find the best classified 00526 // segmentation. As a compromise to obtain better recall, 1-1 ambigiguity 00527 // substitutions ARE used. 00528 bool FindSegmentation(const GenericVector<UNICHAR_ID>& target_text, 00529 WERD_RES* word_res); 00530 // Recursive helper to find a match to the target_text (from text_index 00531 // position) in the choices (from choices_pos position). 00532 // Choices is an array of GenericVectors, of length choices_length, with each 00533 // element representing a starting position in the word, and the 00534 // GenericVector holding classification results for a sequence of consecutive 00535 // blobs, with index 0 being a single blob, index 1 being 2 blobs etc. 00536 void SearchForText(const GenericVector<BLOB_CHOICE_LIST*>* choices, 00537 int choices_pos, int choices_length, 00538 const GenericVector<UNICHAR_ID>& target_text, 00539 int text_index, 00540 float rating, GenericVector<int>* segmentation, 00541 float* best_rating, GenericVector<int>* best_segmentation); 00542 // Counts up the labelled words and the blobs within. 00543 // Deletes all unused or emptied words, counting the unused ones. 00544 // Resets W_BOL and W_EOL flags correctly. 00545 // Builds the rebuild_word and rebuilds the box_word. 00546 void TidyUp(PAGE_RES* page_res); 00547 // Logs a bad box by line in the box file and box coords. 00548 void ReportFailedBox(int boxfile_lineno, TBOX box, const char *box_ch, 00549 const char *err_msg); 00550 // Creates a fake best_choice entry in each WERD_RES with the correct text. 00551 void CorrectClassifyWords(PAGE_RES* page_res); 00552 // Call LearnWord to extract features for labelled blobs within each word. 00553 // Features are written to the given filename. 00554 void ApplyBoxTraining(const STRING& filename, PAGE_RES* page_res); 00555 00557 // Returns the number of misfit blob tops in this word. 00558 int CountMisfitTops(WERD_RES *word_res); 00559 // Returns a new x-height in pixels (original image coords) that is 00560 // maximally compatible with the result in word_res. 00561 // Returns 0.0f if no x-height is found that is better than the current 00562 // estimate. 00563 float ComputeCompatibleXheight(WERD_RES *word_res); 00565 // TODO(ocr-team): Remove obsolete parameters. 00566 BOOL_VAR_H(tessedit_resegment_from_boxes, false, 00567 "Take segmentation and labeling from box file"); 00568 BOOL_VAR_H(tessedit_resegment_from_line_boxes, false, 00569 "Conversion of word/line box file to char box file"); 00570 BOOL_VAR_H(tessedit_train_from_boxes, false, 00571 "Generate training data from boxed chars"); 00572 BOOL_VAR_H(tessedit_make_boxes_from_boxes, false, 00573 "Generate more boxes from boxed chars"); 00574 BOOL_VAR_H(tessedit_dump_pageseg_images, false, 00575 "Dump intermediate images made during page segmentation"); 00576 INT_VAR_H(tessedit_pageseg_mode, PSM_SINGLE_BLOCK, 00577 "Page seg mode: 0=osd only, 1=auto+osd, 2=auto, 3=col, 4=block," 00578 " 5=line, 6=word, 7=char" 00579 " (Values from PageSegMode enum in publictypes.h)"); 00580 INT_VAR_H(tessedit_ocr_engine_mode, tesseract::OEM_TESSERACT_ONLY, 00581 "Which OCR engine(s) to run (Tesseract, Cube, both). Defaults" 00582 " to loading and running only Tesseract (no Cube, no combiner)." 00583 " (Values from OcrEngineMode enum in tesseractclass.h)"); 00584 STRING_VAR_H(tessedit_char_blacklist, "", 00585 "Blacklist of chars not to recognize"); 00586 STRING_VAR_H(tessedit_char_whitelist, "", 00587 "Whitelist of chars to recognize"); 00588 BOOL_VAR_H(tessedit_ambigs_training, false, 00589 "Perform training for ambiguities"); 00590 INT_VAR_H(pageseg_devanagari_split_strategy, 00591 tesseract::ShiroRekhaSplitter::NO_SPLIT, 00592 "Whether to use the top-line splitting process for Devanagari " 00593 "documents while performing page-segmentation."); 00594 INT_VAR_H(ocr_devanagari_split_strategy, 00595 tesseract::ShiroRekhaSplitter::NO_SPLIT, 00596 "Whether to use the top-line splitting process for Devanagari " 00597 "documents while performing ocr."); 00598 STRING_VAR_H(tessedit_write_params_to_file, "", 00599 "Write all parameters to the given file."); 00600 BOOL_VAR_H(tessedit_adapt_to_char_fragments, true, 00601 "Adapt to words that contain " 00602 " a character composed form fragments"); 00603 BOOL_VAR_H(tessedit_adaption_debug, false, 00604 "Generate and print debug information for adaption"); 00605 INT_VAR_H(applybox_debug, 1, "Debug level"); 00606 INT_VAR_H(applybox_page, 0, "Page number to apply boxes from"); 00607 STRING_VAR_H(applybox_exposure_pattern, ".exp", 00608 "Exposure value follows this pattern in the image" 00609 " filename. The name of the image files are expected" 00610 " to be in the form [lang].[fontname].exp[num].tif"); 00611 BOOL_VAR_H(applybox_learn_chars_and_char_frags_mode, false, 00612 "Learn both character fragments (as is done in the" 00613 " special low exposure mode) as well as unfragmented" 00614 " characters."); 00615 BOOL_VAR_H(applybox_learn_ngrams_mode, false, 00616 "Each bounding box is assumed to contain ngrams. Only" 00617 " learn the ngrams whose outlines overlap horizontally."); 00618 BOOL_VAR_H(tessedit_draw_outwords, false, "Draw output words"); 00619 BOOL_VAR_H(tessedit_training_tess, false, "Call Tess to learn blobs"); 00620 BOOL_VAR_H(tessedit_dump_choices, false, "Dump char choices"); 00621 BOOL_VAR_H(tessedit_fix_fuzzy_spaces, true, 00622 "Try to improve fuzzy spaces"); 00623 BOOL_VAR_H(tessedit_unrej_any_wd, false, 00624 "Dont bother with word plausibility"); 00625 BOOL_VAR_H(tessedit_fix_hyphens, true, "Crunch double hyphens?"); 00626 BOOL_VAR_H(tessedit_redo_xheight, true, "Check/Correct x-height"); 00627 BOOL_VAR_H(tessedit_enable_doc_dict, true, 00628 "Add words to the document dictionary"); 00629 BOOL_VAR_H(tessedit_debug_fonts, false, "Output font info per char"); 00630 BOOL_VAR_H(tessedit_debug_block_rejection, false, "Block and Row stats"); 00631 INT_VAR_H(debug_x_ht_level, 0, "Reestimate debug"); 00632 BOOL_VAR_H(debug_acceptable_wds, false, "Dump word pass/fail chk"); 00633 STRING_VAR_H(chs_leading_punct, "('`\"", "Leading punctuation"); 00634 STRING_VAR_H(chs_trailing_punct1, ").,;:?!", "1st Trailing punctuation"); 00635 STRING_VAR_H(chs_trailing_punct2, ")'`\"", "2nd Trailing punctuation"); 00636 double_VAR_H(quality_rej_pc, 0.08, "good_quality_doc lte rejection limit"); 00637 double_VAR_H(quality_blob_pc, 0.0, "good_quality_doc gte good blobs limit"); 00638 double_VAR_H(quality_outline_pc, 1.0, 00639 "good_quality_doc lte outline error limit"); 00640 double_VAR_H(quality_char_pc, 0.95, "good_quality_doc gte good char limit"); 00641 INT_VAR_H(quality_min_initial_alphas_reqd, 2, "alphas in a good word"); 00642 BOOL_VAR_H(tessedit_tess_adapt_to_rejmap, false, 00643 "Use reject map to control Tesseract adaption"); 00644 INT_VAR_H(tessedit_tess_adaption_mode, 0x27, 00645 "Adaptation decision algorithm for tess"); 00646 BOOL_VAR_H(tessedit_minimal_rej_pass1, false, 00647 "Do minimal rejection on pass 1 output"); 00648 BOOL_VAR_H(tessedit_test_adaption, false, "Test adaption criteria"); 00649 BOOL_VAR_H(tessedit_matcher_log, false, "Log matcher activity"); 00650 INT_VAR_H(tessedit_test_adaption_mode, 3, 00651 "Adaptation decision algorithm for tess"); 00652 BOOL_VAR_H(save_best_choices, false, 00653 "Save the results of the recognition step" 00654 " (blob_choices) within the corresponding WERD_CHOICE"); 00655 BOOL_VAR_H(test_pt, false, "Test for point"); 00656 double_VAR_H(test_pt_x, 99999.99, "xcoord"); 00657 double_VAR_H(test_pt_y, 99999.99, "ycoord"); 00658 INT_VAR_H(cube_debug_level, 1, "Print cube debug info."); 00659 STRING_VAR_H(outlines_odd, "%| ", "Non standard number of outlines"); 00660 STRING_VAR_H(outlines_2, "ij!?%\":;", "Non standard number of outlines"); 00661 BOOL_VAR_H(docqual_excuse_outline_errs, false, 00662 "Allow outline errs in unrejection?"); 00663 BOOL_VAR_H(tessedit_good_quality_unrej, true, 00664 "Reduce rejection on good docs"); 00665 BOOL_VAR_H(tessedit_use_reject_spaces, true, "Reject spaces?"); 00666 double_VAR_H(tessedit_reject_doc_percent, 65.00, 00667 "%rej allowed before rej whole doc"); 00668 double_VAR_H(tessedit_reject_block_percent, 45.00, 00669 "%rej allowed before rej whole block"); 00670 double_VAR_H(tessedit_reject_row_percent, 40.00, 00671 "%rej allowed before rej whole row"); 00672 double_VAR_H(tessedit_whole_wd_rej_row_percent, 70.00, 00673 "Number of row rejects in whole word rejects" 00674 "which prevents whole row rejection"); 00675 BOOL_VAR_H(tessedit_preserve_blk_rej_perfect_wds, true, 00676 "Only rej partially rejected words in block rejection"); 00677 BOOL_VAR_H(tessedit_preserve_row_rej_perfect_wds, true, 00678 "Only rej partially rejected words in row rejection"); 00679 BOOL_VAR_H(tessedit_dont_blkrej_good_wds, false, 00680 "Use word segmentation quality metric"); 00681 BOOL_VAR_H(tessedit_dont_rowrej_good_wds, false, 00682 "Use word segmentation quality metric"); 00683 INT_VAR_H(tessedit_preserve_min_wd_len, 2, 00684 "Only preserve wds longer than this"); 00685 BOOL_VAR_H(tessedit_row_rej_good_docs, true, 00686 "Apply row rejection to good docs"); 00687 double_VAR_H(tessedit_good_doc_still_rowrej_wd, 1.1, 00688 "rej good doc wd if more than this fraction rejected"); 00689 BOOL_VAR_H(tessedit_reject_bad_qual_wds, true, 00690 "Reject all bad quality wds"); 00691 BOOL_VAR_H(tessedit_debug_doc_rejection, false, "Page stats"); 00692 BOOL_VAR_H(tessedit_debug_quality_metrics, false, 00693 "Output data to debug file"); 00694 BOOL_VAR_H(bland_unrej, false, "unrej potential with no chekcs"); 00695 double_VAR_H(quality_rowrej_pc, 1.1, 00696 "good_quality_doc gte good char limit"); 00697 BOOL_VAR_H(unlv_tilde_crunching, true, 00698 "Mark v.bad words for tilde crunch"); 00699 BOOL_VAR_H(crunch_early_merge_tess_fails, true, "Before word crunch?"); 00700 BOOL_VAR_H(crunch_early_convert_bad_unlv_chs, false, "Take out ~^ early?"); 00701 double_VAR_H(crunch_terrible_rating, 80.0, "crunch rating lt this"); 00702 BOOL_VAR_H(crunch_terrible_garbage, true, "As it says"); 00703 double_VAR_H(crunch_poor_garbage_cert, -9.0, 00704 "crunch garbage cert lt this"); 00705 double_VAR_H(crunch_poor_garbage_rate, 60, "crunch garbage rating lt this"); 00706 double_VAR_H(crunch_pot_poor_rate, 40, "POTENTIAL crunch rating lt this"); 00707 double_VAR_H(crunch_pot_poor_cert, -8.0, "POTENTIAL crunch cert lt this"); 00708 BOOL_VAR_H(crunch_pot_garbage, true, "POTENTIAL crunch garbage"); 00709 double_VAR_H(crunch_del_rating, 60, "POTENTIAL crunch rating lt this"); 00710 double_VAR_H(crunch_del_cert, -10.0, "POTENTIAL crunch cert lt this"); 00711 double_VAR_H(crunch_del_min_ht, 0.7, "Del if word ht lt xht x this"); 00712 double_VAR_H(crunch_del_max_ht, 3.0, "Del if word ht gt xht x this"); 00713 double_VAR_H(crunch_del_min_width, 3.0, "Del if word width lt xht x this"); 00714 double_VAR_H(crunch_del_high_word, 1.5, 00715 "Del if word gt xht x this above bl"); 00716 double_VAR_H(crunch_del_low_word, 0.5, "Del if word gt xht x this below bl"); 00717 double_VAR_H(crunch_small_outlines_size, 0.6, "Small if lt xht x this"); 00718 INT_VAR_H(crunch_rating_max, 10, "For adj length in rating per ch"); 00719 INT_VAR_H(crunch_pot_indicators, 1, "How many potential indicators needed"); 00720 BOOL_VAR_H(crunch_leave_ok_strings, true, "Dont touch sensible strings"); 00721 BOOL_VAR_H(crunch_accept_ok, true, "Use acceptability in okstring"); 00722 BOOL_VAR_H(crunch_leave_accept_strings, false, 00723 "Dont pot crunch sensible strings"); 00724 BOOL_VAR_H(crunch_include_numerals, false, "Fiddle alpha figures"); 00725 INT_VAR_H(crunch_leave_lc_strings, 4, 00726 "Dont crunch words with long lower case strings"); 00727 INT_VAR_H(crunch_leave_uc_strings, 4, 00728 "Dont crunch words with long lower case strings"); 00729 INT_VAR_H(crunch_long_repetitions, 3, "Crunch words with long repetitions"); 00730 INT_VAR_H(crunch_debug, 0, "As it says"); 00731 INT_VAR_H(fixsp_non_noise_limit, 1, 00732 "How many non-noise blbs either side?"); 00733 double_VAR_H(fixsp_small_outlines_size, 0.28, "Small if lt xht x this"); 00734 BOOL_VAR_H(tessedit_prefer_joined_punct, false, "Reward punctation joins"); 00735 INT_VAR_H(fixsp_done_mode, 1, "What constitues done for spacing"); 00736 INT_VAR_H(debug_fix_space_level, 0, "Contextual fixspace debug"); 00737 STRING_VAR_H(numeric_punctuation, ".,", 00738 "Punct. chs expected WITHIN numbers"); 00739 INT_VAR_H(x_ht_acceptance_tolerance, 8, 00740 "Max allowed deviation of blob top outside of font data"); 00741 INT_VAR_H(x_ht_min_change, 8, "Min change in xht before actually trying it"); 00742 BOOL_VAR_H(tessedit_write_block_separators, false, 00743 "Write block separators in output"); 00744 BOOL_VAR_H(tessedit_write_rep_codes, false, 00745 "Write repetition char code"); 00746 BOOL_VAR_H(tessedit_write_unlv, false, "Write .unlv output file"); 00747 BOOL_VAR_H(tessedit_create_hocr, false, "Write .html hOCR output file"); 00748 STRING_VAR_H(unrecognised_char, "|", 00749 "Output char for unidentified blobs"); 00750 INT_VAR_H(suspect_level, 99, "Suspect marker level"); 00751 INT_VAR_H(suspect_space_level, 100, 00752 "Min suspect level for rejecting spaces"); 00753 INT_VAR_H(suspect_short_words, 2, 00754 "Dont Suspect dict wds longer than this"); 00755 BOOL_VAR_H(suspect_constrain_1Il, false, "UNLV keep 1Il chars rejected"); 00756 double_VAR_H(suspect_rating_per_ch, 999.9, "Dont touch bad rating limit"); 00757 double_VAR_H(suspect_accept_rating, -999.9, "Accept good rating limit"); 00758 BOOL_VAR_H(tessedit_minimal_rejection, false, "Only reject tess failures"); 00759 BOOL_VAR_H(tessedit_zero_rejection, false, "Dont reject ANYTHING"); 00760 BOOL_VAR_H(tessedit_word_for_word, false, 00761 "Make output have exactly one word per WERD"); 00762 BOOL_VAR_H(tessedit_zero_kelvin_rejection, false, 00763 "Dont reject ANYTHING AT ALL"); 00764 BOOL_VAR_H(tessedit_consistent_reps, true, "Force all rep chars the same"); 00765 INT_VAR_H(tessedit_reject_mode, 0, "Rejection algorithm"); 00766 INT_VAR_H(tessedit_ok_mode, 5, "Acceptance decision algorithm"); 00767 BOOL_VAR_H(tessedit_rejection_debug, false, "Adaption debug"); 00768 BOOL_VAR_H(tessedit_flip_0O, true, "Contextual 0O O0 flips"); 00769 double_VAR_H(tessedit_lower_flip_hyphen, 1.5, 00770 "Aspect ratio dot/hyphen test"); 00771 double_VAR_H(tessedit_upper_flip_hyphen, 1.8, 00772 "Aspect ratio dot/hyphen test"); 00773 BOOL_VAR_H(rej_trust_doc_dawg, false, "Use DOC dawg in 11l conf. detector"); 00774 BOOL_VAR_H(rej_1Il_use_dict_word, false, "Use dictword test"); 00775 BOOL_VAR_H(rej_1Il_trust_permuter_type, true, "Dont double check"); 00776 BOOL_VAR_H(rej_use_tess_accepted, true, "Individual rejection control"); 00777 BOOL_VAR_H(rej_use_tess_blanks, true, "Individual rejection control"); 00778 BOOL_VAR_H(rej_use_good_perm, true, "Individual rejection control"); 00779 BOOL_VAR_H(rej_use_sensible_wd, false, "Extend permuter check"); 00780 BOOL_VAR_H(rej_alphas_in_number_perm, false, "Extend permuter check"); 00781 double_VAR_H(rej_whole_of_mostly_reject_word_fract, 0.85, "if >this fract"); 00782 INT_VAR_H(tessedit_image_border, 2, "Rej blbs near image edge limit"); 00783 STRING_VAR_H(ok_repeated_ch_non_alphanum_wds, "-?*\075", 00784 "Allow NN to unrej"); 00785 STRING_VAR_H(conflict_set_I_l_1, "Il1[]", "Il1 conflict set"); 00786 INT_VAR_H(min_sane_x_ht_pixels, 8, "Reject any x-ht lt or eq than this"); 00787 BOOL_VAR_H(tessedit_create_boxfile, false, "Output text with boxes"); 00788 INT_VAR_H(tessedit_page_number, -1, 00789 "-1 -> All pages, else specifc page to process"); 00790 BOOL_VAR_H(tessedit_write_images, false, "Capture the image from the IPE"); 00791 BOOL_VAR_H(interactive_mode, false, "Run interactively?"); 00792 STRING_VAR_H(file_type, ".tif", "Filename extension"); 00793 BOOL_VAR_H(tessedit_override_permuter, true, "According to dict_word"); 00794 INT_VAR_H(tessdata_manager_debug_level, 0, 00795 "Debug level for TessdataManager functions."); 00796 // Min acceptable orientation margin (difference in scores between top and 2nd 00797 // choice in OSResults::orientations) to believe the page orientation. 00798 double_VAR_H(min_orientation_margin, 12.0, 00799 "Min acceptable orientation margin"); 00800 00802 FILE *init_recog_training(const STRING &fname); 00803 void recog_training_segmented(const STRING &fname, 00804 PAGE_RES *page_res, 00805 volatile ETEXT_DESC *monitor, 00806 FILE *output_file); 00807 void ambigs_classify_and_output(WERD_RES *werd_res, 00808 ROW_RES *row_res, 00809 BLOCK_RES *block_res, 00810 const char *label, 00811 FILE *output_file); 00812 00813 inline CubeRecoContext *GetCubeRecoContext() { return cube_cntxt_; } 00814 00815 private: 00816 // The filename of a backup config file. If not null, then we currently 00817 // have a temporary debug config file loaded, and backup_config_file_ 00818 // will be loaded, and set to null when debug is complete. 00819 const char* backup_config_file_; 00820 // The filename of a config file to read when processing a debug word. 00821 STRING word_config_; 00822 Pix* pix_binary_; 00823 Pix* pix_grey_; 00824 // The shiro-rekha splitter object which is used to split top-lines in 00825 // Devanagari words to provide a better word and grapheme segmentation. 00826 ShiroRekhaSplitter splitter_; 00827 // The boolean records if the currently set 00828 // pix_binary_ member has been modified due to any processing so that this 00829 // may hurt Cube's recognition phase. 00830 bool orig_image_changed_; 00831 // Page segmentation/layout 00832 Textord textord_; 00833 // True if the primary language uses right_to_left reading order. 00834 bool right_to_left_; 00835 FCOORD deskew_; 00836 FCOORD reskew_; 00837 TesseractStats stats_; 00838 // Cube objects. 00839 CubeRecoContext* cube_cntxt_; 00840 TesseractCubeCombiner *tess_cube_combiner_; 00841 }; 00842 00843 } // namespace tesseract 00844 00845 00846 #endif // TESSERACT_CCMAIN_TESSERACTCLASS_H__