Tesseract 3.01
/data/source/tesseract-ocr/ccmain/tesseractclass.h
Go to the documentation of this file.
00001 
00002 // File:        tesseractclass.h
00003 // Description: An instance of Tesseract. For thread safety, *every*
00004 //              global variable goes in here, directly, or indirectly.
00005 // Author:      Ray Smith
00006 // Created:     Fri Mar 07 08:17:01 PST 2008
00007 //
00008 // (C) Copyright 2008, Google Inc.
00009 // Licensed under the Apache License, Version 2.0 (the "License");
00010 // you may not use this file except in compliance with the License.
00011 // You may obtain a copy of the License at
00012 // http://www.apache.org/licenses/LICENSE-2.0
00013 // Unless required by applicable law or agreed to in writing, software
00014 // distributed under the License is distributed on an "AS IS" BASIS,
00015 // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
00016 // See the License for the specific language governing permissions and
00017 // limitations under the License.
00018 //
00020 
00021 #ifndef TESSERACT_CCMAIN_TESSERACTCLASS_H__
00022 #define TESSERACT_CCMAIN_TESSERACTCLASS_H__
00023 
00024 #include "allheaders.h"
00025 #include "genericvector.h"
00026 #include "params.h"
00027 #include "wordrec.h"
00028 #include "ocrclass.h"
00029 #include "control.h"
00030 #include "docqual.h"
00031 #include "devanagari_processing.h"
00032 #include "textord.h"
00033 
00034 class PAGE_RES;
00035 class PAGE_RES_IT;
00036 class BLOCK_LIST;
00037 class CharSamp;
00038 class TO_BLOCK_LIST;
00039 class IMAGE;
00040 class WERD_RES;
00041 class ROW;
00042 class TBOX;
00043 class SVMenuNode;
00044 struct Pix;
00045 class WERD_CHOICE;
00046 class WERD;
00047 class BLOB_CHOICE_LIST_CLIST;
00048 struct OSResults;
00049 
00050 
00051 // Top-level class for all tesseract global instance data.
00052 // This class either holds or points to all data used by an instance
00053 // of Tesseract, including the memory allocator. When this is
00054 // complete, Tesseract will be thread-safe. UNTIL THEN, IT IS NOT!
00055 //
00056 // NOTE to developers: Do not create cyclic dependencies through this class!
00057 // The directory dependency tree must remain a tree! The keep this clean,
00058 // lower-level code (eg in ccutil, the bottom level) must never need to
00059 // know about the content of a higher-level directory.
00060 // The following scheme will grant the easiest access to lower-level
00061 // global members without creating a cyclic dependency:
00062 //
00063 // Class Hierarchy (^ = inheritance):
00064 //
00065 //             CCUtil (ccutil/ccutil.h)
00066 //                         ^      Members include: UNICHARSET
00067 //            CUtil (cutil/cutil_class.h)
00068 //                         ^       Members include: TBLOB*, TEXTBLOCK*
00069 //           CCStruct (ccstruct/ccstruct.h)
00070 //                         ^       Members include: Image
00071 //           Classify (classify/classify.h)
00072 //                         ^       Members include: Dict
00073 //             WordRec (wordrec/wordrec.h)
00074 //                         ^       Members include: WERD*, DENORM*
00075 //        Tesseract (ccmain/tesseractclass.h)
00076 //                                 Members include: Pix*, CubeRecoContext*,
00077 //                                 TesseractCubeCombiner*
00078 //
00079 // Other important classes:
00080 //
00081 //  TessBaseAPI (api/baseapi.h)
00082 //                                 Members include: BLOCK_LIST*, PAGE_RES*,
00083 //                                 Tesseract*, ImageThresholder*
00084 //  Dict (dict/dict.h)
00085 //                                 Members include: Image* (private)
00086 //
00087 // NOTE: that each level contains members that correspond to global
00088 // data that is defined (and used) at that level, not necessarily where
00089 // the type is defined so for instance:
00090 // BOOL_VAR_H(textord_show_blobs, false, "Display unsorted blobs");
00091 // goes inside the Textord class, not the cc_util class.
00092 
00093 namespace tesseract {
00094 
00095 class CubeLineObject;
00096 class CubeObject;
00097 class CubeRecoContext;
00098 class TesseractCubeCombiner;
00099 
00100 // A collection of various variables for statistics and debugging.
00101 struct TesseractStats {
00102   TesseractStats()
00103     : adaption_word_number(0),
00104       doc_blob_quality(0),
00105       doc_outline_errs(0),
00106       doc_char_quality(0),
00107       good_char_count(0),
00108       doc_good_char_quality(0),
00109       word_count(0),
00110       dict_words(0),
00111       tilde_crunch_written(false),
00112       last_char_was_newline(true),
00113       last_char_was_tilde(false),
00114       write_results_empty_block(true) {}
00115 
00116   inT32 adaption_word_number;
00117   inT16 doc_blob_quality;
00118   inT16 doc_outline_errs;
00119   inT16 doc_char_quality;
00120   inT16 good_char_count;
00121   inT16 doc_good_char_quality;
00122   inT32 word_count;  // count of word in the document
00123   inT32 dict_words;  // number of dicitionary words in the document
00124   STRING dump_words_str;  // accumulator used by dump_words()
00125   // Flags used by write_results()
00126   bool tilde_crunch_written;
00127   bool last_char_was_newline;
00128   bool last_char_was_tilde;
00129   bool write_results_empty_block;
00130 };
00131 
00132 class Tesseract : public Wordrec {
00133  public:
00134   Tesseract();
00135   ~Tesseract();
00136 
00137   void Clear();
00138 
00139   // Simple accessors.
00140   const FCOORD& reskew() const {
00141     return reskew_;
00142   }
00143   // Destroy any existing pix and return a pointer to the pointer.
00144   Pix** mutable_pix_binary() {
00145     Clear();
00146     return &pix_binary_;
00147   }
00148   Pix* pix_binary() const {
00149     return pix_binary_;
00150   }
00151   Pix* pix_grey() const {
00152     return pix_grey_;
00153   }
00154   void set_pix_grey(Pix* grey_pix) {
00155     pix_grey_ = grey_pix;
00156   }
00157   int ImageWidth() const {
00158     return pixGetWidth(pix_binary_);
00159   }
00160   int ImageHeight() const {
00161     return pixGetHeight(pix_binary_);
00162   }
00163 
00164   const ShiroRekhaSplitter& splitter() const {
00165     return splitter_;
00166   }
00167   ShiroRekhaSplitter* mutable_splitter() {
00168     return &splitter_;
00169   }
00170   const Textord& textord() const {
00171     return textord_;
00172   }
00173   Textord* mutable_textord() {
00174     return &textord_;
00175   }
00176 
00177   bool right_to_left() const {
00178     return right_to_left_;
00179   }
00180 
00181   void SetBlackAndWhitelist();
00182 
00183   // Perform steps to prepare underlying binary image/other data structures for
00184   // page segmentation. Uses the strategy specified in the global variable
00185   // pageseg_devanagari_split_strategy for perform splitting while preparing for
00186   // page segmentation.
00187   void PrepareForPageseg();
00188 
00189   // Perform steps to prepare underlying binary image/other data structures for
00190   // Tesseract OCR. The current segmentation is required by this method.
00191   // Uses the strategy specified in the global variable
00192   // ocr_devanagari_split_strategy for performing splitting while preparing for
00193   // Tesseract ocr.
00194   void PrepareForTessOCR(BLOCK_LIST* block_list,
00195                          Tesseract* osd_tess, OSResults* osr);
00196 
00197   // Perform steps to prepare underlying binary image/other data structures for
00198   // Cube OCR.
00199   void PrepareForCubeOCR();
00200 
00201   int SegmentPage(const STRING* input_file, BLOCK_LIST* blocks,
00202                   Tesseract* osd_tess, OSResults* osr);
00203   void SetupWordScripts(BLOCK_LIST* blocks);
00204   int AutoPageSeg(int resolution, bool single_column,
00205                   bool osd, bool only_osd,
00206                   BLOCK_LIST* blocks, TO_BLOCK_LIST* to_blocks,
00207                   Tesseract* osd_tess, OSResults* osr);
00208 
00210   bool ProcessTargetWord(const TBOX& word_box, const TBOX& target_word_box,
00211                          const char* word_config, int pass);
00212   void recog_all_words(PAGE_RES* page_res,
00213                        ETEXT_DESC* monitor,
00214                        const TBOX* target_word_box,
00215                        const char* word_config,
00216                        int dopasses);
00217   void classify_word_pass1(                 //recog one word
00218                            WERD_RES *word,  //word to do
00219                            ROW *row,
00220                            BLOCK* block);
00221   void recog_pseudo_word(PAGE_RES* page_res,  // blocks to check
00222                          TBOX &selection_box);
00223 
00224   void fix_rep_char(PAGE_RES_IT* page_res_it);
00225   void ExplodeRepeatedWord(BLOB_CHOICE* best_choice, PAGE_RES_IT* page_res_it);
00226 
00227   // Callback helper for fix_quotes returns a double quote if both
00228   // arguments are quote, otherwise INVALID_UNICHAR_ID.
00229   UNICHAR_ID BothQuotes(UNICHAR_ID id1, UNICHAR_ID id2);
00230   void fix_quotes(WERD_RES* word_res,
00231                   BLOB_CHOICE_LIST_CLIST *blob_choices);
00232   ACCEPTABLE_WERD_TYPE acceptable_word_string(const char *s,
00233                                               const char *lengths);
00234   void match_word_pass2(                 //recog one word
00235                         WERD_RES *word,  //word to do
00236                         ROW *row,
00237                         BLOCK* block);
00238   void classify_word_pass2(  //word to do
00239                            WERD_RES *word,
00240                            BLOCK* block,
00241                            ROW *row);
00242   void ReportXhtFixResult(bool accept_new_word, float new_x_ht,
00243                           WERD_RES* word, WERD_RES* new_word);
00244   bool RunOldFixXht(WERD_RES *word, BLOCK* block, ROW *row);
00245   bool TrainedXheightFix(WERD_RES *word, BLOCK* block, ROW *row);
00246   BOOL8 recog_interactive(BLOCK* block, ROW* row, WERD_RES* word_res);
00247 
00248   // Callback helper for fix_hyphens returns UNICHAR_ID of - if both
00249   // arguments are hyphen, otherwise INVALID_UNICHAR_ID.
00250   UNICHAR_ID BothHyphens(UNICHAR_ID id1, UNICHAR_ID id2);
00251   // Callback helper for fix_hyphens returns true if box1 and box2 overlap
00252   // (assuming both on the same textline, are in order and a chopped em dash.)
00253   bool HyphenBoxesOverlap(const TBOX& box1, const TBOX& box2);
00254   void fix_hyphens(WERD_RES* word_res,
00255                    BLOB_CHOICE_LIST_CLIST *blob_choices);
00256   void set_word_fonts(
00257       WERD_RES *word,  // set fonts of this word
00258       BLOB_CHOICE_LIST_CLIST *blob_choices);  // detailed results
00259   void font_recognition_pass(  //good chars in word
00260                              PAGE_RES_IT &page_res_it);
00261   BOOL8 check_debug_pt(WERD_RES *word, int location);
00262 
00264   bool init_cube_objects(bool load_combiner,
00265                          TessdataManager *tessdata_manager);
00266   void run_cube(PAGE_RES *page_res);
00267   void cube_recognize(CubeObject *cube_obj, PAGE_RES_IT *page_res_it);
00268   void fill_werd_res(const BoxWord& cube_box_word,
00269                      WERD_CHOICE* cube_werd_choice,
00270                      const char* cube_best_str,
00271                      PAGE_RES_IT *page_res_it);
00272   bool extract_cube_state(CubeObject* cube_obj, int* num_chars,
00273                           Boxa** char_boxes, CharSamp*** char_samples);
00274   bool create_cube_box_word(Boxa *char_boxes, int num_chars,
00275                             TBOX word_box, BoxWord* box_word);
00277 
00278   void output_pass(PAGE_RES_IT &page_res_it, const TBOX *target_word_box);
00279   void write_results(PAGE_RES_IT &page_res_it,  // full info
00280                      char newline_type,         // type of newline
00281                      BOOL8 force_eol            // override tilde crunch?
00282                     );
00283   void set_unlv_suspects(WERD_RES *word);
00284   UNICHAR_ID get_rep_char(WERD_RES *word);  // what char is repeated?
00285   BOOL8 acceptable_number_string(const char *s,
00286                                  const char *lengths);
00287   inT16 count_alphanums(const WERD_CHOICE &word);
00288   inT16 count_alphas(const WERD_CHOICE &word);
00290   void read_config_file(const char *filename, bool init_only);
00291   int init_tesseract(const char *arg0,
00292                      const char *textbase,
00293                      const char *language,
00294                      OcrEngineMode oem,
00295                      char **configs,
00296                      int configs_size,
00297                      const GenericVector<STRING> *vars_vec,
00298                      const GenericVector<STRING> *vars_values,
00299                      bool set_only_init_params);
00300   int init_tesseract(const char *datapath,
00301                      const char *language,
00302                      OcrEngineMode oem) {
00303     return init_tesseract(datapath, NULL, language, oem,
00304                           NULL, 0, NULL, NULL, false);
00305   }
00306 
00307   int init_tesseract_lm(const char *arg0,
00308                         const char *textbase,
00309                         const char *language);
00310 
00311   void recognize_page(STRING& image_name);
00312   void end_tesseract();
00313 
00314   bool init_tesseract_lang_data(const char *arg0,
00315                                 const char *textbase,
00316                                 const char *language,
00317                                 OcrEngineMode oem,
00318                                 char **configs,
00319                                 int configs_size,
00320                                 const GenericVector<STRING> *vars_vec,
00321                                 const GenericVector<STRING> *vars_values,
00322                                 bool set_only_init_params);
00323 
00325   SVMenuNode *build_menu_new();
00326   void pgeditor_main(int width, int height, PAGE_RES* page_res);
00327   void process_image_event( // action in image win
00328                            const SVEvent &event);
00329   BOOL8 process_cmd_win_event(                 // UI command semantics
00330                               inT32 cmd_event,  // which menu item?
00331                               char *new_value   // any prompt data
00332                              );
00333   void debug_word(PAGE_RES* page_res, const TBOX &selection_box);
00334   void do_re_display(
00335       BOOL8 (tesseract::Tesseract::*word_painter)(BLOCK* block,
00336                                                   ROW* row,
00337                                                   WERD_RES* word_res));
00338   BOOL8 word_display(BLOCK* block, ROW* row, WERD_RES* word_res);
00339   BOOL8 word_bln_display(BLOCK* block, ROW* row, WERD_RES* word_res);
00340   BOOL8 word_blank_and_set_display(BLOCK* block, ROW* row, WERD_RES* word_res);
00341   BOOL8 word_set_display(BLOCK* block, ROW* row, WERD_RES* word_res);
00342   BOOL8 word_dumper(BLOCK* block, ROW* row, WERD_RES* word_res);
00344   void make_reject_map(            //make rej map for wd //detailed results
00345                        WERD_RES *word,
00346                        BLOB_CHOICE_LIST_CLIST *blob_choices,
00347                        ROW *row,
00348                        inT16 pass  //1st or 2nd?
00349                       );
00350   BOOL8 one_ell_conflict(WERD_RES *word_res, BOOL8 update_map);
00351   inT16 first_alphanum_index(const char *word,
00352                              const char *word_lengths);
00353   inT16 first_alphanum_offset(const char *word,
00354                               const char *word_lengths);
00355   inT16 alpha_count(const char *word,
00356                     const char *word_lengths);
00357   BOOL8 word_contains_non_1_digit(const char *word,
00358                                   const char *word_lengths);
00359   void dont_allow_1Il(WERD_RES *word);
00360   inT16 count_alphanums(  //how many alphanums
00361                         WERD_RES *word);
00362   void flip_0O(WERD_RES *word);
00363   BOOL8 non_0_digit(UNICHAR_ID unichar_id);
00364   BOOL8 non_O_upper(UNICHAR_ID unichar_id);
00365   BOOL8 repeated_nonalphanum_wd(WERD_RES *word, ROW *row);
00366   void nn_match_word(  //Match a word
00367                      WERD_RES *word,
00368                      ROW *row);
00369   void nn_recover_rejects(WERD_RES *word, ROW *row);
00370   BOOL8 test_ambig_word(  //test for ambiguity
00371                         WERD_RES *word);
00372   void set_done(  //set done flag
00373                 WERD_RES *word,
00374                 inT16 pass);
00375   inT16 safe_dict_word(const WERD_CHOICE  &word);
00376   void flip_hyphens(WERD_RES *word);
00377   void reject_I_1_L(WERD_RES *word);
00378   void reject_edge_blobs(WERD_RES *word);
00379   void reject_mostly_rejects(WERD_RES *word);
00381   BOOL8 word_adaptable(  //should we adapt?
00382                        WERD_RES *word,
00383                        uinT16 mode);
00384 
00386   void recog_word_recursive(WERD_RES* word,
00387                             BLOB_CHOICE_LIST_CLIST *blob_choices);
00388   void recog_word(WERD_RES *word,
00389                   BLOB_CHOICE_LIST_CLIST *blob_choices);
00390   void split_and_recog_word(WERD_RES* word,
00391                             BLOB_CHOICE_LIST_CLIST *blob_choices);
00393   BOOL8 digit_or_numeric_punct(WERD_RES *word, int char_position);
00394   inT16 eval_word_spacing(WERD_RES_LIST &word_res_list);
00395   void match_current_words(WERD_RES_LIST &words, ROW *row, BLOCK* block);
00396   inT16 fp_eval_word_spacing(WERD_RES_LIST &word_res_list);
00397   void fix_noisy_space_list(WERD_RES_LIST &best_perm, ROW *row, BLOCK* block);
00398   void fix_fuzzy_space_list(  //space explorer
00399                             WERD_RES_LIST &best_perm,
00400                             ROW *row,
00401                             BLOCK* block);
00402   void fix_sp_fp_word(WERD_RES_IT &word_res_it, ROW *row, BLOCK* block);
00403   void fix_fuzzy_spaces(                      //find fuzzy words
00404                         ETEXT_DESC *monitor,  //progress monitor
00405                         inT32 word_count,     //count of words in doc
00406                         PAGE_RES *page_res);
00407   void dump_words(WERD_RES_LIST &perm, inT16 score,
00408                   inT16 mode, BOOL8 improved);
00409   BOOL8 uniformly_spaced(WERD_RES *word);
00410   BOOL8 fixspace_thinks_word_done(WERD_RES *word);
00411   inT16 worst_noise_blob(WERD_RES *word_res, float *worst_noise_score);
00412   float blob_noise_score(TBLOB *blob);
00413   void break_noisiest_blob_word(WERD_RES_LIST &words);
00415   GARBAGE_LEVEL garbage_word(WERD_RES *word, BOOL8 ok_dict_word);
00416   BOOL8 potential_word_crunch(WERD_RES *word,
00417                               GARBAGE_LEVEL garbage_level,
00418                               BOOL8 ok_dict_word);
00419   void tilde_crunch(PAGE_RES_IT &page_res_it);
00420   void unrej_good_quality_words(  //unreject potential
00421                                 PAGE_RES_IT &page_res_it);
00422   void doc_and_block_rejection(  //reject big chunks
00423                                PAGE_RES_IT &page_res_it,
00424                                BOOL8 good_quality_doc);
00425   void quality_based_rejection(PAGE_RES_IT &page_res_it,
00426                                BOOL8 good_quality_doc);
00427   void convert_bad_unlv_chs(WERD_RES *word_res);
00428   // Callback helper for merge_tess_fails returns a space if both
00429   // arguments are space, otherwise INVALID_UNICHAR_ID.
00430   UNICHAR_ID BothSpaces(UNICHAR_ID id1, UNICHAR_ID id2);
00431   void merge_tess_fails(WERD_RES *word_res);
00432   void tilde_delete(PAGE_RES_IT &page_res_it);
00433   inT16 word_blob_quality(WERD_RES *word, ROW *row);
00434   void word_char_quality(WERD_RES *word, ROW *row, inT16 *match_count,
00435                          inT16 *accepted_match_count);
00436   void unrej_good_chs(WERD_RES *word, ROW *row);
00437   inT16 count_outline_errs(char c, inT16 outline_count);
00438   inT16 word_outline_errs(WERD_RES *word);
00439   BOOL8 terrible_word_crunch(WERD_RES *word, GARBAGE_LEVEL garbage_level);
00440   CRUNCH_MODE word_deletable(WERD_RES *word, inT16 &delete_mode);
00441   inT16 failure_count(WERD_RES *word);
00442   BOOL8 noise_outlines(TWERD *word);
00444   void
00445   process_selected_words (
00446       PAGE_RES* page_res, // blocks to check
00447       //function to call
00448       TBOX & selection_box,
00449       BOOL8 (tesseract::Tesseract::*word_processor) (BLOCK* block,
00450                                                      ROW* row,
00451                                                      WERD_RES* word_res));
00453   void tess_add_doc_word(                          //test acceptability
00454                          WERD_CHOICE *word_choice  //after context
00455                         );
00456   void tess_segment_pass1(WERD_RES *word,
00457                           BLOB_CHOICE_LIST_CLIST *blob_choices);
00458   void tess_segment_pass2(WERD_RES *word,
00459                           BLOB_CHOICE_LIST_CLIST *blob_choices);
00460   BOOL8 tess_acceptable_word(                           //test acceptability
00461                              WERD_CHOICE *word_choice,  //after context
00462                              WERD_CHOICE *raw_choice    //before context
00463                             );
00465   // Applies the box file based on the image name fname, and resegments
00466   // the words in the block_list (page), with:
00467   // blob-mode: one blob per line in the box file, words as input.
00468   // word/line-mode: one blob per space-delimited unit after the #, and one word
00469   // per line in the box file. (See comment above for box file format.)
00470   // If find_segmentation is true, (word/line mode) then the classifier is used
00471   // to re-segment words/lines to match the space-delimited truth string for
00472   // each box. In this case, the input box may be for a word or even a whole
00473   // text line, and the output words will contain multiple blobs corresponding
00474   // to the space-delimited input string.
00475   // With find_segmentation false, no classifier is needed, but the chopper
00476   // can still be used to correctly segment touching characters with the help
00477   // of the input boxes.
00478   // In the returned PAGE_RES, the WERD_RES are setup as they would be returned
00479   // from normal classification, ie. with a word, chopped_word, rebuild_word,
00480   // seam_array, denorm, box_word, and best_state, but NO best_choice or
00481   // raw_choice, as they would require a UNICHARSET, which we aim to avoid.
00482   // Instead, the correct_text member of WERD_RES is set, and this may be later
00483   // converted to a best_choice using CorrectClassifyWords. CorrectClassifyWords
00484   // is not required before calling ApplyBoxTraining.
00485   PAGE_RES* ApplyBoxes(const STRING& fname, bool find_segmentation,
00486                        BLOCK_LIST *block_list);
00487 
00488   // Builds a PAGE_RES from the block_list in the way required for ApplyBoxes:
00489   // All fuzzy spaces are removed, and all the words are maximally chopped.
00490   PAGE_RES* SetupApplyBoxes(BLOCK_LIST *block_list);
00491   // Tests the chopper by exhaustively running chop_one_blob.
00492   // The word_res will contain filled chopped_word, seam_array, denorm,
00493   // box_word and best_state for the maximally chopped word.
00494   void MaximallyChopWord(BLOCK* block, ROW* row, WERD_RES* word_res);
00495   // Gather consecutive blobs that match the given box into the best_state
00496   // and corresponding correct_text.
00497   // Fights over which box owns which blobs are settled by pre-chopping and
00498   // applying the blobs to box or next_box with the least non-overlap.
00499   // Returns false if the box was in error, which can only be caused by
00500   // failing to find an appropriate blob for a box.
00501   // This means that occasionally, blobs may be incorrectly segmented if the
00502   // chopper fails to find a suitable chop point.
00503   bool ResegmentCharBox(PAGE_RES* page_res,
00504                         const TBOX& box, const TBOX& next_box,
00505                         const char* correct_text);
00506   // Consume all source blobs that strongly overlap the given box,
00507   // putting them into a new word, with the correct_text label.
00508   // Fights over which box owns which blobs are settled by
00509   // applying the blobs to box or next_box with the least non-overlap.
00510   // Returns false if the box was in error, which can only be caused by
00511   // failing to find an overlapping blob for a box.
00512   bool ResegmentWordBox(BLOCK_LIST *block_list,
00513                         const TBOX& box, const TBOX& next_box,
00514                         const char* correct_text);
00515   // Resegments the words by running the classifier in an attempt to find the
00516   // correct segmentation that produces the required string.
00517   void ReSegmentByClassification(PAGE_RES* page_res);
00518   // Converts the space-delimited string of utf8 text to a vector of UNICHAR_ID.
00519   // Returns false if an invalid UNICHAR_ID is encountered.
00520   bool ConvertStringToUnichars(const char* utf8,
00521                                GenericVector<UNICHAR_ID>* class_ids);
00522   // Resegments the word to achieve the target_text from the classifier.
00523   // Returns false if the re-segmentation fails.
00524   // Uses brute-force combination of upto kMaxGroupSize adjacent blobs, and
00525   // applies a full search on the classifier results to find the best classified
00526   // segmentation. As a compromise to obtain better recall, 1-1 ambigiguity
00527   // substitutions ARE used.
00528   bool FindSegmentation(const GenericVector<UNICHAR_ID>& target_text,
00529                         WERD_RES* word_res);
00530   // Recursive helper to find a match to the target_text (from text_index
00531   // position) in the choices (from choices_pos position).
00532   // Choices is an array of GenericVectors, of length choices_length, with each
00533   // element representing a starting position in the word, and the
00534   // GenericVector holding classification results for a sequence of consecutive
00535   // blobs, with index 0 being a single blob, index 1 being 2 blobs etc.
00536   void SearchForText(const GenericVector<BLOB_CHOICE_LIST*>* choices,
00537                      int choices_pos, int choices_length,
00538                      const GenericVector<UNICHAR_ID>& target_text,
00539                      int text_index,
00540                      float rating, GenericVector<int>* segmentation,
00541                      float* best_rating, GenericVector<int>* best_segmentation);
00542   // Counts up the labelled words and the blobs within.
00543   // Deletes all unused or emptied words, counting the unused ones.
00544   // Resets W_BOL and W_EOL flags correctly.
00545   // Builds the rebuild_word and rebuilds the box_word.
00546   void TidyUp(PAGE_RES* page_res);
00547   // Logs a bad box by line in the box file and box coords.
00548   void ReportFailedBox(int boxfile_lineno, TBOX box, const char *box_ch,
00549                        const char *err_msg);
00550   // Creates a fake best_choice entry in each WERD_RES with the correct text.
00551   void CorrectClassifyWords(PAGE_RES* page_res);
00552   // Call LearnWord to extract features for labelled blobs within each word.
00553   // Features are written to the given filename.
00554   void ApplyBoxTraining(const STRING& filename, PAGE_RES* page_res);
00555 
00557   // Returns the number of misfit blob tops in this word.
00558   int CountMisfitTops(WERD_RES *word_res);
00559   // Returns a new x-height in pixels (original image coords) that is
00560   // maximally compatible with the result in word_res.
00561   // Returns 0.0f if no x-height is found that is better than the current
00562   // estimate.
00563   float ComputeCompatibleXheight(WERD_RES *word_res);
00565   // TODO(ocr-team): Remove obsolete parameters.
00566   BOOL_VAR_H(tessedit_resegment_from_boxes, false,
00567              "Take segmentation and labeling from box file");
00568   BOOL_VAR_H(tessedit_resegment_from_line_boxes, false,
00569               "Conversion of word/line box file to char box file");
00570   BOOL_VAR_H(tessedit_train_from_boxes, false,
00571              "Generate training data from boxed chars");
00572   BOOL_VAR_H(tessedit_make_boxes_from_boxes, false,
00573              "Generate more boxes from boxed chars");
00574   BOOL_VAR_H(tessedit_dump_pageseg_images, false,
00575              "Dump intermediate images made during page segmentation");
00576   INT_VAR_H(tessedit_pageseg_mode, PSM_SINGLE_BLOCK,
00577             "Page seg mode: 0=osd only, 1=auto+osd, 2=auto, 3=col, 4=block,"
00578             " 5=line, 6=word, 7=char"
00579             " (Values from PageSegMode enum in publictypes.h)");
00580   INT_VAR_H(tessedit_ocr_engine_mode, tesseract::OEM_TESSERACT_ONLY,
00581             "Which OCR engine(s) to run (Tesseract, Cube, both). Defaults"
00582             " to loading and running only Tesseract (no Cube, no combiner)."
00583             " (Values from OcrEngineMode enum in tesseractclass.h)");
00584   STRING_VAR_H(tessedit_char_blacklist, "",
00585                "Blacklist of chars not to recognize");
00586   STRING_VAR_H(tessedit_char_whitelist, "",
00587                "Whitelist of chars to recognize");
00588   BOOL_VAR_H(tessedit_ambigs_training, false,
00589              "Perform training for ambiguities");
00590   INT_VAR_H(pageseg_devanagari_split_strategy,
00591             tesseract::ShiroRekhaSplitter::NO_SPLIT,
00592             "Whether to use the top-line splitting process for Devanagari "
00593             "documents while performing page-segmentation.");
00594   INT_VAR_H(ocr_devanagari_split_strategy,
00595             tesseract::ShiroRekhaSplitter::NO_SPLIT,
00596             "Whether to use the top-line splitting process for Devanagari "
00597             "documents while performing ocr.");
00598   STRING_VAR_H(tessedit_write_params_to_file, "",
00599                "Write all parameters to the given file.");
00600   BOOL_VAR_H(tessedit_adapt_to_char_fragments, true,
00601              "Adapt to words that contain "
00602              " a character composed form fragments");
00603   BOOL_VAR_H(tessedit_adaption_debug, false,
00604              "Generate and print debug information for adaption");
00605   INT_VAR_H(applybox_debug, 1, "Debug level");
00606   INT_VAR_H(applybox_page, 0, "Page number to apply boxes from");
00607   STRING_VAR_H(applybox_exposure_pattern, ".exp",
00608                "Exposure value follows this pattern in the image"
00609                " filename. The name of the image files are expected"
00610                " to be in the form [lang].[fontname].exp[num].tif");
00611   BOOL_VAR_H(applybox_learn_chars_and_char_frags_mode, false,
00612              "Learn both character fragments (as is done in the"
00613              " special low exposure mode) as well as unfragmented"
00614              " characters.");
00615   BOOL_VAR_H(applybox_learn_ngrams_mode, false,
00616              "Each bounding box is assumed to contain ngrams. Only"
00617              " learn the ngrams whose outlines overlap horizontally.");
00618   BOOL_VAR_H(tessedit_draw_outwords, false, "Draw output words");
00619   BOOL_VAR_H(tessedit_training_tess, false, "Call Tess to learn blobs");
00620   BOOL_VAR_H(tessedit_dump_choices, false, "Dump char choices");
00621   BOOL_VAR_H(tessedit_fix_fuzzy_spaces, true,
00622              "Try to improve fuzzy spaces");
00623   BOOL_VAR_H(tessedit_unrej_any_wd, false,
00624              "Dont bother with word plausibility");
00625   BOOL_VAR_H(tessedit_fix_hyphens, true, "Crunch double hyphens?");
00626   BOOL_VAR_H(tessedit_redo_xheight, true, "Check/Correct x-height");
00627   BOOL_VAR_H(tessedit_enable_doc_dict, true,
00628              "Add words to the document dictionary");
00629   BOOL_VAR_H(tessedit_debug_fonts, false, "Output font info per char");
00630   BOOL_VAR_H(tessedit_debug_block_rejection, false, "Block and Row stats");
00631   INT_VAR_H(debug_x_ht_level, 0, "Reestimate debug");
00632   BOOL_VAR_H(debug_acceptable_wds, false, "Dump word pass/fail chk");
00633   STRING_VAR_H(chs_leading_punct, "('`\"", "Leading punctuation");
00634   STRING_VAR_H(chs_trailing_punct1, ").,;:?!", "1st Trailing punctuation");
00635   STRING_VAR_H(chs_trailing_punct2, ")'`\"", "2nd Trailing punctuation");
00636   double_VAR_H(quality_rej_pc, 0.08, "good_quality_doc lte rejection limit");
00637   double_VAR_H(quality_blob_pc, 0.0, "good_quality_doc gte good blobs limit");
00638   double_VAR_H(quality_outline_pc, 1.0,
00639                "good_quality_doc lte outline error limit");
00640   double_VAR_H(quality_char_pc, 0.95, "good_quality_doc gte good char limit");
00641   INT_VAR_H(quality_min_initial_alphas_reqd, 2, "alphas in a good word");
00642   BOOL_VAR_H(tessedit_tess_adapt_to_rejmap, false,
00643              "Use reject map to control Tesseract adaption");
00644   INT_VAR_H(tessedit_tess_adaption_mode, 0x27,
00645             "Adaptation decision algorithm for tess");
00646   BOOL_VAR_H(tessedit_minimal_rej_pass1, false,
00647              "Do minimal rejection on pass 1 output");
00648   BOOL_VAR_H(tessedit_test_adaption, false, "Test adaption criteria");
00649   BOOL_VAR_H(tessedit_matcher_log, false, "Log matcher activity");
00650   INT_VAR_H(tessedit_test_adaption_mode, 3,
00651             "Adaptation decision algorithm for tess");
00652   BOOL_VAR_H(save_best_choices, false,
00653              "Save the results of the recognition step"
00654              " (blob_choices) within the corresponding WERD_CHOICE");
00655   BOOL_VAR_H(test_pt, false, "Test for point");
00656   double_VAR_H(test_pt_x, 99999.99, "xcoord");
00657   double_VAR_H(test_pt_y, 99999.99, "ycoord");
00658   INT_VAR_H(cube_debug_level, 1, "Print cube debug info.");
00659   STRING_VAR_H(outlines_odd, "%| ", "Non standard number of outlines");
00660   STRING_VAR_H(outlines_2, "ij!?%\":;", "Non standard number of outlines");
00661   BOOL_VAR_H(docqual_excuse_outline_errs, false,
00662              "Allow outline errs in unrejection?");
00663   BOOL_VAR_H(tessedit_good_quality_unrej, true,
00664              "Reduce rejection on good docs");
00665   BOOL_VAR_H(tessedit_use_reject_spaces, true, "Reject spaces?");
00666   double_VAR_H(tessedit_reject_doc_percent, 65.00,
00667                "%rej allowed before rej whole doc");
00668   double_VAR_H(tessedit_reject_block_percent, 45.00,
00669                "%rej allowed before rej whole block");
00670   double_VAR_H(tessedit_reject_row_percent, 40.00,
00671                "%rej allowed before rej whole row");
00672   double_VAR_H(tessedit_whole_wd_rej_row_percent, 70.00,
00673                "Number of row rejects in whole word rejects"
00674                "which prevents whole row rejection");
00675   BOOL_VAR_H(tessedit_preserve_blk_rej_perfect_wds, true,
00676              "Only rej partially rejected words in block rejection");
00677   BOOL_VAR_H(tessedit_preserve_row_rej_perfect_wds, true,
00678              "Only rej partially rejected words in row rejection");
00679   BOOL_VAR_H(tessedit_dont_blkrej_good_wds, false,
00680              "Use word segmentation quality metric");
00681   BOOL_VAR_H(tessedit_dont_rowrej_good_wds, false,
00682              "Use word segmentation quality metric");
00683   INT_VAR_H(tessedit_preserve_min_wd_len, 2,
00684             "Only preserve wds longer than this");
00685   BOOL_VAR_H(tessedit_row_rej_good_docs, true,
00686              "Apply row rejection to good docs");
00687   double_VAR_H(tessedit_good_doc_still_rowrej_wd, 1.1,
00688                "rej good doc wd if more than this fraction rejected");
00689   BOOL_VAR_H(tessedit_reject_bad_qual_wds, true,
00690              "Reject all bad quality wds");
00691   BOOL_VAR_H(tessedit_debug_doc_rejection, false, "Page stats");
00692   BOOL_VAR_H(tessedit_debug_quality_metrics, false,
00693              "Output data to debug file");
00694   BOOL_VAR_H(bland_unrej, false, "unrej potential with no chekcs");
00695   double_VAR_H(quality_rowrej_pc, 1.1,
00696                "good_quality_doc gte good char limit");
00697   BOOL_VAR_H(unlv_tilde_crunching, true,
00698              "Mark v.bad words for tilde crunch");
00699   BOOL_VAR_H(crunch_early_merge_tess_fails, true, "Before word crunch?");
00700   BOOL_VAR_H(crunch_early_convert_bad_unlv_chs, false, "Take out ~^ early?");
00701   double_VAR_H(crunch_terrible_rating, 80.0, "crunch rating lt this");
00702   BOOL_VAR_H(crunch_terrible_garbage, true, "As it says");
00703   double_VAR_H(crunch_poor_garbage_cert, -9.0,
00704                "crunch garbage cert lt this");
00705   double_VAR_H(crunch_poor_garbage_rate, 60, "crunch garbage rating lt this");
00706   double_VAR_H(crunch_pot_poor_rate, 40, "POTENTIAL crunch rating lt this");
00707   double_VAR_H(crunch_pot_poor_cert, -8.0, "POTENTIAL crunch cert lt this");
00708   BOOL_VAR_H(crunch_pot_garbage, true, "POTENTIAL crunch garbage");
00709   double_VAR_H(crunch_del_rating, 60, "POTENTIAL crunch rating lt this");
00710   double_VAR_H(crunch_del_cert, -10.0, "POTENTIAL crunch cert lt this");
00711   double_VAR_H(crunch_del_min_ht, 0.7, "Del if word ht lt xht x this");
00712   double_VAR_H(crunch_del_max_ht, 3.0, "Del if word ht gt xht x this");
00713   double_VAR_H(crunch_del_min_width, 3.0, "Del if word width lt xht x this");
00714   double_VAR_H(crunch_del_high_word, 1.5,
00715                "Del if word gt xht x this above bl");
00716   double_VAR_H(crunch_del_low_word, 0.5, "Del if word gt xht x this below bl");
00717   double_VAR_H(crunch_small_outlines_size, 0.6, "Small if lt xht x this");
00718   INT_VAR_H(crunch_rating_max, 10, "For adj length in rating per ch");
00719   INT_VAR_H(crunch_pot_indicators, 1, "How many potential indicators needed");
00720   BOOL_VAR_H(crunch_leave_ok_strings, true, "Dont touch sensible strings");
00721   BOOL_VAR_H(crunch_accept_ok, true, "Use acceptability in okstring");
00722   BOOL_VAR_H(crunch_leave_accept_strings, false,
00723              "Dont pot crunch sensible strings");
00724   BOOL_VAR_H(crunch_include_numerals, false, "Fiddle alpha figures");
00725   INT_VAR_H(crunch_leave_lc_strings, 4,
00726             "Dont crunch words with long lower case strings");
00727   INT_VAR_H(crunch_leave_uc_strings, 4,
00728             "Dont crunch words with long lower case strings");
00729   INT_VAR_H(crunch_long_repetitions, 3, "Crunch words with long repetitions");
00730   INT_VAR_H(crunch_debug, 0, "As it says");
00731   INT_VAR_H(fixsp_non_noise_limit, 1,
00732             "How many non-noise blbs either side?");
00733   double_VAR_H(fixsp_small_outlines_size, 0.28, "Small if lt xht x this");
00734   BOOL_VAR_H(tessedit_prefer_joined_punct, false, "Reward punctation joins");
00735   INT_VAR_H(fixsp_done_mode, 1, "What constitues done for spacing");
00736   INT_VAR_H(debug_fix_space_level, 0, "Contextual fixspace debug");
00737   STRING_VAR_H(numeric_punctuation, ".,",
00738                "Punct. chs expected WITHIN numbers");
00739   INT_VAR_H(x_ht_acceptance_tolerance, 8,
00740             "Max allowed deviation of blob top outside of font data");
00741   INT_VAR_H(x_ht_min_change, 8, "Min change in xht before actually trying it");
00742   BOOL_VAR_H(tessedit_write_block_separators, false,
00743              "Write block separators in output");
00744   BOOL_VAR_H(tessedit_write_rep_codes, false,
00745              "Write repetition char code");
00746   BOOL_VAR_H(tessedit_write_unlv, false, "Write .unlv output file");
00747   BOOL_VAR_H(tessedit_create_hocr, false, "Write .html hOCR output file");
00748   STRING_VAR_H(unrecognised_char, "|",
00749                "Output char for unidentified blobs");
00750   INT_VAR_H(suspect_level, 99, "Suspect marker level");
00751   INT_VAR_H(suspect_space_level, 100,
00752             "Min suspect level for rejecting spaces");
00753   INT_VAR_H(suspect_short_words, 2,
00754             "Dont Suspect dict wds longer than this");
00755   BOOL_VAR_H(suspect_constrain_1Il, false, "UNLV keep 1Il chars rejected");
00756   double_VAR_H(suspect_rating_per_ch, 999.9, "Dont touch bad rating limit");
00757   double_VAR_H(suspect_accept_rating, -999.9, "Accept good rating limit");
00758   BOOL_VAR_H(tessedit_minimal_rejection, false, "Only reject tess failures");
00759   BOOL_VAR_H(tessedit_zero_rejection, false, "Dont reject ANYTHING");
00760   BOOL_VAR_H(tessedit_word_for_word, false,
00761              "Make output have exactly one word per WERD");
00762   BOOL_VAR_H(tessedit_zero_kelvin_rejection, false,
00763              "Dont reject ANYTHING AT ALL");
00764   BOOL_VAR_H(tessedit_consistent_reps, true, "Force all rep chars the same");
00765   INT_VAR_H(tessedit_reject_mode, 0, "Rejection algorithm");
00766   INT_VAR_H(tessedit_ok_mode, 5, "Acceptance decision algorithm");
00767   BOOL_VAR_H(tessedit_rejection_debug, false, "Adaption debug");
00768   BOOL_VAR_H(tessedit_flip_0O, true, "Contextual 0O O0 flips");
00769   double_VAR_H(tessedit_lower_flip_hyphen, 1.5,
00770                "Aspect ratio dot/hyphen test");
00771   double_VAR_H(tessedit_upper_flip_hyphen, 1.8,
00772                "Aspect ratio dot/hyphen test");
00773   BOOL_VAR_H(rej_trust_doc_dawg, false, "Use DOC dawg in 11l conf. detector");
00774   BOOL_VAR_H(rej_1Il_use_dict_word, false, "Use dictword test");
00775   BOOL_VAR_H(rej_1Il_trust_permuter_type, true, "Dont double check");
00776   BOOL_VAR_H(rej_use_tess_accepted, true, "Individual rejection control");
00777   BOOL_VAR_H(rej_use_tess_blanks, true, "Individual rejection control");
00778   BOOL_VAR_H(rej_use_good_perm, true, "Individual rejection control");
00779   BOOL_VAR_H(rej_use_sensible_wd, false, "Extend permuter check");
00780   BOOL_VAR_H(rej_alphas_in_number_perm, false, "Extend permuter check");
00781   double_VAR_H(rej_whole_of_mostly_reject_word_fract, 0.85, "if >this fract");
00782   INT_VAR_H(tessedit_image_border, 2, "Rej blbs near image edge limit");
00783   STRING_VAR_H(ok_repeated_ch_non_alphanum_wds, "-?*\075",
00784                "Allow NN to unrej");
00785   STRING_VAR_H(conflict_set_I_l_1, "Il1[]", "Il1 conflict set");
00786   INT_VAR_H(min_sane_x_ht_pixels, 8, "Reject any x-ht lt or eq than this");
00787   BOOL_VAR_H(tessedit_create_boxfile, false, "Output text with boxes");
00788   INT_VAR_H(tessedit_page_number, -1,
00789             "-1 -> All pages, else specifc page to process");
00790   BOOL_VAR_H(tessedit_write_images, false, "Capture the image from the IPE");
00791   BOOL_VAR_H(interactive_mode, false, "Run interactively?");
00792   STRING_VAR_H(file_type, ".tif", "Filename extension");
00793   BOOL_VAR_H(tessedit_override_permuter, true, "According to dict_word");
00794   INT_VAR_H(tessdata_manager_debug_level, 0,
00795             "Debug level for TessdataManager functions.");
00796   // Min acceptable orientation margin (difference in scores between top and 2nd
00797   // choice in OSResults::orientations) to believe the page orientation.
00798   double_VAR_H(min_orientation_margin, 12.0,
00799                "Min acceptable orientation margin");
00800 
00802   FILE *init_recog_training(const STRING &fname);
00803   void recog_training_segmented(const STRING &fname,
00804                                 PAGE_RES *page_res,
00805                                 volatile ETEXT_DESC *monitor,
00806                                 FILE *output_file);
00807   void ambigs_classify_and_output(WERD_RES *werd_res,
00808                                   ROW_RES *row_res,
00809                                   BLOCK_RES *block_res,
00810                                   const char *label,
00811                                   FILE *output_file);
00812 
00813   inline CubeRecoContext *GetCubeRecoContext() { return cube_cntxt_; }
00814 
00815  private:
00816   // The filename of a backup config file. If not null, then we currently
00817   // have a temporary debug config file loaded, and backup_config_file_
00818   // will be loaded, and set to null when debug is complete.
00819   const char* backup_config_file_;
00820   // The filename of a config file to read when processing a debug word.
00821   STRING word_config_;
00822   Pix* pix_binary_;
00823   Pix* pix_grey_;
00824   // The shiro-rekha splitter object which is used to split top-lines in
00825   // Devanagari words to provide a better word and grapheme segmentation.
00826   ShiroRekhaSplitter splitter_;
00827   // The boolean records if the currently set
00828   // pix_binary_ member has been modified due to any processing so that this
00829   // may hurt Cube's recognition phase.
00830   bool orig_image_changed_;
00831   // Page segmentation/layout
00832   Textord textord_;
00833   // True if the primary language uses right_to_left reading order.
00834   bool right_to_left_;
00835   FCOORD deskew_;
00836   FCOORD reskew_;
00837   TesseractStats stats_;
00838   // Cube objects.
00839   CubeRecoContext* cube_cntxt_;
00840   TesseractCubeCombiner *tess_cube_combiner_;
00841 };
00842 
00843 }  // namespace tesseract
00844 
00845 
00846 #endif  // TESSERACT_CCMAIN_TESSERACTCLASS_H__
 All Classes Namespaces Files Functions Variables Typedefs Enumerations Enumerator Friends Defines