Tesseract 3.01
/data/source/tesseract-ocr/wordrec/wordrec.h
Go to the documentation of this file.
00001 
00002 // File:        wordrec.h
00003 // Description: wordrec class.
00004 // Author:      Samuel Charron
00005 //
00006 // (C) Copyright 2006, Google Inc.
00007 // Licensed under the Apache License, Version 2.0 (the "License");
00008 // you may not use this file except in compliance with the License.
00009 // You may obtain a copy of the License at
00010 // http://www.apache.org/licenses/LICENSE-2.0
00011 // Unless required by applicable law or agreed to in writing, software
00012 // distributed under the License is distributed on an "AS IS" BASIS,
00013 // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
00014 // See the License for the specific language governing permissions and
00015 // limitations under the License.
00016 //
00018 
00019 #ifndef TESSERACT_WORDREC_WORDREC_H__
00020 #define TESSERACT_WORDREC_WORDREC_H__
00021 
00022 #include "associate.h"
00023 #include "classify.h"
00024 #include "dict.h"
00025 #include "language_model.h"
00026 #include "ratngs.h"
00027 #include "matrix.h"
00028 #include "matchtab.h"
00029 #include "oldheap.h"
00030 #include "gradechop.h"
00031 #include "seam.h"
00032 #include "findseam.h"
00033 #include "callcpp.h"
00034 #include "associate.h"
00035 #include "pieces.h"
00036 #include "ratngs.h"
00037 #include "tally.h"
00038 
00039 struct CHUNKS_RECORD;
00040 struct SEARCH_RECORD;
00041 class WERD_RES;
00042 
00043 // A struct for storing child/parent pairs of the BLOB_CHOICE_LISTs
00044 // to be processed by the segmentation search.
00045 struct SEG_SEARCH_PENDING : public ELIST_LINK {
00046   SEG_SEARCH_PENDING(int child_row_arg,
00047                      BLOB_CHOICE_LIST *parent_arg,
00048                      tesseract::LanguageModelFlagsType changed_arg) :
00049     child_row(child_row_arg), parent(parent_arg), changed(changed_arg) {}
00050 
00051   // Comparator function for add_sorted().
00052   static int compare(const void *p1, const void *p2) {
00053     const SEG_SEARCH_PENDING *e1 = *reinterpret_cast<
00054       const SEG_SEARCH_PENDING * const *>(p1);
00055     const SEG_SEARCH_PENDING *e2 = *reinterpret_cast<
00056       const SEG_SEARCH_PENDING * const *>(p2);
00057     if (e1->child_row == e2->child_row &&
00058         e1->parent == e2->parent) return 0;
00059     return (e1->child_row < e2->child_row) ? -1 : 1;
00060   }
00061 
00062   int child_row;  // row of the child in the ratings matrix
00063   BLOB_CHOICE_LIST *parent;  // pointer to the parent BLOB_CHOICE_LIST
00064   // Flags that indicate which language model components are still active
00065   // on the parent path (i.e. recorded some changes to the language model
00066   // state) and need to be invoked for this pending entry.
00067   // This field is used as an argument to LanguageModel::UpdateState()
00068   // in Wordrec::UpdateSegSearchNodes().
00069   tesseract::LanguageModelFlagsType changed;
00070 };
00071 
00072 ELISTIZEH(SEG_SEARCH_PENDING);
00073 
00074 
00075 namespace tesseract {
00076 
00077 /* ccmain/tstruct.cpp *********************************************************/
00078 class FRAGMENT:public ELIST_LINK
00079 {
00080   public:
00081     FRAGMENT() {  //constructor
00082     }
00083     FRAGMENT(EDGEPT *head_pt,   //start
00084              EDGEPT *tail_pt);  //end
00085 
00086     ICOORD head;                 //coords of start
00087     ICOORD tail;                 //coords of end
00088     EDGEPT *headpt;              //start point
00089     EDGEPT *tailpt;              //end point
00090 };
00091 ELISTIZEH(FRAGMENT)
00092 
00093 
00094 class Wordrec : public Classify {
00095  public:
00096   // config parameters *******************************************************
00097   BOOL_VAR_H(wordrec_no_block, FALSE, "Don't output block information");
00098   BOOL_VAR_H(wordrec_enable_assoc, TRUE, "Associator Enable");
00099   BOOL_VAR_H(force_word_assoc, FALSE,
00100              "force associator to run regardless of what enable_assoc is."
00101              "This is used for CJK where component grouping is necessary.");
00102   INT_VAR_H(wordrec_num_seg_states, 30, "Segmentation states");
00103   double_VAR_H(wordrec_worst_state, 1, "Worst segmentation state");
00104   BOOL_VAR_H(fragments_guide_chopper, FALSE,
00105              "Use information from fragments to guide chopping process");
00106   INT_VAR_H(repair_unchopped_blobs, 1, "Fix blobs that aren't chopped");
00107   double_VAR_H(tessedit_certainty_threshold, -2.25, "Good blob limit");
00108   INT_VAR_H(chop_debug, 0, "Chop debug");
00109   BOOL_VAR_H(chop_enable, 1, "Chop enable");
00110   BOOL_VAR_H(chop_vertical_creep, 0, "Vertical creep");
00111   INT_VAR_H(chop_split_length, 10000, "Split Length");
00112   INT_VAR_H(chop_same_distance, 2, "Same distance");
00113   INT_VAR_H(chop_min_outline_points, 6, "Min Number of Points on Outline");
00114   INT_VAR_H(chop_inside_angle, -50, "Min Inside Angle Bend");
00115   INT_VAR_H(chop_min_outline_area, 2000, "Min Outline Area");
00116   double_VAR_H(chop_split_dist_knob, 0.5, "Split length adjustment");
00117   double_VAR_H(chop_overlap_knob, 0.9, "Split overlap adjustment");
00118   double_VAR_H(chop_center_knob, 0.15, "Split center adjustment");
00119   double_VAR_H(chop_sharpness_knob, 0.06, "Split sharpness adjustment");
00120   double_VAR_H(chop_width_change_knob, 5.0, "Width change adjustment");
00121   double_VAR_H(chop_ok_split, 100.0, "OK split limit");
00122   double_VAR_H(chop_good_split, 50.0, "Good split limit");
00123   INT_VAR_H(chop_x_y_weight, 3, "X / Y  length weight");
00124   INT_VAR_H(segment_adjust_debug, 0, "Segmentation adjustment debug");
00125   BOOL_VAR_H(assume_fixed_pitch_char_segment, FALSE,
00126              "include fixed-pitch heuristics in char segmentation");
00127   BOOL_VAR_H(use_new_state_cost, FALSE,
00128              "use new state cost heuristics for segmentation state evaluation");
00129   double_VAR_H(heuristic_segcost_rating_base, 1.25,
00130                "base factor for adding segmentation cost into word rating."
00131                "It's a multiplying factor, the larger the value above 1, "
00132                "the bigger the effect of segmentation cost.");
00133   double_VAR_H(heuristic_weight_rating, 1,
00134                "weight associated with char rating in combined cost of state");
00135   double_VAR_H(heuristic_weight_width, 0,
00136                "weight associated with width evidence in combined cost of state");
00137   double_VAR_H(heuristic_weight_seamcut, 0,
00138                "weight associated with seam cut in combined cost of state");
00139   double_VAR_H(heuristic_max_char_wh_ratio, 2.0,
00140                "max char width-to-height ratio allowed in segmentation");
00141   INT_VAR_H(wordrec_debug_level, 0, "Debug level for wordrec");
00142   BOOL_VAR_H(enable_new_segsearch, false,
00143              "Enable new segmentation search path.");
00144   INT_VAR_H(segsearch_debug_level, 0, "SegSearch debug level");
00145   INT_VAR_H(segsearch_max_pain_points, 2000,
00146             "Maximum number of pain points stored in the queue");
00147   INT_VAR_H(segsearch_max_futile_classifications, 10,
00148             "Maximum number of pain point classifications per word.");
00149   double_VAR_H(segsearch_max_char_wh_ratio, 2.0,
00150                "Maximum character width-to-height ratio");
00151   double_VAR_H(segsearch_max_fixed_pitch_char_wh_ratio, 2.0,
00152                "Maximum character width-to-height ratio for"
00153                "fixed pitch fonts");
00154 
00155   // methods from wordrec/*.cpp ***********************************************
00156   Wordrec();
00157   virtual ~Wordrec();
00158 
00159   void CopyCharChoices(const BLOB_CHOICE_LIST_VECTOR &from,
00160                        BLOB_CHOICE_LIST_VECTOR *to);
00161 
00162   // tface.cpp
00163   void program_editup(const char *textbase,
00164                       bool init_classifier,
00165                       bool init_permute);
00166   BLOB_CHOICE_LIST_VECTOR *cc_recog(WERD_RES *word);
00167   void program_editdown(inT32 elasped_time);
00168   void set_pass1();
00169   void set_pass2();
00170   int end_recog();
00171   BLOB_CHOICE_LIST *call_matcher(TBLOB* blob);
00172   int dict_word(const WERD_CHOICE &word);
00173   // wordclass.cpp
00174   BLOB_CHOICE_LIST *classify_blob(TBLOB *blob,
00175                                   const char *string,
00176                                   C_COL color);
00177   BLOB_CHOICE_LIST *fake_classify_blob(UNICHAR_ID class_id,
00178                                        float rating, float certainty);
00179   void update_blob_classifications(TWERD *word,
00180                                    const BLOB_CHOICE_LIST_VECTOR &choices);
00181 
00182   // bestfirst.cpp
00183   BLOB_CHOICE_LIST_VECTOR *evaluate_chunks(CHUNKS_RECORD *chunks_record,
00184                                            SEARCH_STATE search_state);
00185   void update_ratings(const BLOB_CHOICE_LIST_VECTOR &new_choices,
00186                       const CHUNKS_RECORD *chunks_record,
00187                       const SEARCH_STATE search_state);
00188   inT16 evaluate_state(CHUNKS_RECORD *chunks_record,
00189                        SEARCH_RECORD *the_search,
00190                        DANGERR *fixpt);
00191   SEARCH_RECORD *new_search(CHUNKS_RECORD *chunks_record,
00192                             int num_joints,
00193                             BLOB_CHOICE_LIST_VECTOR *best_char_choices,
00194                             WERD_CHOICE *best_choice,
00195                             WERD_CHOICE *raw_choice,
00196                             STATE *state);
00197   void best_first_search(CHUNKS_RECORD *chunks_record,
00198                          BLOB_CHOICE_LIST_VECTOR *best_char_choices,
00199                          WERD_RES *word,
00200                          STATE *state,
00201                          DANGERR *fixpt,
00202                          STATE *best_state);
00203   void delete_search(SEARCH_RECORD *the_search);
00204   void expand_node(FLOAT32 worst_priority,
00205                    CHUNKS_RECORD *chunks_record,
00206                    SEARCH_RECORD *the_search);
00207   void replace_char_widths(CHUNKS_RECORD *chunks_record,
00208                            SEARCH_STATE state);
00209   // Transfers the given state to the word's output fields: rebuild_word,
00210   // best_state, box_word, and returns the corresponding blob choices.
00211   BLOB_CHOICE_LIST_VECTOR *rebuild_current_state(
00212       WERD_RES *word,
00213       STATE *state,
00214       BLOB_CHOICE_LIST_VECTOR *char_choices,
00215       MATRIX *ratings);
00216   // Creates a fake blob choice from the combination of the given fragments.
00217   // unichar is the class to be made from the combination,
00218   // expanded_fragment_lengths[choice_index] is the number of fragments to use.
00219   // old_choices[choice_index] has the classifier output for each fragment.
00220   // choice index initially indexes the last fragment and should be decremented
00221   // expanded_fragment_lengths[choice_index] times to get the earlier fragments.
00222   // Guarantees to return something non-null, or abort!
00223   BLOB_CHOICE* rebuild_fragments(
00224       const char* unichar,
00225       const char* expanded_fragment_lengths,
00226       int choice_index,
00227       BLOB_CHOICE_LIST_VECTOR *old_choices);
00228   // Creates a joined copy of the blobs between x and y (inclusive) and
00229   // insert into the rebuild_word in word.
00230   // Returns a deep copy of the classifier results for the blob.
00231   BLOB_CHOICE_LIST *join_blobs_and_classify(
00232       WERD_RES* word, int x, int y, int choice_index, MATRIX *ratings,
00233       BLOB_CHOICE_LIST_VECTOR *old_choices);
00234   STATE *pop_queue(HEAP *queue);
00235   void push_queue(HEAP *queue, STATE *state, FLOAT32 worst_priority,
00236                   FLOAT32 priority, bool debug);
00237 
00238   // segsearch.cpp
00239   // SegSearch works on the lower diagonal matrix of BLOB_CHOICE_LISTs.
00240   // Each entry in the matrix represents the classification choice
00241   // for a chunk, i.e. an entry in row 2, column 1 represents the list
00242   // of ratings for the chunks 1 and 2 classified as a single blob.
00243   // The entries on the diagonal of the matrix are classifier choice lists
00244   // for a single chunk from the maximal segmentation.
00245   //
00246   // The ratings matrix given to SegSearch represents the segmentation
00247   // graph / trellis for the current word. The nodes in the graph are the
00248   // individual BLOB_CHOICEs in each of the BLOB_CHOICE_LISTs in the ratings
00249   // matrix. The children of each node (nodes connected by outgoing links)
00250   // are the entries in the column that is equal to node's row+1. The parents
00251   // (nodes connected by the incoming links) are the entries in the row that
00252   // is equal to the node's column-1. Here is an example ratings matrix:
00253   //
00254   //    0    1    2   3   4
00255   //  -------------------------
00256   // 0| c,(                   |
00257   // 1| d    l,1              |
00258   // 2|           o           |
00259   // 3|              c,(      |
00260   // 4|              g,y  l,1 |
00261   //  -------------------------
00262   //
00263   // In the example above node "o" has children (outgoing connection to nodes)
00264   // "c","(","g","y" and parents (incoming connections from nodes) "l","1","d".
00265   //
00266   // The objective of the search is to find the least cost path, where the cost
00267   // is determined by the language model components and the properties of the
00268   // cut between the blobs on the path. SegSearch starts by populating the
00269   // matrix with the all the entries that were classified by the chopper and
00270   // finding the initial best path. Based on the classifier ratings, language
00271   // model scores and the properties of each cut, a list of "pain points" is
00272   // constructed - those are the points on the path where the choices do not
00273   // look consistent with the neighboring choices, the cuts look particularly
00274   // problematic, or the certainties of the blobs are low. The most troublesome
00275   // "pain point" is picked from the list and the new entry in the ratings
00276   // matrix corresponding to this "pain point" is filled in. Then the language
00277   // model state is updated to reflect the new classification and the new
00278   // "pain points" are added to the list and the next most troublesome
00279   // "pain point" is determined. This continues until either the word choice
00280   // composed from the best paths in the segmentation graph is "good enough"
00281   // (e.g. above a certain certainty threshold, is an unambiguous dictionary
00282   // word, etc) or there are no more "pain points" to explore.
00283   void SegSearch(CHUNKS_RECORD *chunks_record,
00284                  WERD_CHOICE *best_choice,
00285                  BLOB_CHOICE_LIST_VECTOR *best_char_choices,
00286                  WERD_CHOICE *raw_choice,
00287                  STATE *output_best_state);
00288 
00289   // chop.cpp
00290   PRIORITY point_priority(EDGEPT *point);
00291   void add_point_to_list(POINT_GROUP point_list, EDGEPT *point);
00292   int angle_change(EDGEPT *point1, EDGEPT *point2, EDGEPT *point3);
00293   int is_little_chunk(EDGEPT *point1, EDGEPT *point2);
00294   int is_small_area(EDGEPT *point1, EDGEPT *point2);
00295   EDGEPT *pick_close_point(EDGEPT *critical_point,
00296                            EDGEPT *vertical_point,
00297                            int *best_dist);
00298   void prioritize_points(TESSLINE *outline, POINT_GROUP points);
00299   void new_min_point(EDGEPT *local_min, POINT_GROUP points);
00300   void new_max_point(EDGEPT *local_max, POINT_GROUP points);
00301   void vertical_projection_point(EDGEPT *split_point, EDGEPT *target_point,
00302                                  EDGEPT** best_point);
00303 
00304   // chopper.cpp
00305   SEAM *attempt_blob_chop(TWERD *word, inT32 blob_number, bool italic_blob,
00306                           SEAMS seam_list);
00307   bool improve_one_blob(TWERD *word,
00308                         BLOB_CHOICE_LIST_VECTOR *char_choices,
00309                         inT32 *blob_number,
00310                         SEAMS *seam_list,
00311                         DANGERR *fixpt,
00312                         bool split_next_to_fragment);
00313   void modify_blob_choice(BLOB_CHOICE_LIST *answer,
00314                           int chop_index);
00315   bool chop_one_blob(TWERD *word,
00316                      BLOB_CHOICE_LIST_VECTOR *char_choices,
00317                      inT32 *blob_number,
00318                      SEAMS *seam_list,
00319                      int *right_chop_index);
00320   BLOB_CHOICE_LIST_VECTOR *chop_word_main(WERD_RES *word);
00321   void improve_by_chopping(WERD_RES *word,
00322                            BLOB_CHOICE_LIST_VECTOR *char_choices,
00323                            STATE *best_state,
00324                            BLOB_CHOICE_LIST_VECTOR *best_char_choices,
00325                            DANGERR *fixpt,
00326                            bool *updated_best_choice);
00327   MATRIX *word_associator(WERD_RES *word,
00328                           STATE *state,
00329                           BLOB_CHOICE_LIST_VECTOR *best_char_choices,
00330                           DANGERR *fixpt,
00331                           STATE *best_state);
00332   inT16 select_blob_to_split(const BLOB_CHOICE_LIST_VECTOR &char_choices,
00333                              float rating_ceiling,
00334                              bool split_next_to_fragment);
00335 
00336   // findseam.cpp
00337   void junk_worst_seam(SEAM_QUEUE seams, SEAM *new_seam, float new_priority);
00338   void choose_best_seam(SEAM_QUEUE seam_queue,
00339                         SEAM_PILE *seam_pile,
00340                         SPLIT *split,
00341                         PRIORITY priority,
00342                         SEAM **seam_result,
00343                         TBLOB *blob);
00344   void combine_seam(SEAM_QUEUE seam_queue, SEAM_PILE seam_pile, SEAM *seam);
00345   inT16 constrained_split(SPLIT *split, TBLOB *blob);
00346   void delete_seam_pile(SEAM_PILE seam_pile);
00347   SEAM *pick_good_seam(TBLOB *blob);
00348   PRIORITY seam_priority(SEAM *seam, inT16 xmin, inT16 xmax);
00349   void try_point_pairs (EDGEPT * points[MAX_NUM_POINTS],
00350                         inT16 num_points,
00351                         SEAM_QUEUE seam_queue,
00352                         SEAM_PILE * seam_pile, SEAM ** seam, TBLOB * blob);
00353   void try_vertical_splits(EDGEPT * points[MAX_NUM_POINTS],
00354                            inT16 num_points,
00355                            SEAM_QUEUE seam_queue,
00356                            SEAM_PILE * seam_pile, SEAM ** seam, TBLOB * blob);
00357 
00358   // gradechop.cpp
00359   PRIORITY full_split_priority(SPLIT *split, inT16 xmin, inT16 xmax);
00360   PRIORITY grade_center_of_blob(register BOUNDS_RECT rect);
00361   PRIORITY grade_overlap(register BOUNDS_RECT rect);
00362   PRIORITY grade_split_length(register SPLIT *split);
00363   PRIORITY grade_sharpness(register SPLIT *split);
00364   PRIORITY grade_width_change(register BOUNDS_RECT rect);
00365   void set_outline_bounds(register EDGEPT *point1,
00366                           register EDGEPT *point2,
00367                           BOUNDS_RECT rect);
00368 
00369   // outlines.cpp
00370   int crosses_outline(EDGEPT *p0, EDGEPT *p1, EDGEPT *outline);
00371   int is_crossed(TPOINT a0, TPOINT a1, TPOINT b0, TPOINT b1);
00372   int is_same_edgept(EDGEPT *p1, EDGEPT *p2);
00373   EDGEPT *near_point(EDGEPT *point, EDGEPT *line_pt_0, EDGEPT *line_pt_1);
00374   void reverse_outline(EDGEPT *outline);
00375 
00376   // pieces.cpp
00377   virtual BLOB_CHOICE_LIST *classify_piece(TBLOB *pieces,
00378                                            SEAMS seams,
00379                                            inT16 start,
00380                                            inT16 end);
00381   BLOB_CHOICE_LIST *get_piece_rating(MATRIX *ratings,
00382                                      TBLOB *blobs,
00383                                      SEAMS seams,
00384                                      inT16 start,
00385                                      inT16 end);
00386   BOUNDS_LIST record_blob_bounds(TBLOB *blobs);
00387   MATRIX *record_piece_ratings(TBLOB *blobs);
00388 
00389   // heuristic.cpp
00390   WIDTH_RECORD* state_char_widths(WIDTH_RECORD *chunk_widths,
00391                                   STATE *state,
00392                                   int num_joints);
00393   FLOAT32 get_width_variance(WIDTH_RECORD *wrec, float norm_height);
00394   FLOAT32 get_gap_variance(WIDTH_RECORD *wrec, float norm_height);
00395   FLOAT32 prioritize_state(CHUNKS_RECORD *chunks_record,
00396                            SEARCH_RECORD *the_search);
00397   FLOAT32 width_priority(CHUNKS_RECORD *chunks_record,
00398                          STATE *state,
00399                          int num_joints);
00400   FLOAT32 seamcut_priority(SEAMS seams,
00401                            STATE *state,
00402                            int num_joints);
00403   FLOAT32 rating_priority(CHUNKS_RECORD *chunks_record,
00404                           STATE *state,
00405                           int num_joints);
00406 
00407   // Member variables.
00408 
00409   LanguageModel *language_model_;
00410   PRIORITY pass2_ok_split;
00411   int pass2_seg_states;
00412   int num_joints;
00413   int num_pushed;
00414   int num_popped;
00415   TALLY states_before_best;
00416   TALLY best_certainties[2];
00417   TALLY character_widths;          /* Width histogram */
00418   BlobMatchTable blob_match_table;
00419   EVALUATION_ARRAY last_segmentation;
00420   // Stores the best choice for the previous word in the paragraph.
00421   // This variable is modified by PAGE_RES_IT when iterating over
00422   // words to OCR on the page.
00423   WERD_CHOICE *prev_word_best_choice_;
00424 
00425  protected:
00426   // Updates the language model state recorded for the child entries specified
00427   // in pending[starting_col]. Enqueues the children of the updated entries
00428   // into pending and proceedes to update (and remove from pending) all the
00429   // remaining entries in pending[col] (col >= starting_col). Upon termination
00430   // of this function all the pending[col] lists will be empty.
00431   //
00432   // The arguments:
00433   //
00434   // starting_col: index of the column in chunks_record->ratings from
00435   // which the update should be started
00436   //
00437   // pending: list of entries listing chunks_record->ratings entries
00438   // that should be updated
00439   //
00440   // pain_points: priority heap listing the pain points generated by
00441   // the language model
00442   //
00443   // temp_pain_points: temporary storage for tentative pain points generated
00444   // by the language model after a single call to LanguageModel::UpdateState()
00445   // (the agrument is passed in rather than created before each
00446   // LanguageModel::UpdateState() call to avoid dynamic memory re-allocation)
00447   //
00448   // best_choice_bundle: a collection of variables that should be updated
00449   // if a new best choice is found
00450   //
00451   void UpdateSegSearchNodes(int starting_col,
00452                             SEG_SEARCH_PENDING_LIST *pending[],
00453                             BestPathByColumn *best_path_by_column[],
00454                             CHUNKS_RECORD *chunks_record,
00455                             HEAP *pain_points,
00456                             BestChoiceBundle *best_choice_bundle);
00457 };
00458 
00459 
00460 }  // namespace tesseract
00461 
00462 #endif  // TESSERACT_WORDREC_WORDREC_H__
 All Classes Namespaces Files Functions Variables Typedefs Enumerations Enumerator Friends Defines