00001
00002
00003
00004
00005
00006
00007
00008
00009
00010
00011
00012
00013
00014
00015
00016
00017
00018
00020
00021 #ifndef TESSERACT_CCMAIN_TESSERACTCLASS_H__
00022 #define TESSERACT_CCMAIN_TESSERACTCLASS_H__
00023
00024 #include "varable.h"
00025 #include "wordrec.h"
00026 #include "ocrclass.h"
00027 #include "control.h"
00028 #include "docqual.h"
00029
00030 class CHAR_SAMPLES_LIST;
00031 class CHAR_SAMPLE_LIST;
00032 class PAGE_RES;
00033 class PAGE_RES_IT;
00034 class BLOCK_LIST;
00035 class TO_BLOCK_LIST;
00036 class IMAGE;
00037 class WERD_RES;
00038 class ROW;
00039 class TBOX;
00040 class SVMenuNode;
00041 struct Pix;
00042 class WERD_CHOICE;
00043 class WERD;
00044 class BLOB_CHOICE_LIST_CLIST;
00045
00046
00047
00048
00049
00050
00051
00052
00053
00054
00055
00056
00057
00058
00059
00060
00061
00062
00063
00064
00065
00066
00067
00068
00069
00070
00071
00072 namespace tesseract {
00073
00074 class Tesseract : public Wordrec {
00075 public:
00076 Tesseract();
00077 ~Tesseract();
00078
00079 void Clear();
00080
00081
00082 const FCOORD& reskew() const {
00083 return reskew_;
00084 }
00085
00086 Pix** mutable_pix_binary() {
00087 Clear();
00088 return &pix_binary_;
00089 }
00090 Pix* pix_binary() const {
00091 return pix_binary_;
00092 }
00093
00094 void SetBlackAndWhitelist();
00095 int SegmentPage(const STRING* input_file,
00096 IMAGE* image, BLOCK_LIST* blocks);
00097 int AutoPageSeg(int width, int height, int resolution,
00098 bool single_column, IMAGE* image,
00099 BLOCK_LIST* blocks, TO_BLOCK_LIST* to_blocks);
00100
00102 void recog_all_words(
00103 PAGE_RES *page_res,
00104
00105 volatile ETEXT_DESC *monitor,
00106 TBOX *target_word_box=0L,
00107 inT16 dopasses=0
00108 );
00109 void classify_word_pass1(
00110 WERD_RES *word,
00111 ROW *row,
00112 BLOCK* block,
00113 BOOL8 cluster_adapt,
00114 CHAR_SAMPLES_LIST *char_clusters,
00115 CHAR_SAMPLE_LIST *chars_waiting);
00116 void recog_pseudo_word(
00117 BLOCK_LIST *block_list,
00118 TBOX &selection_box);
00119
00120
00121
00122 C_BLOB_LIST* get_blobs_from_blocks(BLOCK_LIST* blocks
00123 );
00124
00125
00126
00127 void train_word_level_with_boxes(
00128 const STRING& box_file,
00129 const STRING& out_file,
00130 BLOCK_LIST* blocks
00131 );
00132 void fix_rep_char(WERD_RES *word);
00133 void fix_quotes(
00134 WERD_CHOICE *choice,
00135 WERD *word,
00136 BLOB_CHOICE_LIST_CLIST *blob_choices);
00137 ACCEPTABLE_WERD_TYPE acceptable_word_string(const char *s,
00138 const char *lengths);
00139 void match_word_pass2(
00140 WERD_RES *word,
00141 ROW *row,
00142 BLOCK* block,
00143 float x_height);
00144 void classify_word_pass2(
00145 WERD_RES *word,
00146 BLOCK* block,
00147 ROW *row);
00148 BOOL8 recog_interactive(
00149 BLOCK *block,
00150 ROW *row,
00151 WERD *word
00152 );
00153 void fix_hyphens(
00154 WERD_CHOICE *choice,
00155 WERD *word,
00156 BLOB_CHOICE_LIST_CLIST *blob_choices);
00157 void set_word_fonts(
00158 WERD_RES *word,
00159 BLOB_CHOICE_LIST_CLIST *blob_choices);
00160 void font_recognition_pass(
00161 PAGE_RES_IT &page_res_it);
00162
00164
00165 void output_pass(
00166 PAGE_RES_IT &page_res_it,
00167 BOOL8 write_to_shm,
00168 TBOX *target_word_box);
00169 FILE *open_outfile(
00170 const char *extension);
00171 void write_results(
00172 PAGE_RES_IT &page_res_it,
00173 char newline_type,
00174 BOOL8 force_eol,
00175 BOOL8 write_to_shm
00176 );
00177 void set_unlv_suspects(WERD_RES *word);
00178 UNICHAR_ID get_rep_char(WERD_RES *word);
00179 BOOL8 acceptable_number_string(const char *s,
00180 const char *lengths);
00181 inT16 count_alphanums(const WERD_CHOICE &word);
00182 inT16 count_alphas(const WERD_CHOICE &word);
00184 void read_config_file(const char *filename, bool global_only);
00185 int init_tesseract(const char *arg0,
00186 const char *textbase,
00187 const char *language,
00188 char **configs,
00189 int configs_size,
00190 bool configs_global_only);
00191
00192 int init_tesseract_lm(const char *arg0,
00193 const char *textbase,
00194 const char *language);
00195
00196
00197 int init_tesseract_classifier(const char *arg0,
00198 const char *textbase,
00199 const char *language,
00200 char **configs,
00201 int configs_size,
00202 bool configs_global_only);
00203
00204 void recognize_page(STRING& image_name);
00205 void end_tesseract();
00206
00207 bool init_tesseract_lang_data(const char *arg0,
00208 const char *textbase,
00209 const char *language,
00210 char **configs,
00211 int configs_size,
00212 bool configs_global_only);
00213
00215 SVMenuNode *build_menu_new();
00216 void pgeditor_main(BLOCK_LIST *blocks);
00217 void process_image_event(
00218 const SVEvent &event);
00219 void pgeditor_read_file(
00220 STRING &filename,
00221 BLOCK_LIST *blocks
00222 );
00223 void do_new_source(
00224 );
00225 BOOL8 process_cmd_win_event(
00226 inT32 cmd_event,
00227 char *new_value
00228 );
00230 const char *char_ambiguities(char c);
00231 void make_reject_map(
00232 WERD_RES *word,
00233 BLOB_CHOICE_LIST_CLIST *blob_choices,
00234 ROW *row,
00235 inT16 pass
00236 );
00237 BOOL8 one_ell_conflict(WERD_RES *word_res, BOOL8 update_map);
00238 inT16 first_alphanum_index(const char *word,
00239 const char *word_lengths);
00240 inT16 first_alphanum_offset(const char *word,
00241 const char *word_lengths);
00242 inT16 alpha_count(const char *word,
00243 const char *word_lengths);
00244 BOOL8 word_contains_non_1_digit(const char *word,
00245 const char *word_lengths);
00246 void dont_allow_1Il(WERD_RES *word);
00247 inT16 count_alphanums(
00248 WERD_RES *word);
00249 BOOL8 repeated_ch_string(const char *rep_ch_str,
00250 const char *lengths);
00251 void flip_0O(WERD_RES *word);
00252 BOOL8 non_0_digit(UNICHAR_ID unichar_id);
00253 BOOL8 non_O_upper(UNICHAR_ID unichar_id);
00254 BOOL8 repeated_nonalphanum_wd(WERD_RES *word, ROW *row);
00255 void nn_match_word(
00256 WERD_RES *word,
00257 ROW *row);
00258 void nn_recover_rejects(WERD_RES *word, ROW *row);
00259 BOOL8 test_ambig_word(
00260 WERD_RES *word);
00261 void set_done(
00262 WERD_RES *word,
00263 inT16 pass);
00264 inT16 safe_dict_word(const WERD_CHOICE &word);
00265 void flip_hyphens(WERD_RES *word);
00267 void adapt_to_good_ems(WERD_RES *word,
00268 CHAR_SAMPLES_LIST *char_clusters,
00269 CHAR_SAMPLE_LIST *chars_waiting);
00270 void adapt_to_good_samples(WERD_RES *word,
00271 CHAR_SAMPLES_LIST *char_clusters,
00272 CHAR_SAMPLE_LIST *chars_waiting);
00273 BOOL8 word_adaptable(
00274 WERD_RES *word,
00275 uinT16 mode);
00276 void reject_suspect_ems(WERD_RES *word);
00277 void collect_ems_for_adaption(WERD_RES *word,
00278 CHAR_SAMPLES_LIST *char_clusters,
00279 CHAR_SAMPLE_LIST *chars_waiting);
00280 void collect_characters_for_adaption(WERD_RES *word,
00281 CHAR_SAMPLES_LIST *char_clusters,
00282 CHAR_SAMPLE_LIST *chars_waiting);
00283 void check_wait_list(CHAR_SAMPLE_LIST *chars_waiting,
00284 CHAR_SAMPLE *sample,
00285 CHAR_SAMPLES *best_cluster);
00286 void cluster_sample(CHAR_SAMPLE *sample,
00287 CHAR_SAMPLES_LIST *char_clusters,
00288 CHAR_SAMPLE_LIST *chars_waiting);
00289 void complete_clustering(CHAR_SAMPLES_LIST *char_clusters,
00290 CHAR_SAMPLE_LIST *chars_waiting);
00291
00293 WERD_CHOICE *recog_word_recursive(
00294 WERD *word,
00295 DENORM *denorm,
00296
00297 POLY_MATCHER matcher,
00298
00299 POLY_TESTER tester,
00300
00301 POLY_TESTER trainer,
00302 BOOL8 testing,
00303
00304 WERD_CHOICE *&raw_choice,
00305
00306 BLOB_CHOICE_LIST_CLIST *blob_choices,
00307 WERD *&outword
00308 );
00309 WERD_CHOICE *recog_word(
00310 WERD *word,
00311 DENORM *denorm,
00312 POLY_MATCHER matcher,
00313 POLY_TESTER tester,
00314 POLY_TESTER trainer,
00315 BOOL8 testing,
00316 WERD_CHOICE *&raw_choice,
00317
00318 BLOB_CHOICE_LIST_CLIST *blob_choices,
00319 WERD *&outword
00320 );
00321 WERD_CHOICE *split_and_recog_word(
00322 WERD *word,
00323 DENORM *denorm,
00324
00325 POLY_MATCHER matcher,
00326
00327 POLY_TESTER tester,
00328
00329 POLY_TESTER trainer,
00330 BOOL8 testing,
00331
00332 WERD_CHOICE *&raw_choice,
00333
00334 BLOB_CHOICE_LIST_CLIST *blob_choices,
00335 WERD *&outword
00336 );
00338 BOOL8 digit_or_numeric_punct(WERD_RES *word, int char_position);
00339 inT16 eval_word_spacing(WERD_RES_LIST &word_res_list);
00340 void match_current_words(WERD_RES_LIST &words, ROW *row, BLOCK* block);
00341 inT16 fp_eval_word_spacing(WERD_RES_LIST &word_res_list);
00342 void fix_noisy_space_list(WERD_RES_LIST &best_perm, ROW *row, BLOCK* block);
00343 void fix_fuzzy_space_list(
00344 WERD_RES_LIST &best_perm,
00345 ROW *row,
00346 BLOCK* block);
00347 void fix_sp_fp_word(WERD_RES_IT &word_res_it, ROW *row, BLOCK* block);
00348 void fix_fuzzy_spaces(
00349 volatile ETEXT_DESC *monitor,
00350 inT32 word_count,
00351 PAGE_RES *page_res);
00353 GARBAGE_LEVEL garbage_word(WERD_RES *word, BOOL8 ok_dict_word);
00354 BOOL8 potential_word_crunch(WERD_RES *word,
00355 GARBAGE_LEVEL garbage_level,
00356 BOOL8 ok_dict_word);
00357 void tilde_crunch(PAGE_RES_IT &page_res_it);
00358 void unrej_good_quality_words(
00359 PAGE_RES_IT &page_res_it);
00360 void doc_and_block_rejection(
00361 PAGE_RES_IT &page_res_it,
00362 BOOL8 good_quality_doc);
00363 void quality_based_rejection(PAGE_RES_IT &page_res_it,
00364 BOOL8 good_quality_doc);
00365 void convert_bad_unlv_chs(WERD_RES *word_res);
00366 void merge_tess_fails(WERD_RES *word_res);
00367 void tilde_delete(PAGE_RES_IT &page_res_it);
00368 void insert_rej_cblobs(WERD_RES *word);
00370 void
00371 process_selected_words (
00372 BLOCK_LIST * block_list,
00373
00374 TBOX & selection_box,
00375 BOOL8 (tesseract::Tesseract::*word_processor) (
00376 BLOCK *,
00377 ROW *,
00378 WERD *));
00380 void tess_add_doc_word(
00381 WERD_CHOICE *word_choice
00382 );
00383 void tess_adapter(
00384 WERD *word,
00385 DENORM *denorm,
00386 const WERD_CHOICE& choice,
00387 const WERD_CHOICE& raw_choice,
00388 const char *rejmap
00389 );
00390 WERD_CHOICE *test_segment_pass2(
00391 WERD *word,
00392 DENORM *denorm,
00393 POLY_MATCHER matcher,
00394 POLY_TESTER tester,
00395
00396 WERD_CHOICE *&raw_choice,
00397
00398 BLOB_CHOICE_LIST_CLIST *blob_choices,
00399 WERD *&outword
00400 );
00401 WERD_CHOICE *tess_segment_pass1(
00402 WERD *word,
00403 DENORM *denorm,
00404 POLY_MATCHER matcher,
00405
00406 WERD_CHOICE *&raw_choice,
00407
00408 BLOB_CHOICE_LIST_CLIST *blob_choices,
00409 WERD *&outword
00410 );
00411 WERD_CHOICE *tess_segment_pass2(
00412 WERD *word,
00413 DENORM *denorm,
00414 POLY_MATCHER matcher,
00415
00416 WERD_CHOICE *&raw_choice,
00417
00418 BLOB_CHOICE_LIST_CLIST *blob_choices,
00419 WERD *&outword
00420 );
00421 WERD_CHOICE *correct_segment_pass2(
00422 WERD *word,
00423 DENORM *denorm,
00424 POLY_MATCHER matcher,
00425 POLY_TESTER tester,
00426
00427 WERD_CHOICE *&raw_choice,
00428
00429 BLOB_CHOICE_LIST_CLIST *blob_choices,
00430 WERD *&outword
00431 );
00432 void tess_default_matcher(
00433 PBLOB *pblob,
00434 PBLOB *blob,
00435 PBLOB *nblob,
00436 WERD *word,
00437 DENORM *denorm,
00438 BLOB_CHOICE_LIST *ratings,
00439 const char* script
00440 );
00441 void tess_bn_matcher(
00442 PBLOB *pblob,
00443 PBLOB *blob,
00444 PBLOB *nblob,
00445 WERD *word,
00446 DENORM *denorm,
00447 BLOB_CHOICE_LIST *ratings
00448 );
00449 void tess_cn_matcher(
00450 PBLOB *pblob,
00451 PBLOB *blob,
00452 PBLOB *nblob,
00453 WERD *word,
00454 DENORM *denorm,
00455 BLOB_CHOICE_LIST *ratings,
00456
00457 CLASS_PRUNER_RESULTS cpresults
00458 );
00459 BOOL8 tess_adaptable_word(
00460 WERD *word,
00461 WERD_CHOICE *word_choice,
00462 WERD_CHOICE *raw_choice
00463 );
00464 BOOL8 tess_acceptable_word(
00465 WERD_CHOICE *word_choice,
00466 WERD_CHOICE *raw_choice
00467 );
00469 void apply_box_testing(BLOCK_LIST *block_list);
00470 void apply_boxes(const STRING& fname,
00471 BLOCK_LIST *block_list
00472 );
00473
00474 int Boxes2BlockList(int box_cnt, TBOX *boxes, BLOCK_LIST *block_list,
00475 bool right2left);
00477 float compare_tess_blobs(TBLOB *blob1,
00478 TEXTROW *row1,
00479 TBLOB *blob2,
00480 TEXTROW *row2);
00482 float compare_bln_blobs(
00483 PBLOB *blob1,
00484 DENORM *denorm1,
00485 PBLOB *blob2,
00486 DENORM *denorm2);
00487 float compare_blobs(
00488 PBLOB *blob1,
00489 ROW *row1,
00490 PBLOB *blob2,
00491 ROW *row2);
00492 BOOL8 compare_blob_pairs(
00493 BLOCK *,
00494 ROW *row,
00495 WERD *,
00496 PBLOB *blob
00497 );
00499 void check_block_occ(WERD_RES *word_res);
00500
00502 BOOL_VAR_H(tessedit_resegment_from_boxes, false,
00503 "Take segmentation and labeling from box file");
00504 BOOL_VAR_H(tessedit_train_from_boxes, false,
00505 "Generate training data from boxed chars");
00506 BOOL_VAR_H(tessedit_dump_pageseg_images, false,
00507 "Dump itermediate images made during page segmentation");
00508 INT_VAR_H(tessedit_pageseg_mode, 2,
00509 "Page seg mode: 0=auto, 1=col, 2=block, 3=line, 4=word, 6=char"
00510 " (Values from PageSegMode enum in baseapi.h)");
00511 INT_VAR_H(tessedit_accuracyvspeed, 0,
00512 "Accuracy V Speed tradeoff: 0 fastest, 100 most accurate"
00513 " (Values from AccuracyVSpeed enum in baseapi.h)");
00514 BOOL_VAR_H(tessedit_train_from_boxes_word_level, false,
00515 "Generate training data from boxed chars at word level.");
00516 STRING_VAR_H(tessedit_char_blacklist, "",
00517 "Blacklist of chars not to recognize");
00518 STRING_VAR_H(tessedit_char_whitelist, "",
00519 "Whitelist of chars to recognize");
00520 BOOL_VAR_H(global_tessedit_ambigs_training, false,
00521 "Perform training for ambiguities");
00523 FILE *init_ambigs_training(const STRING &fname);
00524 void ambigs_training_segmented(const STRING &fname,
00525 PAGE_RES *page_res,
00526 volatile ETEXT_DESC *monitor,
00527 FILE *output_file);
00528 void ambigs_classify_and_output(PAGE_RES_IT *page_res_it,
00529 const char *label,
00530 FILE *output_file);
00531 private:
00532 Pix* pix_binary_;
00533 FCOORD deskew_;
00534 FCOORD reskew_;
00535 bool hindi_image_;
00536 };
00537
00538 }
00539
00540
00541 #endif // TESSERACT_CCMAIN_TESSERACTCLASS_H__