Tesseract 3.01
|
#include <tesseractclass.h>
Public Member Functions | ||||||||||
Tesseract () | ||||||||||
~Tesseract () | ||||||||||
void | Clear () | |||||||||
const FCOORD & | reskew () const | |||||||||
Pix ** | mutable_pix_binary () | |||||||||
Pix * | pix_binary () const | |||||||||
Pix * | pix_grey () const | |||||||||
void | set_pix_grey (Pix *grey_pix) | |||||||||
int | ImageWidth () const | |||||||||
int | ImageHeight () const | |||||||||
const ShiroRekhaSplitter & | splitter () const | |||||||||
ShiroRekhaSplitter * | mutable_splitter () | |||||||||
const Textord & | textord () const | |||||||||
Textord * | mutable_textord () | |||||||||
bool | right_to_left () const | |||||||||
void | SetBlackAndWhitelist () | |||||||||
void | PrepareForPageseg () | |||||||||
void | PrepareForTessOCR (BLOCK_LIST *block_list, Tesseract *osd_tess, OSResults *osr) | |||||||||
void | PrepareForCubeOCR () | |||||||||
int | SegmentPage (const STRING *input_file, BLOCK_LIST *blocks, Tesseract *osd_tess, OSResults *osr) | |||||||||
void | SetupWordScripts (BLOCK_LIST *blocks) | |||||||||
int | AutoPageSeg (int resolution, bool single_column, bool osd, bool only_osd, BLOCK_LIST *blocks, TO_BLOCK_LIST *to_blocks, Tesseract *osd_tess, OSResults *osr) | |||||||||
bool | ProcessTargetWord (const TBOX &word_box, const TBOX &target_word_box, const char *word_config, int pass) | |||||||||
void | recog_all_words (PAGE_RES *page_res, ETEXT_DESC *monitor, const TBOX *target_word_box, const char *word_config, int dopasses) | |||||||||
void | classify_word_pass1 (WERD_RES *word, ROW *row, BLOCK *block) | |||||||||
void | recog_pseudo_word (PAGE_RES *page_res, TBOX &selection_box) | |||||||||
void | fix_rep_char (PAGE_RES_IT *page_res_it) | |||||||||
void | ExplodeRepeatedWord (BLOB_CHOICE *best_choice, PAGE_RES_IT *page_res_it) | |||||||||
UNICHAR_ID | BothQuotes (UNICHAR_ID id1, UNICHAR_ID id2) | |||||||||
void | fix_quotes (WERD_RES *word_res, BLOB_CHOICE_LIST_CLIST *blob_choices) | |||||||||
ACCEPTABLE_WERD_TYPE | acceptable_word_string (const char *s, const char *lengths) | |||||||||
void | match_word_pass2 (WERD_RES *word, ROW *row, BLOCK *block) | |||||||||
void | classify_word_pass2 (WERD_RES *word, BLOCK *block, ROW *row) | |||||||||
void | ReportXhtFixResult (bool accept_new_word, float new_x_ht, WERD_RES *word, WERD_RES *new_word) | |||||||||
bool | RunOldFixXht (WERD_RES *word, BLOCK *block, ROW *row) | |||||||||
bool | TrainedXheightFix (WERD_RES *word, BLOCK *block, ROW *row) | |||||||||
BOOL8 | recog_interactive (BLOCK *block, ROW *row, WERD_RES *word_res) | |||||||||
UNICHAR_ID | BothHyphens (UNICHAR_ID id1, UNICHAR_ID id2) | |||||||||
bool | HyphenBoxesOverlap (const TBOX &box1, const TBOX &box2) | |||||||||
void | fix_hyphens (WERD_RES *word_res, BLOB_CHOICE_LIST_CLIST *blob_choices) | |||||||||
void | set_word_fonts (WERD_RES *word, BLOB_CHOICE_LIST_CLIST *blob_choices) | |||||||||
void | font_recognition_pass (PAGE_RES_IT &page_res_it) | |||||||||
BOOL8 | check_debug_pt (WERD_RES *word, int location) | |||||||||
bool | init_cube_objects (bool load_combiner, TessdataManager *tessdata_manager) | |||||||||
void | run_cube (PAGE_RES *page_res) | |||||||||
void | cube_recognize (CubeObject *cube_obj, PAGE_RES_IT *page_res_it) | |||||||||
void | fill_werd_res (const BoxWord &cube_box_word, WERD_CHOICE *cube_werd_choice, const char *cube_best_str, PAGE_RES_IT *page_res_it) | |||||||||
bool | extract_cube_state (CubeObject *cube_obj, int *num_chars, Boxa **char_boxes, CharSamp ***char_samples) | |||||||||
bool | create_cube_box_word (Boxa *char_boxes, int num_chars, TBOX word_box, BoxWord *box_word) | |||||||||
void | output_pass (PAGE_RES_IT &page_res_it, const TBOX *target_word_box) | |||||||||
void | write_results (PAGE_RES_IT &page_res_it, char newline_type, BOOL8 force_eol) | |||||||||
void | set_unlv_suspects (WERD_RES *word) | |||||||||
UNICHAR_ID | get_rep_char (WERD_RES *word) | |||||||||
BOOL8 | acceptable_number_string (const char *s, const char *lengths) | |||||||||
inT16 | count_alphanums (const WERD_CHOICE &word) | |||||||||
inT16 | count_alphas (const WERD_CHOICE &word) | |||||||||
void | read_config_file (const char *filename, bool init_only) | |||||||||
int | init_tesseract (const char *arg0, const char *textbase, const char *language, OcrEngineMode oem, char **configs, int configs_size, const GenericVector< STRING > *vars_vec, const GenericVector< STRING > *vars_values, bool set_only_init_params) | |||||||||
int | init_tesseract (const char *datapath, const char *language, OcrEngineMode oem) | |||||||||
int | init_tesseract_lm (const char *arg0, const char *textbase, const char *language) | |||||||||
void | recognize_page (STRING &image_name) | |||||||||
void | end_tesseract () | |||||||||
bool | init_tesseract_lang_data (const char *arg0, const char *textbase, const char *language, OcrEngineMode oem, char **configs, int configs_size, const GenericVector< STRING > *vars_vec, const GenericVector< STRING > *vars_values, bool set_only_init_params) | |||||||||
SVMenuNode * | build_menu_new () | |||||||||
void | pgeditor_main (int width, int height, PAGE_RES *page_res) | |||||||||
void | process_image_event (const SVEvent &event) | |||||||||
BOOL8 | process_cmd_win_event (inT32 cmd_event, char *new_value) | |||||||||
void | debug_word (PAGE_RES *page_res, const TBOX &selection_box) | |||||||||
void | do_re_display (BOOL8(tesseract::Tesseract::*word_painter)(BLOCK *block, ROW *row, WERD_RES *word_res)) | |||||||||
BOOL8 | word_display (BLOCK *block, ROW *row, WERD_RES *word_res) | |||||||||
BOOL8 | word_bln_display (BLOCK *block, ROW *row, WERD_RES *word_res) | |||||||||
BOOL8 | word_blank_and_set_display (BLOCK *block, ROW *row, WERD_RES *word_res) | |||||||||
BOOL8 | word_set_display (BLOCK *block, ROW *row, WERD_RES *word_res) | |||||||||
BOOL8 | word_dumper (BLOCK *block, ROW *row, WERD_RES *word_res) | |||||||||
void | make_reject_map (WERD_RES *word, BLOB_CHOICE_LIST_CLIST *blob_choices, ROW *row, inT16 pass) | |||||||||
BOOL8 | one_ell_conflict (WERD_RES *word_res, BOOL8 update_map) | |||||||||
inT16 | first_alphanum_index (const char *word, const char *word_lengths) | |||||||||
inT16 | first_alphanum_offset (const char *word, const char *word_lengths) | |||||||||
inT16 | alpha_count (const char *word, const char *word_lengths) | |||||||||
BOOL8 | word_contains_non_1_digit (const char *word, const char *word_lengths) | |||||||||
void | dont_allow_1Il (WERD_RES *word) | |||||||||
inT16 | count_alphanums (WERD_RES *word) | |||||||||
void | flip_0O (WERD_RES *word) | |||||||||
BOOL8 | non_0_digit (UNICHAR_ID unichar_id) | |||||||||
BOOL8 | non_O_upper (UNICHAR_ID unichar_id) | |||||||||
BOOL8 | repeated_nonalphanum_wd (WERD_RES *word, ROW *row) | |||||||||
void | nn_match_word (WERD_RES *word, ROW *row) | |||||||||
void | nn_recover_rejects (WERD_RES *word, ROW *row) | |||||||||
BOOL8 | test_ambig_word (WERD_RES *word) | |||||||||
void | set_done (WERD_RES *word, inT16 pass) | |||||||||
inT16 | safe_dict_word (const WERD_CHOICE &word) | |||||||||
void | flip_hyphens (WERD_RES *word) | |||||||||
void | reject_I_1_L (WERD_RES *word) | |||||||||
void | reject_edge_blobs (WERD_RES *word) | |||||||||
void | reject_mostly_rejects (WERD_RES *word) | |||||||||
BOOL8 | word_adaptable (WERD_RES *word, uinT16 mode) | |||||||||
void | recog_word_recursive (WERD_RES *word, BLOB_CHOICE_LIST_CLIST *blob_choices) | |||||||||
void | recog_word (WERD_RES *word, BLOB_CHOICE_LIST_CLIST *blob_choices) | |||||||||
void | split_and_recog_word (WERD_RES *word, BLOB_CHOICE_LIST_CLIST *blob_choices) | |||||||||
void | match_current_words (WERD_RES_LIST &words, ROW *row, BLOCK *block) | |||||||||
inT16 | fp_eval_word_spacing (WERD_RES_LIST &word_res_list) | |||||||||
void | dump_words (WERD_RES_LIST &perm, inT16 score, inT16 mode, BOOL8 improved) | |||||||||
GARBAGE_LEVEL | garbage_word (WERD_RES *word, BOOL8 ok_dict_word) | |||||||||
BOOL8 | potential_word_crunch (WERD_RES *word, GARBAGE_LEVEL garbage_level, BOOL8 ok_dict_word) | |||||||||
void | tilde_crunch (PAGE_RES_IT &page_res_it) | |||||||||
void | unrej_good_quality_words (PAGE_RES_IT &page_res_it) | |||||||||
void | doc_and_block_rejection (PAGE_RES_IT &page_res_it, BOOL8 good_quality_doc) | |||||||||
void | quality_based_rejection (PAGE_RES_IT &page_res_it, BOOL8 good_quality_doc) | |||||||||
void | convert_bad_unlv_chs (WERD_RES *word_res) | |||||||||
UNICHAR_ID | BothSpaces (UNICHAR_ID id1, UNICHAR_ID id2) | |||||||||
void | merge_tess_fails (WERD_RES *word_res) | |||||||||
void | tilde_delete (PAGE_RES_IT &page_res_it) | |||||||||
inT16 | word_blob_quality (WERD_RES *word, ROW *row) | |||||||||
void | word_char_quality (WERD_RES *word, ROW *row, inT16 *match_count, inT16 *accepted_match_count) | |||||||||
void | unrej_good_chs (WERD_RES *word, ROW *row) | |||||||||
inT16 | count_outline_errs (char c, inT16 outline_count) | |||||||||
inT16 | word_outline_errs (WERD_RES *word) | |||||||||
BOOL8 | terrible_word_crunch (WERD_RES *word, GARBAGE_LEVEL garbage_level) | |||||||||
CRUNCH_MODE | word_deletable (WERD_RES *word, inT16 &delete_mode) | |||||||||
inT16 | failure_count (WERD_RES *word) | |||||||||
BOOL8 | noise_outlines (TWERD *word) | |||||||||
void | process_selected_words (PAGE_RES *page_res, TBOX &selection_box, BOOL8(tesseract::Tesseract::*word_processor)(BLOCK *block, ROW *row, WERD_RES *word_res)) | |||||||||
void | tess_segment_pass1 (WERD_RES *word, BLOB_CHOICE_LIST_CLIST *blob_choices) | |||||||||
PAGE_RES * | ApplyBoxes (const STRING &fname, bool find_segmentation, BLOCK_LIST *block_list) | |||||||||
PAGE_RES * | SetupApplyBoxes (BLOCK_LIST *block_list) | |||||||||
void | MaximallyChopWord (BLOCK *block, ROW *row, WERD_RES *word_res) | |||||||||
bool | ResegmentCharBox (PAGE_RES *page_res, const TBOX &box, const TBOX &next_box, const char *correct_text) | |||||||||
bool | ResegmentWordBox (BLOCK_LIST *block_list, const TBOX &box, const TBOX &next_box, const char *correct_text) | |||||||||
void | ReSegmentByClassification (PAGE_RES *page_res) | |||||||||
bool | ConvertStringToUnichars (const char *utf8, GenericVector< UNICHAR_ID > *class_ids) | |||||||||
bool | FindSegmentation (const GenericVector< UNICHAR_ID > &target_text, WERD_RES *word_res) | |||||||||
void | SearchForText (const GenericVector< BLOB_CHOICE_LIST * > *choices, int choices_pos, int choices_length, const GenericVector< UNICHAR_ID > &target_text, int text_index, float rating, GenericVector< int > *segmentation, float *best_rating, GenericVector< int > *best_segmentation) | |||||||||
void | TidyUp (PAGE_RES *page_res) | |||||||||
void | ReportFailedBox (int boxfile_lineno, TBOX box, const char *box_ch, const char *err_msg) | |||||||||
void | CorrectClassifyWords (PAGE_RES *page_res) | |||||||||
void | ApplyBoxTraining (const STRING &filename, PAGE_RES *page_res) | |||||||||
int | CountMisfitTops (WERD_RES *word_res) | |||||||||
float | ComputeCompatibleXheight (WERD_RES *word_res) | |||||||||
FILE * | init_recog_training (const STRING &fname) | |||||||||
void | recog_training_segmented (const STRING &fname, PAGE_RES *page_res, volatile ETEXT_DESC *monitor, FILE *output_file) | |||||||||
void | ambigs_classify_and_output (WERD_RES *werd_res, ROW_RES *row_res, BLOCK_RES *block_res, const char *label, FILE *output_file) | |||||||||
CubeRecoContext * | GetCubeRecoContext () | |||||||||
eval_word_spacing() | ||||||||||
The basic measure is the number of characters in contextually confirmed words. (I.e the word is done) If all words are contextually confirmed the evaluation is deemed perfect. Some fiddles are done to handle "1"s as these are VERY frequent causes of fuzzy spaces. The problem with the basic measure is that "561 63" would score the same as "56163", though given our knowledge that the space is fuzzy, and that there is a "1" next to the fuzzy space, we need to ensure that "56163" is prefered. The solution is to NOT COUNT the score of any word which has a digit at one end and a "1Il" as the character the other side of the space. Conversly, any character next to a "1" within a word is counted as a positive score. Thus "561 63" would score 4 (3 chars in a numeric word plus 1 side of the "1" joined). "56163" would score 7 - all chars in a numeric word + 2 sides of a "1" joined. The joined 1 rule is applied to any word REGARDLESS of contextual confirmation. Thus "PS7a71 3/7a" scores 1 (neither word is contexutally confirmed. The only score is from the joined 1. "PS7a713/7a" scores 2. | ||||||||||
BOOL8 | digit_or_numeric_punct (WERD_RES *word, int char_position) | |||||||||
inT16 | eval_word_spacing (WERD_RES_LIST &word_res_list) | |||||||||
fix_sp_fp_word() | ||||||||||
Test the current word to see if it can be split by deleting noise blobs. If so, do the business. Return with the iterator pointing to the same place if the word is unchanged, or the last of the replacement words. | ||||||||||
void | fix_noisy_space_list (WERD_RES_LIST &best_perm, ROW *row, BLOCK *block) | |||||||||
void | fix_sp_fp_word (WERD_RES_IT &word_res_it, ROW *row, BLOCK *block) | |||||||||
inT16 | worst_noise_blob (WERD_RES *word_res, float *worst_noise_score) | |||||||||
float | blob_noise_score (TBLOB *blob) | |||||||||
void | break_noisiest_blob_word (WERD_RES_LIST &words) | |||||||||
fix_fuzzy_spaces() | ||||||||||
Walk over the page finding sequences of words joined by fuzzy spaces. Extract them as a sublist, process the sublist to find the optimal arrangement of spaces then replace the sublist in the ROW_RES.
| ||||||||||
void | fix_fuzzy_space_list (WERD_RES_LIST &best_perm, ROW *row, BLOCK *block) | |||||||||
void | fix_fuzzy_spaces (ETEXT_DESC *monitor, inT32 word_count, PAGE_RES *page_res) | |||||||||
uniformly_spaced() | ||||||||||
Return true if one of the following are true:
| ||||||||||
BOOL8 | uniformly_spaced (WERD_RES *word) | |||||||||
BOOL8 | fixspace_thinks_word_done (WERD_RES *word) | |||||||||
tess_add_doc_word | ||||||||||
Add the given word to the document dictionary | ||||||||||
void | tess_add_doc_word (WERD_CHOICE *word_choice) | |||||||||
tess_segment_pass2 | ||||||||||
Segment a word using the pass2 conditions of the tess segmenter.
| ||||||||||
void | tess_segment_pass2 (WERD_RES *word, BLOB_CHOICE_LIST_CLIST *blob_choices) | |||||||||
tess_acceptable_word | ||||||||||
| ||||||||||
BOOL8 | tess_acceptable_word (WERD_CHOICE *word_choice, WERD_CHOICE *raw_choice) | |||||||||
Public Attributes | ||||||||||
bool | tessedit_resegment_from_boxes = false | |||||||||
bool | tessedit_resegment_from_line_boxes = false | |||||||||
bool | tessedit_train_from_boxes = false | |||||||||
bool | tessedit_make_boxes_from_boxes = false | |||||||||
bool | tessedit_dump_pageseg_images = false | |||||||||
int | tessedit_pageseg_mode = PSM_SINGLE_BLOCK | |||||||||
int | tessedit_ocr_engine_mode = tesseract::OEM_TESSERACT_ONLY | |||||||||
char * | tessedit_char_blacklist = "" | |||||||||
char * | tessedit_char_whitelist = "" | |||||||||
bool | tessedit_ambigs_training = false | |||||||||
int | pageseg_devanagari_split_strategy = tesseract::ShiroRekhaSplitter::NO_SPLIT | |||||||||
int | ocr_devanagari_split_strategy = tesseract::ShiroRekhaSplitter::NO_SPLIT | |||||||||
char * | tessedit_write_params_to_file = "" | |||||||||
bool | tessedit_adapt_to_char_fragments = true | |||||||||
bool | tessedit_adaption_debug = false | |||||||||
int | applybox_debug = 1 | |||||||||
int | applybox_page = 0 | |||||||||
char * | applybox_exposure_pattern = ".exp" | |||||||||
bool | applybox_learn_chars_and_char_frags_mode = false | |||||||||
bool | applybox_learn_ngrams_mode = false | |||||||||
bool | tessedit_draw_outwords = false | |||||||||
bool | tessedit_training_tess = false | |||||||||
bool | tessedit_dump_choices = false | |||||||||
bool | tessedit_fix_fuzzy_spaces = true | |||||||||
bool | tessedit_unrej_any_wd = false | |||||||||
bool | tessedit_fix_hyphens = true | |||||||||
bool | tessedit_redo_xheight = true | |||||||||
bool | tessedit_enable_doc_dict = true | |||||||||
bool | tessedit_debug_fonts = false | |||||||||
bool | tessedit_debug_block_rejection = false | |||||||||
int | debug_x_ht_level = 0 | |||||||||
bool | debug_acceptable_wds = false | |||||||||
char * | chs_leading_punct = "('`\"" | |||||||||
char * | chs_trailing_punct1 = ").,;:?!" | |||||||||
char * | chs_trailing_punct2 = ")'`\"" | |||||||||
double | quality_rej_pc = 0.08 | |||||||||
double | quality_blob_pc = 0.0 | |||||||||
double | quality_outline_pc = 1.0 | |||||||||
double | quality_char_pc = 0.95 | |||||||||
int | quality_min_initial_alphas_reqd = 2 | |||||||||
bool | tessedit_tess_adapt_to_rejmap = false | |||||||||
int | tessedit_tess_adaption_mode = 0x27 | |||||||||
bool | tessedit_minimal_rej_pass1 = false | |||||||||
bool | tessedit_test_adaption = false | |||||||||
bool | tessedit_matcher_log = false | |||||||||
int | tessedit_test_adaption_mode = 3 | |||||||||
bool | save_best_choices = false | |||||||||
bool | test_pt = false | |||||||||
double | test_pt_x = 99999.99 | |||||||||
double | test_pt_y = 99999.99 | |||||||||
int | cube_debug_level = 1 | |||||||||
char * | outlines_odd = "%| " | |||||||||
char * | outlines_2 = "ij!?%\":;" | |||||||||
bool | docqual_excuse_outline_errs = false | |||||||||
bool | tessedit_good_quality_unrej = true | |||||||||
bool | tessedit_use_reject_spaces = true | |||||||||
double | tessedit_reject_doc_percent = 65.00 | |||||||||
double | tessedit_reject_block_percent = 45.00 | |||||||||
double | tessedit_reject_row_percent = 40.00 | |||||||||
double | tessedit_whole_wd_rej_row_percent = 70.00 | |||||||||
bool | tessedit_preserve_blk_rej_perfect_wds = true | |||||||||
bool | tessedit_preserve_row_rej_perfect_wds = true | |||||||||
bool | tessedit_dont_blkrej_good_wds = false | |||||||||
bool | tessedit_dont_rowrej_good_wds = false | |||||||||
int | tessedit_preserve_min_wd_len = 2 | |||||||||
bool | tessedit_row_rej_good_docs = true | |||||||||
double | tessedit_good_doc_still_rowrej_wd = 1.1 | |||||||||
bool | tessedit_reject_bad_qual_wds = true | |||||||||
bool | tessedit_debug_doc_rejection = false | |||||||||
bool | tessedit_debug_quality_metrics = false | |||||||||
bool | bland_unrej = false | |||||||||
double | quality_rowrej_pc = 1.1 | |||||||||
bool | unlv_tilde_crunching = true | |||||||||
bool | crunch_early_merge_tess_fails = true | |||||||||
bool | crunch_early_convert_bad_unlv_chs = false | |||||||||
double | crunch_terrible_rating = 80.0 | |||||||||
bool | crunch_terrible_garbage = true | |||||||||
double | crunch_poor_garbage_cert = -9.0 | |||||||||
double | crunch_poor_garbage_rate = 60 | |||||||||
double | crunch_pot_poor_rate = 40 | |||||||||
double | crunch_pot_poor_cert = -8.0 | |||||||||
bool | crunch_pot_garbage = true | |||||||||
double | crunch_del_rating = 60 | |||||||||
double | crunch_del_cert = -10.0 | |||||||||
double | crunch_del_min_ht = 0.7 | |||||||||
double | crunch_del_max_ht = 3.0 | |||||||||
double | crunch_del_min_width = 3.0 | |||||||||
double | crunch_del_high_word = 1.5 | |||||||||
double | crunch_del_low_word = 0.5 | |||||||||
double | crunch_small_outlines_size = 0.6 | |||||||||
int | crunch_rating_max = 10 | |||||||||
int | crunch_pot_indicators = 1 | |||||||||
bool | crunch_leave_ok_strings = true | |||||||||
bool | crunch_accept_ok = true | |||||||||
bool | crunch_leave_accept_strings = false | |||||||||
bool | crunch_include_numerals = false | |||||||||
int | crunch_leave_lc_strings = 4 | |||||||||
int | crunch_leave_uc_strings = 4 | |||||||||
int | crunch_long_repetitions = 3 | |||||||||
int | crunch_debug = 0 | |||||||||
int | fixsp_non_noise_limit = 1 | |||||||||
double | fixsp_small_outlines_size = 0.28 | |||||||||
bool | tessedit_prefer_joined_punct = false | |||||||||
int | fixsp_done_mode = 1 | |||||||||
int | debug_fix_space_level = 0 | |||||||||
char * | numeric_punctuation = ".," | |||||||||
int | x_ht_acceptance_tolerance = 8 | |||||||||
int | x_ht_min_change = 8 | |||||||||
bool | tessedit_write_block_separators = false | |||||||||
bool | tessedit_write_rep_codes = false | |||||||||
bool | tessedit_write_unlv = false | |||||||||
bool | tessedit_create_hocr = false | |||||||||
char * | unrecognised_char = "|" | |||||||||
int | suspect_level = 99 | |||||||||
int | suspect_space_level = 100 | |||||||||
int | suspect_short_words = 2 | |||||||||
bool | suspect_constrain_1Il = false | |||||||||
double | suspect_rating_per_ch = 999.9 | |||||||||
double | suspect_accept_rating = -999.9 | |||||||||
bool | tessedit_minimal_rejection = false | |||||||||
bool | tessedit_zero_rejection = false | |||||||||
bool | tessedit_word_for_word = false | |||||||||
bool | tessedit_zero_kelvin_rejection = false | |||||||||
bool | tessedit_consistent_reps = true | |||||||||
int | tessedit_reject_mode = 0 | |||||||||
int | tessedit_ok_mode = 5 | |||||||||
bool | tessedit_rejection_debug = false | |||||||||
bool | tessedit_flip_0O = true | |||||||||
double | tessedit_lower_flip_hyphen = 1.5 | |||||||||
double | tessedit_upper_flip_hyphen = 1.8 | |||||||||
bool | rej_trust_doc_dawg = false | |||||||||
bool | rej_1Il_use_dict_word = false | |||||||||
bool | rej_1Il_trust_permuter_type = true | |||||||||
bool | rej_use_tess_accepted = true | |||||||||
bool | rej_use_tess_blanks = true | |||||||||
bool | rej_use_good_perm = true | |||||||||
bool | rej_use_sensible_wd = false | |||||||||
bool | rej_alphas_in_number_perm = false | |||||||||
double | rej_whole_of_mostly_reject_word_fract = 0.85 | |||||||||
int | tessedit_image_border = 2 | |||||||||
char * | ok_repeated_ch_non_alphanum_wds = "-?*\075" | |||||||||
char * | conflict_set_I_l_1 = "Il1[]" | |||||||||
int | min_sane_x_ht_pixels = 8 | |||||||||
bool | tessedit_create_boxfile = false | |||||||||
int | tessedit_page_number = -1 | |||||||||
bool | tessedit_write_images = false | |||||||||
bool | interactive_mode = false | |||||||||
char * | file_type = ".tif" | |||||||||
bool | tessedit_override_permuter = true | |||||||||
int | tessdata_manager_debug_level = 0 | |||||||||
double | min_orientation_margin = 12.0 |
tesseract::Tesseract::Tesseract | ( | ) |
tesseract::Tesseract::~Tesseract | ( | ) |
BOOL8 tesseract::Tesseract::acceptable_number_string | ( | const char * | s, |
const char * | lengths | ||
) |
ACCEPTABLE_WERD_TYPE tesseract::Tesseract::acceptable_word_string | ( | const char * | s, |
const char * | lengths | ||
) |
inT16 tesseract::Tesseract::alpha_count | ( | const char * | word, |
const char * | word_lengths | ||
) |
void tesseract::Tesseract::ambigs_classify_and_output | ( | WERD_RES * | werd_res, |
ROW_RES * | row_res, | ||
BLOCK_RES * | block_res, | ||
const char * | label, | ||
FILE * | output_file | ||
) |
PAGE_RES * tesseract::Tesseract::ApplyBoxes | ( | const STRING & | fname, |
bool | find_segmentation, | ||
BLOCK_LIST * | block_list | ||
) |
int tesseract::Tesseract::AutoPageSeg | ( | int | resolution, |
bool | single_column, | ||
bool | osd, | ||
bool | only_osd, | ||
BLOCK_LIST * | blocks, | ||
TO_BLOCK_LIST * | to_blocks, | ||
Tesseract * | osd_tess, | ||
OSResults * | osr | ||
) |
Auto page segmentation. Divide the page image into blocks of uniform text linespacing and images.
Resolution (in ppi) is derived from the input image.
The output goes in the blocks list with corresponding TO_BLOCKs in the to_blocks list.
If single_column is true, then no attempt is made to divide the image into columns, but multiple blocks are still made if the text is of non-uniform linespacing.
If osd is true, then orientation and script detection is performed as well. If only_osd is true, then only orientation and script detection is performed. If osr is desired, the osr_tess must be another Tesseract that was initialized especially for osd, and the results will be output into osr.
float tesseract::Tesseract::blob_noise_score | ( | TBLOB * | blob | ) |
UNICHAR_ID tesseract::Tesseract::BothHyphens | ( | UNICHAR_ID | id1, |
UNICHAR_ID | id2 | ||
) |
UNICHAR_ID tesseract::Tesseract::BothQuotes | ( | UNICHAR_ID | id1, |
UNICHAR_ID | id2 | ||
) |
UNICHAR_ID tesseract::Tesseract::BothSpaces | ( | UNICHAR_ID | id1, |
UNICHAR_ID | id2 | ||
) |
void tesseract::Tesseract::break_noisiest_blob_word | ( | WERD_RES_LIST & | words | ) |
break_noisiest_blob_word() Find the word with the blob which looks like the worst noise. Break the word into two, deleting the noise blob.
SVMenuNode * tesseract::Tesseract::build_menu_new | ( | ) |
classify_word_pass1
Baseline normalize the word and pass it to Tess.
classify_word_pass2
Control what to do with the word in pass 2
void tesseract::Tesseract::Clear | ( | ) |
float tesseract::Tesseract::ComputeCompatibleXheight | ( | WERD_RES * | word_res | ) |
void tesseract::Tesseract::convert_bad_unlv_chs | ( | WERD_RES * | word_res | ) |
bool tesseract::Tesseract::ConvertStringToUnichars | ( | const char * | utf8, |
GenericVector< UNICHAR_ID > * | class_ids | ||
) |
void tesseract::Tesseract::CorrectClassifyWords | ( | PAGE_RES * | page_res | ) |
inT16 tesseract::Tesseract::count_alphanums | ( | const WERD_CHOICE & | word | ) |
inT16 tesseract::Tesseract::count_alphas | ( | const WERD_CHOICE & | word | ) |
int tesseract::Tesseract::CountMisfitTops | ( | WERD_RES * | word_res | ) |
bool tesseract::Tesseract::create_cube_box_word | ( | Boxa * | char_boxes, |
int | num_chars, | ||
TBOX | word_box, | ||
BoxWord * | box_word | ||
) |
void tesseract::Tesseract::cube_recognize | ( | CubeObject * | cube_obj, |
PAGE_RES_IT * | page_res_it | ||
) |
debug_word
Process the whole image, but load word_config_ for the selected word(s).
void tesseract::Tesseract::do_re_display | ( | BOOL8(tesseract::Tesseract::*)(BLOCK *block, ROW *row, WERD_RES *word_res) | word_painter | ) |
Redisplay page
void tesseract::Tesseract::doc_and_block_rejection | ( | PAGE_RES_IT & | page_res_it, |
BOOL8 | good_quality_doc | ||
) |
void tesseract::Tesseract::dont_allow_1Il | ( | WERD_RES * | word | ) |
void tesseract::Tesseract::dump_words | ( | WERD_RES_LIST & | perm, |
inT16 | score, | ||
inT16 | mode, | ||
BOOL8 | improved | ||
) |
void tesseract::Tesseract::end_tesseract | ( | ) |
inT16 tesseract::Tesseract::eval_word_spacing | ( | WERD_RES_LIST & | word_res_list | ) |
void tesseract::Tesseract::ExplodeRepeatedWord | ( | BLOB_CHOICE * | best_choice, |
PAGE_RES_IT * | page_res_it | ||
) |
bool tesseract::Tesseract::extract_cube_state | ( | CubeObject * | cube_obj, |
int * | num_chars, | ||
Boxa ** | char_boxes, | ||
CharSamp *** | char_samples | ||
) |
void tesseract::Tesseract::fill_werd_res | ( | const BoxWord & | cube_box_word, |
WERD_CHOICE * | cube_werd_choice, | ||
const char * | cube_best_str, | ||
PAGE_RES_IT * | page_res_it | ||
) |
bool tesseract::Tesseract::FindSegmentation | ( | const GenericVector< UNICHAR_ID > & | target_text, |
WERD_RES * | word_res | ||
) |
inT16 tesseract::Tesseract::first_alphanum_index | ( | const char * | word, |
const char * | word_lengths | ||
) |
inT16 tesseract::Tesseract::first_alphanum_offset | ( | const char * | word, |
const char * | word_lengths | ||
) |
void tesseract::Tesseract::fix_fuzzy_space_list | ( | WERD_RES_LIST & | best_perm, |
ROW * | row, | ||
BLOCK * | block | ||
) |
void tesseract::Tesseract::fix_fuzzy_spaces | ( | ETEXT_DESC * | monitor, |
inT32 | word_count, | ||
PAGE_RES * | page_res | ||
) |
void tesseract::Tesseract::fix_hyphens | ( | WERD_RES * | word_res, |
BLOB_CHOICE_LIST_CLIST * | blob_choices | ||
) |
fix_hyphens
Change pairs of hyphens to a single hyphen if the bounding boxes touch Typically a long dash which has been segmented.
void tesseract::Tesseract::fix_noisy_space_list | ( | WERD_RES_LIST & | best_perm, |
ROW * | row, | ||
BLOCK * | block | ||
) |
void tesseract::Tesseract::fix_quotes | ( | WERD_RES * | word_res, |
BLOB_CHOICE_LIST_CLIST * | blob_choices | ||
) |
fix_quotes
Change pairs of quotes to double quotes.
void tesseract::Tesseract::fix_rep_char | ( | PAGE_RES_IT * | page_res_it | ) |
fix_rep_char() The word is a repeated char. (Leader.) Find the repeated char character. Create the appropriate single-word or multi-word sequence according to the size of spaces in between blobs, and correct the classifications where some of the characters disagree with the majority.
void tesseract::Tesseract::flip_0O | ( | WERD_RES * | word | ) |
void tesseract::Tesseract::flip_hyphens | ( | WERD_RES * | word | ) |
void tesseract::Tesseract::font_recognition_pass | ( | PAGE_RES_IT & | page_res_it | ) |
font_recognition_pass
Smooth the fonts for the document.
inT16 tesseract::Tesseract::fp_eval_word_spacing | ( | WERD_RES_LIST & | word_res_list | ) |
GARBAGE_LEVEL tesseract::Tesseract::garbage_word | ( | WERD_RES * | word, |
BOOL8 | ok_dict_word | ||
) |
UNICHAR_ID tesseract::Tesseract::get_rep_char | ( | WERD_RES * | word | ) |
CubeRecoContext* tesseract::Tesseract::GetCubeRecoContext | ( | ) | [inline] |
int tesseract::Tesseract::ImageHeight | ( | ) | const [inline] |
int tesseract::Tesseract::ImageWidth | ( | ) | const [inline] |
bool tesseract::Tesseract::init_cube_objects | ( | bool | load_combiner, |
TessdataManager * | tessdata_manager | ||
) |
FILE * tesseract::Tesseract::init_recog_training | ( | const STRING & | fname | ) |
int tesseract::Tesseract::init_tesseract | ( | const char * | arg0, |
const char * | textbase, | ||
const char * | language, | ||
OcrEngineMode | oem, | ||
char ** | configs, | ||
int | configs_size, | ||
const GenericVector< STRING > * | vars_vec, | ||
const GenericVector< STRING > * | vars_values, | ||
bool | set_only_init_params | ||
) |
int tesseract::Tesseract::init_tesseract | ( | const char * | datapath, |
const char * | language, | ||
OcrEngineMode | oem | ||
) | [inline] |
bool tesseract::Tesseract::init_tesseract_lang_data | ( | const char * | arg0, |
const char * | textbase, | ||
const char * | language, | ||
OcrEngineMode | oem, | ||
char ** | configs, | ||
int | configs_size, | ||
const GenericVector< STRING > * | vars_vec, | ||
const GenericVector< STRING > * | vars_values, | ||
bool | set_only_init_params | ||
) |
int tesseract::Tesseract::init_tesseract_lm | ( | const char * | arg0, |
const char * | textbase, | ||
const char * | language | ||
) |
void tesseract::Tesseract::make_reject_map | ( | WERD_RES * | word, |
BLOB_CHOICE_LIST_CLIST * | blob_choices, | ||
ROW * | row, | ||
inT16 | pass | ||
) |
match_word_pass2
Baseline normalize the word and pass it to Tess.
void tesseract::Tesseract::merge_tess_fails | ( | WERD_RES * | word_res | ) |
Pix** tesseract::Tesseract::mutable_pix_binary | ( | ) | [inline] |
ShiroRekhaSplitter* tesseract::Tesseract::mutable_splitter | ( | ) | [inline] |
Textord* tesseract::Tesseract::mutable_textord | ( | ) | [inline] |
BOOL8 tesseract::Tesseract::non_0_digit | ( | UNICHAR_ID | unichar_id | ) |
BOOL8 tesseract::Tesseract::non_O_upper | ( | UNICHAR_ID | unichar_id | ) |
void tesseract::Tesseract::output_pass | ( | PAGE_RES_IT & | page_res_it, |
const TBOX * | target_word_box | ||
) |
void tesseract::Tesseract::pgeditor_main | ( | int | width, |
int | height, | ||
PAGE_RES * | page_res | ||
) |
Top level editor operation: Setup a new window and an according event handler
Pix* tesseract::Tesseract::pix_binary | ( | ) | const [inline] |
Pix* tesseract::Tesseract::pix_grey | ( | ) | const [inline] |
BOOL8 tesseract::Tesseract::potential_word_crunch | ( | WERD_RES * | word, |
GARBAGE_LEVEL | garbage_level, | ||
BOOL8 | ok_dict_word | ||
) |
void tesseract::Tesseract::PrepareForCubeOCR | ( | ) |
void tesseract::Tesseract::PrepareForPageseg | ( | ) |
void tesseract::Tesseract::PrepareForTessOCR | ( | BLOCK_LIST * | block_list, |
Tesseract * | osd_tess, | ||
OSResults * | osr | ||
) |
void tesseract::Tesseract::process_image_event | ( | const SVEvent & | event | ) |
User has done something in the image window - mouse down or up. Work out what it is and do something with it. If DOWN - just remember where it was. If UP - for each word in the selected area do the operation defined by the current mode.
void tesseract::Tesseract::process_selected_words | ( | PAGE_RES * | page_res, |
TBOX & | selection_box, | ||
BOOL8(tesseract::Tesseract::*)(BLOCK *block, ROW *row, WERD_RES *word_res) | word_processor | ||
) |
bool tesseract::Tesseract::ProcessTargetWord | ( | const TBOX & | word_box, |
const TBOX & | target_word_box, | ||
const char * | word_config, | ||
int | pass | ||
) |
void tesseract::Tesseract::quality_based_rejection | ( | PAGE_RES_IT & | page_res_it, |
BOOL8 | good_quality_doc | ||
) |
void tesseract::Tesseract::read_config_file | ( | const char * | filename, |
bool | init_only | ||
) |
void tesseract::Tesseract::recog_all_words | ( | PAGE_RES * | page_res, |
ETEXT_DESC * | monitor, | ||
const TBOX * | target_word_box, | ||
const char * | word_config, | ||
int | dopasses | ||
) |
Walk the page_res, recognizing all the words. If monitor is not null, it is used as a progress monitor/timeout/cancel. If dopasses is 0, all recognition passes are run, 1 just pass 1, 2 passes2 and higher. If target_word_box is not null, special things are done to words that overlap the target_word_box: if word_config is not null, the word config file is read for just the target word(s), otherwise, on pass 2 and beyond ONLY the target words are processed (Jetsoft modification.)
page_res | page structure |
monitor | progress monitor |
target_word_box | specifies just to extract a rectangle |
dopasses | 0 - all, 1 just pass 1, 2 passes 2 and higher |
recog_interactive
Recognize a single word in interactive mode.
block | block |
row | row of word |
word | word to recognise |
void tesseract::Tesseract::recog_training_segmented | ( | const STRING & | fname, |
PAGE_RES * | page_res, | ||
volatile ETEXT_DESC * | monitor, | ||
FILE * | output_file | ||
) |
void tesseract::Tesseract::recog_word | ( | WERD_RES * | word, |
BLOB_CHOICE_LIST_CLIST * | blob_choices | ||
) |
void tesseract::Tesseract::recog_word_recursive | ( | WERD_RES * | word, |
BLOB_CHOICE_LIST_CLIST * | blob_choices | ||
) |
void tesseract::Tesseract::recognize_page | ( | STRING & | image_name | ) |
void tesseract::Tesseract::reject_edge_blobs | ( | WERD_RES * | word | ) |
void tesseract::Tesseract::reject_I_1_L | ( | WERD_RES * | word | ) |
void tesseract::Tesseract::reject_mostly_rejects | ( | WERD_RES * | word | ) |
void tesseract::Tesseract::ReportFailedBox | ( | int | boxfile_lineno, |
TBOX | box, | ||
const char * | box_ch, | ||
const char * | err_msg | ||
) |
void tesseract::Tesseract::ReportXhtFixResult | ( | bool | accept_new_word, |
float | new_x_ht, | ||
WERD_RES * | word, | ||
WERD_RES * | new_word | ||
) |
void tesseract::Tesseract::ReSegmentByClassification | ( | PAGE_RES * | page_res | ) |
bool tesseract::Tesseract::ResegmentCharBox | ( | PAGE_RES * | page_res, |
const TBOX & | box, | ||
const TBOX & | next_box, | ||
const char * | correct_text | ||
) |
bool tesseract::Tesseract::ResegmentWordBox | ( | BLOCK_LIST * | block_list, |
const TBOX & | box, | ||
const TBOX & | next_box, | ||
const char * | correct_text | ||
) |
const FCOORD& tesseract::Tesseract::reskew | ( | ) | const [inline] |
bool tesseract::Tesseract::right_to_left | ( | ) | const [inline] |
void tesseract::Tesseract::run_cube | ( | PAGE_RES * | page_res | ) |
inT16 tesseract::Tesseract::safe_dict_word | ( | const WERD_CHOICE & | word | ) |
void tesseract::Tesseract::SearchForText | ( | const GenericVector< BLOB_CHOICE_LIST * > * | choices, |
int | choices_pos, | ||
int | choices_length, | ||
const GenericVector< UNICHAR_ID > & | target_text, | ||
int | text_index, | ||
float | rating, | ||
GenericVector< int > * | segmentation, | ||
float * | best_rating, | ||
GenericVector< int > * | best_segmentation | ||
) |
int tesseract::Tesseract::SegmentPage | ( | const STRING * | input_file, |
BLOCK_LIST * | blocks, | ||
Tesseract * | osd_tess, | ||
OSResults * | osr | ||
) |
Segment the page according to the current value of tessedit_pageseg_mode. pix_binary_ is used as the source image and should not be NULL. On return the blocks list owns all the constructed page layout.
void tesseract::Tesseract::set_pix_grey | ( | Pix * | grey_pix | ) | [inline] |
void tesseract::Tesseract::set_unlv_suspects | ( | WERD_RES * | word | ) |
void tesseract::Tesseract::set_word_fonts | ( | WERD_RES * | word, |
BLOB_CHOICE_LIST_CLIST * | blob_choices | ||
) |
set_word_fonts
Get the fonts for the word.
void tesseract::Tesseract::SetBlackAndWhitelist | ( | ) |
PAGE_RES * tesseract::Tesseract::SetupApplyBoxes | ( | BLOCK_LIST * | block_list | ) |
void tesseract::Tesseract::SetupWordScripts | ( | BLOCK_LIST * | blocks | ) |
void tesseract::Tesseract::split_and_recog_word | ( | WERD_RES * | word, |
BLOB_CHOICE_LIST_CLIST * | blob_choices | ||
) |
const ShiroRekhaSplitter& tesseract::Tesseract::splitter | ( | ) | const [inline] |
BOOL8 tesseract::Tesseract::terrible_word_crunch | ( | WERD_RES * | word, |
GARBAGE_LEVEL | garbage_level | ||
) |
BOOL8 tesseract::Tesseract::tess_acceptable_word | ( | WERD_CHOICE * | word_choice, |
WERD_CHOICE * | raw_choice | ||
) |
void tesseract::Tesseract::tess_add_doc_word | ( | WERD_CHOICE * | word_choice | ) |
void tesseract::Tesseract::tess_segment_pass1 | ( | WERD_RES * | word, |
BLOB_CHOICE_LIST_CLIST * | blob_choices | ||
) |
void tesseract::Tesseract::tess_segment_pass2 | ( | WERD_RES * | word, |
BLOB_CHOICE_LIST_CLIST * | blob_choices | ||
) |
const Textord& tesseract::Tesseract::textord | ( | ) | const [inline] |
void tesseract::Tesseract::TidyUp | ( | PAGE_RES * | page_res | ) |
void tesseract::Tesseract::tilde_crunch | ( | PAGE_RES_IT & | page_res_it | ) |
void tesseract::Tesseract::tilde_delete | ( | PAGE_RES_IT & | page_res_it | ) |
void tesseract::Tesseract::unrej_good_quality_words | ( | PAGE_RES_IT & | page_res_it | ) |
BOOL8 tesseract::Tesseract::word_blank_and_set_display | ( | BLOCK * | block, |
ROW * | row, | ||
WERD_RES * | word_res | ||
) |
Normalize word and display in word window
void tesseract::Tesseract::word_char_quality | ( | WERD_RES * | word, |
ROW * | row, | ||
inT16 * | match_count, | ||
inT16 * | accepted_match_count | ||
) |
BOOL8 tesseract::Tesseract::word_contains_non_1_digit | ( | const char * | word, |
const char * | word_lengths | ||
) |
CRUNCH_MODE tesseract::Tesseract::word_deletable | ( | WERD_RES * | word, |
inT16 & | delete_mode | ||
) |
word_display() Word Processor
Display a word according to its display modes
Dump members to the debug window
word_set_display() Word processor
Display word according to current display mode settings
void tesseract::Tesseract::write_results | ( | PAGE_RES_IT & | page_res_it, |
char | newline_type, | ||
BOOL8 | force_eol | ||
) |
"Debug level"
char* tesseract::Tesseract::applybox_exposure_pattern = ".exp" |
"Exposure value follows this pattern in the image" " filename. The name of the image files are expected" " to be in the form [lang].[fontname].exp[num].tif"
"Learn both character fragments (as is done in the" " special low exposure mode) as well as unfragmented" " characters."
bool tesseract::Tesseract::applybox_learn_ngrams_mode = false |
"Each bounding box is assumed to contain ngrams. Only" " learn the ngrams whose outlines overlap horizontally."
"Page number to apply boxes from"
bool tesseract::Tesseract::bland_unrej = false |
"unrej potential with no chekcs"
char* tesseract::Tesseract::chs_leading_punct = "('`\"" |
"Leading punctuation"
char* tesseract::Tesseract::chs_trailing_punct1 = ").,;:?!" |
"1st Trailing punctuation"
char* tesseract::Tesseract::chs_trailing_punct2 = ")'`\"" |
"2nd Trailing punctuation"
char* tesseract::Tesseract::conflict_set_I_l_1 = "Il1[]" |
"Il1 conflict set"
bool tesseract::Tesseract::crunch_accept_ok = true |
"Use acceptability in okstring"
"As it says"
double tesseract::Tesseract::crunch_del_cert = -10.0 |
"POTENTIAL crunch cert lt this"
double tesseract::Tesseract::crunch_del_high_word = 1.5 |
"Del if word gt xht x this above bl"
double tesseract::Tesseract::crunch_del_low_word = 0.5 |
"Del if word gt xht x this below bl"
double tesseract::Tesseract::crunch_del_max_ht = 3.0 |
"Del if word ht gt xht x this"
double tesseract::Tesseract::crunch_del_min_ht = 0.7 |
"Del if word ht lt xht x this"
double tesseract::Tesseract::crunch_del_min_width = 3.0 |
"Del if word width lt xht x this"
double tesseract::Tesseract::crunch_del_rating = 60 |
"POTENTIAL crunch rating lt this"
"Take out ~^ early?"
"Before word crunch?"
bool tesseract::Tesseract::crunch_include_numerals = false |
"Fiddle alpha figures"
bool tesseract::Tesseract::crunch_leave_accept_strings = false |
"Dont pot crunch sensible strings"
"Dont crunch words with long lower case strings"
bool tesseract::Tesseract::crunch_leave_ok_strings = true |
"Dont touch sensible strings"
"Dont crunch words with long lower case strings"
"Crunch words with long repetitions"
double tesseract::Tesseract::crunch_poor_garbage_cert = -9.0 |
"crunch garbage cert lt this"
double tesseract::Tesseract::crunch_poor_garbage_rate = 60 |
"crunch garbage rating lt this"
bool tesseract::Tesseract::crunch_pot_garbage = true |
"POTENTIAL crunch garbage"
"How many potential indicators needed"
double tesseract::Tesseract::crunch_pot_poor_cert = -8.0 |
"POTENTIAL crunch cert lt this"
double tesseract::Tesseract::crunch_pot_poor_rate = 40 |
"POTENTIAL crunch rating lt this"
"For adj length in rating per ch"
double tesseract::Tesseract::crunch_small_outlines_size = 0.6 |
"Small if lt xht x this"
bool tesseract::Tesseract::crunch_terrible_garbage = true |
"As it says"
double tesseract::Tesseract::crunch_terrible_rating = 80.0 |
"crunch rating lt this"
"Print cube debug info."
bool tesseract::Tesseract::debug_acceptable_wds = false |
"Dump word pass/fail chk"
"Contextual fixspace debug"
"Reestimate debug"
bool tesseract::Tesseract::docqual_excuse_outline_errs = false |
"Allow outline errs in unrejection?"
char* tesseract::Tesseract::file_type = ".tif" |
"Filename extension"
"What constitues done for spacing"
"How many non-noise blbs either side?"
double tesseract::Tesseract::fixsp_small_outlines_size = 0.28 |
"Small if lt xht x this"
bool tesseract::Tesseract::interactive_mode = false |
"Run interactively?"
double tesseract::Tesseract::min_orientation_margin = 12.0 |
"Min acceptable orientation margin"
"Reject any x-ht lt or eq than this"
char* tesseract::Tesseract::numeric_punctuation = ".," |
"Punct. chs expected WITHIN numbers"
int tesseract::Tesseract::ocr_devanagari_split_strategy = tesseract::ShiroRekhaSplitter::NO_SPLIT |
"Whether to use the top-line splitting process for Devanagari " "documents while performing ocr."
char* tesseract::Tesseract::ok_repeated_ch_non_alphanum_wds = "-?*\075" |
"Allow NN to unrej"
char* tesseract::Tesseract::outlines_2 = "ij!?%\":;" |
"Non standard number of outlines"
char* tesseract::Tesseract::outlines_odd = "%| " |
"Non standard number of outlines"
int tesseract::Tesseract::pageseg_devanagari_split_strategy = tesseract::ShiroRekhaSplitter::NO_SPLIT |
"Whether to use the top-line splitting process for Devanagari " "documents while performing page-segmentation."
double tesseract::Tesseract::quality_blob_pc = 0.0 |
"good_quality_doc gte good blobs limit"
double tesseract::Tesseract::quality_char_pc = 0.95 |
"good_quality_doc gte good char limit"
"alphas in a good word"
double tesseract::Tesseract::quality_outline_pc = 1.0 |
"good_quality_doc lte outline error limit"
double tesseract::Tesseract::quality_rej_pc = 0.08 |
"good_quality_doc lte rejection limit"
double tesseract::Tesseract::quality_rowrej_pc = 1.1 |
"good_quality_doc gte good char limit"
"Dont double check"
bool tesseract::Tesseract::rej_1Il_use_dict_word = false |
"Use dictword test"
bool tesseract::Tesseract::rej_alphas_in_number_perm = false |
"Extend permuter check"
bool tesseract::Tesseract::rej_trust_doc_dawg = false |
"Use DOC dawg in 11l conf. detector"
bool tesseract::Tesseract::rej_use_good_perm = true |
"Individual rejection control"
bool tesseract::Tesseract::rej_use_sensible_wd = false |
"Extend permuter check"
bool tesseract::Tesseract::rej_use_tess_accepted = true |
"Individual rejection control"
bool tesseract::Tesseract::rej_use_tess_blanks = true |
"Individual rejection control"
"if >this fract"
bool tesseract::Tesseract::save_best_choices = false |
"Save the results of the recognition step" " (blob_choices) within the corresponding WERD_CHOICE"
double tesseract::Tesseract::suspect_accept_rating = -999.9 |
"Accept good rating limit"
bool tesseract::Tesseract::suspect_constrain_1Il = false |
"UNLV keep 1Il chars rejected"
int tesseract::Tesseract::suspect_level = 99 |
"Suspect marker level"
double tesseract::Tesseract::suspect_rating_per_ch = 999.9 |
"Dont touch bad rating limit"
"Dont Suspect dict wds longer than this"
"Min suspect level for rejecting spaces"
"Debug level for TessdataManager functions."
"Adapt to words that contain " " a character composed form fragments"
bool tesseract::Tesseract::tessedit_adaption_debug = false |
"Generate and print debug information for adaption"
bool tesseract::Tesseract::tessedit_ambigs_training = false |
"Perform training for ambiguities"
"Blacklist of chars not to recognize"
"Whitelist of chars to recognize"
bool tesseract::Tesseract::tessedit_consistent_reps = true |
"Force all rep chars the same"
bool tesseract::Tesseract::tessedit_create_boxfile = false |
"Output text with boxes"
bool tesseract::Tesseract::tessedit_create_hocr = false |
"Write .html hOCR output file"
bool tesseract::Tesseract::tessedit_debug_block_rejection = false |
"Block and Row stats"
bool tesseract::Tesseract::tessedit_debug_doc_rejection = false |
"Page stats"
bool tesseract::Tesseract::tessedit_debug_fonts = false |
"Output font info per char"
bool tesseract::Tesseract::tessedit_debug_quality_metrics = false |
"Output data to debug file"
bool tesseract::Tesseract::tessedit_dont_blkrej_good_wds = false |
"Use word segmentation quality metric"
bool tesseract::Tesseract::tessedit_dont_rowrej_good_wds = false |
"Use word segmentation quality metric"
bool tesseract::Tesseract::tessedit_draw_outwords = false |
"Draw output words"
bool tesseract::Tesseract::tessedit_dump_choices = false |
"Dump char choices"
bool tesseract::Tesseract::tessedit_dump_pageseg_images = false |
"Dump intermediate images made during page segmentation"
bool tesseract::Tesseract::tessedit_enable_doc_dict = true |
"Add words to the document dictionary"
bool tesseract::Tesseract::tessedit_fix_fuzzy_spaces = true |
"Try to improve fuzzy spaces"
bool tesseract::Tesseract::tessedit_fix_hyphens = true |
"Crunch double hyphens?"
bool tesseract::Tesseract::tessedit_flip_0O = true |
"Contextual 0O O0 flips"
"rej good doc wd if more than this fraction rejected"
"Reduce rejection on good docs"
"Rej blbs near image edge limit"
double tesseract::Tesseract::tessedit_lower_flip_hyphen = 1.5 |
"Aspect ratio dot/hyphen test"
bool tesseract::Tesseract::tessedit_make_boxes_from_boxes = false |
"Generate more boxes from boxed chars"
bool tesseract::Tesseract::tessedit_matcher_log = false |
"Log matcher activity"
bool tesseract::Tesseract::tessedit_minimal_rej_pass1 = false |
"Do minimal rejection on pass 1 output"
bool tesseract::Tesseract::tessedit_minimal_rejection = false |
"Only reject tess failures"
int tesseract::Tesseract::tessedit_ocr_engine_mode = tesseract::OEM_TESSERACT_ONLY |
"Which OCR engine(s) to run (Tesseract, Cube, both). Defaults" " to loading and running only Tesseract (no Cube, no combiner)." " (Values from OcrEngineMode enum in tesseractclass.h)"
"Acceptance decision algorithm"
bool tesseract::Tesseract::tessedit_override_permuter = true |
"According to dict_word"
"-1 -> All pages, else specifc page to process"
int tesseract::Tesseract::tessedit_pageseg_mode = PSM_SINGLE_BLOCK |
"Page seg mode: 0=osd only, 1=auto+osd, 2=auto, 3=col, 4=block," " 5=line, 6=word, 7=char" " (Values from PageSegMode enum in publictypes.h)"
bool tesseract::Tesseract::tessedit_prefer_joined_punct = false |
"Reward punctation joins"
"Only rej partially rejected words in block rejection"
"Only preserve wds longer than this"
"Only rej partially rejected words in row rejection"
bool tesseract::Tesseract::tessedit_redo_xheight = true |
"Check/Correct x-height"
"Reject all bad quality wds"
double tesseract::Tesseract::tessedit_reject_block_percent = 45.00 |
"%rej allowed before rej whole block"
double tesseract::Tesseract::tessedit_reject_doc_percent = 65.00 |
"%rej allowed before rej whole doc"
"Rejection algorithm"
double tesseract::Tesseract::tessedit_reject_row_percent = 40.00 |
"%rej allowed before rej whole row"
bool tesseract::Tesseract::tessedit_rejection_debug = false |
"Adaption debug"
bool tesseract::Tesseract::tessedit_resegment_from_boxes = false |
"Take segmentation and labeling from box file"
"Conversion of word/line box file to char box file"
bool tesseract::Tesseract::tessedit_row_rej_good_docs = true |
"Apply row rejection to good docs"
bool tesseract::Tesseract::tessedit_tess_adapt_to_rejmap = false |
"Use reject map to control Tesseract adaption"
"Adaptation decision algorithm for tess"
bool tesseract::Tesseract::tessedit_test_adaption = false |
"Test adaption criteria"
"Adaptation decision algorithm for tess"
bool tesseract::Tesseract::tessedit_train_from_boxes = false |
"Generate training data from boxed chars"
bool tesseract::Tesseract::tessedit_training_tess = false |
"Call Tess to learn blobs"
bool tesseract::Tesseract::tessedit_unrej_any_wd = false |
"Dont bother with word plausibility"
double tesseract::Tesseract::tessedit_upper_flip_hyphen = 1.8 |
"Aspect ratio dot/hyphen test"
bool tesseract::Tesseract::tessedit_use_reject_spaces = true |
"Reject spaces?"
double tesseract::Tesseract::tessedit_whole_wd_rej_row_percent = 70.00 |
"Number of row rejects in whole word rejects" "which prevents whole row rejection"
bool tesseract::Tesseract::tessedit_word_for_word = false |
"Make output have exactly one word per WERD"
"Write block separators in output"
bool tesseract::Tesseract::tessedit_write_images = false |
"Capture the image from the IPE"
"Write all parameters to the given file."
bool tesseract::Tesseract::tessedit_write_rep_codes = false |
"Write repetition char code"
bool tesseract::Tesseract::tessedit_write_unlv = false |
"Write .unlv output file"
bool tesseract::Tesseract::tessedit_zero_kelvin_rejection = false |
"Dont reject ANYTHING AT ALL"
bool tesseract::Tesseract::tessedit_zero_rejection = false |
"Dont reject ANYTHING"
bool tesseract::Tesseract::test_pt = false |
"Test for point"
double tesseract::Tesseract::test_pt_x = 99999.99 |
"xcoord"
double tesseract::Tesseract::test_pt_y = 99999.99 |
"ycoord"
bool tesseract::Tesseract::unlv_tilde_crunching = true |
"Mark v.bad words for tilde crunch"
char* tesseract::Tesseract::unrecognised_char = "|" |
"Output char for unidentified blobs"
"Max allowed deviation of blob top outside of font data"
"Min change in xht before actually trying it"