44 #define PERFECT_WERDS 999 55 static int c_blob_comparator(
59 const C_BLOB *blob1 = *
reinterpret_cast<const C_BLOB* const*
>(blob1p);
60 const C_BLOB *blob2 = *
reinterpret_cast<const C_BLOB* const*
>(blob2p);
78 BLOCK_RES_IT block_res_it;
79 ROW_RES_IT row_res_it;
80 WERD_RES_IT word_res_it_from;
81 WERD_RES_IT word_res_it_to;
83 WERD_RES_LIST fuzzy_space_words;
85 bool prevent_null_wd_fixsp;
90 for (block_res_it.mark_cycle_pt(); !block_res_it.cycled_list();
91 block_res_it.forward()) {
92 row_res_it.set_to_list(&block_res_it.data()->row_res_list);
93 for (row_res_it.mark_cycle_pt(); !row_res_it.cycled_list();
94 row_res_it.forward()) {
95 word_res_it_from.set_to_list(&row_res_it.data()->word_res_list);
96 while (!word_res_it_from.at_last()) {
97 word_res = word_res_it_from.data();
98 while (!word_res_it_from.at_last() &&
100 word_res_it_from.data_relative(1)->word->flag(
W_FUZZY_NON) ||
101 word_res_it_from.data_relative(1)->word->flag(
W_FUZZY_SP))) {
103 block_res_it.data()->block);
104 word_res = word_res_it_from.forward();
106 if (monitor !=
nullptr) {
108 monitor->
progress = 90 + 5 * word_index / word_count;
110 (monitor->
cancel !=
nullptr &&
116 if (!word_res_it_from.at_last()) {
117 word_res_it_to = word_res_it_from;
118 prevent_null_wd_fixsp =
122 word_res_it_to.forward();
124 if (monitor !=
nullptr) {
126 monitor->
progress = 90 + 5 * word_index / word_count;
128 (monitor->
cancel !=
nullptr &&
132 while (!word_res_it_to.at_last () &&
133 (word_res_it_to.data_relative(1)->word->flag(
W_FUZZY_NON) ||
134 word_res_it_to.data_relative(1)->word->flag(
W_FUZZY_SP))) {
138 prevent_null_wd_fixsp =
true;
139 word_res = word_res_it_to.forward();
144 prevent_null_wd_fixsp =
true;
145 if (prevent_null_wd_fixsp) {
146 word_res_it_from = word_res_it_to;
148 fuzzy_space_words.assign_to_sublist(&word_res_it_from,
151 row_res_it.data()->row,
152 block_res_it.data()->block);
153 new_length = fuzzy_space_words.length();
154 word_res_it_from.add_list_before(&fuzzy_space_words);
156 !word_res_it_from.at_last() && new_length > 0;
158 word_res_it_from.forward();
165 block_res_it.data()->block);
176 WERD_RES_LIST current_perm;
177 int16_t current_score;
178 bool improved =
false;
181 dump_words(best_perm, best_score, 1, improved);
186 while ((best_score !=
PERFECT_WERDS) && !current_perm.empty()) {
189 dump_words(current_perm, current_score, 2, improved);
190 if (current_score > best_score) {
193 best_score = current_score;
199 dump_words(best_perm, best_score, 3, improved);
205 WERD_RES_IT src_it(&src_list);
206 WERD_RES_IT new_it(&new_list);
210 for (src_it.mark_cycle_pt(); !src_it.cycled_list(); src_it.forward()) {
211 src_wd = src_it.data();
216 new_it.add_after_then_move(new_wd);
225 WERD_RES_IT word_it(&words);
230 for (word_it.mark_cycle_pt(); !word_it.cycled_list(); word_it.forward()) {
231 word = word_it.data();
233 WordData word_data(block, row, word);
267 WERD_RES_IT word_res_it(&word_res_list);
268 int16_t total_score = 0;
269 int16_t word_count = 0;
270 int16_t done_word_count = 0;
275 int16_t prev_word_score = 0;
276 bool prev_word_done =
false;
277 bool prev_char_1 =
false;
278 bool prev_char_digit =
false;
279 bool current_char_1 =
false;
280 bool current_word_ok_so_far;
281 STRING punct_chars =
"!\"`',.:;";
282 bool prev_char_punct =
false;
283 bool current_char_punct =
false;
284 bool word_done =
false;
287 word = word_res_it.data();
291 total_score += prev_word_score;
296 prev_char_digit =
false;
297 prev_word_done =
false;
305 current_word_ok_so_far =
false;
307 (prev_char_digit && (
313 total_score += prev_word_score;
316 current_word_ok_so_far = word_done;
319 if (current_word_ok_so_far) {
320 prev_word_done =
true;
321 prev_word_score = word_len;
323 prev_word_done =
false;
329 for (i = 0, prev_char_1 =
false; i < word_len; i++) {
331 if (prev_char_1 || (current_char_1 && (i > 0)))
333 prev_char_1 = current_char_1;
339 for (i = 0, offset = 0, prev_char_punct =
false; i < word_len;
343 if (prev_char_punct || (current_char_punct && i > 0))
345 prev_char_punct = current_char_punct;
349 for (i = 0, offset = 0; i < word_len - 1;
358 word_res_it.forward();
359 }
while (word_res_it.data()->part_of_combo);
360 }
while (!word_res_it.at_first());
361 total_score += prev_word_score;
364 if (done_word_count == word_count)
374 for (i = 0, offset = 0; i < char_position;
400 WERD_RES_IT word_it(&words);
401 WERD_RES_IT prev_word_it(&words);
406 int16_t prev_right = -INT16_MAX;
409 int16_t min_gap = INT16_MAX;
411 for (word_it.mark_cycle_pt(); !word_it.cycled_list(); word_it.forward()) {
412 word = word_it.data();
415 if (prev_right > -INT16_MAX) {
416 gap = box.
left() - prev_right;
420 prev_right = box.
right();
423 if (min_gap < INT16_MAX) {
424 prev_right = -INT16_MAX;
425 word_it.set_to_list(&words);
427 for (; (prev_right == -INT16_MAX) || !word_it.at_first();
429 word = word_it.data();
432 if (prev_right > -INT16_MAX) {
433 gap = box.
left() - prev_right;
434 if (gap <= min_gap) {
435 prev_word = prev_word_it.data();
441 copy_word =
new WERD;
442 *copy_word = *(prev_word->
word);
448 prev_word_it.add_before_then_move(combo);
455 delete word_it.extract();
464 prev_word_it = word_it;
467 prev_right = box.
right();
477 int16_t mode,
bool improved) {
478 WERD_RES_IT word_res_it(&perm);
483 for (word_res_it.mark_cycle_pt(); !word_res_it.cycled_list();
484 word_res_it.forward()) {
485 if (!word_res_it.data()->part_of_combo) {
487 word_res_it.data()->best_choice->unichar_string();
496 tprintf(
"EXTRACTED (%d): \"", score);
499 tprintf(
"TESTED (%d): \"", score);
502 tprintf(
"RETURNED (%d): \"", score);
506 for (word_res_it.mark_cycle_pt(); !word_res_it.cycled_list();
507 word_res_it.forward()) {
508 if (!word_res_it.data()->part_of_combo) {
510 word_res_it.data()->best_choice->unichar_string().string(),
511 static_cast<int>(word_res_it.data()->best_choice->permuter()));
515 }
else if (improved) {
517 for (word_res_it.mark_cycle_pt(); !word_res_it.cycled_list();
518 word_res_it.forward()) {
519 if (!word_res_it.data()->part_of_combo) {
521 word_res_it.data()->best_choice->unichar_string().string(),
522 static_cast<int>(word_res_it.data()->best_choice->permuter()));
565 WERD_RES_LIST sub_word_list;
566 WERD_RES_IT sub_word_list_it(&sub_word_list);
571 word_res = word_res_it.data();
583 tprintf(
"FP fixspace working on \"%s\"\n",
587 sub_word_list_it.add_after_stay_put(word_res_it.extract());
589 new_length = sub_word_list.length();
590 word_res_it.add_list_before(&sub_word_list);
591 for (; !word_res_it.at_last() && new_length > 1; new_length--) {
592 word_res_it.forward();
599 WERD_RES_IT best_perm_it(&best_perm);
600 WERD_RES_LIST current_perm;
601 WERD_RES_IT current_perm_it(¤t_perm);
603 int16_t current_score;
604 bool improved =
false;
608 dump_words(best_perm, best_score, 1, improved);
610 old_word_res = best_perm_it.data();
619 while (best_score !=
PERFECT_WERDS && !current_perm.empty()) {
622 dump_words(current_perm, current_score, 2, improved);
623 if (current_score > best_score) {
626 best_score = current_score;
633 dump_words(best_perm, best_score, 3, improved);
643 WERD_RES_IT word_it(&words);
644 WERD_RES_IT worst_word_it;
645 float worst_noise_score = 9999;
646 int worst_blob_index = -1;
651 C_BLOB_IT rej_cblob_it;
652 C_BLOB_LIST new_blob_list;
653 C_BLOB_IT new_blob_it;
654 C_BLOB_IT new_rej_cblob_it;
656 int16_t start_of_noise_blob;
659 for (word_it.mark_cycle_pt(); !word_it.cycled_list(); word_it.forward()) {
661 if (blob_index > -1 && worst_noise_score > noise_score) {
662 worst_noise_score = noise_score;
663 worst_blob_index = blob_index;
664 worst_word_it = word_it;
667 if (worst_blob_index < 0) {
674 word_res = worst_word_it.data();
678 new_blob_it.set_to_list(&new_blob_list);
680 for (i = 0; i < worst_blob_index; i++, blob_it.forward()) {
681 new_blob_it.add_after_then_move(blob_it.extract());
683 start_of_noise_blob = blob_it.data()->bounding_box().left();
684 delete blob_it.extract();
686 new_word =
new WERD(&new_blob_list, word_res->
word);
694 (!rej_cblob_it.empty() &&
695 (rej_cblob_it.data()->bounding_box().left() < start_of_noise_blob));
696 rej_cblob_it.forward()) {
697 new_rej_cblob_it.add_after_then_move(rej_cblob_it.extract());
700 auto* new_word_res =
new WERD_RES(new_word);
701 new_word_res->combination =
true;
702 worst_word_it.add_before_then_move(new_word_res);
708 float *worst_noise_score) {
709 float noise_score[512];
731 tprintf(
"FP fixspace Noise metrics for \"%s\": ",
738 noise_score[i] = non_noise_limit;
743 tprintf(
"%1.1f ", noise_score[i]);
752 if (noise_score[i] >= non_noise_limit) {
764 if (noise_score[i] >= non_noise_limit) {
773 if (min_noise_blob > max_noise_blob)
776 *worst_noise_score = small_limit;
778 for (i = min_noise_blob; i <= max_noise_blob; i++) {
779 if (noise_score[i] < *worst_noise_score) {
781 *worst_noise_score = noise_score[i];
789 int16_t outline_count = 0;
790 int16_t max_dimension;
791 int16_t largest_outline_dimension = 0;
795 box = ol->bounding_box();
797 max_dimension = box.
height();
799 max_dimension = box.
width();
802 if (largest_outline_dimension < max_dimension)
803 largest_outline_dimension = max_dimension;
806 if (outline_count > 5) {
808 largest_outline_dimension *= 2;
815 largest_outline_dimension /= 2;
818 return largest_outline_dimension;
824 const bool show_map_detail =
false;
829 tprintf(
"Blob count: %d (word); %d/%d (rebuild word)\n",
835 if (show_map_detail) {
844 tprintf(
"Done flag: %s\n\n", word->
done ?
"TRUE" :
"FALSE");
858 WERD_RES_IT word_it(&word_res_list);
864 for (word_it.mark_cycle_pt(); !word_it.cycled_list(); word_it.forward()) {
865 word = word_it.data();
char * conflict_set_I_l_1
void fix_fuzzy_space_list(WERD_RES_LIST &best_perm, ROW *row, BLOCK *block)
bool flag(WERD_FLAGS mask) const
DLLSYM void tprintf(const char *format,...)
CANCEL_FUNC cancel
for errcode use
GenericVector< TBLOB * > blobs
const int kBlnBaselineOffset
int16_t progress
chars in this buffer(0)
int16_t worst_noise_blob(WERD_RES *word_res, float *worst_noise_score)
void copy_on(WERD_RES *word_res)
bool get_isdigit(UNICHAR_ID unichar_id) const
void fix_noisy_space_list(WERD_RES_LIST &best_perm, ROW *row, BLOCK *block)
bool check_debug_pt(WERD_RES *word, int location)
bool contains(char c) const
int16_t safe_dict_word(const WERD_RES *werd_res)
const STRING & unichar_lengths() const
const char * string() const
void SetupWordPassN(int pass_n, WordData *word)
int fixsp_non_noise_limit
void match_current_words(WERD_RES_LIST &words, ROW *row, BLOCK *block)
volatile int8_t ocr_alive
true if not last
bool deadline_exceeded() const
C_BLOB_LIST * cblob_list()
TBOX bounding_box() const
void fix_fuzzy_spaces(ETEXT_DESC *monitor, int32_t word_count, PAGE_RES *page_res)
TBOX bounding_box() const
void set_flag(WERD_FLAGS mask, bool value)
C_BLOB_LIST * rej_cblob_list()
int16_t fp_eval_word_spacing(WERD_RES_LIST &word_res_list)
UNICHAR_ID unichar_to_id(const char *const unichar_repr) const
float blob_noise_score(TBLOB *blob)
bool tessedit_prefer_joined_punct
void join_on(WERD *other)
UNICHAR_ID unichar_id(int index) const
const STRING & unichar_string() const
WERD_CHOICE * prev_word_best_choice_
void initialise_search(WERD_RES_LIST &src_list, WERD_RES_LIST &new_list)
bool digit_or_numeric_punct(WERD_RES *word, int char_position)
int debug_fix_space_level
void transform_to_next_perm(WERD_RES_LIST &words)
void break_noisiest_blob_word(WERD_RES_LIST &words)
void dump_words(WERD_RES_LIST &perm, int16_t score, int16_t mode, bool improved)
void fixspace_dbg(WERD_RES *word)
void classify_word_and_language(int pass_n, PAGE_RES_IT *pr_it, WordData *word_data)
static WERD_RES * deep_copy(const WERD_RES *src)
void full_print(FILE *fp)
char * numeric_punctuation
double fixsp_small_outlines_size
void * cancel_this
monitor-aware progress callback
TBOX bounding_box() const
const UNICHARSET * uch_set
WERD_CHOICE * best_choice
int16_t eval_word_spacing(WERD_RES_LIST &word_res_list)
tesseract::BoxWord * box_word
bool fixspace_thinks_word_done(WERD_RES *word)
void set_blanks(uint8_t new_blanks)
BLOCK_RES_LIST block_res_list
void fix_sp_fp_word(WERD_RES_IT &word_res_it, ROW *row, BLOCK *block)