tesseract
3.03
|
00001 /********************************************************************** 00002 * File: rejctmap.h (Formerly rejmap.h) 00003 * Description: REJ and REJMAP class functions. 00004 * Author: Phil Cheatle 00005 * Created: Thu Jun 9 13:46:38 BST 1994 00006 * 00007 * (C) Copyright 1994, Hewlett-Packard Ltd. 00008 ** Licensed under the Apache License, Version 2.0 (the "License"); 00009 ** you may not use this file except in compliance with the License. 00010 ** You may obtain a copy of the License at 00011 ** http://www.apache.org/licenses/LICENSE-2.0 00012 ** Unless required by applicable law or agreed to in writing, software 00013 ** distributed under the License is distributed on an "AS IS" BASIS, 00014 ** WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 00015 ** See the License for the specific language governing permissions and 00016 ** limitations under the License. 00017 * 00018 00019 This module may look unneccessarily verbose, but here's the philosophy... 00020 00021 ALL processing of the reject map is done in this module. There are lots of 00022 separate calls to set reject/accept flags. These have DELIBERATELY been kept 00023 distinct so that this module can decide what to do. 00024 00025 Basically, there is a flag for each sort of rejection or acceptance. This 00026 provides a history of what has happened to EACH character. 00027 00028 Determining whether a character is CURRENTLY rejected depends on implicit 00029 understanding of the SEQUENCE of possible calls. The flags are defined and 00030 grouped in the REJ_FLAGS enum. These groupings are used in determining a 00031 characters CURRENT rejection status. Basically, a character is ACCEPTED if 00032 00033 none of the permanent rej flags are set 00034 AND ( the character has never been rejected 00035 OR an accept flag is set which is LATER than the latest reject flag ) 00036 00037 IT IS FUNDAMENTAL THAT ANYONE HACKING THIS CODE UNDERSTANDS THE SIGNIFICANCE 00038 OF THIS IMPLIED TEMPORAL ORDERING OF THE FLAGS!!!! 00039 **********************************************************************/ 00040 00041 #ifndef REJCTMAP_H 00042 #define REJCTMAP_H 00043 00044 #ifdef __UNIX__ 00045 #include <assert.h> 00046 #endif 00047 #include "memry.h" 00048 #include "bits16.h" 00049 #include "params.h" 00050 00051 enum REJ_FLAGS 00052 { 00053 /* Reject modes which are NEVER overridden */ 00054 R_TESS_FAILURE, // PERM Tess didnt classify 00055 R_SMALL_XHT, // PERM Xht too small 00056 R_EDGE_CHAR, // PERM Too close to edge of image 00057 R_1IL_CONFLICT, // PERM 1Il confusion 00058 R_POSTNN_1IL, // PERM 1Il unrejected by NN 00059 R_REJ_CBLOB, // PERM Odd blob 00060 R_MM_REJECT, // PERM Matrix match rejection (m's) 00061 R_BAD_REPETITION, // TEMP Repeated char which doesn't match trend 00062 00063 /* Initial reject modes (pre NN_ACCEPT) */ 00064 R_POOR_MATCH, // TEMP Ray's original heuristic (Not used) 00065 R_NOT_TESS_ACCEPTED, // TEMP Tess didnt accept WERD 00066 R_CONTAINS_BLANKS, // TEMP Tess failed on other chs in WERD 00067 R_BAD_PERMUTER, // POTENTIAL Bad permuter for WERD 00068 00069 /* Reject modes generated after NN_ACCEPT but before MM_ACCEPT */ 00070 R_HYPHEN, // TEMP Post NN dodgy hyphen or full stop 00071 R_DUBIOUS, // TEMP Post NN dodgy chars 00072 R_NO_ALPHANUMS, // TEMP No alphanumerics in word after NN 00073 R_MOSTLY_REJ, // TEMP Most of word rejected so rej the rest 00074 R_XHT_FIXUP, // TEMP Xht tests unsure 00075 00076 /* Reject modes generated after MM_ACCEPT but before QUALITY_ACCEPT */ 00077 R_BAD_QUALITY, // TEMP Quality metrics bad for WERD 00078 00079 /* Reject modes generated after QUALITY_ACCEPT but before MINIMAL_REJ accep*/ 00080 R_DOC_REJ, // TEMP Document rejection 00081 R_BLOCK_REJ, // TEMP Block rejection 00082 R_ROW_REJ, // TEMP Row rejection 00083 R_UNLV_REJ, // TEMP ~ turned to - or ^ turned to space 00084 00085 /* Accept modes which occur inbetween the above rejection groups */ 00086 R_NN_ACCEPT, //NN acceptance 00087 R_HYPHEN_ACCEPT, //Hyphen acceptance 00088 R_MM_ACCEPT, //Matrix match acceptance 00089 R_QUALITY_ACCEPT, //Accept word in good quality doc 00090 R_MINIMAL_REJ_ACCEPT //Accept EVERYTHING except tess failures 00091 }; 00092 00093 /* REJECT MAP VALUES */ 00094 00095 #define MAP_ACCEPT '1' 00096 #define MAP_REJECT_PERM '0' 00097 #define MAP_REJECT_TEMP '2' 00098 #define MAP_REJECT_POTENTIAL '3' 00099 00100 class REJ 00101 { 00102 BITS16 flags1; 00103 BITS16 flags2; 00104 00105 void set_flag(REJ_FLAGS rej_flag) { 00106 if (rej_flag < 16) 00107 flags1.turn_on_bit (rej_flag); 00108 else 00109 flags2.turn_on_bit (rej_flag - 16); 00110 } 00111 00112 BOOL8 rej_before_nn_accept(); 00113 BOOL8 rej_between_nn_and_mm(); 00114 BOOL8 rej_between_mm_and_quality_accept(); 00115 BOOL8 rej_between_quality_and_minimal_rej_accept(); 00116 BOOL8 rej_before_mm_accept(); 00117 BOOL8 rej_before_quality_accept(); 00118 00119 public: 00120 REJ() { //constructor 00121 } 00122 00123 REJ( //classwise copy 00124 const REJ &source) { 00125 flags1 = source.flags1; 00126 flags2 = source.flags2; 00127 } 00128 00129 REJ & operator= ( //assign REJ 00130 const REJ & source) { //from this 00131 flags1 = source.flags1; 00132 flags2 = source.flags2; 00133 return *this; 00134 } 00135 00136 BOOL8 flag(REJ_FLAGS rej_flag) { 00137 if (rej_flag < 16) 00138 return flags1.bit (rej_flag); 00139 else 00140 return flags2.bit (rej_flag - 16); 00141 } 00142 00143 char display_char() { 00144 if (perm_rejected ()) 00145 return MAP_REJECT_PERM; 00146 else if (accept_if_good_quality ()) 00147 return MAP_REJECT_POTENTIAL; 00148 else if (rejected ()) 00149 return MAP_REJECT_TEMP; 00150 else 00151 return MAP_ACCEPT; 00152 } 00153 00154 BOOL8 perm_rejected(); //Is char perm reject? 00155 00156 BOOL8 rejected(); //Is char rejected? 00157 00158 BOOL8 accepted() { //Is char accepted? 00159 return !rejected (); 00160 } 00161 00162 //potential rej? 00163 BOOL8 accept_if_good_quality(); 00164 00165 BOOL8 recoverable() { 00166 return (rejected () && !perm_rejected ()); 00167 } 00168 00169 void setrej_tess_failure(); //Tess generated blank 00170 void setrej_small_xht(); //Small xht char/wd 00171 void setrej_edge_char(); //Close to image edge 00172 void setrej_1Il_conflict(); //Initial reject map 00173 void setrej_postNN_1Il(); //1Il after NN 00174 void setrej_rej_cblob(); //Insert duff blob 00175 void setrej_mm_reject(); //Matrix matcher 00176 //Odd repeated char 00177 void setrej_bad_repetition(); 00178 void setrej_poor_match(); //Failed Rays heuristic 00179 //TEMP reject_word 00180 void setrej_not_tess_accepted(); 00181 //TEMP reject_word 00182 void setrej_contains_blanks(); 00183 void setrej_bad_permuter(); //POTENTIAL reject_word 00184 void setrej_hyphen(); //PostNN dubious hyph or . 00185 void setrej_dubious(); //PostNN dubious limit 00186 void setrej_no_alphanums(); //TEMP reject_word 00187 void setrej_mostly_rej(); //TEMP reject_word 00188 void setrej_xht_fixup(); //xht fixup 00189 void setrej_bad_quality(); //TEMP reject_word 00190 void setrej_doc_rej(); //TEMP reject_word 00191 void setrej_block_rej(); //TEMP reject_word 00192 void setrej_row_rej(); //TEMP reject_word 00193 void setrej_unlv_rej(); //TEMP reject_word 00194 void setrej_nn_accept(); //NN Flipped a char 00195 void setrej_hyphen_accept(); //Good aspect ratio 00196 void setrej_mm_accept(); //Matrix matcher 00197 //Quality flip a char 00198 void setrej_quality_accept(); 00199 //Accept all except blank 00200 void setrej_minimal_rej_accept(); 00201 00202 void full_print(FILE *fp); 00203 }; 00204 00205 class REJMAP 00206 { 00207 REJ *ptr; //ptr to the chars 00208 inT16 len; //Number of chars 00209 00210 public: 00211 REJMAP() { //constructor 00212 ptr = NULL; 00213 len = 0; 00214 } 00215 00216 REJMAP( //classwise copy 00217 const REJMAP &rejmap); 00218 00219 REJMAP & operator= ( //assign REJMAP 00220 const REJMAP & source); //from this 00221 00222 ~REJMAP () { //destructor 00223 if (ptr != NULL) 00224 free_struct (ptr, len * sizeof (REJ), "REJ"); 00225 } 00226 00227 void initialise( //Redefine map 00228 inT16 length); 00229 00230 REJ & operator[]( //access function 00231 inT16 index) const //map index 00232 { 00233 ASSERT_HOST (index < len); 00234 return ptr[index]; //no bounds checks 00235 } 00236 00237 inT32 length() const { //map length 00238 return len; 00239 } 00240 00241 inT16 accept_count(); //How many accepted? 00242 00243 inT16 reject_count() { //How many rejects? 00244 return len - accept_count (); 00245 } 00246 00247 void remove_pos( //Cut out an element 00248 inT16 pos); //element to remove 00249 00250 void print(FILE *fp); 00251 00252 void full_print(FILE *fp); 00253 00254 BOOL8 recoverable_rejects(); //Any non perm rejs? 00255 00256 BOOL8 quality_recoverable_rejects(); 00257 //Any potential rejs? 00258 00259 void rej_word_small_xht(); //Reject whole word 00260 //Reject whole word 00261 void rej_word_tess_failure(); 00262 void rej_word_not_tess_accepted(); 00263 //Reject whole word 00264 //Reject whole word 00265 void rej_word_contains_blanks(); 00266 //Reject whole word 00267 void rej_word_bad_permuter(); 00268 void rej_word_xht_fixup(); //Reject whole word 00269 //Reject whole word 00270 void rej_word_no_alphanums(); 00271 void rej_word_mostly_rej(); //Reject whole word 00272 void rej_word_bad_quality(); //Reject whole word 00273 void rej_word_doc_rej(); //Reject whole word 00274 void rej_word_block_rej(); //Reject whole word 00275 void rej_word_row_rej(); //Reject whole word 00276 }; 00277 #endif