tesseract  3.03
/usr/local/google/home/jbreiden/tesseract-ocr-read-only/ccstruct/rejctmap.h
Go to the documentation of this file.
00001 /**********************************************************************
00002  * File:        rejctmap.h  (Formerly rejmap.h)
00003  * Description: REJ and REJMAP class functions.
00004  * Author:              Phil Cheatle
00005  * Created:             Thu Jun  9 13:46:38 BST 1994
00006  *
00007  * (C) Copyright 1994, Hewlett-Packard Ltd.
00008  ** Licensed under the Apache License, Version 2.0 (the "License");
00009  ** you may not use this file except in compliance with the License.
00010  ** You may obtain a copy of the License at
00011  ** http://www.apache.org/licenses/LICENSE-2.0
00012  ** Unless required by applicable law or agreed to in writing, software
00013  ** distributed under the License is distributed on an "AS IS" BASIS,
00014  ** WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
00015  ** See the License for the specific language governing permissions and
00016  ** limitations under the License.
00017  *
00018 
00019 This module may look unneccessarily verbose, but here's the philosophy...
00020 
00021 ALL processing of the reject map is done in this module. There are lots of
00022 separate calls to set reject/accept flags. These have DELIBERATELY been kept
00023 distinct so that this module can decide what to do.
00024 
00025 Basically, there is a flag for each sort of rejection or acceptance. This
00026 provides a history of what has happened to EACH character.
00027 
00028 Determining whether a character is CURRENTLY rejected depends on implicit
00029 understanding of the SEQUENCE of possible calls. The flags are defined and
00030 grouped in the REJ_FLAGS enum. These groupings are used in determining a
00031 characters CURRENT rejection status. Basically, a character is ACCEPTED if
00032 
00033     none of the permanent rej flags are set
00034   AND (    the character has never been rejected
00035       OR an accept flag is set which is LATER than the latest reject flag )
00036 
00037 IT IS FUNDAMENTAL THAT ANYONE HACKING THIS CODE UNDERSTANDS THE SIGNIFICANCE
00038 OF THIS IMPLIED TEMPORAL ORDERING OF THE FLAGS!!!!
00039 **********************************************************************/
00040 
00041 #ifndef           REJCTMAP_H
00042 #define           REJCTMAP_H
00043 
00044 #ifdef __UNIX__
00045 #include          <assert.h>
00046 #endif
00047 #include          "memry.h"
00048 #include          "bits16.h"
00049 #include                   "params.h"
00050 
00051 enum REJ_FLAGS
00052 {
00053   /* Reject modes which are NEVER overridden */
00054   R_TESS_FAILURE,                // PERM Tess didnt classify
00055   R_SMALL_XHT,                   // PERM Xht too small
00056   R_EDGE_CHAR,                   // PERM Too close to edge of image
00057   R_1IL_CONFLICT,                // PERM 1Il confusion
00058   R_POSTNN_1IL,                  // PERM 1Il unrejected by NN
00059   R_REJ_CBLOB,                   // PERM Odd blob
00060   R_MM_REJECT,                   // PERM Matrix match rejection (m's)
00061   R_BAD_REPETITION,              // TEMP Repeated char which doesn't match trend
00062 
00063   /* Initial reject modes (pre NN_ACCEPT) */
00064   R_POOR_MATCH,                  // TEMP Ray's original heuristic (Not used)
00065   R_NOT_TESS_ACCEPTED,           // TEMP Tess didnt accept WERD
00066   R_CONTAINS_BLANKS,             // TEMP Tess failed on other chs in WERD
00067   R_BAD_PERMUTER,                // POTENTIAL Bad permuter for WERD
00068 
00069   /* Reject modes generated after NN_ACCEPT but before MM_ACCEPT */
00070   R_HYPHEN,                      // TEMP Post NN dodgy hyphen or full stop
00071   R_DUBIOUS,                     // TEMP Post NN dodgy chars
00072   R_NO_ALPHANUMS,                // TEMP No alphanumerics in word after NN
00073   R_MOSTLY_REJ,                  // TEMP Most of word rejected so rej the rest
00074   R_XHT_FIXUP,                   // TEMP Xht tests unsure
00075 
00076   /* Reject modes generated after MM_ACCEPT but before QUALITY_ACCEPT */
00077   R_BAD_QUALITY,                 // TEMP Quality metrics bad for WERD
00078 
00079   /* Reject modes generated after QUALITY_ACCEPT but before MINIMAL_REJ accep*/
00080   R_DOC_REJ,                     // TEMP Document rejection
00081   R_BLOCK_REJ,                   // TEMP Block rejection
00082   R_ROW_REJ,                     // TEMP Row rejection
00083   R_UNLV_REJ,                    // TEMP ~ turned to - or ^ turned to space
00084 
00085   /* Accept modes which occur inbetween the above rejection groups */
00086   R_NN_ACCEPT,                   //NN acceptance
00087   R_HYPHEN_ACCEPT,               //Hyphen acceptance
00088   R_MM_ACCEPT,                   //Matrix match acceptance
00089   R_QUALITY_ACCEPT,              //Accept word in good quality doc
00090   R_MINIMAL_REJ_ACCEPT           //Accept EVERYTHING except tess failures
00091 };
00092 
00093 /* REJECT MAP VALUES */
00094 
00095 #define           MAP_ACCEPT '1'
00096 #define           MAP_REJECT_PERM '0'
00097 #define           MAP_REJECT_TEMP '2'
00098 #define           MAP_REJECT_POTENTIAL '3'
00099 
00100 class REJ
00101 {
00102   BITS16 flags1;
00103   BITS16 flags2;
00104 
00105   void set_flag(REJ_FLAGS rej_flag) {
00106     if (rej_flag < 16)
00107       flags1.turn_on_bit (rej_flag);
00108     else
00109       flags2.turn_on_bit (rej_flag - 16);
00110   }
00111 
00112   BOOL8 rej_before_nn_accept();
00113   BOOL8 rej_between_nn_and_mm();
00114   BOOL8 rej_between_mm_and_quality_accept();
00115   BOOL8 rej_between_quality_and_minimal_rej_accept();
00116   BOOL8 rej_before_mm_accept();
00117   BOOL8 rej_before_quality_accept();
00118 
00119   public:
00120     REJ() {  //constructor
00121     }
00122 
00123     REJ(  //classwise copy
00124         const REJ &source) {
00125       flags1 = source.flags1;
00126       flags2 = source.flags2;
00127     }
00128 
00129     REJ & operator= (            //assign REJ
00130     const REJ & source) {        //from this
00131       flags1 = source.flags1;
00132       flags2 = source.flags2;
00133       return *this;
00134     }
00135 
00136     BOOL8 flag(REJ_FLAGS rej_flag) {
00137       if (rej_flag < 16)
00138         return flags1.bit (rej_flag);
00139       else
00140         return flags2.bit (rej_flag - 16);
00141     }
00142 
00143     char display_char() {
00144       if (perm_rejected ())
00145         return MAP_REJECT_PERM;
00146       else if (accept_if_good_quality ())
00147         return MAP_REJECT_POTENTIAL;
00148       else if (rejected ())
00149         return MAP_REJECT_TEMP;
00150       else
00151         return MAP_ACCEPT;
00152     }
00153 
00154     BOOL8 perm_rejected();  //Is char perm reject?
00155 
00156     BOOL8 rejected();  //Is char rejected?
00157 
00158     BOOL8 accepted() {  //Is char accepted?
00159       return !rejected ();
00160     }
00161 
00162                                  //potential rej?
00163     BOOL8 accept_if_good_quality();
00164 
00165     BOOL8 recoverable() {
00166       return (rejected () && !perm_rejected ());
00167     }
00168 
00169     void setrej_tess_failure();  //Tess generated blank
00170     void setrej_small_xht();  //Small xht char/wd
00171     void setrej_edge_char();  //Close to image edge
00172     void setrej_1Il_conflict();  //Initial reject map
00173     void setrej_postNN_1Il();  //1Il after NN
00174     void setrej_rej_cblob();  //Insert duff blob
00175     void setrej_mm_reject();  //Matrix matcher
00176                                  //Odd repeated char
00177     void setrej_bad_repetition();
00178     void setrej_poor_match();  //Failed Rays heuristic
00179                                  //TEMP reject_word
00180     void setrej_not_tess_accepted();
00181                                  //TEMP reject_word
00182     void setrej_contains_blanks();
00183     void setrej_bad_permuter();  //POTENTIAL reject_word
00184     void setrej_hyphen();  //PostNN dubious hyph or .
00185     void setrej_dubious();  //PostNN dubious limit
00186     void setrej_no_alphanums();  //TEMP reject_word
00187     void setrej_mostly_rej();  //TEMP reject_word
00188     void setrej_xht_fixup();  //xht fixup
00189     void setrej_bad_quality();  //TEMP reject_word
00190     void setrej_doc_rej();  //TEMP reject_word
00191     void setrej_block_rej();  //TEMP reject_word
00192     void setrej_row_rej();  //TEMP reject_word
00193     void setrej_unlv_rej();  //TEMP reject_word
00194     void setrej_nn_accept();  //NN Flipped a char
00195     void setrej_hyphen_accept();  //Good aspect ratio
00196     void setrej_mm_accept();  //Matrix matcher
00197                                  //Quality flip a char
00198     void setrej_quality_accept();
00199                                  //Accept all except blank
00200     void setrej_minimal_rej_accept();
00201 
00202     void full_print(FILE *fp);
00203 };
00204 
00205 class REJMAP
00206 {
00207   REJ *ptr;                      //ptr to the chars
00208   inT16 len;                     //Number of chars
00209 
00210   public:
00211     REJMAP() {  //constructor
00212       ptr = NULL;
00213       len = 0;
00214     }
00215 
00216     REJMAP(  //classwise copy
00217            const REJMAP &rejmap);
00218 
00219     REJMAP & operator= (         //assign REJMAP
00220       const REJMAP & source);    //from this
00221 
00222     ~REJMAP () {                 //destructor
00223       if (ptr != NULL)
00224         free_struct (ptr, len * sizeof (REJ), "REJ");
00225     }
00226 
00227     void initialise(  //Redefine map
00228                     inT16 length);
00229 
00230     REJ & operator[](            //access function
00231       inT16 index) const         //map index
00232     {
00233       ASSERT_HOST (index < len);
00234       return ptr[index];         //no bounds checks
00235     }
00236 
00237     inT32 length() const {  //map length
00238       return len;
00239     }
00240 
00241     inT16 accept_count();  //How many accepted?
00242 
00243     inT16 reject_count() {  //How many rejects?
00244       return len - accept_count ();
00245     }
00246 
00247     void remove_pos(             //Cut out an element
00248                     inT16 pos);  //element to remove
00249 
00250     void print(FILE *fp);
00251 
00252     void full_print(FILE *fp);
00253 
00254     BOOL8 recoverable_rejects();  //Any non perm rejs?
00255 
00256     BOOL8 quality_recoverable_rejects();
00257     //Any potential rejs?
00258 
00259     void rej_word_small_xht();  //Reject whole word
00260                                  //Reject whole word
00261     void rej_word_tess_failure();
00262     void rej_word_not_tess_accepted();
00263     //Reject whole word
00264                                  //Reject whole word
00265     void rej_word_contains_blanks();
00266                                  //Reject whole word
00267     void rej_word_bad_permuter();
00268     void rej_word_xht_fixup();  //Reject whole word
00269                                  //Reject whole word
00270     void rej_word_no_alphanums();
00271     void rej_word_mostly_rej();  //Reject whole word
00272     void rej_word_bad_quality();  //Reject whole word
00273     void rej_word_doc_rej();  //Reject whole word
00274     void rej_word_block_rej();  //Reject whole word
00275     void rej_word_row_rej();  //Reject whole word
00276 };
00277 #endif
 All Classes Namespaces Files Functions Variables Typedefs Enumerations Enumerator Friends Defines