Tesseract 3.01
|
00001 00002 // File: pageiterator.h 00003 // Description: Iterator for tesseract page structure that avoids using 00004 // tesseract internal data structures. 00005 // Author: Ray Smith 00006 // Created: Fri Feb 26 11:01:06 PST 2010 00007 // 00008 // (C) Copyright 2010, Google Inc. 00009 // Licensed under the Apache License, Version 2.0 (the "License"); 00010 // you may not use this file except in compliance with the License. 00011 // You may obtain a copy of the License at 00012 // http://www.apache.org/licenses/LICENSE-2.0 00013 // Unless required by applicable law or agreed to in writing, software 00014 // distributed under the License is distributed on an "AS IS" BASIS, 00015 // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 00016 // See the License for the specific language governing permissions and 00017 // limitations under the License. 00018 // 00020 00021 #ifndef TESSERACT_API_PAGEITERATOR_H__ 00022 #define TESSERACT_API_PAGEITERATOR_H__ 00023 00024 #include "apitypes.h" 00025 00026 class C_BLOB_IT; 00027 class PBLOB_IT; 00028 class PAGE_RES; 00029 class PAGE_RES_IT; 00030 class WERD; 00031 struct Pix; 00032 00033 namespace tesseract { 00034 00035 class Tesseract; 00036 00037 // Class to iterate over tesseract page structure, providing access to all 00038 // levels of the page hierarchy, without including any tesseract headers or 00039 // having to handle any tesseract structures. 00040 // WARNING! This class points to data held within the TessBaseAPI class, and 00041 // therefore can only be used while the TessBaseAPI class still exists and 00042 // has not been subjected to a call of Init, SetImage, Recognize, Clear, End 00043 // DetectOS, or anything else that changes the internal PAGE_RES. 00044 // See apitypes.h for the definition of PageIteratorLevel. 00045 // See also ResultIterator, derived from PageIterator, which adds in the 00046 // ability to access OCR output with text-specific methods. 00047 00048 class PageIterator { 00049 public: 00050 // page_res and tesseract come directly from the BaseAPI. 00051 // The rectangle parameters are copied indirectly from the Thresholder, 00052 // via the BaseAPI. They represent the coordinates of some rectangle in an 00053 // original image (in top-left-origin coordinates) and therefore the top-left 00054 // needs to be added to any output boxes in order to specify coordinates 00055 // in the original image. See TessBaseAPI::SetRectangle. 00056 // The scale and scaled_yres are in case the Thresholder scaled the image 00057 // rectangle prior to thresholding. Any coordinates in tesseract's image 00058 // must be divided by scale before adding (rect_left, rect_top). 00059 // The scaled_yres indicates the effective resolution of the binary image 00060 // that tesseract has been given by the Thresholder. 00061 // After the constructor, Begin has already been called. 00062 PageIterator(PAGE_RES* page_res, Tesseract* tesseract, 00063 int scale, int scaled_yres, 00064 int rect_left, int rect_top, 00065 int rect_width, int rect_height); 00066 virtual ~PageIterator(); 00067 00068 // Page/ResultIterators may be copied! This makes it possible to iterate over 00069 // all the objects at a lower level, while maintaining an iterator to 00070 // objects at a higher level. These constructors DO NOT CALL Begin, so 00071 // iterations will continue from the location of src. 00072 PageIterator(const PageIterator& src); 00073 const PageIterator& operator=(const PageIterator& src); 00074 00075 // ============= Moving around within the page ============. 00076 00077 // Moves the iterator to point to the start of the page to begin an iteration. 00078 void Begin(); 00079 00080 // Moves to the start of the next object at the given level in the 00081 // page hierarchy, and returns false if the end of the page was reached. 00082 // NOTE that RIL_SYMBOL will skip non-text blocks, but all other 00083 // PageIteratorLevel level values will visit each non-text block once. 00084 // Think of non text blocks as containing a single para, with a single line, 00085 // with a single imaginary word. 00086 // Calls to Next with different levels may be freely intermixed. 00087 // This function iterates words in right-to-left scripts correctly, if 00088 // the appropriate language has been loaded into Tesseract. 00089 bool Next(PageIteratorLevel level); 00090 00091 // Returns true if the iterator is at the start of an object at the given 00092 // level. Possible uses include determining if a call to Next(RIL_WORD) 00093 // moved to the start of a RIL_PARA. 00094 bool IsAtBeginningOf(PageIteratorLevel level) const; 00095 00096 // Returns whether the iterator is positioned at the last element in a 00097 // given level. (e.g. the last word in a line, the last line in a block) 00098 bool IsAtFinalElement(PageIteratorLevel level, 00099 PageIteratorLevel element) const; 00100 00101 // ============= Accessing data ==============. 00102 // Coordinate system: 00103 // Integer coordinates are at the cracks between the pixels. 00104 // The top-left corner of the top-left pixel in the image is at (0,0). 00105 // The bottom-right corner of the bottom-right pixel in the image is at 00106 // (width, height). 00107 // Every bounding box goes from the top-left of the top-left contained 00108 // pixel to the bottom-right of the bottom-right contained pixel, so 00109 // the bounding box of the single top-left pixel in the image is: 00110 // (0,0)->(1,1). 00111 // If an image rectangle has been set in the API, then returned coordinates 00112 // relate to the original (full) image, rather than the rectangle. 00113 00114 // Returns the bounding rectangle of the current object at the given level. 00115 // See comment on coordinate system above. 00116 // Returns false if there is no such object at the current position. 00117 // The returned bounding box is guaranteed to match the size and position 00118 // of the image returned by GetBinaryImage, but may clip foreground pixels 00119 // from a grey image. The padding argument to GetImage can be used to expand 00120 // the image to include more foreground pixels. See GetImage below. 00121 bool BoundingBox(PageIteratorLevel level, 00122 int* left, int* top, int* right, int* bottom) const; 00123 00124 // Returns the type of the current block. See apitypes.h for PolyBlockType. 00125 PolyBlockType BlockType() const; 00126 00127 // Returns a binary image of the current object at the given level. 00128 // The position and size match the return from BoundingBox. 00129 // Use pixDestroy to delete the image after use. 00130 Pix* GetBinaryImage(PageIteratorLevel level) const; 00131 00132 // Returns an image of the current object at the given level in greyscale 00133 // if available in the input. To guarantee a binary image use BinaryImage. 00134 // NOTE that in order to give the best possible image, the bounds are 00135 // expanded slightly over the binary connected component, by the supplied 00136 // padding, so the top-left position of the returned image is returned 00137 // in (left,top). These will most likely not match the coordinates 00138 // returned by BoundingBox. 00139 // Use pixDestroy to delete the image after use. 00140 Pix* GetImage(PageIteratorLevel level, int padding, 00141 int* left, int* top) const; 00142 00143 // Returns the baseline of the current object at the given level. 00144 // The baseline is the line that passes through (x1, y1) and (x2, y2). 00145 // WARNING: with vertical text, baselines may be vertical! 00146 // Returns false if there is no baseline at the current position. 00147 bool Baseline(PageIteratorLevel level, 00148 int* x1, int* y1, int* x2, int* y2) const; 00149 00150 // Returns orientation for the block the iterator points to. 00151 // orientation, writing_direction, textline_order: see publictypes.h 00152 // deskew_angle: after rotating the block so the text orientation is 00153 // upright, how many radians does one have to rotate the 00154 // block anti-clockwise for it to be level? 00155 // -Pi/4 <= deskew_angle <= Pi/4 00156 void Orientation(tesseract::Orientation *orientation, 00157 tesseract::WritingDirection *writing_direction, 00158 tesseract::TextlineOrder *textline_order, 00159 float *deskew_angle); 00160 00161 protected: 00162 // Sets up the internal data for iterating the blobs of a new word, then 00163 // moves the iterator to the given offset. 00164 void BeginWord(int offset); 00165 00166 // Pointer to the page_res owned by the API. 00167 PAGE_RES* page_res_; 00168 // Pointer to the Tesseract object owned by the API. 00169 Tesseract* tesseract_; 00170 // The iterator to the page_res_. Owned by this ResultIterator. 00171 // A pointer just to avoid dragging in Tesseract includes. 00172 PAGE_RES_IT* it_; 00173 // The current input WERD being iterated. If there is an output from OCR, 00174 // then word_ is NULL. Owned by the API. 00175 WERD* word_; 00176 // The length of the current word_. 00177 int word_length_; 00178 // The current blob index within the word. 00179 int blob_index_; 00180 // Iterator to the blobs within the word. If NULL, then we are iterating 00181 // OCR results in the box_word. 00182 // Owned by this ResultIterator. 00183 C_BLOB_IT* cblob_it_; 00184 // Parameters saved from the Thresholder. Needed to rebuild coordinates. 00185 int scale_; 00186 int scaled_yres_; 00187 int rect_left_; 00188 int rect_top_; 00189 int rect_width_; 00190 int rect_height_; 00191 }; 00192 00193 } // namespace tesseract. 00194 00195 #endif // TESSERACT_API_PAGEITERATOR_H__