Tesseract 3.01
/data/source/tesseract-ocr/ccstruct/statistc.h
Go to the documentation of this file.
00001 /**********************************************************************
00002  * File:        statistc.h  (Formerly stats.h)
00003  * Description: Class description for STATS class.
00004  * Author:                                      Ray Smith
00005  * Created:                                     Mon Feb 04 16:19:07 GMT 1991
00006  *
00007  * (C) Copyright 1991, Hewlett-Packard Ltd.
00008  ** Licensed under the Apache License, Version 2.0 (the "License");
00009  ** you may not use this file except in compliance with the License.
00010  ** You may obtain a copy of the License at
00011  ** http://www.apache.org/licenses/LICENSE-2.0
00012  ** Unless required by applicable law or agreed to in writing, software
00013  ** distributed under the License is distributed on an "AS IS" BASIS,
00014  ** WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
00015  ** See the License for the specific language governing permissions and
00016  ** limitations under the License.
00017  *
00018  **********************************************************************/
00019 
00020 #ifndef TESSERACT_CCSTRUCT_STATISTC_H_
00021 #define TESSERACT_CCSTRUCT_STATISTC_H_
00022 
00023 #include <stdio.h>
00024 #include "host.h"
00025 #include "scrollview.h"
00026 
00027 // Simple histogram-based statistics for integer values in a known
00028 // range, such that the range is small compared to the number of samples.
00029 class STATS {
00030  public:
00031   // The histogram buckets are in the range
00032   // [min_bucket_value, max_bucket_value_plus_1 - 1] i.e.
00033   // [min_bucket_value, max_bucket_value].
00034   // Any data under min_bucket value is silently mapped to min_bucket_value,
00035   // and likewise, any data over max_bucket_value is silently mapped to
00036   // max_bucket_value.
00037   // In the internal array, min_bucket_value maps to 0 and
00038   // max_bucket_value_plus_1 - min_bucket_value to the array size.
00039   // TODO(rays) This is ugly. Convert the second argument to
00040   // max_bucket_value and all the code that uses it.
00041   STATS(inT32 min_bucket_value, inT32 max_bucket_value_plus_1);
00042   STATS();  // empty for arrays
00043 
00044   ~STATS();
00045 
00046   // (Re)Sets the range and clears the counts.
00047   // See the constructor for info on max and min values.
00048   bool set_range(inT32 min_bucket_value, inT32 max_bucket_value_plus_1);
00049 
00050   void clear();  // empty buckets
00051 
00052   void add(inT32 value, inT32 count);
00053 
00054   // "Accessors" return various statistics on the data.
00055   inT32 mode() const;  // get mode of samples
00056   double mean() const;  // get mean of samples
00057   double sd() const;  // standard deviation
00058   // Returns the fractile value such that frac fraction (in [0,1]) of samples
00059   // has a value less than the return value.
00060   double ile(double frac) const;
00061   // Returns the minimum used entry in the histogram (ie the minimum of the
00062   // data, NOT the minimum of the supplied range, nor is it an index.)
00063   // Would normally be called min(), but that is a reserved word in VC++.
00064   inT32 min_bucket() const;  // Find min
00065   // Returns the maximum used entry in the histogram (ie the maximum of the
00066   // data, NOT the maximum of the supplied range, nor is it an index.)
00067   inT32 max_bucket() const;  // Find max
00068   // Finds a more useful estimate of median than ile(0.5).
00069   // Overcomes a problem with ile() - if the samples are, for example,
00070   // 6,6,13,14 ile(0.5) return 7.0 - when a more useful value would be midway
00071   // between 6 and 13 = 9.5
00072   double median() const;  // get median of samples
00073   // Returns the count of the given value.
00074   inT32 pile_count(inT32 value ) const {
00075     if (value <= rangemin_)
00076       return buckets_[0];
00077     if (value >= rangemax_ - 1)
00078       return buckets_[rangemax_ - rangemin_ - 1];
00079     return buckets_[value - rangemin_];
00080   }
00081   // Returns the total count of all buckets.
00082   inT32 get_total() const {
00083     return total_count_;        // total of all piles
00084   }
00085   // Returns true if x is a local min.
00086   bool local_min(inT32 x) const;
00087 
00088   // Apply a triangular smoothing filter to the stats.
00089   // This makes the modes a bit more useful.
00090   // The factor gives the height of the triangle, i.e. the weight of the
00091   // centre.
00092   void smooth(inT32 factor);
00093 
00094   // Cluster the samples into max_cluster clusters.
00095   // Each call runs one iteration. The array of clusters must be
00096   // max_clusters+1 in size as cluster 0 is used to indicate which samples
00097   // have been used.
00098   // The return value is the current number of clusters.
00099   inT32 cluster(float lower,         // thresholds
00100                 float upper,
00101                 float multiple,      // distance threshold
00102                 inT32 max_clusters,  // max no to make
00103                 STATS *clusters);    // array of clusters
00104 
00105 
00106   // Prints a summary and table of the histogram.
00107   void print() const;
00108   // Prints summary stats only of the histogram.
00109   void print_summary() const;
00110 
00111   // Draws the histogram as a series of rectangles.
00112   void plot(ScrollView* window,   // window to draw in
00113             float xorigin,   // origin of histo
00114             float yorigin,   // gram
00115             float xscale,    // size of one unit
00116             float yscale,    // size of one uint
00117             ScrollView::Color colour) const;  // colour to draw in
00118 
00119   // Draws a line graph of the histogram.
00120   void plotline(ScrollView* window,   // window to draw in
00121                 float xorigin,   // origin of histo
00122                 float yorigin,   // gram
00123                 float xscale,    // size of one unit
00124                 float yscale,    // size of one uint
00125                 ScrollView::Color colour) const;  // colour to draw in
00126  private:
00127   inT32 rangemin_;                // min of range
00128   // rangemax_ is not well named as it is really one past the max.
00129   inT32 rangemax_;                // max of range
00130   inT32 total_count_;             // no of samples
00131   inT32* buckets_;                // array of cells
00132 };
00133 
00134 // Returns the nth ordered item from the array, as if they were
00135 // ordered, but without ordering them, in linear time.
00136 // The array does get shuffled!
00137 inT32 choose_nth_item(inT32 index,   // index to choose
00138                       float *array,  // array of items
00139                       inT32 count);  // no of items
00140 // Generic version uses a defined comparator (with qsort semantics).
00141 inT32 choose_nth_item(inT32 index,   // index to choose
00142                       void *array,   // array of items
00143                       inT32 count,   // no of items
00144                       size_t size,   // element size
00145                       int (*compar)(const void*, const void*));  // comparator
00146 // Swaps 2 entries in an array in-place.
00147 void swap_entries(void *array,   // array of entries
00148                   size_t size,   // size of entry
00149                   inT32 index1,  // entries to swap
00150                   inT32 index2);
00151 
00152 #endif  // TESSERACT_CCSTRUCT_STATISTC_H_
 All Classes Namespaces Files Functions Variables Typedefs Enumerations Enumerator Friends Defines