Tesseract 3.01
/data/source/tesseract-ocr/classify/cluster.h
Go to the documentation of this file.
00001 /******************************************************************************
00002  **     Filename:       cluster.h
00003  **     Purpose:        Definition of feature space clustering routines
00004  **     Author:         Dan Johnson
00005  **     History:        5/29/89, DSJ, Created.
00006  **
00007  **     (c) Copyright Hewlett-Packard Company, 1988.
00008  ** Licensed under the Apache License, Version 2.0 (the "License");
00009  ** you may not use this file except in compliance with the License.
00010  ** You may obtain a copy of the License at
00011  ** http://www.apache.org/licenses/LICENSE-2.0
00012  ** Unless required by applicable law or agreed to in writing, software
00013  ** distributed under the License is distributed on an "AS IS" BASIS,
00014  ** WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
00015  ** See the License for the specific language governing permissions and
00016  ** limitations under the License.
00017  ******************************************************************************/
00018 #ifndef   CLUSTER_H
00019 #define   CLUSTER_H
00020 
00021 #include "kdtree.h"
00022 #include "oldlist.h"
00023 
00024 /*----------------------------------------------------------------------
00025           Types
00026 ----------------------------------------------------------------------*/
00027 typedef struct sample {
00028   unsigned Clustered:1;          // TRUE if included in a higher cluster
00029   unsigned Prototype:1;          // TRUE if cluster represented by a proto
00030   unsigned SampleCount:30;       // number of samples in this cluster
00031   struct sample *Left;           // ptr to left sub-cluster
00032   struct sample *Right;          // ptr to right sub-cluster
00033   inT32 CharID;                  // identifier of char sample came from
00034   FLOAT32 Mean[1];               // mean of cluster - SampleSize floats
00035 } CLUSTER;
00036 
00037 typedef CLUSTER SAMPLE;          // can refer to as either sample or cluster
00038 
00039 typedef enum {
00040   spherical, elliptical, mixed, automatic
00041 } PROTOSTYLE;
00042 
00043 typedef struct {                 // parameters to control clustering
00044   PROTOSTYLE ProtoStyle;         // specifies types of protos to be made
00045   FLOAT32 MinSamples;            // min # of samples per proto - % of total
00046   FLOAT32 MaxIllegal;            // max percentage of samples in a cluster which have
00047   // more than 1 feature in that cluster
00048   FLOAT32 Independence;          // desired independence between dimensions
00049   FLOAT64 Confidence;            // desired confidence in prototypes created
00050   int MagicSamples;              // Ideal number of samples in a cluster.
00051 } CLUSTERCONFIG;
00052 
00053 typedef enum {
00054   normal, uniform, D_random
00055 } DISTRIBUTION;
00056 
00057 typedef union {
00058   FLOAT32 Spherical;
00059   FLOAT32 *Elliptical;
00060 } FLOATUNION;
00061 
00062 typedef struct {
00063   unsigned Significant:1;        // TRUE if prototype is significant
00064   unsigned Merged:1;             // Merged after clustering so do not output
00065                                  // but kept for display purposes. If it has no
00066                                  // samples then it was actually merged.
00067                                  // Otherwise it matched an already significant
00068                                  // cluster.
00069   unsigned Style:2;              // spherical, elliptical, or mixed
00070   unsigned NumSamples:28;        // number of samples in the cluster
00071   CLUSTER *Cluster;              // ptr to cluster which made prototype
00072   DISTRIBUTION *Distrib;         // different distribution for each dimension
00073   FLOAT32 *Mean;                 // prototype mean
00074   FLOAT32 TotalMagnitude;        // total magnitude over all dimensions
00075   FLOAT32 LogMagnitude;          // log base e of TotalMagnitude
00076   FLOATUNION Variance;           // prototype variance
00077   FLOATUNION Magnitude;          // magnitude of density function
00078   FLOATUNION Weight;             // weight of density function
00079 } PROTOTYPE;
00080 
00081 typedef struct {
00082   inT16 SampleSize;              // number of parameters per sample
00083   PARAM_DESC *ParamDesc;         // description of each parameter
00084   inT32 NumberOfSamples;         // total number of samples being clustered
00085   KDTREE *KDTree;                // for optimal nearest neighbor searching
00086   CLUSTER *Root;                 // ptr to root cluster of cluster tree
00087   LIST ProtoList;                // list of prototypes
00088   inT32 NumChar;                 // # of characters represented by samples
00089   LIST bucket_cache[3];  // cache of reusable histograms by distribution type
00090 } CLUSTERER;
00091 
00092 typedef struct {
00093   inT32 NumSamples;              // number of samples in list
00094   inT32 MaxNumSamples;           // maximum size of list
00095   SAMPLE *Sample[1];             // array of ptrs to sample data structures
00096 } SAMPLELIST;
00097 
00098 // low level cluster tree analysis routines.
00099 #define InitSampleSearch(S,C) (((C)==NULL)?(S=NIL_LIST):(S=push(NIL_LIST,(C))))
00100 
00101 /*--------------------------------------------------------------------------
00102         Public Function Prototypes
00103 --------------------------------------------------------------------------*/
00104 CLUSTERER *MakeClusterer (inT16 SampleSize, const PARAM_DESC ParamDesc[]);
00105 
00106 SAMPLE *MakeSample (CLUSTERER * Clusterer, FLOAT32 Feature[], inT32 CharID);
00107 
00108 LIST ClusterSamples(CLUSTERER *Clusterer, CLUSTERCONFIG *Config);
00109 
00110 void FreeClusterer(CLUSTERER *Clusterer);
00111 
00112 void FreeProtoList(LIST *ProtoList);
00113 
00114 void FreePrototype(void *arg);  // PROTOTYPE *Prototype);
00115 
00116 CLUSTER *NextSample(LIST *SearchState);
00117 
00118 FLOAT32 Mean(PROTOTYPE *Proto, uinT16 Dimension);
00119 
00120 FLOAT32 StandardDeviation(PROTOTYPE *Proto, uinT16 Dimension);
00121 
00122 inT32 MergeClusters(inT16 N, PARAM_DESC ParamDesc[], inT32 n1, inT32 n2,
00123                     FLOAT32 m[], FLOAT32 m1[], FLOAT32 m2[]);
00124 
00125 //--------------Global Data Definitions and Declarations---------------------------
00126 // define errors that can be trapped
00127 #define ALREADYCLUSTERED  4000
00128 #endif
 All Classes Namespaces Files Functions Variables Typedefs Enumerations Enumerator Friends Defines