Tesseract 3.01
|
00001 /****************************************************************************** 00002 ** Filename: cluster.h 00003 ** Purpose: Definition of feature space clustering routines 00004 ** Author: Dan Johnson 00005 ** History: 5/29/89, DSJ, Created. 00006 ** 00007 ** (c) Copyright Hewlett-Packard Company, 1988. 00008 ** Licensed under the Apache License, Version 2.0 (the "License"); 00009 ** you may not use this file except in compliance with the License. 00010 ** You may obtain a copy of the License at 00011 ** http://www.apache.org/licenses/LICENSE-2.0 00012 ** Unless required by applicable law or agreed to in writing, software 00013 ** distributed under the License is distributed on an "AS IS" BASIS, 00014 ** WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 00015 ** See the License for the specific language governing permissions and 00016 ** limitations under the License. 00017 ******************************************************************************/ 00018 #ifndef CLUSTER_H 00019 #define CLUSTER_H 00020 00021 #include "kdtree.h" 00022 #include "oldlist.h" 00023 00024 /*---------------------------------------------------------------------- 00025 Types 00026 ----------------------------------------------------------------------*/ 00027 typedef struct sample { 00028 unsigned Clustered:1; // TRUE if included in a higher cluster 00029 unsigned Prototype:1; // TRUE if cluster represented by a proto 00030 unsigned SampleCount:30; // number of samples in this cluster 00031 struct sample *Left; // ptr to left sub-cluster 00032 struct sample *Right; // ptr to right sub-cluster 00033 inT32 CharID; // identifier of char sample came from 00034 FLOAT32 Mean[1]; // mean of cluster - SampleSize floats 00035 } CLUSTER; 00036 00037 typedef CLUSTER SAMPLE; // can refer to as either sample or cluster 00038 00039 typedef enum { 00040 spherical, elliptical, mixed, automatic 00041 } PROTOSTYLE; 00042 00043 typedef struct { // parameters to control clustering 00044 PROTOSTYLE ProtoStyle; // specifies types of protos to be made 00045 FLOAT32 MinSamples; // min # of samples per proto - % of total 00046 FLOAT32 MaxIllegal; // max percentage of samples in a cluster which have 00047 // more than 1 feature in that cluster 00048 FLOAT32 Independence; // desired independence between dimensions 00049 FLOAT64 Confidence; // desired confidence in prototypes created 00050 int MagicSamples; // Ideal number of samples in a cluster. 00051 } CLUSTERCONFIG; 00052 00053 typedef enum { 00054 normal, uniform, D_random 00055 } DISTRIBUTION; 00056 00057 typedef union { 00058 FLOAT32 Spherical; 00059 FLOAT32 *Elliptical; 00060 } FLOATUNION; 00061 00062 typedef struct { 00063 unsigned Significant:1; // TRUE if prototype is significant 00064 unsigned Merged:1; // Merged after clustering so do not output 00065 // but kept for display purposes. If it has no 00066 // samples then it was actually merged. 00067 // Otherwise it matched an already significant 00068 // cluster. 00069 unsigned Style:2; // spherical, elliptical, or mixed 00070 unsigned NumSamples:28; // number of samples in the cluster 00071 CLUSTER *Cluster; // ptr to cluster which made prototype 00072 DISTRIBUTION *Distrib; // different distribution for each dimension 00073 FLOAT32 *Mean; // prototype mean 00074 FLOAT32 TotalMagnitude; // total magnitude over all dimensions 00075 FLOAT32 LogMagnitude; // log base e of TotalMagnitude 00076 FLOATUNION Variance; // prototype variance 00077 FLOATUNION Magnitude; // magnitude of density function 00078 FLOATUNION Weight; // weight of density function 00079 } PROTOTYPE; 00080 00081 typedef struct { 00082 inT16 SampleSize; // number of parameters per sample 00083 PARAM_DESC *ParamDesc; // description of each parameter 00084 inT32 NumberOfSamples; // total number of samples being clustered 00085 KDTREE *KDTree; // for optimal nearest neighbor searching 00086 CLUSTER *Root; // ptr to root cluster of cluster tree 00087 LIST ProtoList; // list of prototypes 00088 inT32 NumChar; // # of characters represented by samples 00089 LIST bucket_cache[3]; // cache of reusable histograms by distribution type 00090 } CLUSTERER; 00091 00092 typedef struct { 00093 inT32 NumSamples; // number of samples in list 00094 inT32 MaxNumSamples; // maximum size of list 00095 SAMPLE *Sample[1]; // array of ptrs to sample data structures 00096 } SAMPLELIST; 00097 00098 // low level cluster tree analysis routines. 00099 #define InitSampleSearch(S,C) (((C)==NULL)?(S=NIL_LIST):(S=push(NIL_LIST,(C)))) 00100 00101 /*-------------------------------------------------------------------------- 00102 Public Function Prototypes 00103 --------------------------------------------------------------------------*/ 00104 CLUSTERER *MakeClusterer (inT16 SampleSize, const PARAM_DESC ParamDesc[]); 00105 00106 SAMPLE *MakeSample (CLUSTERER * Clusterer, FLOAT32 Feature[], inT32 CharID); 00107 00108 LIST ClusterSamples(CLUSTERER *Clusterer, CLUSTERCONFIG *Config); 00109 00110 void FreeClusterer(CLUSTERER *Clusterer); 00111 00112 void FreeProtoList(LIST *ProtoList); 00113 00114 void FreePrototype(void *arg); // PROTOTYPE *Prototype); 00115 00116 CLUSTER *NextSample(LIST *SearchState); 00117 00118 FLOAT32 Mean(PROTOTYPE *Proto, uinT16 Dimension); 00119 00120 FLOAT32 StandardDeviation(PROTOTYPE *Proto, uinT16 Dimension); 00121 00122 inT32 MergeClusters(inT16 N, PARAM_DESC ParamDesc[], inT32 n1, inT32 n2, 00123 FLOAT32 m[], FLOAT32 m1[], FLOAT32 m2[]); 00124 00125 //--------------Global Data Definitions and Declarations--------------------------- 00126 // define errors that can be trapped 00127 #define ALREADYCLUSTERED 4000 00128 #endif