00001 /****************************************************************************** 00002 ** Filename: cluster.h 00003 ** Purpose: Definition of feature space clustering routines 00004 ** Author: Dan Johnson 00005 ** History: 5/29/89, DSJ, Created. 00006 ** 00007 ** (c) Copyright Hewlett-Packard Company, 1988. 00008 ** Licensed under the Apache License, Version 2.0 (the "License"); 00009 ** you may not use this file except in compliance with the License. 00010 ** You may obtain a copy of the License at 00011 ** http://www.apache.org/licenses/LICENSE-2.0 00012 ** Unless required by applicable law or agreed to in writing, software 00013 ** distributed under the License is distributed on an "AS IS" BASIS, 00014 ** WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 00015 ** See the License for the specific language governing permissions and 00016 ** limitations under the License. 00017 ******************************************************************************/ 00018 #ifndef CLUSTER_H 00019 #define CLUSTER_H 00020 00021 #include "kdtree.h" 00022 #include "oldlist.h" 00023 00024 /*---------------------------------------------------------------------- 00025 Types 00026 ----------------------------------------------------------------------*/ 00027 typedef struct sample 00028 { 00029 unsigned Clustered:1; // TRUE if included in a higher cluster 00030 unsigned Prototype:1; // TRUE if cluster represented by a proto 00031 unsigned SampleCount:30; // number of samples in this cluster 00032 struct sample *Left; // ptr to left sub-cluster 00033 struct sample *Right; // ptr to right sub-cluster 00034 inT32 CharID; // identifier of char sample came from 00035 FLOAT32 Mean[1]; // mean of cluster - SampleSize floats 00036 } 00037 00038 00039 CLUSTER; 00040 00041 typedef CLUSTER SAMPLE; // can refer to as either sample or cluster 00042 00043 typedef enum { 00044 spherical, elliptical, mixed, automatic 00045 } 00046 00047 00048 PROTOSTYLE; 00049 00050 typedef struct // parameters to control clustering 00051 { 00052 PROTOSTYLE ProtoStyle; // specifies types of protos to be made 00053 FLOAT32 MinSamples; // min # of samples per proto - % of total 00054 FLOAT32 MaxIllegal; // max percentage of samples in a cluster which have 00055 // more than 1 feature in that cluster 00056 FLOAT32 Independence; // desired independence between dimensions 00057 FLOAT64 Confidence; // desired confidence in prototypes created 00058 int MagicSamples; // Ideal number of samples in a cluster. 00059 } 00060 00061 00062 CLUSTERCONFIG; 00063 00064 typedef enum { 00065 normal, uniform, D_random 00066 } 00067 00068 00069 DISTRIBUTION; 00070 00071 typedef union 00072 { 00073 FLOAT32 Spherical; 00074 FLOAT32 *Elliptical; 00075 00076 } 00077 00078 00079 FLOATUNION; 00080 00081 typedef struct 00082 { 00083 unsigned Significant:1; // TRUE if prototype is significant 00084 unsigned Merged:1; // Merged after clustering so do not output 00085 // but kept for display purposes. If it has no 00086 // samples then it was actually merged. 00087 // Otherwise it matched an already significant 00088 // cluster. 00089 unsigned Style:2; // spherical, elliptical, or mixed 00090 unsigned NumSamples:28; // number of samples in the cluster 00091 CLUSTER *Cluster; // ptr to cluster which made prototype 00092 DISTRIBUTION *Distrib; // different distribution for each dimension 00093 FLOAT32 *Mean; // prototype mean 00094 FLOAT32 TotalMagnitude; // total magnitude over all dimensions 00095 FLOAT32 LogMagnitude; // log base e of TotalMagnitude 00096 FLOATUNION Variance; // prototype variance 00097 FLOATUNION Magnitude; // magnitude of density function 00098 FLOATUNION Weight; // weight of density function 00099 } 00100 00101 00102 PROTOTYPE; 00103 00104 typedef struct 00105 { 00106 inT16 SampleSize; // number of parameters per sample 00107 PARAM_DESC *ParamDesc; // description of each parameter 00108 inT32 NumberOfSamples; // total number of samples being clustered 00109 KDTREE *KDTree; // for optimal nearest neighbor searching 00110 CLUSTER *Root; // ptr to root cluster of cluster tree 00111 LIST ProtoList; // list of prototypes 00112 inT32 NumChar; // # of characters represented by samples 00113 } 00114 00115 00116 CLUSTERER; 00117 00118 typedef struct 00119 { 00120 inT32 NumSamples; // number of samples in list 00121 inT32 MaxNumSamples; // maximum size of list 00122 SAMPLE *Sample[1]; // array of ptrs to sample data structures 00123 } 00124 00125 00126 SAMPLELIST; 00127 00128 // low level cluster tree analysis routines. 00129 #define InitSampleSearch(S,C) (((C)==NULL)?(S=NIL):(S=push(NIL,(C)))) 00130 00131 /*-------------------------------------------------------------------------- 00132 Public Function Prototypes 00133 --------------------------------------------------------------------------*/ 00134 CLUSTERER *MakeClusterer (inT16 SampleSize, PARAM_DESC ParamDesc[]); 00135 00136 SAMPLE *MakeSample (CLUSTERER * Clusterer, FLOAT32 Feature[], inT32 CharID); 00137 00138 LIST ClusterSamples(CLUSTERER *Clusterer, CLUSTERCONFIG *Config); 00139 00140 void FreeClusterer(CLUSTERER *Clusterer); 00141 00142 void FreeProtoList(LIST *ProtoList); 00143 00144 void FreePrototype(void *arg); //PROTOTYPE *Prototype); 00145 00146 CLUSTER *NextSample(LIST *SearchState); 00147 00148 FLOAT32 Mean(PROTOTYPE *Proto, uinT16 Dimension); 00149 00150 FLOAT32 StandardDeviation(PROTOTYPE *Proto, uinT16 Dimension); 00151 00152 inT32 MergeClusters(inT16 N, PARAM_DESC ParamDesc[], inT32 n1, inT32 n2, 00153 FLOAT32 m[], FLOAT32 m1[], FLOAT32 m2[]); 00154 00155 //--------------Global Data Definitions and Declarations--------------------------- 00156 // define errors that can be trapped 00157 #define ALREADYCLUSTERED 4000 00158 #endif