/***************************************************************************************** * Copyright (c) 2006 Hewlett-Packard Development Company, L.P. * Permission is hereby granted, free of charge, to any person obtaining a copy of * this software and associated documentation files (the "Software"), to deal in * the Software without restriction, including without limitation the rights to use, * copy, modify, merge, publish, distribute, sublicense, and/or sell copies of the * Software, and to permit persons to whom the Software is furnished to do so, * subject to the following conditions: * * The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software. * * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, * INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS FOR A * PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT * HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF * CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE * OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. *****************************************************************************************/ /************************************************************************ * FILE DESCR: Definitions of Agglomerative Hierarchical Clustering module * * CONTENTS: * cluster * getProximityMatrix * setOutputConfig * setHyperlinkMap * getClusterResult * computeProximityMatrix * computeDistances * clusterToFindNumClusters * getInterObjectDistance * findGroup * findInterClusterDistance * writeClustersAsHTML * determineNumOfClusters * determineKnee * findRMSE * computeAvgSil * * * AUTHOR: Bharath A * * DATE: February 22, 2005 * CHANGE HISTORY: * Author Date Description of change ************************************************************************/ #ifndef __LTKHIERARCHICALCLUSTERING_H #define __LTKHIERARCHICALCLUSTERING_H #ifndef _WIN32 //#include #endif #include "LTKInc.h" #include "LTKTypes.h" #include "LTKLoggerUtil.h" #include "LTKException.h" #include "LTKErrors.h" /*Enumerator for stopping criterion to be used*/ enum ELTKHCStoppingCriterion { LMETHOD, AVG_SIL }; /*Enumerator for methods in hierarchical clustering*/ enum ELTKHCMethod { SINGLE_LINKAGE, COMPLETE_LINKAGE, AVERAGE_LINKAGE }; #define OUTPUT_HTML_FILE_NAME "output.html" #define MIN_CUTOFF 20 /** * @class LTKHierarchicalClustering *

This class does agglomerative hierarchical clustering. The data objects which could be LTKTrace or LTKTraceGroup, are supplied as a vector. Function that defines the distance between two data objects needs to be supplied as a function pointer.One of the 3 methods (Single,Average or Complete linkage) needs to be selected to define the way inter-cluster has to be determined. In case number of clusters is not supplied, it is determined using the L-method (default stopping criterion)

*/ template class LTKHierarchicalClustering { private: //reference to the vector containing the data objects to be clustered const vector& m_data; //triangular matrix containing the pairwise distances between data //objects float2DVector m_proximityMatrix; //data structure that stores current (intermediate) state of the //clusters int2DVector m_intermediateCG; //vector mapping the data object id and path to the data (unipen) file stringVector m_hyperlinksVec; //contains the number of clusters required int m_numOfClusters; //output file handle to write the cluster results as html //with name of the file as OUTPUT_HTML_FILE_NAME ofstream m_output; //flag to indicate whether the output is required as html bool m_writeHTML; //flag to indicate whether to show all levels of the hierarchy in the //html file bool m_showAllLevels; //vector to hold merging distance for each number of clusters floatVector m_mergingDist; //flag for determining number of clusters bool m_determineClusters; //output result directory string m_outputDir; //extension of the image corresponding to each data object in //order to write in the html string m_imageFileExtn; //Method for defining inter-cluster distance ELTKHCMethod m_method; //number of clusters determined by Average Silhouette method int m_numBySil; //cached clustering result corresponding minimum average silhouette int2DVector m_cachedResult; //stopping criterion selected - LMethod or Average Silhouette ELTKHCStoppingCriterion m_stoppingCriterion; //pointer to the class that has the definition of distance function DistanceClass* m_distClassPtr; //function pointer type of the function that defines inter-object distance typedef int (DistanceClass::*FN_PTR_DISTANCE)(const ClusterObjType&, const ClusterObjType&, float&) ; //distance function pointer FN_PTR_DISTANCE m_distancePtr; public: /********************************************************************************** * AUTHOR : Bharath A * DATE : 22-FEB-2005 * NAME : LTKHierarchicalClustering * DESCRIPTION : Constructor to initialise the required parameters * ARGUMENTS : clusterObjects - vector of data objects which could be LKTrace or * LTKTraceGroup for HWR * noOfClusters - Number of clusters required * clusteringMethod - One of the 3 methods: * SINGLE_LINKAGE,AVERAGE_LINKAGE * or COMPLETE_LINKAGE * * RETURNS : * NOTES : * CHANGE HISTROY * Author Date Description of change *************************************************************************************/ LTKHierarchicalClustering(const vector& clusterObjects,int noOfClusters, ELTKHCMethod clusteringMethod=AVERAGE_LINKAGE) : m_data(clusterObjects),m_method(clusteringMethod), m_numOfClusters(noOfClusters),m_writeHTML(false), m_showAllLevels(false), m_determineClusters(false) { LOG(LTKLogger::LTK_LOGLEVEL_DEBUG)<<"Entering: " <<"LTKHierarchicalClustering::LTKHierarchicalClustering" <<"(vector,int,ELTKHCMethod)"<=clusterObjects.size()) { LOG(LTKLogger::LTK_LOGLEVEL_ERR) <<"Number of clusters:"<,int,ELTKHCMethod)"<,int,ELTKHCMethod)"<& clusterObjects, ELTKHCMethod clusteringMethod=AVERAGE_LINKAGE, ELTKHCStoppingCriterion stoppingCriterion=LMETHOD) : m_data(clusterObjects), m_method(clusteringMethod), m_stoppingCriterion(stoppingCriterion), m_writeHTML(false), m_showAllLevels(false), m_determineClusters(true) { LOG(LTKLogger::LTK_LOGLEVEL_DEBUG)<<"Entering: " <<"LTKHierarchicalClustering::LTKHierarchicalClustering" <<"(vector,ELTKHCMethod,ELTKHCStoppingCriterion)"<,ELTKHCMethod,ELTKHCStoppingCriterion)"<,ELTKHCMethod,ELTKHCStoppingCriterion)"<,ELTKHCMethod,ELTKHCStoppingCriterion)"< tag in the output html is not created. * RETURNS : error code * NOTES : * CHANGE HISTROY * Author Date Description of change *************************************************************************************/ //int setOutputConfig(const string& outputDirectory, // bool displayAllLevels=false, // string imageFileExtension="") int setOutputConfig(const string& outputDirectory,bool displayAllLevels=false, const string& imageFileExtension="") { LOG(LTKLogger::LTK_LOGLEVEL_DEBUG)<<"Entering: " <<"LTKHierarchicalClustering::setOutputConfig()"< tag in the output is not generated m_imageFileExtn=imageFileExtension; LOG(LTKLogger::LTK_LOGLEVEL_DEBUG) <<"Image file extension:"<& hyperlinksVector) int setHyperlinkMap(const vector& hyperlinksVector) { LOG(LTKLogger::LTK_LOGLEVEL_DEBUG)<<"Entering: " <<"LTKHierarchicalClustering::setHyperlinkMap()"< >& outClusterResult) const { LOG(LTKLogger::LTK_LOGLEVEL_DEBUG)<<"Entering: " <<"LTKHierarchicalClustering::getClusterResult()"< eachRow((m_data.size()-i)-1);//added -1 at the end int c=0; for(int j=i+1;j*m_distancePtr)(m_data[i],m_data[j], eachRow[c]); if (errorCode != SUCCESS ) { LOG(LTKLogger::LTK_LOGLEVEL_ERR) <<"Error while calling distance function"<0) { m_intermediateCG=m_cachedResult; return SUCCESS; } } for(int i=0;i v; v.push_back(i); //To begin with, each data object is cluster by itself m_intermediateCG.push_back(v); } if(m_writeHTML) //If output is needed as html { string outputFilePath=m_outputDir+"/"+OUTPUT_HTML_FILE_NAME; m_output.open(outputFilePath.c_str()); //Cluster output file is created if(m_output.fail()) { LOG(LTKLogger::LTK_LOGLEVEL_ERR) <<"Unable to create file:" <\n"; m_output<<"\n"; m_output<<"\n"; m_output<<"\n"; for(int v=0;v"; for(int w=0;w0) { m_output<<""< "; } else { m_output<      "; } } } m_output<<""; m_output<<"\n"; } if(m_numOfClusters toCluster; //to find the clusters that need to be merged float interClusterDistance=findGroup(toCluster); currNumOfClusters=m_data.size()-it-1; if(m_stoppingCriterion==AVG_SIL) { float silDiff=computeAvgSil(toCluster[0],toCluster[1]); if(silDiff 2) { m_numBySil=currNumOfClusters+1; m_cachedResult=m_intermediateCG; } } } else if(m_stoppingCriterion==LMETHOD && m_determineClusters==true) { m_mergingDist[currNumOfClusters]=interClusterDistance; } //clusters are merged m_intermediateCG[toCluster[0]].insert(m_intermediateCG[toCluster[0]].end(), m_intermediateCG[toCluster[1]].begin(), m_intermediateCG[toCluster[1]].end()); //old cluster deleted m_intermediateCG.erase(m_intermediateCG.begin()+ toCluster[1]); if(m_writeHTML) { if(!m_showAllLevels) { if(currNumOfClusters==m_numOfClusters) { writeClustersAsHTML(interClusterDistance); } } else { writeClustersAsHTML(interClusterDistance); } } } } //closing the html tags if(m_writeHTML) { m_output<<"
"; m_output<<"Inter-cluster Dist"; m_output<<"
\n"; m_output<<"\n"; m_output<<""; m_output.close(); } LOG(LTKLogger::LTK_LOGLEVEL_DEBUG)<<"Exiting: " <<"LTKHierarchicalClustering::clusterToFindNumClusters()"< pairToCombine) float findGroup(vector& pairToCombine) const { LOG(LTKLogger::LTK_LOGLEVEL_DEBUG)<<"Entering: " <<"LTKHierarchicalClustering::findGroup()"<& v1,const vector& v2) float findInterClusterDistance(const vector& cluster1, const vector& cluster2) const { LOG(LTKLogger::LTK_LOGLEVEL_DEBUG)<<"Entering: " <<"LTKHierarchicalClustering::findInterClusterDistance()"< groupDistance) { groupDistance=temp; } } } } LOG(LTKLogger::LTK_LOGLEVEL_DEBUG)<<"Exiting: " <<"LTKHierarchicalClustering::findInterClusterDistance()"<\n"; for(int v=0;v"; m_output<<"("<"; for(int w=0;w0) { m_output<<""< "; } else { m_output<      "; } } } m_output<<""; m_output<<"("<"<"; m_output<<""; m_output<<"\n"; LOG(LTKLogger::LTK_LOGLEVEL_DEBUG)<<"Exiting: " <<"LTKHierarchicalClustering::writeClustersAsHTML()"<= lastKnee) { break; } } int temp=currentKnee*2; if(temp>cutOff) { --cutOff; trueCutOff=false; } else { cutOff=temp; trueCutOff=true; } if(cutOff < MIN_CUTOFF) { break; } } LOG(LTKLogger::LTK_LOGLEVEL_DEBUG) <<"Number of clusters determined by iterative refinement:" < EPS) { beta1Right = numer / denom; } else { beta1Right = 0; } beta0Right = avgYRight-(beta1Right*avgXRight); float errorSOS=0; for(i=2;i<=candidateKnee;i++) { float yCap = (beta0Left+(beta1Left*i)); errorSOS += ((m_mergingDist[i]-yCap)*(m_mergingDist[i]-yCap)); } outLRMSE=sqrt(errorSOS/(candidateKnee-2)); errorSOS=0; for(j=candidateKnee+1;j<=maxNumClusters;j++) { float yCap = beta0Right + (beta1Right*j); errorSOS += (m_mergingDist[j]-yCap)*(m_mergingDist[j]-yCap); } outRRMSE=sqrt(errorSOS/(maxNumClusters-candidateKnee-1)); LOG(LTKLogger::LTK_LOGLEVEL_DEBUG)<<"Exiting: " <<"LTKHierarchicalClustering::findRMSE()"<& clust1=m_intermediateCG[clust1Index]; const vector& clust2=m_intermediateCG[clust2Index]; vector combinedClust; combinedClust.insert(combinedClust.end(),clust1.begin(),clust1.end()); combinedClust.insert(combinedClust.end(),clust2.begin(),clust2.end()); float clust1TotalSils=0.0f,clust2TotalSils=0.0f,combinedClustTotalSils=0.0f; for(int i=0;i1) { for(int j=0;j avgIntraDist && minInterDist > EPS) { dataObjSil=(minInterDist-avgIntraDist)/minInterDist; } else if(avgIntraDist > EPS) { dataObjSil=(minInterDist-avgIntraDist)/avgIntraDist; } clust1TotalSils+=dataObjSil; } for(int ii=0;ii1) { for(int j=0;j avgIntraDist && minInterDist > EPS) { dataObjSil=(minInterDist-avgIntraDist)/minInterDist; } else if(avgIntraDist > EPS) { dataObjSil=(minInterDist-avgIntraDist)/avgIntraDist; } clust2TotalSils+=dataObjSil; } for(int iii=0;iii1) { for(int j=0;javgIntraDist && minInterDist > EPS) { dataObjSil=(minInterDist-avgIntraDist)/minInterDist; } else if(avgIntraDist > EPS) { dataObjSil=(minInterDist-avgIntraDist)/avgIntraDist; } combinedClustTotalSils+=dataObjSil; } LOG(LTKLogger::LTK_LOGLEVEL_DEBUG)<<"Exiting: " <<"LTKHierarchicalClustering::computeAvgSil()"<