文章给出层次聚类算法的python实现方法,并用《数据挖掘导论》里面的具体数据进行运行,代码如下:
from numpy import *
from math import *
from operator import *
def dist(a,b):#a,b is mat
c=(a-b)*(a-b).T
return sqrt(sum(c))
def centroid(a): #a is mat
return a.mean(0)
def resolveList(List):#分解一个List,如将[[1,2],[[1,2],[1,2]]]分解为[[1,2],[1,2],[1,2]]
L=[]
for i in range(len(List)):
if type(List[i][0])!=list:
L.append(List[i])
continue
else:
List1=resolveList(List[i])
for j in List1:
L.append(j)
return L
def cluster(data,num): #data is list num is the number of clusters you want
dataList=data
clusterList=range(len(dataList))
while len(dataList)>num:
dataMat=map(mat,data)
distDic={}
for i in range(len(dataMat)-1):
for j in range(len(dataMat))[i+1:]:
distDic[(i,j)]=dist(dataMat[i].mean(0),dataMat[j].mean(0))
m,n=sorted(distDic.items(),key=itemgetter(1))[0][0]
tempList=[]
tempList.append(dataList[m])
tempList.append(dataList[n])
L=resolveList(tempList)
del dataList[n]
del dataList[m]
dataList.append(L)
print dataList #打出每一次结果
data=[[0.4005,0.5306],[0.2148,0.3854],[0.3457,0.3156],[0.2652,0.1875],[0.0789,0.4139],[0.4548,0.3022]]
cluster(data,2)
结果如下:
[[0.4005, 0.5306], [0.2148, 0.3854], [0.2652, 0.1875], [0.0789, 0.4139], [[0.3457, 0.3156], [0.4548, 0.3022]]]
[[0.4005, 0.5306], [0.2652, 0.1875], [[0.3457, 0.3156], [0.4548, 0.3022]], [[0.2148, 0.3854], [0.0789, 0.4139]]]
[[0.4005, 0.5306], [[0.2148, 0.3854], [0.0789, 0.4139]], [[0.2652, 0.1875], [0.3457, 0.3156], [0.4548, 0.3022]]]
[[0.4005, 0.5306], [[0.2148, 0.3854], [0.0789, 0.4139], [0.2652, 0.1875], [0.3457, 0.3156], [0.4548, 0.3022]]]
程序可能编写的很菜,如有错误,希望指正~