别看层次聚类算法简单,但是实现起来在数据结构方面还需要思考一番,不是那么轻而易举的确定数据结构,实现过的人应该知道的。
python代码是自己实现,执行结果和数学计算结果完全一致,可以放心read-code
完全人肉出品,代码详见如下:
1.层次聚类算法的计算过程原理
详见以前文章:https://blog.csdn.net/u012421852/article/details/80531184
2.Code
# -*- coding: utf-8 -*-
"""
@author: 蔚蓝的天空Tom
Talk is cheap, show me the code
Aim: 实现层次聚类算法Hierarchical Clustering Alg
"""
import numpy as np
class CHC(object):
'''
Hierarchical Clustering Algorithm,HCA,层次聚类算法
'''
def __init__(self, samples):
self.samples = samples
self.hc = []
self.cnt = 0
self._work()
def indmin_matrix(self, M):
'''得到矩阵M中最小元素的行列坐标'''
row,col = divmod(np.argmin(M), np.shape(M)[1])
return row,col
def em(self, A, B):
'''
计算A和B之间的欧式距离
比如A=[1,1], B=[4,5]的欧式距离=5.0
'''
efunc = lambda a,b : np.power(float(a)-float(b),2)
func = np.frompyfunc(efunc, 2, 1)
em = np.sqrt(sum(func(A,B)))
return em
def AverageLinkage(self, A, B):
'''计算两个组合数据点中的每个数据点与其他所有数据点的距离
将所有距离的均值作为两个组合数据点间的距离
a = [1,1],b = [1,1]
c = [4,5],d = [6,7]
A=[a,b] B=[c,d]
则AverageLinkage(A,B)=(em(a,c)+em(a,d)+em(b,c)+em(b,d))/4
'''
total = 0.0
for i in A:
for j in B:
total += self.em(i,j)
ret = total / (np.shape(A)[0]*np.shape(B)[0])
return ret
def _work(self):
self.cnt += 1
print('\n\n=====================%d times Hierarical Clustring=================='%self.cnt)
#首次进行算法,要初始化结果数组
if 0 == np.shape(self.hc)[0]:
initData = [[i] for i in range(np.shape(self.samples)[0])]
self.hc = [initData]
print('init self.hc:', self.hc)
preHC, n = self.hc[-1], np.shape(self.hc[-1])[0]
print('preHC:',preHC)
#最后剩2个集合时,停止层次聚集算法
if 2 == n:
print('succeed hierarical clustring:\n',)
for i in range(np.shape(self.hc)[0]):
print(self.hc[i])
return self.hc
#继续进行层次聚类算法
dist = np.full(shape=(n,n), fill_value=np.inf)
value = np.array(self.samples)[:,-1]
for i in range(n):
for j in np.arange(start=i+1, stop=n, step=1):
A,B = value[preHC[i]], value[preHC[j]]
dist[i,j] = self.AverageLinkage(A,B)
print('dist:\n', dist)
#更新聚类算法结果
row,col = self.indmin_matrix(dist)
C = []
newHC = []
for i in range(n):
if row == i or col == i:
if np.shape(C)[0] == 0:
C = preHC[row] + preHC[col]
newHC.append(C)
continue
newHC.append(preHC[i])
#更新HC结果数组
self.hc.append(newHC)
for i in range(np.shape(self.hc)[0]):
print(self.hc[i])
return self._work()
if __name__=='__main__':
srcData = [[['A'], [16.9]],
[['B'], [38.5]],
[['C'], [39.5]],
[['D'], [80.8]],
[['E'], [82]],
[['F'], [34.6]],
[['G'], [116.1]]]
hc = CHC(srcData)
3.运行结果
runfile('C:/Users/tom/hierarchical_clustering.py', wdir='C:/Users/tom')
=====================1 times Hierarical Clustring==================
init self.hc: [[[0], [1], [2], [3], [4], [5], [6]]]
preHC: [[0], [1], [2], [3], [4], [5], [6]]
dist:
[[ inf 21.6 22.6 63.9 65.1 17.7 99.2]
[ inf inf 1. 42.3 43.5 3.9 77.6]
[ inf inf inf 41.3 42.5 4.9 76.6]
[ inf inf inf inf 1.2 46.2 35.3]
[ inf inf inf inf inf 47.4 34.1]
[ inf inf inf inf inf inf 81.5]
[ inf inf inf inf inf inf inf]]
[[0], [1], [2], [3], [4], [5], [6]]
[[0], [1, 2], [3], [4], [5], [6]]
=====================2 times Hierarical Clustring==================
preHC: [[0], [1, 2], [3], [4], [5], [6]]
dist:
[[ inf 22.1 63.9 65.1 17.7 99.2]
[ inf inf 41.8 43. 4.4 77.1]
[ inf inf inf 1.2 46.2 35.3]
[ inf inf inf inf 47.4 34.1]
[ inf inf inf inf inf 81.5]
[ inf inf inf inf inf inf]]
[[0], [1], [2], [3], [4], [5], [6]]
[[0], [1, 2], [3], [4], [5], [6]]
[[0], [1, 2], [3, 4], [5], [6]]
=====================3 times Hierarical Clustring==================
preHC: [[0], [1, 2], [3, 4], [5], [6]]
dist:
[[ inf 22.1 64.5 17.7 99.2]
[ inf inf 42.4 4.4 77.1]
[ inf inf inf 46.8 34.7]
[ inf inf inf inf 81.5]
[ inf inf inf inf inf]]
[[0], [1], [2], [3], [4], [5], [6]]
[[0], [1, 2], [3], [4], [5], [6]]
[[0], [1, 2], [3, 4], [5], [6]]
[[0], [1, 2, 5], [3, 4], [6]]
=====================4 times Hierarical Clustring==================
preHC: [[0], [1, 2, 5], [3, 4], [6]]
dist:
[[ inf 20.63333333 64.5 99.2 ]
[ inf inf 43.86666667 78.56666667]
[ inf inf inf 34.7 ]
[ inf inf inf inf]]
[[0], [1], [2], [3], [4], [5], [6]]
[[0], [1, 2], [3], [4], [5], [6]]
[[0], [1, 2], [3, 4], [5], [6]]
[[0], [1, 2, 5], [3, 4], [6]]
[[0, 1, 2, 5], [3, 4], [6]]
=====================5 times Hierarical Clustring==================
preHC: [[0, 1, 2, 5], [3, 4], [6]]
dist:
[[ inf 49.025 83.725]
[ inf inf 34.7 ]
[ inf inf inf]]
[[0], [1], [2], [3], [4], [5], [6]]
[[0], [1, 2], [3], [4], [5], [6]]
[[0], [1, 2], [3, 4], [5], [6]]
[[0], [1, 2, 5], [3, 4], [6]]
[[0, 1, 2, 5], [3, 4], [6]]
[[0, 1, 2, 5], [3, 4, 6]]
=====================6 times Hierarical Clustring==================
preHC: [[0, 1, 2, 5], [3, 4, 6]]
succeed hierarical clustring:
[[0], [1], [2], [3], [4], [5], [6]]
[[0], [1, 2], [3], [4], [5], [6]]
[[0], [1, 2], [3, 4], [5], [6]]
[[0], [1, 2, 5], [3, 4], [6]]
[[0, 1, 2, 5], [3, 4], [6]]
[[0, 1, 2, 5], [3, 4, 6]]
(end)