模型原型
class sklearn.cluster.Agglomerative(n_clusters=2,affinity=’euclidean’,memory=Memory(cachedir=None),connectivity=None, n_components=None,compute_full_tree=’auto’,linkage=’ward’,pooling_func=< function mean >)
参数
- n_clusters
- affinity:用于计算距离(可以为:’euclidean’,’l1’,’l2’,’manhattan’,’cosine’, ‘precomputed’)
- memory:用于缓存输出的结果,默认不缓存
- connectivity:用于连接矩阵,给出了每个样本的可连接样本
- n_components
- compute_full_tree:通常当训练了n_clusters之后,训练过程就停止。但如果compute_full_tree=True,则会继续训练生成一颗完整的树
- linkage:指定链接算法
- ’ward’:单链接single-linkage算法,采用
dmin - ‘compleae’:全链接complete算法,采用
dmax - ‘average’”均链接average-linkage算法,采用
davg
- ’ward’:单链接single-linkage算法,采用
- pooling_func:输入一组特征值,输出一个数值
属性
- labels_
- n_leaves_:分层树的叶节点数量
- n_components_:连接图中连通分量的估计值
- children_:给出了每个非叶子节点中的子节点数量
方法 - fit(X[,y])
- fit_predict(X[,y])
import numpy as np
import matplotlib.pyplot as plt
from sklearn.datasets.samples_generator import make_blobs
from sklearn import cluster
from sklearn.metrics import adjusted_rand_score
from sklearn import mixture
产生数据
def create_data(centers,num=100,std=0.7):
X,labels_true=make_blobs(n_samples=num,centers=centers,cluster_std=std)
return X,labels_true
查看生成的样本点
def plot_data(*data):
X,labels_true=data
labels=np.unique(labels_true)
fig=plt.figure()
ax=fig.add_subplot(1,1,1)
colors='rgbyckm'
for i,label in enumerate(labels):
position=labels_true==label
ax.scatter(X[position,0],X[position,1],label='cluster %d'%label,color=colors[i%len(colors)])
ax.legend(loc='best',framealpha=0.5)
ax.set_xlabel('X[0]')
ax.set_ylabel('Y[1]')
ax.set_title('data')
plt.show()
X,labels_true=create_data([[1,1],[2,2],[1,2],[10,20]],1000,0.5)
plot_data(X,labels_true)
使用AgglomerativeClustering
def test_AgglomerativeClustering(*data):
X,labels_true=data
clst=cluster.AgglomerativeClustering()
predicted_labels=clst.fit_predict(X)
print('ARI:%s'%adjusted_rand_score(labels_true,predicted_labels))
centers=[[1,1],[2,2],[1,2],[10,20]]
X,labels_true=create_data(centers,1000,0.5)
test_AgglomerativeClustering(X,labels_true)
簇的数量的影响
def test_AgglomerativeClustering_nclusters(*data):
X,labels_true=data
nums=range(1,50)
ARIs=[]
for num in nums:
clst=cluster.AgglomerativeClustering(n_clusters=num)
predicted_labels=clst.fit_predict(X)
ARIs.append(adjusted_rand_score(labels_true,predicted_labels))
fig=plt.figure()
ax=fig.add_subplot(1,1,1)
ax.plot(nums,ARIs,marker='+')
ax.set_xlabel('n_clusters')
ax.set_ylabel('ARI')
fig.suptitle("AGGlomerativeClustering")
plt.show()
test_AgglomerativeClustering_nclusters(X,labels_true)
链接方式的影响
def test_AgglomerativeClustering_linkage(*data):
X,labels_true=data
nums=range(1,50)
fig=plt.figure()
ax=fig.add_subplot(1,1,1)
linkages=['ward','complete','average']
markers='+o*'
for i,linkage in enumerate(linkages):
ARIs=[]
for num in nums:
clst=cluster.AgglomerativeClustering(n_clusters=num,linkage=linkage)
predicted_labels=clst.fit_predict(X)
ARIs.append(adjusted_rand_score(labels_true,predicted_labels))
ax.plot(nums,ARIs,marker=markers[i],label='linkage:%s'%linkage)
ax.set_xlabel('n_clusters')
ax.set_ylabel('ARI')
ax.legend(loc='best')
fig.suptitle('AgglomerativeClustering')
plt.show()
test_AgglomerativeClustering_linkage(X,labels_true)