本文是k均值聚类算法源代码
语言:Python;数据集:西瓜数据4.0.xlsx
使用的库:
import numpy as np
import math
import xlrd
import random
import matplotlib.pyplot as plt
从xlsx中导入数据:
def loadData(filename):
data = xlrd.open_workbook(filename)
table = data.sheets()[0]
print "table: ", table
nrows = table.nrows
dataset = []
for i in xrange(nrows):
dataset.append(table.row_values(i))
return dataset
计算欧式距离:
def lengthcalc(inX,inY): #inX,inY 要求同为行向量
subdu = inX - inY
subdu.shape = (1,subdu.shape[0])#一维数组转置必须指定大小
return pow(np.dot(subdu,subdu.T),0.5)[0][0] #从类似于array([[ 0.09625487]])中取出值
选择最小距离,返回距离最小距离的蔟索引:
def minlength(inX,cluster):
cluster = np.array(cluster)
inX = np.array(inX)
clm = cluster.shape[0]
minindex = np.inf
minlen = np.inf
for i in xrange(clm):
currlen = lengthcalc(inX,cluster[i]) #还需要进一步完善,计算之前做数据处理,完成归一化
if currlen < minlen:
minindex = i
minlen = currlen
return minindex
k均值聚类实现:
def kMeans(data,k): #k为要分的蔟数
dataset = np.array(data)
m,n = dataset.shape
cluster = random.sample(dataset,k)
#cluster = np.array(cluster)
dic = {}
dicbak = {}
for time in xrange(500):
dic = {}
#print cluster
for data in dataset:
#print "666"
minindex = minlength(data,cluster)
if minindex not in dic.keys():
dic[minindex] = []
dic[minindex].append((np.mat(data)).tolist()[0]) #array先转化为mat,mat用tolist函数转化为list(注意一维数组的特殊情况)
for index in range(k):
#print dic[index]
cluster[index] = np.array(dic[index]).mean(axis=0) #axis=0:按列求和
#print dic
if len(dic)!=0 and dic == dicbak:
break
dicbak = dic.copy()
return cluster,dic #返回蔟和每个蔟中的样本
聚类完成后,通过散点图看聚类结果:
def figplot(dic):
fig = plt.figure()
ax = fig.add_subplot(1,1,1)
col = ['r','b','g','k','y','m'] #个数要大于蔟的个数
for key in dic.keys():
#ax.plot(np.array(dic[key])[:,0],np.array(dic[key])[:,1],linestyle='o--',color=col[key])
ax.scatter(np.array(dic[key])[:,0],np.array(dic[key])[:,1],color=col[key])
fig.show()
为了实现自动选择蔟个数,用DBI作为衡量标准,关于DBI请参考周志华《机器学习》和文章http://blog.sina.com.cn/s/blog_65c8baf901016flh.html
def DBIcalc(cul,dic):
dicset = np.array(dic)
k = len(dic)
kDBI = 0.0
for i in range(k):
maxsim = 0
kj = range(k)
del kj[i]
for j in kj:
sim = similar(dic[i],dic[j],cul[i],cul[j])
#print sim
if sim > maxsim:
maxsim = sim
#print 'maxsim',maxsim
kDBI += maxsim
#print 'kDBI',kDBI
return 1.0/k*kDBI
def avgC(data): #输入为一个类的数据集
dataset = np.array(data)
k = dataset.shape[0]
sumlength = 0.0
for i in range(k):
kj = range(k)
del kj[i]
for j in kj:
sumlength += lengthcalc(dataset[i],dataset[j])
return (2.0/(k*(k-1)))*sumlength
def similar(dataX,dataY,inX,inY): #输入为两个分类中的数据集,inX,inY为两个类的中心
return float(avgC(dataX)+avgC(dataY))/lengthcalc(inX,inY)
选择最佳蔟的函数:
def bestCluster(data):
bestdbi = np.inf
for k in range(2,5):
cul,dic = kMeans(data,k)
currdbi = DBIcalc(cul,dic)
if currdbi < bestdbi:
bestdbi = currdbi
return k