根据《机器学习实战》一书第十章学习k均值聚类算法和二分k均值聚类算法,自己把代码边敲边理解了一下,修正了一些原书中代码的细微差错。目前代码有时会出现如下4种报错信息,这有待继续探究和完善。
报错信息:
Warning (from warnings module):
File "F:\Python2.7.6\lib\site-packages\numpy\core\_methods.py", line 55
warnings.warn("Mean of empty slice.", RuntimeWarning)
RuntimeWarning: Mean of empty slice.
Warning (from warnings module):
File "F:\Python2.7.6\lib\site-packages\numpy\core\_methods.py", line 65
ret, rcount, out=ret, casting='unsafe', subok=False)
RuntimeWarning: invalid value encountered in true_divide
Warning (from warnings module):
File "E:\学习\学习\Python书籍\机器学习实战源代码\Ch10\kMeans.py", line 18
return sqrt(sum(power(vecA - vecB, 2))) #la.norm(vecA-vecB)
RuntimeWarning: invalid value encountered in power
Traceback (most recent call last):
File "E:\学习\学习\Python书籍\机器学习实战源代码\Ch10\kMeans.py", line 147, in <module>
centList,myNewAssments = biKmeans(newDataMat,i)
File "E:\学习\学习\Python书籍\机器学习实战源代码\Ch10\kMeans.py", line 62, in biKmeans
centroidMat, splitClustAss = kMeans(ptsInCurrCluster, 2, distMeas)
File "E:\学习\学习\Python书籍\机器学习实战源代码\Ch10\kMeans.py", line 33, in kMeans
centroids = createCent(dataSet, k)
File "E:\学习\学习\Python书籍\机器学习实战源代码\Ch10\kMeans.py", line 24, in randCent
minJ = min(dataSet[:,j])
ValueError: min() arg is an empty sequence
k均值聚类算法、二分k均值聚类算法的python代码如下:
# -*- coding: utf-8 -*-
'''
kMeans聚类算法
基本思想:
基于距离和预定义好的类簇数目k,
首先,随机选定k个初始类簇中心(不同的类簇中心会导致收敛速度和聚类结果有差别,有可能会陷入局部最优.)
其次,计算每个点到每个类簇中心的距离,并将其分配到最近的类簇中
第三,重新计算每个类簇的中心
第四,重复第二步和第三步直到类簇中心不再发生变化,聚类停止
二分k-均值聚类算法
基本思想
该算法首先将所有点作为一个簇,然后将该簇一分为二.之后选择其中一个簇继续进行划分,
选择哪一个簇进行划分取决于对"其划分是否可以最大程度降低SSE的值.上述基于SSE的划分过程不断重复,
直到得到用户指定的簇数目为止.
本代码对原书代码稍作微调,对有的小细节错误进行改正
Author: <jianzhang.zhang@foxmail.com>
Data : 2016-06-11
'''
import pickle,matplotlib
import matplotlib.pyplot as plt
from numpy import *
from pprint import pprint
def loadDataSet(filename):
'''
从文件中加载数据矩阵
param filename: 保存数据矩阵的文件名 str
return dataSet: 数据矩阵 [[],[],...] list(list)
'''
dataMat = []
with open(filename) as f:
l = f.readlines()
for line in l:
curLine = line.strip().split('\t')
fltLine = map(float,curLine)
dataMat.append(fltLine)
return dataMat
def distEclud(vecA,vecB):
'''
计算两个向量的欧氏距离
param vecA,vecB: 两个待计算距离的向量 numpy.ndarray
return : 两个向量的欧氏距离
'''
return sqrt(sum(power(vecA - vecB, 2)))
def randCent(dataSet,k):
'''
随机挑选k个初始簇中心
param dataSet: 数据矩阵
param k: 簇数
return : k个初始簇中心
'''
n = shape(dataSet)[1]
centroids = mat(zeros((k,n)))
for j in range(n):
try:
minJ = min(dataSet[:,j])
except:
print dataSet
maxJ = max(dataSet[:,j])
rangeJ = float(maxJ - minJ)
centroids[:,j] = minJ + rangeJ * random.rand(k,1)
return centroids
def kMeans(dataSet,k,distMeas = distEclud,createCent = randCent):
'''
K-均值聚类算法
param dataSet: 数据集 numpy.matrix
param k: 类别数 int
param distMeas: 计算距离的方法,默认为欧氏距离 function
param createCent: 产生初始簇中心的方式,默认是随机生成 function
return centroids: 聚类结果的类簇中心 numpy.matrix
return clusterAssment: 每条记录的聚类结果 numpy.matrix matrix([[clusterTag,欧氏距离平方],
[clusterTag,欧氏距离平方],...])
'''
m = shape(dataSet)[0]
clusterAssment = mat(zeros((m,2)))
centroids = createCent(dataSet,k)
clusterChanged = True
# 记录迭代次数
iteration = 1
while clusterChanged:
clusterChanged = False
for i in range(m):
minDist = inf; minIndex = -1
for j in range(k):
distJI = distMeas(centroids[j,:],dataSet[i,:])
if distJI < minDist:
minDist = distJI;minIndex = j
# 判断所属类别是否变化,进而判断是否停止类中心变换,只有每个点的类别均不再发生变化,才认为聚类可以终止
if clusterAssment[i,0] != minIndex:
clusterChanged = True
clusterAssment[i,:] = minIndex,minDist**2
## print "Iter:",iteration
## iteration += 1
## print centroids
for cent in range(k):
'''mean(matrix,axis = 0)表示沿矩阵列方向进行均值计算
clusterAssment[:,0] == cent返回bool型矩阵
nonzero(clusterAssment[:,0] == cent)返回True值的坐标(array(x1,x2,..),array(y1,y2,...)),(第一维坐标,第二维坐标)
nonzero(clusterAssment[:,0] == cent)[0]返回第一维坐标array(x1,x2,..)
'''
ptsInClust = dataSet[nonzero(clusterAssment[:,0].A == cent)[0]]
centroids[cent,:] = mean(ptsInClust,axis = 0)
return centroids,clusterAssment
def biKmeans(dataSet,k,distMeas = distEclud):
'''
二分K-均值聚类算法,基本思路:
param dataSet: 数据集 numpy.matrix
param k: 类簇数目
param distMeas: 计算距离的方法,默认为欧氏距离 function
return mat(centList): 聚类结果的类簇中心 numpy.matrix
return clusterAssment: 每条记录的聚类结果 numpy.matrix matrix([[clusterTag,欧氏距离平方],
[clusterTag,欧氏距离平方],...])
'''
m = shape(dataSet)[0] # 记录数
clusterAssment = mat(zeros((m,2))) # 初始化聚类结果为零矩阵
centroid0 = mean(dataSet,axis = 0).tolist()[0] # 初始化第一个类簇中心,全部记录的中心,结果是一个列表
centList = [centroid0] # centList用来保存类簇中心,首先将初始化类簇中心加入其中,centList的长度亦即类簇数目
# 计算每一条记录到初始类簇中心的欧氏距离平方
for j in range(m):
clusterAssment[j,1] = distMeas(dataSet[j,:],mat(centroid0))**2
while (len(centList) < k): # 当聚类数目达到k时停止
lowestSSE = inf # 初始化误差平方和为正无穷
# 尝试划分已有的每一个簇,寻找使得SSE降幅最大的那个簇,然后对其进行2-Means聚类划分
for i in range(len(centList)):
# 取出属于类簇i的记录
ptsInCurrCluster = dataSet[nonzero(clusterAssment[:,0].A==i)[0],:]
# 对类簇i进行2-Means聚类划分
centroidMat,splitClustAss = kMeans(ptsInCurrCluster,2,distMeas)
# 对类簇i进行2-Means聚类划分后求其SSE
sseSplit = sum(splitClustAss[:,1])
# 对除类簇i之外的其他类簇求SSE
sseNotSplit = sum(clusterAssment[nonzero(clusterAssment[:,0].A!=i)[0],1])
## print "sseSplit,sseNotSplit:",sseSplit,sseNotSplit
# 判断划分类簇i后的总SSE是否比lowestSSE小
if sseSplit+sseNotSplit < lowestSSE:
lowestSSE = sseSplit+sseNotSplit
bestCentToSplit = i
bestNewCents = centroidMat.copy()
bestClustAss = splitClustAss.copy()
bestClustAss[nonzero(bestClustAss[:,0].A == 1)[0],0] = len(centList)
bestClustAss[nonzero(bestClustAss[:,0].A == 0)[0],0] = bestCentToSplit
## print "The bestCentToSplit is:",bestCentToSplit
## print "The length of bestClustAss is:",len(bestClustAss)
# 更新保存类簇中心的列表
centList[bestCentToSplit] = bestNewCents[0,:].tolist()[0]
centList.append(bestNewCents[1,:].tolist()[0])
# 更新聚类结果
clusterAssment[nonzero(clusterAssment[:,0].A == bestCentToSplit)[0],:] = bestClustAss
return mat(centList),clusterAssment
def distSLC(vecA,vecB):
'''
计算球面上两点间距离的公式
设所求点A ,纬度β1 ,经度α1 ;点B ,纬度β2 ,经度α2.则距离
S = R * arc cos[cosβ1*cosβ2*cos(α1-α2)+sinβ1*sinβ2]
param vecA: A点坐标
param vecB: B点坐标
return 球面距离
'''
a = sin(vecA[0,1]*pi/180) * sin(vecB[0,1]*pi/180)
b = cos(vecA[0,1]*pi/180) * cos(vecB[0,1]*pi/180) * cos(pi * (vecB[0,0] - vecA[0,0])/180)
return arccos(a+b)*6371.0
def clusterClubs(numClust = 5):
'''
对地图上的点按照球面距离聚类,并在地图上展示
param numClust: 默认类簇数量5 int
'''
datList = []
with open('places.txt') as f:
l = f.readlines()
for line in l:
lineArr = line.strip().split('\t')
datList.append([float(lineArr[4]),float(lineArr[3])])
datMat = mat(datList)
myCentroids,clustAssing = biKmeans(datMat,numClust,distMeas = distSLC)
fig = plt.figure()
rect = [0.1,0.1,0.8,0.8]
scatterMarkers = ['s','o','^','8','p',
'd','v','h','>','<',]
axprops = dict(xticks = [],yticks = [])
ax0 = fig.add_axes(rect,label = 'ax0',**axprops)
imgP = plt.imread('Portland.png')
ax0.imshow(imgP)
ax1 = fig.add_axes(rect,label = 'ax1',frameon = False)
for i in range(numClust):
ptsInCurrCluster = datMat[nonzero(clustAssing[:,0].A == i)[0],:]
markerStyle = scatterMarkers[i % len(scatterMarkers)]
ax1.scatter(ptsInCurrCluster[:,0].flatten().A[0],\
ptsInCurrCluster[:,1].flatten().A[0],
marker = markerStyle,s = 90)
ax1.scatter(myCentroids[:,0].flatten().A[0],\
myCentroids[:,1].flatten().A[0],
marker = '+', s = 300)
plt.show()
def resuDisp(clusterAssing):
'''
输出显示聚类结果
param clusterAssing: 聚类结果矩阵
return resuDict: 聚类结果字典
'''
resuList = clusterAssing.tolist()
resuDict = {}
for i in range(len(resuList)):
label = resuList[i][0]
resuDict[label] = resuDict.get(label,[])
resuDict[label].append(i)
for key,value in resuDict.iteritems():
print key,':',value
return resuDict
def storeResu(resuDict,filename):
'''
存储聚类结果到文件
'''
with open(filename,'w') as f:
pickle.dump(resuDict,f)
def getResu(filename):
'''
从文件中读取聚类结果
'''
with open(filename) as f:
resuDict = pickle.load(f)
return resuDict
def plotCluster(datMat,clustAssing,myCentroids,numClust):
'''
画图显示聚类结果
param datmat: 数据集
param clustAssing: 聚类结果
param myCentroids: 类簇中心
param numClust: 类簇数目
'''
fig = plt.figure()
rect = [0.1,0.1,0.8,0.8]
scatterMarkers = ['s','o','^','8','p',
'd','v','h','>','<',]
ax1 = fig.add_axes(rect,label = 'ax1',frameon = False)
for i in range(numClust):
ptsInCurrCluster = datMat[nonzero(clustAssing[:,0].A == i)[0],:]
markerStyle = scatterMarkers[i % len(scatterMarkers)]
ax1.scatter(ptsInCurrCluster[:,0].flatten().A[0],\
ptsInCurrCluster[:,1].flatten().A[0],
marker = markerStyle,s = 90)
ax1.scatter(myCentroids[:,0].flatten().A[0],\
myCentroids[:,1].flatten().A[0],
marker = '+', s = 300)
plt.show()
if __name__ == "__main__":
## dataMat = mat(loadDataSet('testSet.txt'))
## newDataMat = mat(loadDataSet('testSet2.txt'))
## print "Minimum of 0 column:",min(dataMat[:,0])
## print "Minimum of 1 column:",min(dataMat[:,1])
## print "Maximun of 1 column:",max(dataMat[:,1])
## print "Maximun of 0 column:",max(dataMat[:,0])
## print "Random centroids: \n"
## pprint(randCent(dataMat,2))
## print "Euclidean distance of record_0 and record_1:",distEclud(dataMat[0],dataMat[1])
## # 由于初始类中心的选取是随机的,所以每次运行收敛次数也是不一致的
## myCentroids,clusterAssing = kMeans(dataMat,3)
## resuDict = resuDisp(clusterAssing)
## storeResu(resuDict,'clusterResult')
## result = getResu('clusterResult')
## centList,myNewAssments = biKmeans(newDataMat,3)
## plotCluster(newDataMat,myNewAssments,centList,3)
## resuDict = resuDisp(myNewAssments)
## totalSSE = sum(myNewAssments[:,1])
## clusterClubs(numClust = 5)
不完善之处希望大家评论指正,互相交流。