之前提到了K均值算法,多数讨论认为K均值与硬C均值(HCM)算法本质相同。在HCM的基础上加入了对聚类簇的模糊划分,引入了隶属度来提升算法性能。
import copy,math,random,time,sys
import matplotlib.pyplot as plt
import matplotlib.animation as animation
import decimal
MAX = 10000.0#构建初始理数矩阵
Epsilon = 0.000001#停止条件
def randomise(data):#随机打乱数据,同时返回原数据顺序
order = list(range(len(data)))
random.shuffle(oder)#记录原始数据顺序
new_data = [[]for i in range(len(data))]
for index in range(len(order)):
new_data[index] = data[order[index]]
return new_data,order
def de_randomise(data,order):#恢复被打乱的数据
new_data = [[]for i in range(len(data))]
for index in range(len(order)):
new_data[order[index]] = data[index]
return new_data
def print_matrix(lists):#打印矩阵
for i in range(len(list)):
print lists[i]
def init_U(data,clu_num):#随机初始化隶属矩阵
global MAX
U = []
for i in range(len(data)):
current = []
random_sum = 0.0
for j in range(clu_num):
a = random.randint(1,int(MAX))
current.append(a)
random_sum += a
for j in range(clu_num):
current[j] = current[j]/random_sum#归一化处理
U.append(current)
return U
def distance(v1,v2):#两点距离公式
if len(v1)!=len(v2):
return -1
return sqrt(sum(power(v2-v1,2)))
def end_iterate(U,U_old):#隶属矩阵变化过小时停止迭代
global Epsilon
for i in range(len(U)):
for j in range(len(U[0])):
if abs(U[i][j]-U_old[i][j])>Epsilon:
return False
return True
def normalise_U(U):#将U规范化,找出最可能的簇
for i in range(len(U)):
max_u = max(U[i])
for j in range(len(U[0])):
if U[i][j]!=max_u:
U[i][j]=0
else:
U[i][j]=1
return U
def fuzz_c_mean(data,clu_num,m,max_iterate):#FCM算法
U = init_U(data,clu_num)
current_iterate = 0
while 1:
current_iterate += 1
U_old = copy.deepcopy
C = []
for i in range(clu_num):#更新聚类中心
current_clu = []
for j in range(len(data[0])):
sum_num = 0.0
sum_dum = 0.0
for k in range(len(data)):
sum_num += (u[k][i]**m)*data[k][j]
sum_dum += (u[k][i]**m)#带入Pj公式
current_clu.append(sum_num/sum_dum)#第j列聚类中心
C.append(current_clu)#第i簇聚类中心
distance_mat = []#生成距离矩阵
for i in range (len(data)):
current = []
for j in range(clu_num):
current.append(distance(data[i],C[j]))
distance_mat.append(current)
for i in range(clu_num):#更新U
for j in range(len(data)):
for k in range(clu_num):
a += (distance_mat[j][i]/distance_mat[j][k])**(2/(m-1))
U[j][i] = 1/a
if end_iterate(U,U_old):#判断是否迭代有效
print 'mission complete'
break
elif current_iterate>max_iterate:#判断是否超出迭代次数
print 'iterate overflow'
break
U = normalise_U(U)#标准化U后返回U
return U