密度峰值聚类算法总结及其python实现

密度峰值聚类算法总结及其python实现

先上代码:

#-*-coding:utf-8 -*-
from __future__ import division
import numpy as np
from sklearn import preprocessing
from scipy.special import comb
from sklearn.metrics import adjusted_rand_score,f1_score,accuracy_score,precision_score,precision_recall_fscore_support
from sklearn.datasets import *
import sys
import matplotlib.pyplot as plt
import matplotlib.colors as colors
import csv
import random
from time import time
from matplotlib.font_manager import FontProperties
defaultencoding = 'utf-8'
font = FontProperties(fname='/Library/Fonts/Songti.ttc')
if sys.getdefaultencoding() != defaultencoding:
	reload(sys)
	sys.setdefaultencoding(defaultencoding)

def calculate(data,percent):
	#计算距离
	distlist = []
	dist = np.zeros((len(data), len(data)));
	for i in range(len(data)-1):
		for j in range(i+1,len(data)):
			distance = np.sqrt(np.sum(np.square(data[i] - data[j])))
			dist[i, j] = distance
			dist[j, i] = distance
			if(i != j):
				distlist.append(distance)
	# dc
	# for i in range(len(dist) - 1):
	# 	for j in range(len(dist) - 1):
	# 		if(dist[i,j] != 0):
	sortdist = sorted(distlist)
	position = round(len(distlist) * percent / 100)
	dc = sortdist[position]
	print('dc',dc)
	#计算局部密度
	# 计算局部密度 rho (利用 Gaussian 核)
	rho = np.zeros(len(dist))
	# Gaussian kernel
	for i in range(len(dist) - 1):
		for j in range(i + 1, len(dist)):
			rho[i] = rho[i] + np.exp(-(dist[i, j] / dc) * (dist[i, j] / dc))
			rho[j] = rho[j] + np.exp(-(dist[i, j] / dc) * (dist[i, j] / dc))
	rho = [item / max(rho) for item in rho]
	# 生成 delta 和 nneigh 数组
	# delta 距离数组
	# nneigh 比本身密度大的最近的点
	# 记录 rho 值更大的数据点中与 ordrho(ii) 距离最近的点的编号 ordrho(jj)
	# 将 rho 按降序排列,ordrho 保持序
	ordrho = np.flipud(np.argsort(rho))
	delta = np.zeros(len(dist))
	nneigh = np.zeros(len(dist),dtype=int)
	delta[ordrho[0]] = -1.
	nneigh[ordrho[0]] = 0
	# 求最大距离
	maxd = max(dist.flatten())
	for ii in range(1,len(dist)):
		delta[ordrho[ii]] = maxd
		for jj in range(ii):
			if dist[ordrho[ii], ordrho[jj]] < delta[ordrho[ii]]:
				delta[ordrho[ii]] = dist[ordrho[ii], ordrho[jj]]
				nneigh[ordrho[ii]] = ordrho[jj]
		delta[ordrho[0]] = max(delta)
		# delta = [item/max(delta) for item in delta]
	gamma = [rho[i] * delta[i] for i in range(len(dist))]
	plt.figure(1);
	for index in range(len(dist)):
		plt.plot(rho[index], delta[index], 'ko');
	pos = plt.ginput(1)
	rhomin = pos[0][0]
	deltamin = pos[0][1]
	cl = -1*np.ones(len(dist),dtype=int)
	icl = []
	NCLUST = 1
	for i in range(len(dist)):
		if rho[i]>rhomin and delta[i]>deltamin:
			cl[i] = NCLUST #第 i 号数据点属于第 NCLUST 个 cluster
			icl.append(i) # 逆映射,第 NCLUST 个 cluster 的中心为第 i 号数据点
			NCLUST+=1
	for i in range(len(dist)):
		if cl[ordrho[i]] == -1:
			cl[ordrho[i]] = cl[nneigh[ordrho[i]]]
	print(cl)
	color = ['bo', 'co', 'go', 'ko', 'mo', 'ro', 'wo', 'yo']
	s = random.sample(color, NCLUST-1)
	plt.close()
	for i in range(len(data)):
		plt.plot(data[i][0],data[i][1],s[cl[i]-1])
	plt.show();
def loadData():
	dataSet = []
	# fileIn = open('C:/data/jain.txt')
	fileIn = open('C:/data/jain.dat')
	for line in fileIn.readlines():
		lineArr = line.strip().split('\t')
		dataSet.append([float(lineArr[0]), float(lineArr[1])])
	data = np.array(dataSet);
	return data;

#data,label = load_breast_cancer(True)
#min_max_scaler = preprocessing.MinMaxScaler()
#data = min_max_scaler.fit_transform(data)
def test():
	data = loadData()
	dataSet = np.array(data);
	dist = calculate(dataSet,2)
	# local_density(dist,2)
if __name__ == '__main__': test();
# len = len(data)
# # start = time()
# fsdp = FSDP(len,7.8605,data)
# fsdp.clustering(0.4703,0.7108)
# print (time() - start)/60.0
# print fsdp.cl
# print label
# print "f1_score"
# print f1_score(fsdp.cl,label,average="weighted")
# print "accary"
# print accuracy_score(fsdp.cl,label)
# print adjusted_rand_score(fsdp.cl,label)
# gamma = np.flipud(np.sort(fsdp.gamma))
# plt.figure("标准CS")
# plt.subplot(311)
# x = [t for t in range(0, len(gamma))]
# plt.plot(x, gamma, 'r*')
# plt.subplot(312)
# plt.plot(fsdp.rho, fsdp.delta, 'r*')
# plt.legend(loc='upper right', prop=font)
#
# plt.subplot(313)
# color = []
# for i in fsdp.cl:
#     color.append(colors.cnames.values()[i])
# plt.scatter(data[:,0], data[:,1],marker=".",c=color)
# plt.legend(loc='upper right', prop=font)
# plt.show()

 

    原文作者:聚类算法
    原文地址: https://blog.csdn.net/u014755255/article/details/80254813
    本文转自网络文章,转载此文章仅为分享知识,如有侵权,请联系博主进行删除。
点赞