K_Means
什么是聚类分析
聚类分析是在数据中发现数据对象之间的关系,将数据进行分组,组内的相似性越大,组间的差别越大,则聚类效果越好。
明显分离的
可以看到(a)中不同组中任意两点之间的距离都大于组内任意两点之间的距离,明显分离的簇不一定是球形的,可以具有任意的形状。
算法思想较为简单如下所示:
选择K个点作为初始质心
repeat
将每个点指派到最近的质心,形成K个簇
重新计算每个簇的质心
until 簇不发生变化或达到最大迭代次数
核心代码:
//类方法:迭代一次
public void IterateOnce()
{
//逐一计算每个样本点所属的簇
for (int i = 0; i <= _SampleList.Count – 1; i++)
{
//计算该样本点与每个簇(质心点)的距离,并划归到距离最近的簇
double minDist = 9999999;
for (int j = 0; j <= _ClusterList.Count – 1; j++)
{
double dist = CalcDist(_SampleList[i], _ClusterList[j]);
if (dist < minDist)
{
minDist = dist;
_SampleList[i]._ClusterID = j; //划归到该簇中
}
}
}
//逐一计算每个簇的质心
for (int i = 0; i <= _ClusterList.Count - 1; i++)
{
double sumX = 0, sumY = 0;
int count = 0;
for (int j = 0; j <= _SampleList.Count - 1; j++)
if (_SampleList[j]._ClusterID == i)
{
sumX = sumX + _SampleList[j]._X;
sumY = sumY + _SampleList[j]._Y;
count++;
}
if (count != 0)
{
_ClusterList[i]._X = sumX / count;
_ClusterList[i]._Y = sumY / count;
}
//MessageBox.Show("x=" + _ClusterList[i]._X.ToString() + ";y=" + _ClusterList[i]._Y.ToString());
}
}
Cluster 类
//簇质心坐标
public double _X;
public double _Y;
//质心颜色
public Color _Color;
public PointStyle _PointStyle;
//类构造方法
public Cluster(double x, double y)
{
_X = x;
_Y = y;
}
Sample 类
//样本坐标
public double _X;
public double _Y;
//样本所属簇的编号(编号从0开始)
public int _ClusterID = -1;
//类构造方法1
public Sample(double x, double y)
{
_X = x;
_Y = y;
}
//类构造方法2
public Sample(double x, double y, int clusterID)
{
_X = x;
_Y = y;
_ClusterID = clusterID;
}