K 均值聚类算法-C#

using System;
using System.Collections.Generic;
using System.Linq;
using System.Text;
using System.Collections;

namespace Similarity
{
    class myClusterFun
    {
        public static int[][] CluEntran(int[][] HashArr, int NumberOfCenter)//聚类的函数的入口
        {
            //程序的初始化及参数的设定
            int LoopTimesThres = 300;//设置最大的循环次数
            int LoopTimes = 0;//循环次数
            int JudgeConverg = 1;//记录是否收敛
            int NumberOfDoc = HashArr.Length;//待比较的对象的数量
            int AveCenterGap = 0;
            int LastAveCenterGap = 0;
            int ConverCount = 0;
            ArrayList ClusterResult = new ArrayList();//存储每次聚类结果
            ArrayList NewCenter = new ArrayList();//存储每次聚类的新中心
            ArrayList CenterGap = new ArrayList();//存储每次聚类中心的偏差
            int[][] SingleClusterResult = new int[NumberOfCenter][];//暂存每次聚类的结果
            int[][] SingleNewCenter = new int[NumberOfCenter][];//暂存每次聚类后形成的新的中心
            int[][] SingleLastCenter = new int[NumberOfCenter][];//暂存每次聚类后形成的新的中心
            int[] SingleCenterGap = new int[NumberOfCenter];//暂存每次聚类后中心的偏差情况
            //初始化:确定初始中心
            for (int i = 0; i < NumberOfCenter; i++)
            {
                SingleLastCenter[i] = new int[HashArr[0].Length];
                SingleNewCenter[i] = new int[HashArr[0].Length];
            }
            int[] TempRec = new int[NumberOfCenter];
            TempRec = InitialFunc(NumberOfCenter, NumberOfDoc);
            for (int i = 0; i < NumberOfCenter; i++)
            {
                for (int j = 0; j < HashArr[0].Length; j++)
                    SingleNewCenter[i][j] = HashArr[TempRec[i]][j];
            }
            NewCenter.Add(SingleNewCenter);
            while (JudgeConverg == 1)//记录的收敛情况
            {
                //对每个中心进行聚类
                for (int i = 0; i < NumberOfCenter; i++)
                {
                    for (int j = 0; j < HashArr[0].Length; j++)
                        SingleLastCenter[i][j] = SingleNewCenter[i][j];
                }
                SingleClusterResult = GatherToCenter(ref SingleNewCenter, HashArr, NumberOfCenter);
                LoopTimes++;
                //记录每次聚类的结果、以及新中心
                NewCenter.Add(SingleNewCenter);
                ClusterResult.Add(SingleClusterResult);
                //判断是否收敛
                SingleCenterGap = JudgeConverge(SingleNewCenter, SingleLastCenter, NumberOfCenter);//收敛的结果
                CenterGap.Add(SingleCenterGap);//记录收敛结果
                for (int i = 0; i < NumberOfCenter; i++)
                    AveCenterGap += SingleCenterGap[i];
                AveCenterGap = AveCenterGap / NumberOfCenter;
                if (AveCenterGap == LastAveCenterGap)
                    ConverCount++;
                if (ConverCount > 20 || (LoopTimes == LoopTimesThres))
                    JudgeConverg = 0;
                LastAveCenterGap = AveCenterGap;
                AveCenterGap = 0;
            }
            if (LoopTimes == LoopTimesThres)
                Console.WriteLine("Converge failed!!");
            return SingleClusterResult;//返回结果
        }
        public static int[] InitialFunc(int NumberOfCenter, int NumberOfDoc)//确定几个起始点的位置
        {
            int[] Res = new int[NumberOfCenter];
            int step = NumberOfDoc / NumberOfCenter;
            int RandomStep = 0;
            Random ran = new Random();
            RandomStep = ran.Next(NumberOfDoc) + 1;
            for (int i = 0; i < NumberOfCenter; i++)
            {
                Res[i] = (step * i + RandomStep) % NumberOfDoc;
            }
            return Res;
        }
        public static int[][] GatherToCenter(ref int[][] SingleNewCenter, int[][] HashArr, int NumberOfCenter)//对给定的点进行聚类,同时得到新的中心
        {
            int dis;//中间变量,记录距离
            int rec = 0;//中间变量,记录属于的类号
            int NumberOfDoc = HashArr.Length;
            int[] ClusterNum = new int[NumberOfCenter];//记录每一个类的数量
            int[] LoopTemp = new int[NumberOfDoc];//记录类号的中间数组,数组元素的值是类号,数组下标表示元素        
            int[][] ResRecord = new int[NumberOfCenter][];//记录聚类结果
            int[][] CenterSum = new int[NumberOfCenter][];//求中心时的中间变量,对各元素做和
            //分类的过程
            for (int i = 0; i < NumberOfDoc; i++)
            {
                dis = 0;
                for (int j = 0; j < NumberOfCenter; j++)
                {
                    if (CosComp(HashArr[i], SingleNewCenter[j]) > dis)
                    {
                        dis = CosComp(HashArr[i], SingleNewCenter[j]);
                        rec = j;
                    }
                }
                LoopTemp[i] = rec;
                ClusterNum[rec]++;
            }
            //将分类的结果进行记录
            for (int i = 0; i < NumberOfCenter; i++)
                ResRecord[i] = new int[ClusterNum[i]];
            int ClusterTemp;
            int[] ClusterIndexTemp = new int[NumberOfCenter];
            for (int i = 0; i < NumberOfDoc; i++)
            {
                ClusterTemp = LoopTemp[i];
                ResRecord[ClusterTemp][ClusterIndexTemp[ClusterTemp]] = i;
                ClusterIndexTemp[ClusterTemp]++;
            }
            //计算新的中心
            for (int i = 0; i < NumberOfCenter; i++)
            {
                CenterSum[i] = new int[HashArr[0].Length];
                if (ClusterNum[i] == 0) //对于类成员数为零的,直接用上一个中心
                {
                    for (int k = 0; k < HashArr[0].Length; k++)
                    {
                        CenterSum[i][k] += SingleNewCenter[i][k];
                    }
                    ClusterNum[i]++;
                }
                else
                {
                    for (int j = 0; j < ResRecord[i].Length; j++)
                    {
                        int t = ResRecord[i][j];
                        for (int k = 0; k < HashArr[0].Length; k++)
                            CenterSum[i][k] = HashArr[t][k] + CenterSum[i][k];
                    }
                }
            }
            for (int i = 0; i < NumberOfCenter; i++)
            {
                for (int j = 0; j < HashArr[0].Length; j++)
                {
                    SingleNewCenter[i][j] = CenterSum[i][j] / ClusterNum[i];
                }
            }
            return ResRecord;
        }
        public static int[] JudgeConverge(int[][] SingleNewCenter, int[][] LastNewCenter, int NumberOfCenter)//比较前后的中心点
        {
            int[] Result = new int[NumberOfCenter];
            for (int i = 0; i < NumberOfCenter; i++)
                Result[i] = CosComp(SingleNewCenter[i], LastNewCenter[i]);
            return Result;
        }
        public static int CosComp(int[] HashArr1, int[] HashArr2)//求向量的Cos距离
        {
            double Vdot = VecDot(HashArr1, HashArr2);
            double l1 = VecMod(HashArr1);
            double l2 = VecMod(HashArr2);
            double doubleresult = (Vdot / (l1 * l2)) * 100;
            int result = (int)Math.Floor(doubleresult);//将结果近似为整数
            return result;
        }
        public static double VecDot(int[] a, int[] b)//求向量的点积
        {
            double sum = 0;
            for (int i = 0; i < a.Length; i++)
            {
                sum += a[i] * b[i];
            }
            return sum;
        }
        public static double VecMod(int[] a)//求向量的模
        {
            double sum = 0;
            for (int i = 0; i < a.Length; i++)
            {
                sum += a[i] * a[i];
            }
            sum = Math.Sqrt(sum);
            return sum;
        }
    }
}

    原文作者:聚类算法
    原文地址: https://blog.csdn.net/new_shows/article/details/51234998
    本文转自网络文章,转载此文章仅为分享知识,如有侵权,请联系博主进行删除。
点赞