参考百度百科http://baike.baidu.com/link?url=LDYen7bEqt8o2l5mUrnZjQk1topFi36-MwLuhjuGf-1z4sQFtFq1xCEe0TCJwYVjGbu0C6cpuVMFIxNglvSnoa
外加http://www.cnblogs.com/zhangchaoyang/articles/2200800.html
学习birch聚类最好有B-树的知识
结合了B-树的特性,birch算法适合于处理大数据。
原因是:
(1)CF 结构概括了簇的基本信息,并且是高度压缩的,它存储了小于实际数据点的聚类信息。每个新添加的数据其作为个体消失了,将信息融入的集合簇中
(2)增量式的学习方法,不用一次将数据全部加载到内存,可以一边添加数据一边进行学习
下面是我的实现
// birch-cluster.cpp : 定义控制台应用程序的入口点。
//
///************birch-cluster*************///
///******* author Marshall ********///
///******* 2015.9.18 ********///
///******* version 1.0 ********///
#include "stdafx.h"
#include<vector>
#include<iostream>
#include<cstdlib>
#include<time.h>
#define BirchType int
using namespace std;
vector<BirchType> operator+(vector<BirchType>aa, vector<BirchType>&bb){
_ASSERTE(aa.size() == bb.size());
for (int i = 0; i < aa.size(); i++)
aa[i] += bb[i];
return aa;
}
vector<BirchType> operator*(vector<BirchType>aa, vector<BirchType>&bb){
_ASSERTE(aa.size() == bb.size());
for (int i = 0; i < aa.size(); i++)
aa[i] *= bb[i];
return aa;
}
vector<BirchType> operator-(vector<BirchType>aa, vector<BirchType>&bb){
_ASSERTE(aa.size() == bb.size());
for (int i = 0; i < aa.size(); i++)
aa[i] -= bb[i];
return aa;
}
vector<BirchType> operator*(vector<BirchType>aa, double k){
for (int i = 0; i < aa.size(); i++)
aa[i] = double(aa[i])* k;
return aa;
}
vector<BirchType> operator*(int k, vector<BirchType>aa){
for (int i = 0; i < aa.size(); i++)
aa[i] *= k;
return aa;
}
class birch
{
public:
struct Attribute
{
unsigned int dim;
vector<BirchType>data;
Attribute(unsigned int d) :dim(d)
{
data.resize(dim);
}
};
struct CF
{
unsigned int N;
vector<BirchType> LS;
vector<BirchType> SS;
CF(unsigned int N,
vector<BirchType> LS,
vector<BirchType>SS) :N(N), LS(LS), SS(SS){}
/*CF(CF& cc){//shallow copy is enough
this->N = cc.N;
this->LS = cc.LS;
this->SS = cc.SS;
}*/
CF(unsigned int dim){
N = 0;
LS.resize(dim);
SS.resize(dim);
};
CF(){};
};
struct Leaf;
struct MinCluster
{
CF cf;
Leaf*parent;
MinCluster()
{
parent = NULL;
}
MinCluster(CF cf)
{
parent = NULL;
this->cf = cf;
}
};
struct Leaf
{
Leaf*pre, *next;//to make up a leaf-list.for Nonleaf,NULL
Leaf*parent;
vector<Leaf*>*child;//对Leaf而言为NULL
vector<MinCluster>*cluster;//对NonLeaf而言为NULL
CF cf;
Leaf()
{
parent = pre = next = NULL;
child = NULL;
cluster = NULL;
}
};
void generate_data(int num, int dim, vector<int>&span)
{
this->dim = dim;
_ASSERTE(span.size() == dim);
for (int i = 0; i < num; i++)
{
Attribute att(dim);
for (int j = 0; j < dim; j++)
att.data[j] = span[j] * double(rand()) / double(RAND_MAX + 1.0);
dataset.push_back(att);
}
}
vector<Attribute>dataset;
int absorbnum;
public:
birch(unsigned int b, unsigned int l, unsigned int t)
:B(b), L(l), T(t){
_ASSERTE(B > 2);
_ASSERTE(L > 3);
root = NULL;
time_t tt;
srand(time(&tt));
absorbnum = 0;
}
~birch();
void insert(Attribute att);
private:
unsigned int B; //maximal num of child a Nonleaf will have
unsigned int L;//maximal num of MinCluster a leaf will haveLeaf
unsigned int T;// MinCluster的直径不能超过T
Leaf*root;
Leaf*head;//the head of the leaf-list at the bottom of the tree
int dim;
private:
inline double lengthofvec(vector<BirchType>&aa){
double len = 0;
for (int i = 0; i < aa.size(); i++)
len += pow(aa[i], 2.0);
return sqrt(len);
}
double sumofvec(vector<BirchType>&aa){
double sum = 0;
for (int i = 0; i < aa.size(); i++)
sum += aa[i];
return sum;
}
double cal_inter_cluster_dis(CF &cf1, CF &cf2);
double cal_intra_cluster_dis();
double merge_cluster_diameter(CF &cf1, CF &cf2);
vector<BirchType>updateSS(vector<BirchType>&LS, vector<BirchType>&SS)
{
for (int i = 0; i < LS.size(); i++)
SS[i] += pow(LS[i], 2.0);
return SS;
}
CF updateCF(CF &c1, CF &c2)
{
return CF(c1.N + c2.N, c1.LS + c2.LS, c1.SS + c2.SS);
}
void updateCF(Leaf*leaf)
{
CF cf(dim);
if (leaf->cluster != NULL)
{
for (int i = 0; i < leaf->cluster->size(); i++)
{
cf.N = cf.N + (*leaf->cluster)[i].cf.N;
cf.LS = cf.LS + (*leaf->cluster)[i].cf.LS;
cf.SS = cf.SS + (*leaf->cluster)[i].cf.SS;
}
}
else if (leaf->child != NULL)
{
for (int i = 0; i < leaf->child->size(); i++)
{
cf.N = cf.N + (*leaf->child)[i]->cf.N;
cf.LS = cf.LS + (*leaf->child)[i]->cf.LS;
cf.SS = cf.SS + (*leaf->child)[i]->cf.SS;
}
}
leaf->cf = cf;
}
MinCluster create_mincluster(Attribute att)
{
vector<BirchType>aa;
aa.resize(att.dim);
return MinCluster(CF(1, att.data, updateSS(att.data, aa)));
}
void insert(Leaf*close, bool &split, MinCluster &clu);
};
birch::~birch()
{
Leaf*plist = head;
while (plist != NULL)
{
delete plist->cluster;
plist = plist->next;
}
vector<Leaf*>aa, bb;
aa.push_back(root);
while (!aa.empty())
{
Leaf*pleaf = aa.back();
aa.pop_back();
bb.push_back(pleaf);
if (pleaf->child != NULL)
aa.insert(aa.end(), pleaf->child->begin(), pleaf->child->end());
}
for (int i = 0; i < bb.size(); i++)
{
if (bb[i]->child != NULL)
delete bb[i]->child;
delete bb[i];
}
}
/*double birch::merge_cluster_diameter(CF &cf1, CF &cf2)
{
return sqrt(sumofvec(cf1.SS *(1.0 / double(cf1.N))
+ cf2.SS *(1.0 / double(cf1.N)) -
2 * cf1.LS*cf2.LS*(1.0 / double(cf1.N + cf2.N))));
}*/
double birch::merge_cluster_diameter(CF &cf1, CF &cf2)
{
return sqrt(sumofvec(cf1.SS *(1.0 / double(cf1.N))
+ cf2.SS *(1.0 / double(cf1.N)) -
2 * cf1.LS*cf2.LS*(1.0 / double(cf1.N + cf2.N))));
}
void birch::insert(Attribute att)
{
if (root == NULL)
{
root = new Leaf;
root->cluster = new vector < MinCluster > ;
(*root->cluster).push_back(create_mincluster(att));
root->cf = CF((*root->cluster)[0].cf);
head = root;
head->pre = NULL;
head->next = NULL;
return;
}
MinCluster clu = create_mincluster(att);
Leaf*leaf = root;
vector<int>path;
while (leaf->cluster == NULL)
{
int k = -1;
double mindis = 10000000000000;
double dd;
for (int i = 0; i < (*leaf->child).size(); i++)
{
double dis = cal_inter_cluster_dis(clu.cf, (*leaf->child)[i]->cf);
if (dis < mindis)
{
mindis = dis;
k = i;
}
dd = dis;
}
_ASSERTE(k >= 0);
path.push_back(k);
leaf = (*leaf->child)[k];
}
int k = -1;
//mindis = 100000;
double mindis = 100000;
for (int i = 0; i < (*leaf->cluster).size(); i++)
{
double dis = cal_inter_cluster_dis(clu.cf, (*leaf->cluster)[i].cf);
if (dis < mindis)
{
mindis = dis;
k = i;
}
_ASSERTE(k >= 0);
}
//double ttt = merge_cluster_diameter(clu.cf, (*leaf->cluster)[k].cf);
double ttt = cal_inter_cluster_dis(clu.cf, (*leaf->cluster)[k].cf);
if (ttt < T)
{
//absorb
(*leaf->cluster)[k].cf = updateCF((*leaf->cluster)[k].cf, clu.cf);
absorbnum++;
}
else
{
(*leaf->cluster).push_back(clu);
}
//update CF value along the path
Leaf*lea = root;
(*lea).cf = updateCF((*lea).cf, clu.cf);
for (int i = 0; i < path.size(); i++)
{
(*lea->child)[path[i]]->cf = updateCF((*lea->child)[path[i]]->cf, clu.cf);
lea = (*lea->child)[path[i]];
}
if ((*leaf->cluster).size() > L)
{
double maxdis = 0;
int th1 = -1;
int th2 = -1;
double**dismatrix = new double*[(*leaf->cluster).size()];
for (int i = 0; i < (*leaf->cluster).size(); i++)
dismatrix[i] = new double[(*leaf->cluster).size()];
//找到距离最远的两个簇
for (int i = 0; i < (*leaf->cluster).size() - 1; i++)
for (int h = i + 1; h < (*leaf->cluster).size(); h++)
{
double dis = cal_inter_cluster_dis((*leaf->cluster)[i].cf, (*leaf->cluster)[h].cf);
dismatrix[i][h] = dis;
dismatrix[h][i] = dis;
if (dis > maxdis)
{
maxdis = dis;
th1 = i; th2 = h;
}
}
Leaf*new_leaf = new Leaf;
new_leaf->cluster = new vector < MinCluster > ;
new_leaf->cluster->push_back((*leaf->cluster)[th2]);
int len = (*leaf->cluster).size();
(*leaf->cluster)[th2].parent = new_leaf;
//根据各簇与两个新簇的距离分配到两个新簇中
for (int i = 0; i < len; i++)
{
if (i == th1 || i == th2)
continue;
if (dismatrix[i][th2] < dismatrix[i][th1])
{
(*leaf->cluster)[i].parent = new_leaf;
new_leaf->cluster->push_back((*leaf->cluster)[i]);
}
}
for (int i = 0; i < (*leaf->cluster).size(); i++)
delete[] dismatrix[i];
delete[]dismatrix;
vector < MinCluster >::iterator it, it1;
it = (*leaf->cluster).begin();
while (it != (*leaf->cluster).end())
{
if (it->parent == new_leaf)
it = (*leaf->cluster).erase(it);
else
{
it++;
}
}
//不要忘了更新leaf和new_leaf的cf值
updateCF(leaf);
updateCF(new_leaf);
//不要忘了将new_leaf加入到链表中
Leaf*next = leaf->next;
leaf->next = new_leaf;
new_leaf->pre = leaf;
new_leaf->next = next;
if (next)
next->pre = new_leaf;
if (leaf->parent != NULL)
{
leaf->parent->child->push_back(new_leaf);
new_leaf->parent = leaf->parent;
}
else//leaf is root,then a new root should be created
{
Leaf*new_root = new Leaf;
new_root->child = new vector < Leaf* > ;
new_root->child->push_back(leaf);
new_root->child->push_back(new_leaf);
leaf->parent = new_root;
new_leaf->parent = new_root;
updateCF(new_root);
root = new_root;
return;
}
}
Leaf*cur = leaf->parent;
while (cur != NULL&&cur->child->size() > B)
{
double maxdis = 0;
int th1 = -1;
int th2 = -1;
double**dismatrix = new double*[cur->child->size()];
for (int i = 0; i < cur->child->size(); i++)
dismatrix[i] = new double[cur->child->size()];
//找到距离最远的两个leaf
for (int i = 0; i < cur->child->size() - 1; i++)
for (int h = i + 1; h < cur->child->size(); h++)
{
double dis = cal_inter_cluster_dis((*cur->child)[i]->cf, (*cur->child)[h]->cf);
dismatrix[i][h] = dis;
dismatrix[h][i] = dis;
if (dis > maxdis)
{
maxdis = dis;
th1 = i; th2 = h;
}
}
Leaf*new_leaf1 = new Leaf;
new_leaf1->child = new vector < Leaf* > ;
(*cur->child)[th2]->parent = new_leaf1;
(*new_leaf1->child).push_back((*cur->child)[th2]);
int len = (*cur->child).size();
//rearrange other leaves to th1 th2 as their child
for (int i = 0; i < len; i++)
{
if (i == th1 || i == th2)
continue;
if (dismatrix[i][th2] < dismatrix[i][th1])
{
(*cur->child)[i]->parent = new_leaf1;
new_leaf1->child->push_back((*cur->child)[i]);
}
}
for (int i = 0; i < (*cur->child).size(); i++)
delete[] dismatrix[i];
delete[]dismatrix;
vector < Leaf* >::iterator it;
it = (*cur->child).begin();
while (it != (*cur->child).end())
{
if ((*it)->parent == new_leaf1)
it = (*cur->child).erase(it);
else
it++;
}
//不要忘了更新cur和new_leaf1的cf值
updateCF(cur);
updateCF(new_leaf1);
//if cur is root,then a new root should be created
if (cur->parent == NULL)
{
Leaf*new_root = new Leaf;
new_root->child = new vector < Leaf* > ;
new_root->child->push_back(cur);
new_root->child->push_back(new_leaf1);
cur->parent = new_root;
new_leaf1->parent = new_root;
updateCF(new_root);
root = new_root;
return;
}
//cur is not root
//不要忘了将new_leaf1加入cur的父亲节点的child
cur->parent->child->push_back(new_leaf1);
new_leaf1->parent = cur->parent;
cur = cur->parent;
}
}
//根据CF值计算簇间距离
/*double birch::cal_inter_cluster_dis(CF &cf1, CF &cf2)
{
return sqrt(sumofvec((2 * (cf1.N + cf2.N)*(cf1.SS + cf2.SS)
- 2 * (cf1.LS + cf2.LS)*(cf1.LS + cf2.LS))*
(1.0 / double(cf1.N + cf2.N)*(cf1.N + cf2.N - 1))));
}*/
double birch::cal_inter_cluster_dis(CF &cf1, CF &cf2)
{
double dis = 0;
double temp;
for (int i = 0; i < dim; i++)
{
double t1 = double(cf1.LS[i]) / double(cf1.N);
double t2 = double(cf2.LS[i]) / double(cf2.N);
temp = t1 - t2;
dis += temp*temp;
}
return sqrt(dis);
}
int _tmain(int argc, _TCHAR* argv[])
{
//vector<int*>aa, bb;
//int *p1 = new int;
//int *p2 = new int;
//int *p3 = new int;
//*p1 = 8;
//*p2 = 9;
//*p3 = 88;
//aa.push_back(p1);
//aa.push_back(p2);
//aa.push_back(p3);
//*aa[2] = 999;
//bb.push_back(p3);
//vector<int*>::iterator it = aa.begin() + 1;
////delete aa[0];
//it = aa.erase(it);
//cout << *bb[0] << endl;
//cout << **it << endl;
//for (it = aa.begin(); it != aa.end(); it++)
// cout << **it << endl;
birch bir(5, 6, 20);
int dim = 2;
int num = 1000;
vector<int>span;
for (int i = 0; i < dim; i++)
span.push_back(1000);
bir.generate_data(num, dim, span);
for (int i = 0; i < num; i++)
bir.insert(bir.dataset[i]);
cout << bir.absorbnum << endl;
system("pause");
return 0;
}