最近看了一些gbdt方面的内容,发现网上的一些文章都语焉不详,在git上找了份源码
from GBDTReg import GBDT
class Config(object):
learningRate=0.1
maxTreeLength=4
maxLeafCount=30
maxTreeNum=4
def buildGbdt(self,x_train,y_train):
size = len(x_train)
dim = len(x_train[0])
x_train=np.array(x_train)
y_train=np.array(y_train)
x_train_feature=[]
#初始化第一棵树
treePreviousValue=0*y_train
treeValues=[]
treeValues.append(treePreviousValue)
curValue = self.sigmoid(0*y_train)
dataFeatures=[]
for i in range(self.maxTreeNum):
print("the tree %i-th"%i)
residualGradient = -1*self.learningRate*(curValue-y_train)
curTree = self.splitTree(x_train,residualGradient,1)
self.tree.append(curTree)
print (curTree)
#更新梯度残差值
curTreeLeafNodeNum = self.getTreeLeafNodeNum(curTree)
curTreeValue=[]
for singleX in x_train:
xValue,xFeature = self.scanTree(curTree,singleX,curTreeLeafNodeNum)
curTreeValue.append(xValue)
treePreviousValue=np.array(curTreeValue)+treePreviousValue
curValue=self.sigmoid(treePreviousValue)
print (y_train)
print("curValue")
print( curValue)
首先是遍历range(self.maxTreeNum)树的数量, residualGradient=(y-y0)然后开始分裂树,
def splitTree(self,x_train,residualGradient,treeHeight):
"""
:param x_train:训练数据
:param residualGradient:当前需要拟合的梯度残差值
:param treeHeight:树的高度
:return:建好的GBDT树
"""
size = len(x_train)
dim = len(x_train[0])
#约定:左子树是小于等于,右子树是大于
bestSplitPointDim=-1
bestSplitPointValue=-1
curLoss = self.calculateSquareLoss(residualGradient)
minLossValue=curLoss
if treeHeight==self.maxTreeLength:
return curLoss
tree=dict([])
for i in range(dim):
for j in range(size):
splitNum = x_train[j,i]
leftSubTree=[]
rightSubTree=[]
for k in range(size):
tmpNum=x_train[k,i]
if tmpNum<=splitNum:
leftSubTree.append(residualGradient[k])
else:
rightSubTree.append(residualGradient[k])
sumLoss=0.0
sumLoss+=self.calculateSquareLoss(np.array(leftSubTree))
sumLoss+=self.calculateSquareLoss(np.array(rightSubTree))
if sumLoss<minLossValue:
bestSplitPointDim=i
bestSplitPointValue=splitNum
minLossValue=sumLoss
#如果损失值没有变小,则不作任何改变,也就是下面的归位一个Node
if minLossValue==curLoss:
return np.mean(residualGradient)
else:
leftSplit=[(x_train[i],residualGradient[i]) for i in range(size) if x_train[i,bestSplitPointDim]<=bestSplitPointValue ]
rightSplit=[(x_train[i],residualGradient[i]) for i in range(size) if x_train[i,bestSplitPointDim]>bestSplitPointValue ]
# print(leftSplit)
newLeftSubTree = list(zip(*leftSplit))[0]
newLeftResidual = list(zip(*leftSplit))[1]
leftTree = self.splitTree(np.array(newLeftSubTree),newLeftResidual,treeHeight+1)
newRightSubTree = list(zip(*rightSplit))[0]
newRightResidual =list(zip(*rightSplit))[1]
rightTree = self.splitTree(np.array(newRightSubTree),newRightResidual,treeHeight+1)
tree[(bestSplitPointDim,bestSplitPointValue)]=[leftTree,rightTree]
return tree
通过对split方法的解读,可以看出所谓梯度提升树枝,就是递归的方式更新梯度的同时,划分左子树和右子树。一直到loss不在降低。如果本棵树loss不再降低,self.tree.append(curTree),最后的模型是多棵树的叠加。