gbdt源码解读

最近看了一些gbdt方面的内容,发现网上的一些文章都语焉不详,在git上找了份源码

from GBDTReg import GBDT
class Config(object):
    learningRate=0.1
    maxTreeLength=4
    maxLeafCount=30
    maxTreeNum=4
    def buildGbdt(self,x_train,y_train):
        size = len(x_train)
        dim = len(x_train[0])
        x_train=np.array(x_train)
        y_train=np.array(y_train)
        x_train_feature=[]

        #初始化第一棵树
        treePreviousValue=0*y_train
        treeValues=[]
        treeValues.append(treePreviousValue)

        curValue = self.sigmoid(0*y_train)
        dataFeatures=[]
        for i in range(self.maxTreeNum):
            print("the tree %i-th"%i)
            residualGradient = -1*self.learningRate*(curValue-y_train)
            curTree = self.splitTree(x_train,residualGradient,1)
            self.tree.append(curTree)
            print (curTree)
            #更新梯度残差值
            curTreeLeafNodeNum = self.getTreeLeafNodeNum(curTree)
            curTreeValue=[]
            for singleX in x_train:
                xValue,xFeature = self.scanTree(curTree,singleX,curTreeLeafNodeNum)
                curTreeValue.append(xValue)

            treePreviousValue=np.array(curTreeValue)+treePreviousValue
            curValue=self.sigmoid(treePreviousValue)
            print (y_train)
            print("curValue")
            print( curValue)


首先是遍历range(self.maxTreeNum)树的数量, residualGradient=(y-y0)然后开始分裂树,

    def splitTree(self,x_train,residualGradient,treeHeight):
        """

        :param x_train:训练数据
        :param residualGradient:当前需要拟合的梯度残差值
        :param treeHeight:树的高度
        :return:建好的GBDT树
        """
        size = len(x_train)
        dim = len(x_train[0])
        #约定:左子树是小于等于,右子树是大于
        bestSplitPointDim=-1
        bestSplitPointValue=-1
        curLoss = self.calculateSquareLoss(residualGradient)
        minLossValue=curLoss
        if treeHeight==self.maxTreeLength:
            return curLoss
        tree=dict([])
        for i in range(dim):
            for j in range(size):
                splitNum = x_train[j,i]
                leftSubTree=[]
                rightSubTree=[]
                for k in range(size):
                    tmpNum=x_train[k,i]
                    if tmpNum<=splitNum:
                        leftSubTree.append(residualGradient[k])
                    else:
                        rightSubTree.append(residualGradient[k])
                sumLoss=0.0

                sumLoss+=self.calculateSquareLoss(np.array(leftSubTree))
                sumLoss+=self.calculateSquareLoss(np.array(rightSubTree))
                if sumLoss<minLossValue:
                    bestSplitPointDim=i
                    bestSplitPointValue=splitNum
                    minLossValue=sumLoss
        #如果损失值没有变小,则不作任何改变,也就是下面的归位一个Node
        if minLossValue==curLoss:
            return np.mean(residualGradient)
        else:

            leftSplit=[(x_train[i],residualGradient[i]) for i in range(size) if x_train[i,bestSplitPointDim]<=bestSplitPointValue ]
            rightSplit=[(x_train[i],residualGradient[i]) for i in range(size) if x_train[i,bestSplitPointDim]>bestSplitPointValue ]
             
#            print(leftSplit)
            newLeftSubTree = list(zip(*leftSplit))[0]
            newLeftResidual = list(zip(*leftSplit))[1]
            leftTree = self.splitTree(np.array(newLeftSubTree),newLeftResidual,treeHeight+1)

            newRightSubTree = list(zip(*rightSplit))[0]
            newRightResidual =list(zip(*rightSplit))[1]
            rightTree = self.splitTree(np.array(newRightSubTree),newRightResidual,treeHeight+1)

            tree[(bestSplitPointDim,bestSplitPointValue)]=[leftTree,rightTree]
            return tree

通过对split方法的解读,可以看出所谓梯度提升树枝,就是递归的方式更新梯度的同时,划分左子树和右子树。一直到loss不在降低。如果本棵树loss不再降低,self.tree.append(curTree),最后的模型是多棵树的叠加。

源码链接

点赞