# GBDT原理与Sklearn源码分析-分类篇

## 正文：

GB的一些基本原理都已经在上文中介绍了，下面直接进入正题。

L(yi,Fm(xi))={yilogpi+(1yi)log(1pi)} L ( y i , F m ( x i ) ) = − { y i l o g p i + ( 1 − y i ) l o g ( 1 − p i ) }

L(yi,Fm(xi))={yilogpi+(1yi)log(1pi)} L ( y i , F m ( x i ) ) = − { y i l o g p i + ( 1 − y i ) l o g ( 1 − p i ) }
（先不带入负号）

=> yilog(1+e(Fm(xi)))+(1yi){log(e(Fm(xi)))log(1+e(Fm(xi)))} − y i l o g ( 1 + e ( − F m ( x i ) ) ) + ( 1 − y i ) { l o g ( e ( − F m ( x i ) ) ) − l o g ( 1 + e ( − F m ( x i ) ) ) }
=> yilog(1+e(Fm(xi)))+log(e(Fm(xi)))log(1+e(Fm(xi)))yilog(e(Fm(xi)))+yilog(1+e(Fm(xi))) − y i l o g ( 1 + e ( − F m ( x i ) ) ) + l o g ( e ( − F m ( x i ) ) ) − l o g ( 1 + e ( − F m ( x i ) ) ) − y i l o g ( e ( − F m ( x i ) ) ) + y i l o g ( 1 + e ( − F m ( x i ) ) )
=> yiFm(xi)log(1+eFm(xi)) y i F m ( x i ) − l o g ( 1 + e F m ( x i ) )

L(yi,Fm(xi))={yilogpi+(1yi)log(1pi)}={yiFm(xi)log(1+eFm(xi))} L ( y i , F m ( x i ) ) = − { y i l o g p i + ( 1 − y i ) l o g ( 1 − p i ) } = − { y i F m ( x i ) − l o g ( 1 + e F m ( x i ) ) }

Algorithm 3:BinomiaDeviance_TreeBoost____________________________________F0(x)=0.5log(Ni=1yiNi=1(1yi))From=1 to M do:       yi~=[L(yi,F(xi))F(xi)]F(x)=Fm1(x)=yi11+e(Fm1(xi))       {Rjm}J1=Jterminal node tree({ỹ i,xi}N1)       γjm=xiRjmỹ ixiRjm(yiỹ i)(1yi+ỹ i)       Fm(x)=Fm1(x)+j=1JγjmI(xRjm) A l g o r i t h m   3 : B i n o m i a D e v i a n c e _ T r e e B o o s t _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ F 0 ( x ) = 0.5 ∗ l o g ( ∑ i = 1 N y i ∑ i = 1 N ( 1 − y i ) ) F r o m = 1   t o   M   d o :               y i ~ = − [ ∂ L ( y i , F ( x i ) ) ∂ F ( x i ) ] F ( x ) = F m − 1 ( x ) = y i − 1 1 + e ( − F m − 1 ( x i ) )               { R j m } 1 J = J − t e r m i n a l   n o d e   t r e e ( { y ~ i , x i } 1 N )               γ j m = ∑ x i ∈ R j m y ~ i ∑ x i ∈ R j m ( y i − y ~ i ) ∗ ( 1 − y i + y ~ i )               F m ( x ) = F m − 1 ( x ) + ∑ j = 1 J γ j m I ( x ∈ R j m )

## 实践

xi x i 1 2 3 4 5 6 7 8 9 10
yi y i 0 0 0 1 1 0 0 0 1 1

1. 以logloss为损失函数
2. 以MSE为分裂准则
3. 树的深度为1
4. 学习率为0.1

F0(x)=log(Ni=1yiNi=1(1yi))=log(46)=0.4054 F 0 ( x ) = l o g ( ∑ i = 1 N y i ∑ i = 1 N ( 1 − y i ) ) = l o g ( 4 6 ) = − 0.4054

yi~=[L(yi,F(xi))F(xi)]F(x)=Fm1(x)=yi11+e(Fm1(xi))=yi11+e(F0(xi)) y i ~ = − [ ∂ L ( y i , F ( x i ) ) ∂ F ( x i ) ] F ( x ) = F m − 1 ( x ) = y i − 1 1 + e ( − F m − 1 ( x i ) ) = y i − 1 1 + e ( − F 0 ( x i ) )

y1~=011+e(0.4054)=0.400 y 1 ~ = 0 − 1 1 + e ( 0.4054 ) = − 0.400

xi x i 1 2 3 4 5 6 7 8 9 10
ỹ i y ~ i -0.4 -0.4 -0.4 0.6 0.6 -0.4 -0.4 -0.4 0.6 0.6

R11 R 11 xi<=8 x i <= 8 R21 R 21 xi>8 x i > 8

xiR11ỹ i=(ỹ 1+ỹ 2+ỹ 3+ỹ 4+ỹ 5+ỹ 6+ỹ 7+ỹ 8)=1.2 ∑ x i ∈ R 11 y ~ i = ( y ~ 1 + y ~ 2 + y ~ 3 + y ~ 4 + y ~ 5 + y ~ 6 + y ~ 7 + y ~ 8 ) = − 1.2
xiR11(yiỹ i)(1yi+ỹ i)=(y1ỹ 1)(1y1+ỹ 1)+(y2ỹ 2)(1y2+ỹ 2)+(y3ỹ 3)(1y3+ỹ 3)+(y4ỹ 4)(1y4+ỹ 4)+(y5ỹ 5)(1y5+ỹ 5)+(y6ỹ 6)(1y6+ỹ 6)+(y7ỹ 7)(1y7+ỹ 7)+(y8ỹ 8)(1y8+ỹ 8)=1.92 ∑ x i ∈ R 11 ( y i − y ~ i ) ∗ ( 1 − y i + y ~ i ) = ( y 1 − y ~ 1 ) ∗ ( 1 − y 1 + y ~ 1 ) + ( y 2 − y ~ 2 ) ∗ ( 1 − y 2 + y ~ 2 ) + ( y 3 − y ~ 3 ) ∗ ( 1 − y 3 + y ~ 3 ) + ( y 4 − y ~ 4 ) ∗ ( 1 − y 4 + y ~ 4 ) + ( y 5 − y ~ 5 ) ∗ ( 1 − y 5 + y ~ 5 ) + ( y 6 − y ~ 6 ) ∗ ( 1 − y 6 + y ~ 6 ) + ( y 7 − y ~ 7 ) ∗ ( 1 − y 7 + y ~ 7 ) + ( y 8 − y ~ 8 ) ∗ ( 1 − y 8 + y ~ 8 ) = 1.92

xiR21ỹ i=(ỹ 9+ỹ 10)=1.2 ∑ x i ∈ R 21 y ~ i = ( y ~ 9 + y ~ 10 ) = 1.2
xiR21(yiỹ i)(1yi+ỹ i)=(y9ỹ 9)(1y9+ỹ 9)+(y10ỹ 10)(1y10+ỹ 10)=0.48 ∑ x i ∈ R 21 ( y i − y ~ i ) ∗ ( 1 − y i + y ~ i ) = ( y 9 − y ~ 9 ) ∗ ( 1 − y 9 + y ~ 9 ) + ( y 10 − y ~ 10 ) ∗ ( 1 − y 10 + y ~ 10 ) = 0.48

γ11=1.21.92=0.625 γ 11 = − 1.2 1.92 = − 0.625 γ21=1.20.480=2.5 γ 21 = 1.2 0.480 = 2.5

Fm(x)=Fm1(x)+ηJj=1γjmI(xRjm) F m ( x ) = F m − 1 ( x ) + η ∗ ∑ j = 1 J γ j m I ( x ∈ R j m )

F1(x1)=F0(x1)+0.1(0.625)=0.40540.0625=0.4679 F 1 ( x 1 ) = F 0 ( x 1 ) + 0.1 ∗ ( − 0.625 ) = − 0.4054 − 0.0625 = − 0.4679

xi x i 1 2 3 4 5 6 7 8 9 10
F1(xi) F 1 ( x i ) -0.46796511 -0.46796511 -0.46796511 -0.46796511 -0.46796511 -0.46796511 -0.46796511 -0.46796511 -0.15546511 -0.15546511

=> ỹ 1=y111+e(F1(x1))=00.38509=0.38509 y ~ 1 = y 1 − 1 1 + e ( − F 1 ( x 1 ) ) = 0 − 0.38509 = − 0.38509

xi x i 1 2 3 4 5 6 7 8 9 10
ỹ i y ~ i -0.38509799 -0.38509799 -0.38509799 0.61490201 0.61490201 -0.38509799 -0.38509799 -0.38509799 0.53878818 0.53878818

## 关于预测

xi x i 1 2 3 4 5 6 7 8 9 10
F2(xi) F 2 ( x i ) -0.52501722 -0.52501722 -0.52501722 -0.52501722 -0.52501722 -0.52501722 -0.52501722 -0.52501722 0.06135501 0.06135501

F2(x) F 2 ( x ) 可有有下表：

xi x i 1 2 3 4 5 6 7 8 9 10
pi p i 0.37167979 0.37167979 0.37167979 0.37167979 0.37167979 0.37167979 0.37167979 0.37167979 0.51533394 0.51533394

(表中的概率为正样本的概率，即 yi=1 y i = 1 的概率）

## Sklearn源码简单分析

``````class BinomialDeviance(ClassificationLossFunction):
"""Binomial deviance loss function for binary classification. Binary classification is a special case; here, we only need to fit one tree instead of ``n_classes`` trees. """
def __init__(self, n_classes):
if n_classes != 2:
raise ValueError("{0:s} requires 2 classes.".format(
self.__class__.__name__))
# we only need to fit one tree for binary clf.
super(BinomialDeviance, self).__init__(1)

def init_estimator(self):
return LogOddsEstimator()

def __call__(self, y, pred, sample_weight=None):
"""Compute the deviance (= 2 * negative log-likelihood). """
# logaddexp(0, v) == log(1.0 + exp(v))
pred = pred.ravel()
if sample_weight is None:
return -2.0 * np.mean((y * pred) - np.logaddexp(0.0, pred))
else:
return (-2.0 / sample_weight.sum() *
np.sum(sample_weight * ((y * pred) - np.logaddexp(0.0, pred))))

"""Compute the residual (= negative gradient). """
return y - expit(pred.ravel())

def _update_terminal_region(self, tree, terminal_regions, leaf, X, y, residual, pred, sample_weight):
"""Make a single Newton-Raphson step. our node estimate is given by: sum(w * (y - prob)) / sum(w * prob * (1 - prob)) we take advantage that: y - prob = residual """
terminal_region = np.where(terminal_regions == leaf)[0]
residual = residual.take(terminal_region, axis=0)
y = y.take(terminal_region, axis=0)
sample_weight = sample_weight.take(terminal_region, axis=0)

numerator = np.sum(sample_weight * residual)
denominator = np.sum(sample_weight * (y - residual) * (1 - y + residual))
# prevents overflow and division by zero
if abs(denominator) < 1e-150:
tree.value[leaf, 0, 0] = 0.0
else:
tree.value[leaf, 0, 0] = numerator / denominator

def _score_to_proba(self, score):
proba = np.ones((score.shape[0], 2), dtype=np.float64)
proba[:, 1] = expit(score.ravel())
proba[:, 0] -= proba[:, 1]
return proba

def _score_to_decision(self, score):
proba = self._score_to_proba(score)
return np.argmax(proba, axis=1)``````

``````    def negative_gradient(self, y, pred, **kargs):
"""Compute the residual (= negative gradient). """
return y - expit(pred.ravel())``````

``````    def _update_terminal_region(self, tree, terminal_regions, leaf, X, y, residual, pred, sample_weight):
"""Make a single Newton-Raphson step. our node estimate is given by: sum(w * (y - prob)) / sum(w * prob * (1 - prob)) we take advantage that: y - prob = residual """
terminal_region = np.where(terminal_regions == leaf)[0]
residual = residual.take(terminal_region, axis=0)
y = y.take(terminal_region, axis=0)
sample_weight = sample_weight.take(terminal_region, axis=0)

numerator = np.sum(sample_weight * residual)
denominator = np.sum(sample_weight * (y - residual) * (1 - y + residual))
# prevents overflow and division by zero
if abs(denominator) < 1e-150:
tree.value[leaf, 0, 0] = 0.0
else:
tree.value[leaf, 0, 0] = numerator / denominator``````

``````class LogOddsEstimator(object):
"""An estimator predicting the log odds ratio."""
scale = 1.0

def fit(self, X, y, sample_weight=None):
# pre-cond: pos, neg are encoded as 1, 0
if sample_weight is None:
pos = np.sum(y)
neg = y.shape[0] - pos
else:
pos = np.sum(sample_weight * y)
neg = np.sum(sample_weight * (1 - y))

if neg == 0 or pos == 0:
raise ValueError('y contains non binary labels.')
self.prior = self.scale * np.log(pos / neg)

def predict(self, X):
check_is_fitted(self, 'prior')

y = np.empty((X.shape[0], 1), dtype=np.float64)
y.fill(self.prior)
return y``````

``````    def fit(self, X, y, sample_weight=None):
# pre-cond: pos, neg are encoded as 1, 0
if sample_weight is None:
pos = np.sum(y)
neg = y.shape[0] - pos
else:
pos = np.sum(sample_weight * y)
neg = np.sum(sample_weight * (1 - y))

if neg == 0 or pos == 0:
raise ValueError('y contains non binary labels.')
self.prior = self.scale * np.log(pos / neg)``````

``````    def _score_to_proba(self, score):
proba = np.ones((score.shape[0], 2), dtype=np.float64)
proba[:, 1] = expit(score.ravel())
proba[:, 0] -= proba[:, 1]
return proba
``````

## 参考资料

http://docplayer.net/21448572-Generalized-boosted-models-a-guide-to-the-gbm-package.html（各种loss function的推导结果）