项目链接:
代码实例:
# -*- coding:UTF-8 -*-
# 导入需要用到的python包
import pandas as pd # 常用的数据分析包
import matplotlib as mpl # 可视化包
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split # 数据集切分包
from xgboost import XGBClassifier # 算法包
import pickle # 用于保存模型的包
from sklearn.metrics import classification_report # 模型评价包(分数越高越好)
# 导入训练数据(历史数据)及测试数据
train_data = pd.read_csv(‘F:/02 数据挖掘/05 Coding/二分类算法–提供银行精准营销解决方案/train_set.csv’,encoding=’utf-8′,engine=’python’)
test_data = pd.read_csv(‘F:/02 数据挖掘/05 Coding/二分类算法–提供银行精准营销解决方案/test_set.csv’,encoding=’utf-8′,engine=’python’)
# 进行数据预处理(包括删除无用变量,分离变量,字符类型数据处理,数字类型数据归一化),注意要将训练数据和测试数据进行相同的处理
# 1.删除空值
train_data.dropna()
test_data.dropna()
# n=0
# print(len(test_data))
# for i in range(len(test_data)):
# for j in test_data.columns:
# if test_data.loc[i,j]==None:
# n=n+1
# print(n)
# train_data.drop([‘ID’,’contact’,’day’,’month’,’duration’,’pdays’],axis=1,inplace=True)
# test_data.drop([‘ID’,’contact’,’day’,’month’,’duration’,’pdays’],axis=1,inplace=True)
ID =test_data[‘ID’]
train_data.drop([‘ID’],axis=1,inplace=True)
test_data.drop([‘ID’],axis=1,inplace=True)
train_data.reset_index(drop=True)
test_data.reset_index(drop=True)
# 2.分离因变量及自变量
y = train_data[‘y’]
train_data.drop([‘y’],axis=1,inplace=True)
# 3.字符类型数据处理(使用one-hot编码)
train_data_get_dummies = pd.get_dummies(train_data)
test_data_get_dummies = pd.get_dummies(test_data)
# 4.数据归一化(使每个变量下的数据正态分布)
scaler = StandardScaler()
train_scaled = scaler.fit_transform(train_data_get_dummies)
test_scaled = scaler.fit_transform(test_data_get_dummies)
# 使用train_test_split函数划分数据集(训练集占75%,测试集占25%)
x_train,x_test,y_train,y_test = train_test_split(train_scaled,y,random_state = 1)
# 使用XGBOOST算法,对训练数据进行拟合
XGB = XGBClassifier(max_depth=8)
clf = XGB.fit(x_train,y_train)
# 保存本地模型
with open(‘F:/02 数据挖掘/05 Coding/二分类算法–提供银行精准营销解决方案/XGB模型’,’wb’) as f:
pickle.dump(clf,f)
# 读取本地模型,并对模型打分
pickle_in = open(‘F:/02 数据挖掘/05 Coding/二分类算法–提供银行精准营销解决方案/XGB模型’,’rb’)
clf = pickle.load(pickle_in)
pred_most_frequent = clf.predict(x_test)
print(classification_report(y_test, pred_most_frequent))
score = clf.score(x_test,y_test)
print(score)
# print(clf.predict_proba(test_scaled))
result = {‘ID’:(ID),
‘pred’:(clf.predict_proba(test_scaled)[:,1])}
result = pd.DataFrame(result)
result.to_csv(‘F:/02 数据挖掘/05 Coding/二分类算法–提供银行精准营销解决方案/result.csv’,index=None)
print(result)