import numpy as np
import scipy as sp
import pandas as pd
import matplotlib.pyplot as plt
Split train and test
from sklearn.cross_validation import train_test_split
x_train, x_test, y_train, y_test = train_test_split(customer.ix[:,0:customer.columns.size-1], customer.ix[:,customer.columns.size-1], test_size = 0.2)
x_train, x_test, y_train, y_test = train_test_split(order.ix[:,0:order.columns.size-1], order.ix[:,order.columns.size-1], test_size = 0.2)
Pearson Correlation for Order
from scipy.stats import pearsonr
prr = []
for i in range(order.columns.size-1):
frame = pearsonr(order.iloc[:,i], order.iloc[:,order.columns.size-1])
prr.append(frame)
result = pd.concat([pd.DataFrame(order.columns.values.tolist()), pd.DataFrame(prr)], axis=1)
result.columns = ['Features', 'Pearson', 'Pvalue']
result
result.to_csv('result.csv', index = True, header = True)
Pearson Correlation for Customer
from scipy.stats import pearsonr
prr = []
for i in range(customer.columns.size-1):
frame = pearsonr(customer.iloc[:,i], customer.iloc[:,customer.columns.size-1])
prr.append(frame)
result = pd.concat([pd.DataFrame(customer.columns.values.tolist()), pd.DataFrame(prr)], axis=1)
result.columns = ['Features', 'Pearson', 'Pvalue']
result
result.to_csv('result.csv', index = True, header = True)
Random forest
from sklearn.ensemble import RandomForestRegressor
clf = RandomForestRegressor()
clf.fit(x_train, y_train)
from sklearn.ensemble import RandomForestClassifier
clf = RandomForestClassifier(n_jobs=100)
clf.fit(x_train, y_train)
MIC
from minepy import MINE
mic = []
for i in range(customer.columns.size-1):
frame = m.compute_score(customer.iloc[:,i], customer.iloc[:,34])
prr.append(frame)
result = pd.concat([pd.DataFrame(customer.columns.values.tolist()), pd.DataFrame(prr)], axis=1)
result.columns = ['Features', 'Pearson', 'Pvalue']
result.to_csv('result.csv', index = True, header = True)
Feature Correlation
corr = customer.corr()
corr.to_csv('result.csv', index = True, header = True)
tar_corr = lambda x: x.corr(x['tar'])
cus_call.apply(tar_corr)
cus_call.corrwith(cus_call.tar)
Feature Importance
系数反映每个特征的影响力。越大表示该特征在分类中起到的作用越大
importances = pd.DataFrame(sorted(zip(x_train.columns, map(lambda x: round(x, 4), clf.feature_importances_)), reverse=True))
importances.columns = ['Features', 'Importance']
importances.to_csv('result.csv', index = True, header = True)