Split train and test
from sklearn.cross_validation import train_test_split
x_train, x_test, y_train, y_test = train_test_split(customer.ix[:,0:customer.columns.size-1], customer.ix[:,customer.columns.size-1], test_size = 0.2)
x_train, x_test, y_train, y_test = train_test_split(order.ix[:,0:order.columns.size-1], order.ix[:,order.columns.size-1], test_size = 0.2)
Decision tree
使用信息熵作为划分标准,对决策树进行训练
from sklearn import tree
clf = tree.DecisionTreeClassifier(criterion='entropy')
print(clf)
clf.fit(x_train, y_train)
Predict
answer = pd.Series(clf.predict(x_test))
result = pd.DataFrame(np.c_[y_test, answer])
result['correct'] = np.where(result[0] == result[1], 1, 0)
sum(result['correct'])/pd.count(result['correct'])
result.sum()/result.count()
preds = clf.predict(x_test)
result = pd.crosstab(y_test, preds, rownames=['actual'], colnames=['preds'])
res = result.div(result.sum(1).astype(float), axis=0)
res
res2 = res.drop(max(res) < 0.5, axis=1)
res.to_csv('result.csv', index = True, header = True)