聚类分析——Kmeans

导入数据

cus_general = customer[['wm_poi_id','city_type','pre_book','aor_type','is_selfpick_poi','is_selfpick_trade_poi']]
cus_ord = customer[['wm_poi_id','month_original_price','month_order_cnt','service_fee_30day','abnor_rate_30day']]
cus = customer[['wm_poi_id','comment_1star','comment_5star','pic_comment_cnt']]
cus = customer[['wm_poi_id','waybill_received_ratio','waybill_delivered_ratio','waybill_ontime_ratio','waybill_normal_arrived_delivery_total_interval_avg','waybill_normal_poi_push_interval_avg','waybill_normal_receive_interval_avg','waybill_normal_fetch_interval_avg','waybill_normal_delivery_interval_avg','waybill_delivery_ontime_ratio','loss_amt']]
cus_all = customer[['wm_poi_id','c5','ol_time','primary_first_tag_id','city_level',
                    'month_original_price','month_order_cnt','service_fee_30day','abnor_cnt_30day',
                    'comment_1star','comment_5star','pic_comment_cnt',
                    'area_30day','waybill_grab_5mins_ratio','waybill_delivered_ratio','waybill_normal_arrived_delivery_total_interval_avg','waybill_normal_receive_interval_avg',
                    'call.call_cnt','call.call_cnt_ord','call.call_cnt_poi','call.call_cnt_oth']]

预处理

from sklearn import preprocessing
cus = pd.DataFrame(preprocessing.scale(cus_general.iloc[:,1:6]))
cus = pd.DataFrame(preprocessing.scale(cus_ord.iloc[:,1:5]))
cus = pd.DataFrame(preprocessing.scale(cus_all.iloc[:,1:21]))
cus.columns = ['city_type','pre_book','aor_type','is_selfpick_poi','is_selfpick_trade_poi']
cus.columns = ['month_original_price','month_order_cnt','service_fee_30day','abnor_rate_30day']
cus.columns = ['comment_1star','comment_5star','pic_comment_cnt']
cus.columns = ['waybill_push_ratio','waybill_delivered_ratio','waybill_ontime_ratio','waybill_normal_arrived_delivery_total_interval_avg','waybill_normal_poi_push_interval_avg','waybill_normal_receive_interval_avg','waybill_normal_fetch_interval_avg','waybill_normal_delivery_interval_avg','waybill_delivery_ontime_ratio','loss_amt']
cus.columns = ['c5','ol_time','primary_first_tag_id','city_level',
               'month_original_price','month_order_cnt','service_fee_30day','abnor_cnt_30day',
               'comment_1star','comment_5star','pic_comment_cnt',
               'area_30day','waybill_grab_5mins_ratio','waybill_delivered_ratio','waybill_normal_arrived_delivery_total_interval_avg','waybill_normal_receive_interval_avg',
               'call.call_cnt','call.call_cnt_ord','call.call_cnt_poi','call.call_cnt_oth']

计算K值从1到10对应的平均畸变程度:用scipy求解距离

from sklearn.cluster import KMeans
from scipy.spatial.distance import cdist
K=range(1,15)
meandistortions=[]
for k in K:
    kmeans=KMeans(n_clusters=k)
    kmeans.fit(cus)
    meandistortions.append(sum(np.min(cdist(cus,kmeans.cluster_centers_,'euclidean'),axis=1)))
plt.plot(K,meandistortions,'bx-')
plt.xlabel('k')
plt.ylabel(u'平均畸变程度')
plt.title(u'用肘部法则来确定最佳的K值')

Kmean建模

from sklearn.cluster import KMeans
clf = KMeans(n_clusters=12)
clf.fit(cus)
pd.Series(pd.Series(clf.labels_).value_counts())

centres = pd.DataFrame(clf.cluster_centers_)
centres.columns = cus_all.iloc[:,1:21].columns
centres.plot(kind='bar', subplots=True, figsize=(6,15))
clf.inertia_

cus_general = pd.concat([cus_general, pd.DataFrame(clf.fit_predict(cus))], axis=0)
cus_general = cus_general.rename(columns={0:'general'})
cus_ord = pd.concat([cus_ord, pd.DataFrame(clf.fit_predict(cus))], axis=0)
cus_ord = cus_ord.rename(columns={0:'order'})
cus_all = pd.concat([cus_all, pd.DataFrame(clf.fit_predict(cus))], axis=0)
cus_all = cus_all.rename(columns={0:'cluster'})

centres = cus_all.groupby(['cluster']).mean()

cus_all.to_csv('cluster.csv')

result = cus_all[cus_all['cluster']==2]
    原文作者:叶青婧
    原文地址: https://segmentfault.com/a/1190000013554167
    本文转自网络文章,转载此文章仅为分享知识,如有侵权,请联系博主进行删除。
点赞