本文介绍该书第一章的项目:运用CART树进行规则挖掘,具体代码如下
#!/usr/bin/env python
# coding: utf-8
# In[1]:
import pandas as pd
import numpy as np
import os
# In[2]:
data = pd.read_excel( './data_for_tree.xlsx')
# In[3]:
data.head()
# In[4]:
os.environ["PATH"] += os.pathsep + 'D:/' #指定画图路径
# In[6]:
org_lst = ['uid','create_dt','oil_actv_dt','class_new','bad_ind']
agg_lst = ['oil_amount','discount_amount','sale_amount','amount','pay_amount','coupon_amount','payment_coupon_amount']
dstc_lst = ['channel_code','oil_code','scene','source_app','call_source']
# In[7]:
df = data[org_lst].copy()
# In[9]:
df[agg_lst] = data[agg_lst].copy()
df[dstc_lst] = data[dstc_lst].copy()
# In[12]:
base = df[org_lst].copy()
# In[16]:
df
# In[14]:
base = base.drop_duplicates(['uid'],keep = 'first')
# In[15]:
base
# In[17]:
gn = pd.DataFrame()
# In[18]:
gn
# In[24]:
for i in agg_lst:
#计算个数
tp = pd.DataFrame(df.groupby('uid').apply(
lambda df:len(df[i])).reset_index())
tp.columns = ['uid',i + '_cnt']
if gn.empty == True:
gn = tp
else:
gn = pd.merge(gn,tp,on = 'uid',how = 'left')
#求历史特征值大于零的个数
tp = pd.DataFrame(df.groupby('uid').apply(
lambda df:np.where(df[i]>0,1,0).sum()).reset_index())
tp.columns = ['uid',i + '_num']
if gn.empty == True:
gn = tp
else:
gn = pd.merge(gn,tp,on = 'uid',how = 'left')
#对历史数据求和
tp = pd.DataFrame(df.groupby('uid').apply(
lambda df:np.nansum(df[i])).reset_index())
tp.columns = ['uid',i + '_tot']
if gn.empty == True:
gn = tp
else:
gn = pd.merge(gn,tp,on = 'uid',how = 'left')
#对历史数据求均值
tp = pd.DataFrame(df.groupby('uid').apply(
lambda df:np.nanmean(df[i])).reset_index())
tp.columns = ['uid',i + '_avg']
if gn.empty == True:
gn = tp
else:
gn = pd.merge(gn,tp,on = 'uid',how = 'left')
#对历史数据求最大值
tp = pd.DataFrame(df.groupby('uid').apply(
lambda df:np.nanmax(df[i])).reset_index())
tp.columns = ['uid',i + '_max']
if gn.empty == True:
gn = tp
else:
gn = pd.merge(gn,tp,on = 'uid',how = 'left')
#对历史数据求最小值
tp = pd.DataFrame(df.groupby('uid').apply(
lambda df:np.nanmin(df[i])).reset_index())
tp.columns = ['uid',i + '_min']
if gn.empty == True:
gn = tp
else:
gn = pd.merge(gn,tp,on = 'uid',how = 'left')
#对历史数据求方差
tp = pd.DataFrame(df.groupby('uid').apply(
lambda df:np.nanvar(df[i])).reset_index())
tp.columns = ['uid',i + '_var']
if gn.empty == True:
gn = tp
else:
gn = pd.merge(gn,tp,on = 'uid',how = 'left')
#对历史数据求极差
tp = pd.DataFrame(df.groupby('uid').apply(
lambda df:np.nanmax(df[i])-np.nanmin(df[i]) ).reset_index())
tp.columns = ['uid',i + '_ran']
if gn.empty == True:
gn = tp
else:
gn = pd.merge(gn,tp,on = 'uid',how = 'left')
#对历史数据求变异系数,为防止除数为0,利用0.01进行平滑
tp = pd.DataFrame(df.groupby('uid').apply(lambda df:np.nanmean(df[i])/(np.nanvar(df[i])+0.01))).reset_index()
tp.columns = ['uid',i + '_cva']
if gn.empty == True:
gn = tp
else:
gn = pd.merge(gn,tp,on = 'uid',how = 'left')
# In[25]:
#对离散变量处理
gc = pd.DataFrame()
for i in dstc_lst:
tp = pd.DataFrame(df.groupby('uid').apply(
lambda df: len(set(df[i]))).reset_index())
tp.columns = ['uid',i + '_dstc']
if gc.empty == True:
gc = tp
else:
gc = pd.merge(gc,tp,on = 'uid',how = 'left')
# In[26]:
#将两部分衍生数据和基础用户信息合并
fn = base.merge(gn,on='uid').merge(gc,on='uid')
fn = pd.merge(fn,gc,on= 'uid')
fn.shape
# In[72]:
#使用CART树挖掘规则
from sklearn import tree
dtree = tree.DecisionTreeRegressor(max_depth = 2,min_samples_leaf = 500,min_samples_split = 5000)
# In[64]:
x = fn.drop(['uid','oil_actv_dt','create_dt','bad_ind','class_new'],axis = 1)
y = fn.bad_ind.copy()
x.fillna(0, inplace=True)#把缺失值用0替换,否则训练不了
# In[65]:
dtree=dtree.fit(x,y)
# In[66]:
#查看下变量的重要性
importance=dtree.feature_importances_
df=pd.DataFrame()
df['名称']=x.columns
df['特征重要性']=importance
# In[67]:
df.sort_values(by='特征重要性', ascending=True)
# In[68]:
from io import StringIO
import os
# In[69]:
os.environ["PATH"] += os.pathsep + 'D:/'
dot_data = StringIO()
tree.export_graphviz(dtree, out_file=dot_data,
feature_names=x.columns,
class_names=['bad_ind'],
filled=True, rounded=True,
special_characters=True)
graph = pydotplus.graph_from_dot_data(dot_data.getvalue())
# In[70]:
from IPython.display import Image
# In[71]:
Image(graph.create_png())#得到图像,由此可以写出分类规则。