上一次修改时间：2018-06-26 20:33:08

决策树随机森林-商品分类-代码

数据同Logistic回归，Otto商品分类数据，分别调用缺省参数CART、CART + GridSearchCV以及Random Forest + GridSearchCV进行参数调优；同时考虑通过featureimportances进行特征选择

# python 2.7    python 2.7    python 2.7  
#import必要的模块
import pandas as pd
import numpy as np

#竞赛的评论指标为logloss
from sklearn.metrics import log_loss
from sklearn import metrics

from matplotlib import pyplot
import seaborn as sns
%matplotlib inline

读取数据 & 数据探索

#读取数据
dpath = './logistic/'
train = pd.read_csv(dpath + "Otto_train_test.csv")
train.head()

train.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 701 entries, 0 to 700
Data columns (total 95 columns):
id         701 non-null int64
feat_1     701 non-null int64
feat_2     701 non-null int64
feat_3     701 non-null int64
feat_4     701 non-null int64
feat_5     701 non-null int64
feat_6     701 non-null int64
feat_7     701 non-null int64
feat_8     701 non-null int64
feat_9     701 non-null int64
feat_10    701 non-null int64
feat_11    701 non-null int64
feat_12    701 non-null int64
feat_13    701 non-null int64
feat_14    701 non-null int64
feat_15    701 non-null int64
feat_16    701 non-null int64
feat_17    701 non-null int64
feat_18    701 non-null int64
feat_19    701 non-null int64
feat_20    701 non-null int64
feat_21    701 non-null int64
feat_22    701 non-null int64
feat_23    701 non-null int64
feat_24    701 non-null int64
feat_25    701 non-null int64
feat_26    701 non-null int64
feat_27    701 non-null int64
feat_28    701 non-null int64
feat_29    701 non-null int64
feat_30    701 non-null int64
feat_31    701 non-null int64
feat_32    701 non-null int64
feat_33    701 non-null int64
feat_34    701 non-null int64
feat_35    701 non-null int64
feat_36    701 non-null int64
feat_37    701 non-null int64
feat_38    701 non-null int64
feat_39    701 non-null int64
feat_40    701 non-null int64
feat_41    701 non-null int64
feat_42    701 non-null int64
feat_43    701 non-null int64
feat_44    701 non-null int64
feat_45    701 non-null int64
feat_46    701 non-null int64
feat_47    701 non-null int64
feat_48    701 non-null int64
feat_49    701 non-null int64
feat_50    701 non-null int64
feat_51    701 non-null int64
feat_52    701 non-null int64
feat_53    701 non-null int64
feat_54    701 non-null int64
feat_55    701 non-null int64
feat_56    701 non-null int64
feat_57    701 non-null int64
feat_58    701 non-null int64
feat_59    701 non-null int64
feat_60    701 non-null int64
feat_61    701 non-null int64
feat_62    701 non-null int64
feat_63    701 non-null int64
feat_64    701 non-null int64
feat_65    701 non-null int64
feat_66    701 non-null int64
feat_67    701 non-null int64
feat_68    701 non-null int64
feat_69    701 non-null int64
feat_70    701 non-null int64
feat_71    701 non-null int64
feat_72    701 non-null int64
feat_73    701 non-null int64
feat_74    701 non-null int64
feat_75    701 non-null int64
feat_76    701 non-null int64
feat_77    701 non-null int64
feat_78    701 non-null int64
feat_79    701 non-null int64
feat_80    701 non-null int64
feat_81    701 non-null int64
feat_82    701 non-null int64
feat_83    701 non-null int64
feat_84    701 non-null int64
feat_85    701 non-null int64
feat_86    701 non-null int64
feat_87    701 non-null int64
feat_88    701 non-null int64
feat_89    701 non-null int64
feat_90    701 non-null int64
feat_91    701 non-null int64
feat_92    701 non-null int64
feat_93    701 non-null int64
target     701 non-null object
dtypes: int64(94), object(1)
memory usage: 520.3+ KB

#各属性的统计特性
train.describe()

# Target 分布，查看各类样本分布是否均衡
# 当各类样本数量不均衡时，交叉验证对分类任务要采用StratifiedKFold，即在每折采样时根据各类样本按比例采样，
# 交叉验证的代码中缺省的就是StratifiedKFold
sns.countplot(train.target)
pyplot.xlabel('target')
pyplot.ylabel('Number of occurrences')

特征编码

# 将类别字符串变成数字
y_train = train['target'] #形式为Class_x
y_train = y_train.map(lambda s: s[6:])
y_train = y_train.map(lambda s: int(s) - 1)
 
train = train.drop(["id" , "target"] , axis = 1)
columns = train.columns
X_train = np.array(train)

数据预处理

# 数据标准化
from sklearn.preprocessing import StandardScaler
# 初始化特征的标准化器
ss_X = StandardScaler()
# 分别对训练和测试数据的特征进行标准化处理
X_train = ss_X.fit_transform(X_train)
# 因采用的是交叉验证，所以没有划分出训练数据和测试数据
#X_test = ss_X.transform(X_test)

模型训练

default Desion Tree(CART)

from sklearn.tree import DecisionTreeClassifier
model_tree = DecisionTreeClassifier()

model_tree.fit(X_train ,y_train)

# 交叉验证用于评估模型性能和进行参数调优(模型选择)
# 分类任务中交叉验证缺省是采用StratifiedKFold
#交叉验证中要求scoring越大越好，因此用neg_log_loss时需要加一个负号
from sklearn.cross_validation import cross_val_score
loss = cross_val_score(model_tree , X_train , y_train ,cv = 10 , scoring='neg_log_loss')
print 'logloss of each fold is :' , -loss
print 'cv logloss mean is :' , -loss.mean()

#查看各特征的系数，系数的绝对值大小可视为该特征的重要性
df = pd.DataFrame({"columns" : list(columns) , "importance" : list(abs(model_tree.feature_importances_.T))})
df.sort_values(by = ['importance'] , ascending = False)

决策树的超参数有：max_depth(树的深度)或max_leaf_nodes(叶子结点的数目)、max_features(最大特征数目)、min_samples_leaf(叶子结点的最小样本数)、min_samples_split(中间结点的最小样本数)、min_weight_fraction_leaf(叶子节点的样本权重点总权重的比例)、min_impurity_split(最小不纯净度)也可以调整这里调整max_depth试试

from sklearn.tree import DecisionTreeClassifier

model_DD = DecisionTreeClassifier()

#设置参数搜索grid
max_depth = range(5 , 15 , 1)
#min_samples_leaf = range(1 , 10 , 2)
tuned_parameters = dict(max_depth = max_depth)

#GridSearchCV
from sklearn.model_selection import GridSearchCV
DD = GridSearchCV(model_DD , tuned_parameters ,scoring = 'neg_log_loss' , cv = 10)
DD.fit(X_train , y_train)

print("Best : %f using %s" % (-DD.best_score_ , DD.best_params_))

test_means = -DD.cv_results_['mean_test_score']

x_axis = range(5 , test_means.shape[0] + 5)

pyplot.plot(x_axis , test_means)
pyplot.title("max_depth vs Log Loss")
pyplot.xlabel('max_depth')
pyplot.ylabel('Log Loss')

pyplot.show()

#查看各特征的系数，系数的绝对值大小可视为该特征的重要性
#print DD.best_estimator_.feature_importances_.T
df = pd.DataFrame({"columns" : list(columns) , "importance" : list(DD.best_estimator_.feature_importances_.T)})
df.sort_values(by = ['importance'] , ascending = False)

尝试做下特征选择

去掉上面那些重要程度为0的特征

from sklearn.feature_selection import SelectFromModel
#SelectFromModel为特征选择类，可以返回大于阈值的特征，回归中阈值选择的是coef_，分类中为feature_importances_
model = SelectFromModel(DD.best_estimator_ , prefit = True , threshold = 0.01)#threshold参数为特征选择阈值，默认为0.01
X_Train_new = model.transform(X_train)

X_Train_new.shape

用选择的特征，CV选择的max_depth参数，再次训练模型

model_tuned_tree = DecisionTreeClassifier(max_depth = DD.best_params_['max_depth'])

from sklearn.cross_validation import cross_val_score
loss = cross_val_score(model_tuned_tree , X_Train_new , y_train , cv = 10 , scoring='neg_log_loss')
print 'logloss of each fold is :' , -loss
print 'cv logloss is :' , -loss.mean()

写一个自动监控程序

决策树运行快，所以在这示意监控过程。Random Forest部分可以参照这个写，不过得对训练时间有足够的心理准备；

import time
#一个threshold(阈值)对应的特征选择，及相应的模型训练和测试，得到该参数下模型在校验集上的预测
def one_feature_selection(threshold , X_train , y_train):
    start = time.time()
    
    print("threshold: {}".format(threshold))
    model = SelectFromModel(DD.best_estimator_ , threshold = threshold ,  prefit = True)
    X_Train_new = model.transform(X_train)
    print X_Train_new.shape
    
    #在训练集和测试集降维
    DD.best_estimator_.get_params()
    model_truned_tree = DecisionTreeClassifier(max_depth = DD.best_params_['max_depth'])
    
    loss = cross_val_score(model_tuned_tree , X_Train_new , y_train ,cv = 10 , scoring='neg_log_loss')
    cvloss = -loss.mean()
    
    end = time.time()
    print("cvloss:{}".format(cvloss.mean()))
    return cvloss

#设置threshold搜索范围
#每次去掉最不重要的5%
cols = df.shape[0] #总的特征数目
step = int(cols * 0.05)

threshold_s = np.zeros(20)
losses = []
for i in range(1 , 20 , 1):
    threshold_s[i - 1] = df.importance[cols - i * step]
    tmp = one_feature_selection(threshold_s[i - 1] , X_train , y_train)
    losses.append(tmp)

out:

threshold: 0.157906991761
(701L, 2L)
cvloss:2.72968407471
threshold: 0.0
(701L, 93L)
cvloss:2.68337656155
threshold: 0.0
(701L, 93L)
cvloss:2.68850684525
threshold: 0.0
(701L, 93L)
cvloss:2.58999143151
threshold: 0.0
(701L, 93L)
cvloss:2.63796195428
threshold: 0.0352315956308
(701L, 7L)
cvloss:2.41340774533
threshold: 0.0
(701L, 93L)
cvloss:2.64053632248
threshold: 0.126966902673
(701L, 5L)
cvloss:2.57437674006
threshold: 0.0
(701L, 93L)
cvloss:2.59256579971
threshold: 0.0
(701L, 93L)
cvloss:2.54733846624
threshold: 0.0
(701L, 93L)
cvloss:2.58993127805
threshold: 0.0
(701L, 93L)
cvloss:2.63796195428
threshold: 0.0
(701L, 93L)
cvloss:2.59017870531
threshold: 0.0
(701L, 93L)
cvloss:2.54942414114
threshold: 0.293529089618
(701L, 1L)
cvloss:2.43123798541
threshold: 0.0
(701L, 93L)
cvloss:2.64053632248
threshold: 0.0
(701L, 93L)
cvloss:2.64811857035
threshold: 0.0
(701L, 93L)
cvloss:2.59739466391
threshold: 0.0
(701L, 93L)
cvloss:2.64072359628

threshold_s

Random Forest

随机森林可调整的超参数(除了和决策树相同的参数)：n_estimators(弱学习器的数目)，所以下面一起调整树的max_depth和n_estimators

from sklearn.ensemble import RandomForestClassifier

model_RR = RandomForestClassifier()

#设置参数搜索grid
max_depth = range(10 , 30 , 2)
n_estimators = range(10 , 20 , 2)
tuned_parameters = dict(max_depth = max_depth , n_estimators = n_estimators)

from sklearn.model_selection import GridSearchCV
RR = GridSearchCV(model_RR , tuned_parameters , scoring = 'neg_log_loss' , cv = 10)
RR.fit(X_train , y_train)

print("Best: %f using %s" % (-RR.best_score_ , RR.best_params_))

随机森林比一棵树好了很多

RR.grid_scores_

out:

[mean: -1.35386, std: 0.19609, params: {'n_estimators': 10, 'max_depth': 10},
 mean: -1.22500, std: 0.07677, params: {'n_estimators': 12, 'max_depth': 10},
 mean: -1.23760, std: 0.08164, params: {'n_estimators': 14, 'max_depth': 10},
 mean: -1.23707, std: 0.05659, params: {'n_estimators': 16, 'max_depth': 10},
 mean: -1.23354, std: 0.07110, params: {'n_estimators': 18, 'max_depth': 10},
 mean: -1.39196, std: 0.26465, params: {'n_estimators': 10, 'max_depth': 12},
 mean: -1.21759, std: 0.09212, params: {'n_estimators': 12, 'max_depth': 12},
 mean: -1.20984, std: 0.17629, params: {'n_estimators': 14, 'max_depth': 12},
 mean: -1.28113, std: 0.20810, params: {'n_estimators': 16, 'max_depth': 12},
 mean: -1.16538, std: 0.08866, params: {'n_estimators': 18, 'max_depth': 12},
 mean: -1.45142, std: 0.31450, params: {'n_estimators': 10, 'max_depth': 14},
 mean: -1.40571, std: 0.41349, params: {'n_estimators': 12, 'max_depth': 14},
 mean: -1.37168, std: 0.45835, params: {'n_estimators': 14, 'max_depth': 14},
 mean: -1.24758, std: 0.18312, params: {'n_estimators': 16, 'max_depth': 14},
 mean: -1.21770, std: 0.20019, params: {'n_estimators': 18, 'max_depth': 14},
 mean: -2.00128, std: 0.33793, params: {'n_estimators': 10, 'max_depth': 16},
 mean: -1.55623, std: 0.41455, params: {'n_estimators': 12, 'max_depth': 16},
 mean: -1.42701, std: 0.33239, params: {'n_estimators': 14, 'max_depth': 16},
 mean: -1.20207, std: 0.21551, params: {'n_estimators': 16, 'max_depth': 16},
 mean: -1.20576, std: 0.29579, params: {'n_estimators': 18, 'max_depth': 16},
 mean: -2.02903, std: 0.49022, params: {'n_estimators': 10, 'max_depth': 18},
 mean: -1.63538, std: 0.33508, params: {'n_estimators': 12, 'max_depth': 18},
 mean: -1.66119, std: 0.51923, params: {'n_estimators': 14, 'max_depth': 18},
 mean: -1.43268, std: 0.18889, params: {'n_estimators': 16, 'max_depth': 18},
 mean: -1.43009, std: 0.37455, params: {'n_estimators': 18, 'max_depth': 18},
 mean: -2.51668, std: 0.75125, params: {'n_estimators': 10, 'max_depth': 20},
 mean: -2.20780, std: 0.75028, params: {'n_estimators': 12, 'max_depth': 20},
 mean: -1.59372, std: 0.59536, params: {'n_estimators': 14, 'max_depth': 20},
 mean: -1.56941, std: 0.45555, params: {'n_estimators': 16, 'max_depth': 20},
 mean: -1.32084, std: 0.36734, params: {'n_estimators': 18, 'max_depth': 20},
 mean: -2.62570, std: 1.06356, params: {'n_estimators': 10, 'max_depth': 22},
 mean: -2.11879, std: 0.42273, params: {'n_estimators': 12, 'max_depth': 22},
 mean: -1.64756, std: 0.52801, params: {'n_estimators': 14, 'max_depth': 22},
 mean: -1.46009, std: 0.42005, params: {'n_estimators': 16, 'max_depth': 22},
 mean: -1.53584, std: 0.43831, params: {'n_estimators': 18, 'max_depth': 22},
 mean: -2.51453, std: 0.89887, params: {'n_estimators': 10, 'max_depth': 24},
 mean: -1.99431, std: 0.74647, params: {'n_estimators': 12, 'max_depth': 24},
 mean: -1.44960, std: 0.43270, params: {'n_estimators': 14, 'max_depth': 24},
 mean: -1.89192, std: 0.47620, params: {'n_estimators': 16, 'max_depth': 24},
 mean: -1.59524, std: 0.41647, params: {'n_estimators': 18, 'max_depth': 24},
 mean: -2.62363, std: 0.94986, params: {'n_estimators': 10, 'max_depth': 26},
 mean: -2.37050, std: 0.68064, params: {'n_estimators': 12, 'max_depth': 26},
 mean: -2.08027, std: 0.61122, params: {'n_estimators': 14, 'max_depth': 26},
 mean: -1.89457, std: 0.45814, params: {'n_estimators': 16, 'max_depth': 26},
 mean: -1.76486, std: 0.78120, params: {'n_estimators': 18, 'max_depth': 26},
 mean: -3.14444, std: 1.06844, params: {'n_estimators': 10, 'max_depth': 28},
 mean: -2.47574, std: 0.90047, params: {'n_estimators': 12, 'max_depth': 28},
 mean: -1.91078, std: 0.39219, params: {'n_estimators': 14, 'max_depth': 28},
 mean: -1.81819, std: 0.48838, params: {'n_estimators': 16, 'max_depth': 28},
 mean: -1.85914, std: 0.70752, params: {'n_estimators': 18, 'max_depth': 28}]