数据同Logistic回归,Otto商品分类数据,分别调用缺省参数CART、CART + GridSearchCV以及Random Forest + GridSearchCV进行参数调优; 同时考虑通过featureimportances进行特征选择
# python 2.7 python 2.7 python 2.7 #import必要的模块 import pandas as pd import numpy as np #竞赛的评论指标为logloss from sklearn.metrics import log_loss from sklearn import metrics from matplotlib import pyplot import seaborn as sns %matplotlib inline
#读取数据 dpath = './logistic/' train = pd.read_csv(dpath + "Otto_train_test.csv") train.head()
train.info()
<class 'pandas.core.frame.DataFrame'> RangeIndex: 701 entries, 0 to 700 Data columns (total 95 columns): id 701 non-null int64 feat_1 701 non-null int64 feat_2 701 non-null int64 feat_3 701 non-null int64 feat_4 701 non-null int64 feat_5 701 non-null int64 feat_6 701 non-null int64 feat_7 701 non-null int64 feat_8 701 non-null int64 feat_9 701 non-null int64 feat_10 701 non-null int64 feat_11 701 non-null int64 feat_12 701 non-null int64 feat_13 701 non-null int64 feat_14 701 non-null int64 feat_15 701 non-null int64 feat_16 701 non-null int64 feat_17 701 non-null int64 feat_18 701 non-null int64 feat_19 701 non-null int64 feat_20 701 non-null int64 feat_21 701 non-null int64 feat_22 701 non-null int64 feat_23 701 non-null int64 feat_24 701 non-null int64 feat_25 701 non-null int64 feat_26 701 non-null int64 feat_27 701 non-null int64 feat_28 701 non-null int64 feat_29 701 non-null int64 feat_30 701 non-null int64 feat_31 701 non-null int64 feat_32 701 non-null int64 feat_33 701 non-null int64 feat_34 701 non-null int64 feat_35 701 non-null int64 feat_36 701 non-null int64 feat_37 701 non-null int64 feat_38 701 non-null int64 feat_39 701 non-null int64 feat_40 701 non-null int64 feat_41 701 non-null int64 feat_42 701 non-null int64 feat_43 701 non-null int64 feat_44 701 non-null int64 feat_45 701 non-null int64 feat_46 701 non-null int64 feat_47 701 non-null int64 feat_48 701 non-null int64 feat_49 701 non-null int64 feat_50 701 non-null int64 feat_51 701 non-null int64 feat_52 701 non-null int64 feat_53 701 non-null int64 feat_54 701 non-null int64 feat_55 701 non-null int64 feat_56 701 non-null int64 feat_57 701 non-null int64 feat_58 701 non-null int64 feat_59 701 non-null int64 feat_60 701 non-null int64 feat_61 701 non-null int64 feat_62 701 non-null int64 feat_63 701 non-null int64 feat_64 701 non-null int64 feat_65 701 non-null int64 feat_66 701 non-null int64 feat_67 701 non-null int64 feat_68 701 non-null int64 feat_69 701 non-null int64 feat_70 701 non-null int64 feat_71 701 non-null int64 feat_72 701 non-null int64 feat_73 701 non-null int64 feat_74 701 non-null int64 feat_75 701 non-null int64 feat_76 701 non-null int64 feat_77 701 non-null int64 feat_78 701 non-null int64 feat_79 701 non-null int64 feat_80 701 non-null int64 feat_81 701 non-null int64 feat_82 701 non-null int64 feat_83 701 non-null int64 feat_84 701 non-null int64 feat_85 701 non-null int64 feat_86 701 non-null int64 feat_87 701 non-null int64 feat_88 701 non-null int64 feat_89 701 non-null int64 feat_90 701 non-null int64 feat_91 701 non-null int64 feat_92 701 non-null int64 feat_93 701 non-null int64 target 701 non-null object dtypes: int64(94), object(1) memory usage: 520.3+ KB
#各属性的统计特性 train.describe()
# Target 分布,查看各类样本分布是否均衡 # 当各类样本数量不均衡时,交叉验证对分类任务要采用StratifiedKFold,即在每折采样时根据各类样本按比例采样, # 交叉验证的代码中缺省的就是StratifiedKFold sns.countplot(train.target) pyplot.xlabel('target') pyplot.ylabel('Number of occurrences')
# 将类别字符串变成数字 y_train = train['target'] #形式为Class_x y_train = y_train.map(lambda s: s[6:]) y_train = y_train.map(lambda s: int(s) - 1) train = train.drop(["id" , "target"] , axis = 1) columns = train.columns X_train = np.array(train)
# 数据标准化 from sklearn.preprocessing import StandardScaler # 初始化特征的标准化器 ss_X = StandardScaler() # 分别对训练和测试数据的特征进行标准化处理 X_train = ss_X.fit_transform(X_train) # 因采用的是交叉验证,所以没有划分出训练数据和测试数据 #X_test = ss_X.transform(X_test)
from sklearn.tree import DecisionTreeClassifier model_tree = DecisionTreeClassifier() model_tree.fit(X_train ,y_train)
# 交叉验证用于评估模型性能和进行参数调优(模型选择) # 分类任务中交叉验证缺省是采用StratifiedKFold #交叉验证中要求scoring越大越好,因此用neg_log_loss时需要加一个负号 from sklearn.cross_validation import cross_val_score loss = cross_val_score(model_tree , X_train , y_train ,cv = 10 , scoring='neg_log_loss') print 'logloss of each fold is :' , -loss print 'cv logloss mean is :' , -loss.mean()
#查看各特征的系数,系数的绝对值大小可视为该特征的重要性 df = pd.DataFrame({"columns" : list(columns) , "importance" : list(abs(model_tree.feature_importances_.T))}) df.sort_values(by = ['importance'] , ascending = False)
决策树的超参数有:max_depth(树的深度)或max_leaf_nodes(叶子结点的数目)、max_features(最大特征数目)、min_samples_leaf(叶子结点的最小样本数)、min_samples_split(中间结点的最小样本数)、min_weight_fraction_leaf(叶子节点的样本权重点总权重的比例)、min_impurity_split(最小不纯净度)也可以调整 这里调整max_depth试试
from sklearn.tree import DecisionTreeClassifier model_DD = DecisionTreeClassifier() #设置参数搜索grid max_depth = range(5 , 15 , 1) #min_samples_leaf = range(1 , 10 , 2) tuned_parameters = dict(max_depth = max_depth)
#GridSearchCV from sklearn.model_selection import GridSearchCV DD = GridSearchCV(model_DD , tuned_parameters ,scoring = 'neg_log_loss' , cv = 10) DD.fit(X_train , y_train)
print("Best : %f using %s" % (-DD.best_score_ , DD.best_params_))
test_means = -DD.cv_results_['mean_test_score'] x_axis = range(5 , test_means.shape[0] + 5) pyplot.plot(x_axis , test_means) pyplot.title("max_depth vs Log Loss") pyplot.xlabel('max_depth') pyplot.ylabel('Log Loss') pyplot.show()
#查看各特征的系数,系数的绝对值大小可视为该特征的重要性 #print DD.best_estimator_.feature_importances_.T df = pd.DataFrame({"columns" : list(columns) , "importance" : list(DD.best_estimator_.feature_importances_.T)}) df.sort_values(by = ['importance'] , ascending = False)
去掉上面那些重要程度为0的特征
from sklearn.feature_selection import SelectFromModel #SelectFromModel为特征选择类,可以返回大于阈值的特征,回归中阈值选择的是coef_,分类中为feature_importances_ model = SelectFromModel(DD.best_estimator_ , prefit = True , threshold = 0.01)#threshold参数为特征选择阈值,默认为0.01 X_Train_new = model.transform(X_train) X_Train_new.shape
model_tuned_tree = DecisionTreeClassifier(max_depth = DD.best_params_['max_depth']) from sklearn.cross_validation import cross_val_score loss = cross_val_score(model_tuned_tree , X_Train_new , y_train , cv = 10 , scoring='neg_log_loss') print 'logloss of each fold is :' , -loss print 'cv logloss is :' , -loss.mean()
决策树运行快,所以在这示意监控过程。Random Forest部分可以参照这个写,不过得对训练时间有足够的心理准备;
import time #一个threshold(阈值)对应的特征选择,及相应的模型训练和测试,得到该参数下模型在校验集上的预测 def one_feature_selection(threshold , X_train , y_train): start = time.time() print("threshold: {}".format(threshold)) model = SelectFromModel(DD.best_estimator_ , threshold = threshold , prefit = True) X_Train_new = model.transform(X_train) print X_Train_new.shape #在训练集和测试集降维 DD.best_estimator_.get_params() model_truned_tree = DecisionTreeClassifier(max_depth = DD.best_params_['max_depth']) loss = cross_val_score(model_tuned_tree , X_Train_new , y_train ,cv = 10 , scoring='neg_log_loss') cvloss = -loss.mean() end = time.time() print("cvloss:{}".format(cvloss.mean())) return cvloss
#设置threshold搜索范围 #每次去掉最不重要的5% cols = df.shape[0] #总的特征数目 step = int(cols * 0.05) threshold_s = np.zeros(20) losses = [] for i in range(1 , 20 , 1): threshold_s[i - 1] = df.importance[cols - i * step] tmp = one_feature_selection(threshold_s[i - 1] , X_train , y_train) losses.append(tmp)
out:
threshold: 0.157906991761 (701L, 2L) cvloss:2.72968407471 threshold: 0.0 (701L, 93L) cvloss:2.68337656155 threshold: 0.0 (701L, 93L) cvloss:2.68850684525 threshold: 0.0 (701L, 93L) cvloss:2.58999143151 threshold: 0.0 (701L, 93L) cvloss:2.63796195428 threshold: 0.0352315956308 (701L, 7L) cvloss:2.41340774533 threshold: 0.0 (701L, 93L) cvloss:2.64053632248 threshold: 0.126966902673 (701L, 5L) cvloss:2.57437674006 threshold: 0.0 (701L, 93L) cvloss:2.59256579971 threshold: 0.0 (701L, 93L) cvloss:2.54733846624 threshold: 0.0 (701L, 93L) cvloss:2.58993127805 threshold: 0.0 (701L, 93L) cvloss:2.63796195428 threshold: 0.0 (701L, 93L) cvloss:2.59017870531 threshold: 0.0 (701L, 93L) cvloss:2.54942414114 threshold: 0.293529089618 (701L, 1L) cvloss:2.43123798541 threshold: 0.0 (701L, 93L) cvloss:2.64053632248 threshold: 0.0 (701L, 93L) cvloss:2.64811857035 threshold: 0.0 (701L, 93L) cvloss:2.59739466391 threshold: 0.0 (701L, 93L) cvloss:2.64072359628
threshold_s
随机森林可调整的超参数(除了和决策树相同的参数):n_estimators(弱学习器的数目),所以下面一起调整树的max_depth和n_estimators
from sklearn.ensemble import RandomForestClassifier model_RR = RandomForestClassifier() #设置参数搜索grid max_depth = range(10 , 30 , 2) n_estimators = range(10 , 20 , 2) tuned_parameters = dict(max_depth = max_depth , n_estimators = n_estimators)
from sklearn.model_selection import GridSearchCV RR = GridSearchCV(model_RR , tuned_parameters , scoring = 'neg_log_loss' , cv = 10) RR.fit(X_train , y_train)
print("Best: %f using %s" % (-RR.best_score_ , RR.best_params_))
随机森林比一棵树好了很多
RR.grid_scores_
out:
[mean: -1.35386, std: 0.19609, params: {'n_estimators': 10, 'max_depth': 10}, mean: -1.22500, std: 0.07677, params: {'n_estimators': 12, 'max_depth': 10}, mean: -1.23760, std: 0.08164, params: {'n_estimators': 14, 'max_depth': 10}, mean: -1.23707, std: 0.05659, params: {'n_estimators': 16, 'max_depth': 10}, mean: -1.23354, std: 0.07110, params: {'n_estimators': 18, 'max_depth': 10}, mean: -1.39196, std: 0.26465, params: {'n_estimators': 10, 'max_depth': 12}, mean: -1.21759, std: 0.09212, params: {'n_estimators': 12, 'max_depth': 12}, mean: -1.20984, std: 0.17629, params: {'n_estimators': 14, 'max_depth': 12}, mean: -1.28113, std: 0.20810, params: {'n_estimators': 16, 'max_depth': 12}, mean: -1.16538, std: 0.08866, params: {'n_estimators': 18, 'max_depth': 12}, mean: -1.45142, std: 0.31450, params: {'n_estimators': 10, 'max_depth': 14}, mean: -1.40571, std: 0.41349, params: {'n_estimators': 12, 'max_depth': 14}, mean: -1.37168, std: 0.45835, params: {'n_estimators': 14, 'max_depth': 14}, mean: -1.24758, std: 0.18312, params: {'n_estimators': 16, 'max_depth': 14}, mean: -1.21770, std: 0.20019, params: {'n_estimators': 18, 'max_depth': 14}, mean: -2.00128, std: 0.33793, params: {'n_estimators': 10, 'max_depth': 16}, mean: -1.55623, std: 0.41455, params: {'n_estimators': 12, 'max_depth': 16}, mean: -1.42701, std: 0.33239, params: {'n_estimators': 14, 'max_depth': 16}, mean: -1.20207, std: 0.21551, params: {'n_estimators': 16, 'max_depth': 16}, mean: -1.20576, std: 0.29579, params: {'n_estimators': 18, 'max_depth': 16}, mean: -2.02903, std: 0.49022, params: {'n_estimators': 10, 'max_depth': 18}, mean: -1.63538, std: 0.33508, params: {'n_estimators': 12, 'max_depth': 18}, mean: -1.66119, std: 0.51923, params: {'n_estimators': 14, 'max_depth': 18}, mean: -1.43268, std: 0.18889, params: {'n_estimators': 16, 'max_depth': 18}, mean: -1.43009, std: 0.37455, params: {'n_estimators': 18, 'max_depth': 18}, mean: -2.51668, std: 0.75125, params: {'n_estimators': 10, 'max_depth': 20}, mean: -2.20780, std: 0.75028, params: {'n_estimators': 12, 'max_depth': 20}, mean: -1.59372, std: 0.59536, params: {'n_estimators': 14, 'max_depth': 20}, mean: -1.56941, std: 0.45555, params: {'n_estimators': 16, 'max_depth': 20}, mean: -1.32084, std: 0.36734, params: {'n_estimators': 18, 'max_depth': 20}, mean: -2.62570, std: 1.06356, params: {'n_estimators': 10, 'max_depth': 22}, mean: -2.11879, std: 0.42273, params: {'n_estimators': 12, 'max_depth': 22}, mean: -1.64756, std: 0.52801, params: {'n_estimators': 14, 'max_depth': 22}, mean: -1.46009, std: 0.42005, params: {'n_estimators': 16, 'max_depth': 22}, mean: -1.53584, std: 0.43831, params: {'n_estimators': 18, 'max_depth': 22}, mean: -2.51453, std: 0.89887, params: {'n_estimators': 10, 'max_depth': 24}, mean: -1.99431, std: 0.74647, params: {'n_estimators': 12, 'max_depth': 24}, mean: -1.44960, std: 0.43270, params: {'n_estimators': 14, 'max_depth': 24}, mean: -1.89192, std: 0.47620, params: {'n_estimators': 16, 'max_depth': 24}, mean: -1.59524, std: 0.41647, params: {'n_estimators': 18, 'max_depth': 24}, mean: -2.62363, std: 0.94986, params: {'n_estimators': 10, 'max_depth': 26}, mean: -2.37050, std: 0.68064, params: {'n_estimators': 12, 'max_depth': 26}, mean: -2.08027, std: 0.61122, params: {'n_estimators': 14, 'max_depth': 26}, mean: -1.89457, std: 0.45814, params: {'n_estimators': 16, 'max_depth': 26}, mean: -1.76486, std: 0.78120, params: {'n_estimators': 18, 'max_depth': 26}, mean: -3.14444, std: 1.06844, params: {'n_estimators': 10, 'max_depth': 28}, mean: -2.47574, std: 0.90047, params: {'n_estimators': 12, 'max_depth': 28}, mean: -1.91078, std: 0.39219, params: {'n_estimators': 14, 'max_depth': 28}, mean: -1.81819, std: 0.48838, params: {'n_estimators': 16, 'max_depth': 28}, mean: -1.85914, std: 0.70752, params: {'n_estimators': 18, 'max_depth': 28}]