上一次修改时间：2018-06-26 20:32:43

决策树随机森林-蘑菇分类-代码

导入工具包

# python 2.7    python 2.7    python 2.7  
import pandas as pd
import numpy as np
from matplotlib import pyplot as plt

from sklearn.model_selection import GridSearchCV

from sklearn.metrics import accuracy_score
from sklearn.metrics import roc_auc_score

数据读取

该数据集是一个两类分类问题(poisonous , edibl)

dpath = './data/'
data = pd.read_csv(dpath + "mushrooms.csv")
data.head()

#查看数据是否有空值/缺失数据
data.isnull().sum()

data['Poisonous'].unique()

#数据基本信息
data.info()

data.shape

特征编码¶

特征全是类别型变量，很多模型需要数值型的输入(Logisstic回归、xgboost...)

from sklearn.preprocessing import LabelEncoder
labelencoder = LabelEncoder()
for col in data.columns:
    data[col] = labelencoder.fit_transform(data[col])
    
data.head()

LabelEncoder是不合适的，因为是有序的；而颜色等特征是没有有序关系的；决策树等模型不敏感，但logistic回归不行，后面我们再试试OneHotEncoder

y = data['Poisonous']
X = data.drop('Poisonous' , axis = 1)

分出一部分数据来做测试

from sklearn.model_selection import train_test_split
X_train , X_test , y_train , y_test = train_test_split(X ,y , random_state = 26 , test_size = 0.2)

columns = X_train.columns

#数据标准化
#该标准化对决策树是没有必要的，此处的的标准化是用于Logistic回归
#Logistic回归用于和决策树的对比
from sklearn.preprocessing import StandardScaler

#分别初始化对特征和目标值的标准化器
ss_X = StandardScaler()
ss_y = StandardScaler()

#分别对训练和测试数据的特征以及目标进行标准化处理
X_train = ss_X.fit_transform(X_train)
X_test = ss_X.transform(X_test)

default Logistic Regression

from sklearn.linear_model import LogisticRegression

model_LR = LogisticRegression()
model_LR.fit(X_train , y_train)

#查看各特征的系数，系数的绝对值大小可视为该特征的重要性
fs = pd.DataFrame({"columns" : list(columns) , "coef" : list(abs(model_LR.coef_[0]))})
fs.sort_values(by = ['coef'] , ascending = False)

#用模型进行预测
y_prob = model_LR.predict_proba(X_test)[: , 1]#模型返回每个样本属于p和e的概率，这里截取了属于p的概率，属于e的概率为1-p
y_pred = np.where(y_prob > 0.5 , 1 , 0)#概率大于0.5时为p类，否则为e类

#accuracy
print 'The accuary of default Logistic Regression is' , model_LR.score(X_test , y_pred)

print 'The AUC of default Logistic Regression is' , roc_auc_score(y_test , y_pred)

Logistic Regression(Tuned model)

logistic回归需要调整的超参数有：C(正则系数，一般在log域(取log后的值)均匀设置调优)和正则函数penalty(L2/L1)；目标函数为：J(theata) = sum(log loss(f(x) , yi)) + C penalty 在sklearn框架下，不同学习器的参数调整步骤相同： 1.设置候选参数集合； 2.调用GridSearchCV； 3.调用fit

from sklearn.linear_model import LogisticRegression

LR_model = LogisticRegression()

#设置参数搜索范围(Grid ， 网格)
tuned_parameters = {'C':[0.001 , 0.01 , 0.1 , 1 , 10 , 100 ,1000] , 'penalty' : ['l1' , 'l2']}

cv

#fit函数执行会有点慢，因为要循环执行 参数数目 * CV折数 次模型训练
LR = GridSearchCV(LR_model , tuned_parameters , cv = 10)
LR.fit(X_train , y_train)

print(LR.best_params_)

y_prob = LR.predict_proba(X_test)[: , 1]
y_pred = np.where(y_prob > 0.5 , 1 , 0)
LR.score(X_test , y_pred)

print 'The AUC of default Logistic Regression is' , roc_auc_score(y_test , y_pred)

比缺省的Logistic回归好一点

Default Decision Tree model

from sklearn.tree import DecisionTreeClassifier

model_tree = DecisionTreeClassifier()

model_tree.fit(X_train , y_train)

y_prob = model_tree.predict_proba(X_test)[: , 1]
y_pred = np.where(y_prob > 0.5 , 1 , 0)
model_tree.score(X_test , y_pred)

print 'The AUC of default DecisionTree is' , roc_auc_score(y_test , y_pred)

#查看各特征的系数，系数的绝对值大小可视为该特征的重要性
df = pd.DataFrame({"columns" : list(columns) , "importance" : list(abs(model_tree.feature_importances_.T))})
df.sort_values(by = ['importance'] , ascending = False)

好像和Logistic回归选出来的重要特征不一样，也许是特征之间有强相关性

Let us tune the hyperparameters of the Decision tree model 决策树的超参数有：max_depth(树的深度)或max_leaf_nodes(叶子结点的数目)、max_features(最大特征数目)、min_samples_leaf(叶子结点的最小样本数)、min_samples_split(中间结点的最小样本数)、min_weight_fraction_leaf(叶子节点的样本权重点总权重的比例)、min_impurity_split(最小不纯净度)也可以调整

from sklearn.tree import DecisionTreeClassifier

model_DD = DecisionTreeClassifier()

max_depth = range(1 , 10 , 1)#树的最大深度
min_samples_leaf = range(1 , 10 , 2)#中间结点的最小样本数
tuned_parameters = dict(max_depth = max_depth , min_samples_leaf = min_samples_leaf)

from sklearn.model_selection import GridSearchCV
DD = GridSearchCV(model_DD , tuned_parameters , cv = 10)
DD.fit(X_train , y_train)

print("Best : %f using %s" % (DD.best_score_ , DD.best_params_))

y_prob = DD.predict_proba(X_test)[ : , 1]
y_pred = np.where(y_prob > 0.5 , 1 , 0)
DD.score(X_test , y_pred)

print 'The AUC of GridSearchCV Desicion Tree is ' , roc_auc_score(y_test , y_pred)

DD.grid_scores_

test_means = DD.cv_results_['mean_test_score']

#plot results
test_scores = np.array(test_means).reshape(len(max_depth) , len(min_samples_leaf))

for i , value in enumerate(max_depth):
    plt.plot(min_samples_leaf , test_scores[i] , label = ('test_max_depth:' + str(i)))
    
plt.legend()
plt.xlabel('min_samples_leaf')
plt.ylabel('accuray')
plt.show()

Default Random Forest

from sklearn.ensemble import RandomForestClassifier

model_RR = RandomForestClassifier()

model_RR.fit(X_train , y_train)

y_prob = model_RR.predict_proba(X_test)[: , 1]
y_pred = np.where(y_prob > 0.5 , 1 , 0)
model_RR.score(X_test , y_pred)

print 'The AUC of default Random Forest is ' , roc_auc_score(y_test , y_pred)

Let us tuned the parameters of Random Forest just for the purpose of knowledge

1）n_estimators 2）miin_sample_leaf 随机森林可调整的超参数(除了和决策树相同的参数)：n_estimators(弱学习器的数目)

from sklearn.ensemble import RandomForestClassifier

model_RR = RandomForestClassifier()

tuned_parameters = {'min_samples_leaf' : range(1 , 10 , 2) , 'n_estimators' : range(1 , 10 , 2)}

from sklearn.model_selection import GridSearchCV 
RR = GridSearchCV(model_RR , tuned_parameters , cv = 10)

RR.fit(X_train , y_train)

print(RR.grid_scores_)

print(RR.best_score_)

print(RR.best_params_)

y_prob = RR.predict_proba(X_test)[: , 1]
y_pred = np.where(y_prob > 0.5 , 1 , 0)
RR.score(X_test , y_pred)

auc_roc = roc_auc_score(y_test , y_pred)
auc_roc