# python 2.7 python 2.7 python 2.7 import pandas as pd import numpy as np from matplotlib import pyplot as plt from sklearn.model_selection import GridSearchCV from sklearn.metrics import accuracy_score from sklearn.metrics import roc_auc_score
该数据集是一个两类分类问题(poisonous , edibl)
dpath = './data/' data = pd.read_csv(dpath + "mushrooms.csv") data.head()
#查看数据是否有空值/缺失数据 data.isnull().sum()
data['Poisonous'].unique()
#数据基本信息 data.info()
data.shape
特征全是类别型变量,很多模型需要数值型的输入(Logisstic回归、xgboost...)
from sklearn.preprocessing import LabelEncoder labelencoder = LabelEncoder() for col in data.columns: data[col] = labelencoder.fit_transform(data[col]) data.head()
LabelEncoder是不合适的,因为是有序的;而颜色等特征是没有有序关系的;决策树等模型不敏感,但logistic回归不行,后面我们再试试OneHotEncoder
y = data['Poisonous'] X = data.drop('Poisonous' , axis = 1)
from sklearn.model_selection import train_test_split X_train , X_test , y_train , y_test = train_test_split(X ,y , random_state = 26 , test_size = 0.2)
columns = X_train.columns
#数据标准化 #该标准化对决策树是没有必要的,此处的的标准化是用于Logistic回归 #Logistic回归用于和决策树的对比 from sklearn.preprocessing import StandardScaler #分别初始化对特征和目标值的标准化器 ss_X = StandardScaler() ss_y = StandardScaler() #分别对训练和测试数据的特征以及目标进行标准化处理 X_train = ss_X.fit_transform(X_train) X_test = ss_X.transform(X_test)
from sklearn.linear_model import LogisticRegression model_LR = LogisticRegression() model_LR.fit(X_train , y_train)
#查看各特征的系数,系数的绝对值大小可视为该特征的重要性 fs = pd.DataFrame({"columns" : list(columns) , "coef" : list(abs(model_LR.coef_[0]))}) fs.sort_values(by = ['coef'] , ascending = False)
#用模型进行预测 y_prob = model_LR.predict_proba(X_test)[: , 1]#模型返回每个样本属于p和e的概率,这里截取了属于p的概率,属于e的概率为1-p y_pred = np.where(y_prob > 0.5 , 1 , 0)#概率大于0.5时为p类,否则为e类 #accuracy print 'The accuary of default Logistic Regression is' , model_LR.score(X_test , y_pred)
print 'The AUC of default Logistic Regression is' , roc_auc_score(y_test , y_pred)
logistic回归需要调整的超参数有:C(正则系数,一般在log域(取log后的值)均匀设置调优)和正则函数penalty(L2/L1);目标函数为:J(theata) = sum(log loss(f(x) , yi)) + C penalty 在sklearn框架下,不同学习器的参数调整步骤相同: 1.设置候选参数集合; 2.调用GridSearchCV; 3.调用fit
from sklearn.linear_model import LogisticRegression LR_model = LogisticRegression() #设置参数搜索范围(Grid , 网格) tuned_parameters = {'C':[0.001 , 0.01 , 0.1 , 1 , 10 , 100 ,1000] , 'penalty' : ['l1' , 'l2']}
#fit函数执行会有点慢,因为要循环执行 参数数目 * CV折数 次模型训练 LR = GridSearchCV(LR_model , tuned_parameters , cv = 10) LR.fit(X_train , y_train)
print(LR.best_params_)
y_prob = LR.predict_proba(X_test)[: , 1] y_pred = np.where(y_prob > 0.5 , 1 , 0) LR.score(X_test , y_pred)
print 'The AUC of default Logistic Regression is' , roc_auc_score(y_test , y_pred)
比缺省的Logistic回归好一点
from sklearn.tree import DecisionTreeClassifier model_tree = DecisionTreeClassifier()
model_tree.fit(X_train , y_train)
y_prob = model_tree.predict_proba(X_test)[: , 1] y_pred = np.where(y_prob > 0.5 , 1 , 0) model_tree.score(X_test , y_pred)
print 'The AUC of default DecisionTree is' , roc_auc_score(y_test , y_pred)
#查看各特征的系数,系数的绝对值大小可视为该特征的重要性 df = pd.DataFrame({"columns" : list(columns) , "importance" : list(abs(model_tree.feature_importances_.T))}) df.sort_values(by = ['importance'] , ascending = False)
好像和Logistic回归选出来的重要特征不一样,也许是特征之间有强相关性
Let us tune the hyperparameters of the Decision tree model 决策树的超参数有:max_depth(树的深度)或max_leaf_nodes(叶子结点的数目)、max_features(最大特征数目)、min_samples_leaf(叶子结点的最小样本数)、min_samples_split(中间结点的最小样本数)、min_weight_fraction_leaf(叶子节点的样本权重点总权重的比例)、min_impurity_split(最小不纯净度)也可以调整
from sklearn.tree import DecisionTreeClassifier model_DD = DecisionTreeClassifier() max_depth = range(1 , 10 , 1)#树的最大深度 min_samples_leaf = range(1 , 10 , 2)#中间结点的最小样本数 tuned_parameters = dict(max_depth = max_depth , min_samples_leaf = min_samples_leaf)
from sklearn.model_selection import GridSearchCV DD = GridSearchCV(model_DD , tuned_parameters , cv = 10) DD.fit(X_train , y_train)
print("Best : %f using %s" % (DD.best_score_ , DD.best_params_))
y_prob = DD.predict_proba(X_test)[ : , 1] y_pred = np.where(y_prob > 0.5 , 1 , 0) DD.score(X_test , y_pred)
print 'The AUC of GridSearchCV Desicion Tree is ' , roc_auc_score(y_test , y_pred)
DD.grid_scores_
test_means = DD.cv_results_['mean_test_score'] #plot results test_scores = np.array(test_means).reshape(len(max_depth) , len(min_samples_leaf)) for i , value in enumerate(max_depth): plt.plot(min_samples_leaf , test_scores[i] , label = ('test_max_depth:' + str(i))) plt.legend() plt.xlabel('min_samples_leaf') plt.ylabel('accuray') plt.show()
from sklearn.ensemble import RandomForestClassifier model_RR = RandomForestClassifier()
model_RR.fit(X_train , y_train)
y_prob = model_RR.predict_proba(X_test)[: , 1] y_pred = np.where(y_prob > 0.5 , 1 , 0) model_RR.score(X_test , y_pred)
print 'The AUC of default Random Forest is ' , roc_auc_score(y_test , y_pred)
1)n_estimators 2)miin_sample_leaf 随机森林可调整的超参数(除了和决策树相同的参数):n_estimators(弱学习器的数目)
from sklearn.ensemble import RandomForestClassifier model_RR = RandomForestClassifier() tuned_parameters = {'min_samples_leaf' : range(1 , 10 , 2) , 'n_estimators' : range(1 , 10 , 2)}
from sklearn.model_selection import GridSearchCV RR = GridSearchCV(model_RR , tuned_parameters , cv = 10)
RR.fit(X_train , y_train)
print(RR.grid_scores_)
print(RR.best_score_)
print(RR.best_params_)
y_prob = RR.predict_proba(X_test)[: , 1] y_pred = np.where(y_prob > 0.5 , 1 , 0) RR.score(X_test , y_pred)
auc_roc = roc_auc_score(y_test , y_pred) auc_roc