上一次修改时间：2018-06-26 20:33:42

默认参数的XGBoost-蘑菇分类-代码

导入工具包

# python 2.7    python 2.7    python 2.7  
import pandas as pd
import numpy as np
from matplotlib import pyplot as plt
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import accuracy_score
from sklearn.metrics import roc_auc_score

读取文件

蘑菇数据集，22维特征该数据集是一个两类分类问题(poisonous , edibl)

dpath = './data/'
data = pd.read_csv(dpath + "mushrooms.csv")
data.head()

#查看数据是否有空值/缺失数据
data.isnull().sum()

查看是否为一个两类分类问题(poisonous , edibl)

data['Poisonous'].unique()

#数据基本信息
data.info()

#查看数据规模
data.shape

特征编码

特征全是类别型变量，很多模型需要数值型的输入(Logisstic回归、xgboost...)

from sklearn.preprocessing import LabelEncoder
labelencoder = LabelEncoder()
for col in data.columns:
    data[col] = labelencoder.fit_transform(data[col])
     
data.head()

LabelEncoder是不合适的，因为是有序的；而颜色等特征是没有有序关系的；决策树等模型不敏感，但logistic回归不行，后面我们再试试OneHotEncoder

y = data['Poisonous']#用列名访问更直观
X = data.drop('Poisonous' , axis = 1)

分出一部分数据来做测试

from sklearn.model_selection import train_test_split
X_train , X_test , y_train , y_test = train_test_split(X ,y , random_state = 26 , test_size = 0.2)

columns = X_train.columns

#数据标准化
#该标准化对决策树是没有必要的，此处的的标准化是用于Logistic回归
#Logistic回归用于和决策树的对比
from sklearn.preprocessing import StandardScaler
 
#分别初始化对特征和目标值的标准化器
ss_X = StandardScaler()
ss_y = StandardScaler()
 
#分别对训练和测试数据的特征以及目标进行标准化处理
X_train = ss_X.fit_transform(X_train)
X_test = ss_X.transform(X_test)

Default XGBoost

from xgboost import XGBClassifier
model_XGB = XGBClassifier()

model_XGB.fit(X_train , y_train)

y_prob = model_XGB.predict_proba(X_test)[: , 1]
y_pred = np.where(y_prob > 0.5 , 1 , 0)   #预测结果时，概率大于0.5时为1，否则为0
model_XGB.score(X_test , y_pred)

auc_roc = roc_auc_score(y_test , y_pred)
auc_roc

特征重要性

在XGBoost中特征重要性已经自动算好，存放在featureimportances

print(model_XGB.feature_importances_)

#plot
from matplotlib import pyplot
pyplot.bar(range(len(model_XGB.feature_importances_)) , model_XGB.feature_importances_)
pyplot.show()

上述表是按特征顺序打印，还可以使用XGBoost内嵌的函数，按特征重要性排序

#plot feature importance using built-in function
from xgboost import plot_importance
plot_importance(model_XGB)
pyplot.show()

可以根据特征重要性进行特征选择

from numpy import sort
from sklearn.feature_selection import SelectFromModel

#Fit model using each importance as a threshold
thresholds = sort(model_XGB.feature_importances_)
print thresholds
for thresh in thresholds:
    #select features using threshold
    #按排序后的特征，依次选择阈值为thresh的前n维特征进行训练，测试
    selection = SelectFromModel(model_XGB , threshold = thresh , prefit=True)
    select_X_train = selection.transform(X_train)
    #train model
    selection_model = XGBClassifier()
    selection_model.fit(select_X_train , y_train)
    #eval model
    select_X_test = selection.transform(X_test)
    y_pred = selection_model.predict(select_X_test)
    predictions = [round(value) for value in y_pred]
    accuracy = accuracy_score(y_test , predictions)
    print("Thresh=%.3f , n=%d , Accuracy: %.2f%%" % (thresh , select_X_train.shape[1] , accuracy*100.0))