# python 2.7 python 2.7 python 2.7 import pandas as pd import numpy as np from matplotlib import pyplot as plt from sklearn.model_selection import GridSearchCV from sklearn.metrics import accuracy_score from sklearn.metrics import roc_auc_score
蘑菇数据集,22维特征 该数据集是一个两类分类问题(poisonous , edibl)
dpath = './data/' data = pd.read_csv(dpath + "mushrooms.csv") data.head()
#查看数据是否有空值/缺失数据 data.isnull().sum()
data['Poisonous'].unique()
#数据基本信息 data.info()
#查看数据规模 data.shape
特征全是类别型变量,很多模型需要数值型的输入(Logisstic回归、xgboost...)
from sklearn.preprocessing import LabelEncoder labelencoder = LabelEncoder() for col in data.columns: data[col] = labelencoder.fit_transform(data[col]) data.head()
LabelEncoder是不合适的,因为是有序的;而颜色等特征是没有有序关系的;决策树等模型不敏感,但logistic回归不行,后面我们再试试OneHotEncoder
y = data['Poisonous']#用列名访问更直观 X = data.drop('Poisonous' , axis = 1)
from sklearn.model_selection import train_test_split X_train , X_test , y_train , y_test = train_test_split(X ,y , random_state = 26 , test_size = 0.2)
columns = X_train.columns
#数据标准化 #该标准化对决策树是没有必要的,此处的的标准化是用于Logistic回归 #Logistic回归用于和决策树的对比 from sklearn.preprocessing import StandardScaler #分别初始化对特征和目标值的标准化器 ss_X = StandardScaler() ss_y = StandardScaler() #分别对训练和测试数据的特征以及目标进行标准化处理 X_train = ss_X.fit_transform(X_train) X_test = ss_X.transform(X_test)
from xgboost import XGBClassifier model_XGB = XGBClassifier()
model_XGB.fit(X_train , y_train)
y_prob = model_XGB.predict_proba(X_test)[: , 1] y_pred = np.where(y_prob > 0.5 , 1 , 0) #预测结果时,概率大于0.5时为1,否则为0 model_XGB.score(X_test , y_pred)
auc_roc = roc_auc_score(y_test , y_pred) auc_roc
在XGBoost中特征重要性已经自动算好,存放在featureimportances
print(model_XGB.feature_importances_)
#plot from matplotlib import pyplot pyplot.bar(range(len(model_XGB.feature_importances_)) , model_XGB.feature_importances_) pyplot.show()
上述表是按特征顺序打印,还可以使用XGBoost内嵌的函数,按特征重要性排序
#plot feature importance using built-in function from xgboost import plot_importance plot_importance(model_XGB) pyplot.show()
可以根据特征重要性进行特征选择
from numpy import sort from sklearn.feature_selection import SelectFromModel #Fit model using each importance as a threshold thresholds = sort(model_XGB.feature_importances_) print thresholds for thresh in thresholds: #select features using threshold #按排序后的特征,依次选择阈值为thresh的前n维特征进行训练,测试 selection = SelectFromModel(model_XGB , threshold = thresh , prefit=True) select_X_train = selection.transform(X_train) #train model selection_model = XGBClassifier() selection_model.fit(select_X_train , y_train) #eval model select_X_test = selection.transform(X_test) y_pred = selection_model.predict(select_X_test) predictions = [round(value) for value in y_pred] accuracy = accuracy_score(y_test , predictions) print("Thresh=%.3f , n=%d , Accuracy: %.2f%%" % (thresh , select_X_train.shape[1] , accuracy*100.0))