上一次修改时间：2018-06-26 20:29:32

Logistic回归-代码

数据说明：Kaggle 2015年举办的Otto Group Product Classification Challenge竞赛，Otto数据集是著名电商Otto提供的一个多类商品分类问题，类别数=9，每个样本有93维数值特征(整数，表某种事件发生的次数，已进行过脱敏处理)；参数调优说明：分别调缺省参数LogisticRegression、LogisticRegression + GridSearchCV以及LogisticRegressionCV进行参数调优，实际应用中LogisticRegression + GridSearchCV或LogisticRegressionCV任选一个即可；

# python 2.7    python 2.7    python 2.7  
#import必要的模块
import pandas as pd
import numpy as np

# GridSearchCV用来做模型选择，GridSearchCV是用于交叉验证的类
from sklearn.model_selection import GridSearchCV

#竞赛的评价指标为logloss
from sklearn.metrics import log_loss

from matplotlib import pyplot
import seaborn as sns
%matplotlib inline

读取数据 & 数据探索

#读取数据
dpath = './logistic/'
train = pd.read_csv(dpath + "Otto_train_test.csv")
train.head()

train.info()

......

#各属性的统计特性
train.describe()

......

# Target 分布，查看各类样本分布是否均衡
# 当各类样本数量不均衡时，交叉验证对分类任务要采用StratifiedKFold，即在每折采样时根据各类样本按比例采样，
# 交叉验证的代码中缺省的就是StratifiedKFold
sns.countplot(train.target)
pyplot.xlabel('target')
pyplot.ylabel('Number of occurrences')

特征编码

# 将类别字符串变成数字
y_train = train['target'] #形式为Class_x
y_train = y_train.map(lambda s: s[6:])
y_train = y_train.map(lambda s: int(s) - 1)

train = train.drop(["id" , "target"] , axis = 1)
X_train = np.array(train)

数据预处理

# 数据标准化
from sklearn.preprocessing import StandardScaler
# 初始化特征的标准化器
ss_X = StandardScaler()
# 分别对训练和测试数据的特征进行标准化处理
X_train = ss_X.fit_transform(X_train)
# 因采用的是交叉验证，所以没有划分出训练数据和测试数据
#X_test = ss_X.transform(X_test)

模型训练

from sklearn.linear_model import LogisticRegression
lr = LogisticRegression()

# 交叉验证用于评估模型性能和进行参数调优(模型选择)
# 分类任务中交叉验证缺省是采用StratifiedKFold
from sklearn.cross_validation import cross_val_score
scores = cross_val_score(lr , X_train , y_train , cv=2 , scoring='accuracy')
print '交叉验证中每折的准确度为:'
print(scores)
print 'cv accuracy is:' , scores.mean()

正则化的 Logistic Regression及参数调优

logistic回归需要调整的超参数有：C(正则系数，一般在log域(取log后的值))和正则函数penalty(L2/L1)目标函数为：J=sum(logloss(f(xi), yi)) + C* penalty

在sklearn框架下，不同学习器的参数调整步骤相同：设置候选参数集合调用GridSearchCV调用fit

from sklearn.model_selection import GridSearchCV
from sklearn.linear_model import LogisticRegression

# 需要调优的参数
# 待测：将L1正则和L2正则分开，并配合合适的优化求解算法(slover)
#tuned_parameters = {'penalty':['l1','l2'],
#                   'C': [0.001, 0.01, 0.1, 1, 10, 100, 1000]
#                   }
penaltys = ['l1' , 'l2']
Cs = [0.001 , 0.01 , 0.1 , 1 , 10 , 100 , 1000]
tuned_parameters = dict(penalty = penaltys , C = Cs)

lr_penalty = LogisticRegression()
# GridSearchCV参数说明：第一个参数为模型，第二个为参数
grid = GridSearchCV(lr_penalty , tuned_parameters , cv = 5)
grid.fit(X_train , y_train)

# 输出结果
grid.cv_results_

out[46]:

{'mean_fit_time': array([  2.56000519e-02,   6.05999947e-02,   3.22000504e-02,
          1.10799980e-01,   6.85999870e-02,   1.75799990e-01,
          2.18599987e-01,   3.23199940e-01,   2.49199996e+00,
          6.04399967e-01,   4.23484001e+01,   9.69399977e-01,
          6.73174000e+01,   1.87620001e+00]),
 'mean_score_time': array([ 0.00139995,  0.00120001,  0.00099993,  0.00100002,  0.00060005,
         0.00180001,  0.00060005,  0.00100007,  0.00119996,  0.00140009,
         0.00119996,  0.00100002,  0.00079999,  0.00699997]),
 'mean_test_score': array([ 0.09985735,  0.60912981,  0.2339515 ,  0.64051355,  0.62339515,
         0.66761769,  0.67332382,  0.65192582,  0.62482168,  0.63766049,
         0.57631954,  0.60057061,  0.54778887,  0.58059914]),
 'mean_train_score': array([ 0.0998579 ,  0.68866596,  0.23680409,  0.74715815,  0.69901865,
         0.81170009,  0.85983567,  0.86768396,  0.94116088,  0.91297004,
         0.98429903,  0.94972342,  0.98964857,  0.97539203]),
 'param_C': masked_array(data = [0.001 0.001 0.01 0.01 0.1 0.1 1 1 10 10 100 100 1000 1000],
              mask = [False False False False False False False False False False False False
  False False],
        fill_value = ?),
 'param_penalty': masked_array(data = ['l1' 'l2' 'l1' 'l2' 'l1' 'l2' 'l1' 'l2' 'l1' 'l2' 'l1' 'l2' 'l1' 'l2'],
              mask = [False False False False False False False False False False False False
  False False],
        fill_value = ?),
 'params': ({'C': 0.001, 'penalty': 'l1'},
  {'C': 0.001, 'penalty': 'l2'},
  {'C': 0.01, 'penalty': 'l1'},
  {'C': 0.01, 'penalty': 'l2'},
  {'C': 0.1, 'penalty': 'l1'},
  {'C': 0.1, 'penalty': 'l2'},
  {'C': 1, 'penalty': 'l1'},
  {'C': 1, 'penalty': 'l2'},
  {'C': 10, 'penalty': 'l1'},
  {'C': 10, 'penalty': 'l2'},
  {'C': 100, 'penalty': 'l1'},
  {'C': 100, 'penalty': 'l2'},
  {'C': 1000, 'penalty': 'l1'},
  {'C': 1000, 'penalty': 'l2'}),
 'rank_test_score': array([14,  8, 13,  4,  7,  2,  1,  3,  6,  5, 11,  9, 12, 10]),
 'split0_test_score': array([ 0.09859155,  0.61971831,  0.21830986,  0.62676056,  0.6056338 ,
         0.65492958,  0.66901408,  0.63380282,  0.59859155,  0.59859155,
         0.58450704,  0.57746479,  0.55633803,  0.57042254]),
 'split0_train_score': array([ 0.10017889,  0.6940966 ,  0.23613596,  0.76386404,  0.70483005,
         0.81395349,  0.84973166,  0.86225403,  0.94454383,  0.90697674,
         0.98211091,  0.95348837,  0.98926655,  0.97674419]),
 'split1_test_score': array([ 0.09929078,  0.60992908,  0.23404255,  0.68085106,  0.63829787,
         0.66666667,  0.64539007,  0.65957447,  0.65248227,  0.65957447,
         0.57446809,  0.63120567,  0.5248227 ,  0.60283688]),
 'split1_train_score': array([ 0.1       ,  0.6875    ,  0.225     ,  0.74642857,  0.70178571,
         0.81428571,  0.8625    ,  0.87678571,  0.94285714,  0.91071429,
         0.97678571,  0.95      ,  0.98214286,  0.96785714]),
 'split2_test_score': array([ 0.1       ,  0.6       ,  0.21428571,  0.64285714,  0.63571429,
         0.65714286,  0.65714286,  0.63571429,  0.62142857,  0.67142857,
         0.58571429,  0.58571429,  0.57142857,  0.61428571]),
 'split2_train_score': array([ 0.09982175,  0.70053476,  0.24064171,  0.73975045,  0.70409982,
         0.80926916,  0.87344029,  0.86987522,  0.93582888,  0.90909091,
         0.99108734,  0.95187166,  0.99286988,  0.98217469]),
 'split3_test_score': array([ 0.1       ,  0.61428571,  0.25714286,  0.63571429,  0.61428571,
         0.67142857,  0.68571429,  0.67142857,  0.61428571,  0.62142857,
         0.58571429,  0.60714286,  0.55714286,  0.55714286]),
 'split3_train_score': array([ 0.09982175,  0.67736185,  0.2513369 ,  0.73796791,  0.70231729,
         0.80926916,  0.85204991,  0.85561497,  0.94474153,  0.91622103,
         0.98039216,  0.95187166,  0.98573975,  0.97682709]),
 'split4_test_score': array([ 0.10144928,  0.60144928,  0.24637681,  0.61594203,  0.62318841,
         0.6884058 ,  0.71014493,  0.65942029,  0.63768116,  0.63768116,
         0.55072464,  0.60144928,  0.52898551,  0.55797101]),
 'split4_train_score': array([ 0.09946714,  0.68383659,  0.23090586,  0.74777975,  0.68206039,
         0.81172291,  0.86145648,  0.87388988,  0.93783304,  0.92184725,
         0.99111901,  0.94138544,  0.9982238 ,  0.97335702]),
 'std_fit_time': array([  2.57683653e-03,   1.08922055e-02,   4.11827777e-03,
          8.56509030e-03,   4.31736540e-03,   1.75886413e-02,
          7.20000664e-03,   2.18577453e-02,   3.05847666e-01,
          3.42380435e-02,   1.12247750e+01,   8.47834657e-02,
          5.65952800e+00,   2.16019833e-01]),
 'std_score_time': array([  4.89920847e-04,   7.48379193e-04,   0.00000000e+00,
          6.32485093e-04,   4.89940316e-04,   4.00042545e-04,
          4.89940316e-04,   6.32409703e-04,   1.46967817e-03,
          4.89901382e-04,   7.48328219e-04,   1.16800773e-07,
          3.99994861e-04,   5.54976776e-03]),
 'std_test_score': array([ 0.00094665,  0.00751057,  0.01626677,  0.0221393 ,  0.01246855,
         0.01194485,  0.02259572,  0.01475829,  0.018716  ,  0.02619427,
         0.01335989,  0.01870232,  0.01787686,  0.02361698]),
 'std_train_score': array([ 0.00023602,  0.00803524,  0.00894988,  0.00915795,  0.00855232,
         0.00217175,  0.00845601,  0.00776275,  0.00365096,  0.00539313,
         0.00581541,  0.00431285,  0.00557982,  0.00470761])}

# 输出最佳结果和最佳参数
# 如果最佳值在候选参数的边缘，最好再尝试更大或更小的参数，直到找到拐点
print(grid.best_score_)
print(grid.best_params_)

# 将调优的参数及结果写入文件
pd.DataFrame(grid.cv_results_).to_csv('LogisticGridSearchCV_Otto.csv')

# 绘制 CV误差曲线
test_means = grid.cv_results_['mean_test_score']
test_stds = grid.cv_results_['std_test_score']
train_means = grid.cv_results_['mean_train_score']
train_stds = grid.cv_results_['std_train_score']

# 绘制结果
number_C = len(Cs)
number_penaltys = len(penaltys)

test_scores = np.array(test_means).reshape(number_C ,number_penaltys)
train_scores = np.array(train_means).reshape(number_C , number_penaltys)
test_stds = np.array(test_stds).reshape(number_C , number_penaltys)
train_stds = np.array(train_stds).reshape(number_C , number_penaltys)

x_axis = np.log10(Cs)

for i , value in enumerate(penaltys):
    pyplot.errorbar(x_axis , test_scores[: , i ] , yerr = test_stds[: , i ] , label = penaltys[i] + ' Test')
    pyplot.errorbar(x_axis , train_scores[: , i ] , yerr = train_stds[: , i] , label = penaltys[i] + ' Train')
    
pyplot.legend()
pyplot.xlabel('log(C)')
pyplot.ylabel('accuracy')
pyplot.savefig('LogisticGridSearchCV_C.png')

pyplot.show()

上图给出了L1正则和L2正则下、不同正则参数C对应的模型在训练集上测试集上的正确率(score)。可以看出在训练集上C越大(正则越少)的模型性能越好；但在测试集上当C=100时性能最好(L1正则和L2正则均是)

用LogisticRegressionCV实现正则化的 Logistic Regression

from sklearn.linear_model import LogisticRegressionCV

Cs = [1 , 10 , 100 , 1000]

#大量样本(7W)、高维度(93维)，L1正则-->可选用saga优化求解器(0.19版本新功能)
lr_cv = LogisticRegressionCV(Cs = Cs , cv = 5 , penalty = 'l1' , solver = 'liblinear' , multi_class = 'ovr')
lr_cv.fit(X_train , y_train)

lr_cv.scores_

{0: array([[ 0.90140845,  0.88028169,  0.85211268,  0.83802817],
        [ 0.89361702,  0.87943262,  0.85106383,  0.84397163],
        [ 0.89285714,  0.89285714,  0.86428571,  0.85714286],
        [ 0.90714286,  0.85714286,  0.81428571,  0.81428571],
        [ 0.91304348,  0.85507246,  0.81884058,  0.80434783]]),
 1: array([[ 0.8943662 ,  0.88732394,  0.88028169,  0.86619718],
        [ 0.92198582,  0.90780142,  0.85815603,  0.83687943],
        [ 0.90714286,  0.91428571,  0.9       ,  0.87142857],
        [ 0.87857143,  0.85      ,  0.85      ,  0.85714286],
        [ 0.89855072,  0.89855072,  0.85507246,  0.86956522]]),
 2: array([[ 0.88028169,  0.84507042,  0.86619718,  0.83802817],
        [ 0.87234043,  0.86524823,  0.83687943,  0.82269504],
        [ 0.9       ,  0.9       ,  0.85714286,  0.85      ],
        [ 0.90714286,  0.87857143,  0.82857143,  0.81428571],
        [ 0.9057971 ,  0.86956522,  0.87681159,  0.88405797]]),
 3: array([[ 0.93661972,  0.8943662 ,  0.88028169,  0.85211268],
        [ 0.92198582,  0.90780142,  0.90780142,  0.87943262],
        [ 0.91428571,  0.89285714,  0.9       ,  0.9       ],
        [ 0.91428571,  0.90714286,  0.87857143,  0.85714286],
        [ 0.95652174,  0.89855072,  0.85507246,  0.84782609]]),
 4: array([[ 0.98591549,  0.97887324,  0.98591549,  0.98591549],
        [ 1.        ,  1.        ,  1.        ,  0.9929078 ],
        [ 0.97857143,  0.98571429,  0.99285714,  0.99285714],
        [ 1.        ,  1.        ,  0.99285714,  0.97857143],
        [ 0.99275362,  0.99275362,  0.99275362,  0.99275362]]),
 5: array([[ 0.97887324,  0.94366197,  0.94366197,  0.93661972],
        [ 0.95744681,  0.96453901,  0.95035461,  0.93617021],
        [ 0.95714286,  0.90714286,  0.89285714,  0.87857143],
        [ 0.96428571,  0.95714286,  0.95      ,  0.94285714],
        [ 0.98550725,  0.96376812,  0.94927536,  0.94927536]]),
 6: array([[ 0.8943662 ,  0.85915493,  0.82394366,  0.80985915],
        [ 0.91489362,  0.87234043,  0.84397163,  0.85815603],
        [ 0.94285714,  0.91428571,  0.9       ,  0.9       ],
        [ 0.92857143,  0.93571429,  0.90714286,  0.9       ],
        [ 0.88405797,  0.86956522,  0.85507246,  0.84057971]]),
 7: array([[ 0.92253521,  0.91549296,  0.92253521,  0.9084507 ],
        [ 0.92907801,  0.91489362,  0.91489362,  0.90780142],
        [ 0.95714286,  0.94285714,  0.91428571,  0.91428571],
        [ 0.93571429,  0.92857143,  0.93571429,  0.92142857],
        [ 0.94927536,  0.91304348,  0.91304348,  0.9057971 ]]),
 8: array([[ 0.92957746,  0.87323944,  0.84507042,  0.83802817],
        [ 0.90780142,  0.88652482,  0.86524823,  0.85106383],
        [ 0.88571429,  0.83571429,  0.85      ,  0.82857143],
        [ 0.93571429,  0.9       ,  0.84285714,  0.84285714],
        [ 0.9057971 ,  0.84782609,  0.84782609,  0.84782609]])}