数据说明:Kaggle 2015年举办的Otto Group Product Classification Challenge竞赛,Otto数据集是著名电商Otto提供的一个多类商品分类问题,类别数=9,每个样本有93维数值特征(整数,表某种事件发生的次数,已进行过脱敏处理); 参数调优说明:分别调缺省参数LogisticRegression、LogisticRegression + GridSearchCV以及LogisticRegressionCV进行参数调优,实际应用中LogisticRegression + GridSearchCV或LogisticRegressionCV任选一个即可;
# python 2.7 python 2.7 python 2.7 #import必要的模块 import pandas as pd import numpy as np # GridSearchCV用来做模型选择,GridSearchCV是用于交叉验证的类 from sklearn.model_selection import GridSearchCV #竞赛的评价指标为logloss from sklearn.metrics import log_loss from matplotlib import pyplot import seaborn as sns %matplotlib inline
#读取数据 dpath = './logistic/' train = pd.read_csv(dpath + "Otto_train_test.csv") train.head()

train.info()
......
#各属性的统计特性 train.describe()
......
# Target 分布,查看各类样本分布是否均衡
# 当各类样本数量不均衡时,交叉验证对分类任务要采用StratifiedKFold,即在每折采样时根据各类样本按比例采样,
# 交叉验证的代码中缺省的就是StratifiedKFold
sns.countplot(train.target)
pyplot.xlabel('target')
pyplot.ylabel('Number of occurrences')
# 将类别字符串变成数字 y_train = train['target'] #形式为Class_x y_train = y_train.map(lambda s: s[6:]) y_train = y_train.map(lambda s: int(s) - 1) train = train.drop(["id" , "target"] , axis = 1) X_train = np.array(train)
# 数据标准化 from sklearn.preprocessing import StandardScaler # 初始化特征的标准化器 ss_X = StandardScaler() # 分别对训练和测试数据的特征进行标准化处理 X_train = ss_X.fit_transform(X_train) # 因采用的是交叉验证,所以没有划分出训练数据和测试数据 #X_test = ss_X.transform(X_test)
from sklearn.linear_model import LogisticRegression lr = LogisticRegression() # 交叉验证用于评估模型性能和进行参数调优(模型选择) # 分类任务中交叉验证缺省是采用StratifiedKFold from sklearn.cross_validation import cross_val_score scores = cross_val_score(lr , X_train , y_train , cv=2 , scoring='accuracy') print '交叉验证中每折的准确度为:' print(scores) print 'cv accuracy is:' , scores.mean()

logistic回归需要调整的超参数有:C(正则系数,一般在log域(取log后的值))和正则函数penalty(L2/L1)目标函数为:J=sum(logloss(f(xi), yi)) + C* penalty
在sklearn框架下,不同学习器的参数调整步骤相同:设置候选参数集合调用GridSearchCV调用fit
from sklearn.model_selection import GridSearchCV
from sklearn.linear_model import LogisticRegression
# 需要调优的参数
# 待测:将L1正则和L2正则分开,并配合合适的优化求解算法(slover)
#tuned_parameters = {'penalty':['l1','l2'],
# 'C': [0.001, 0.01, 0.1, 1, 10, 100, 1000]
# }
penaltys = ['l1' , 'l2']
Cs = [0.001 , 0.01 , 0.1 , 1 , 10 , 100 , 1000]
tuned_parameters = dict(penalty = penaltys , C = Cs)
lr_penalty = LogisticRegression()
# GridSearchCV参数说明:第一个参数为模型,第二个为参数
grid = GridSearchCV(lr_penalty , tuned_parameters , cv = 5)
grid.fit(X_train , y_train)
# 输出结果 grid.cv_results_
out[46]:
{'mean_fit_time': array([ 2.56000519e-02, 6.05999947e-02, 3.22000504e-02,
1.10799980e-01, 6.85999870e-02, 1.75799990e-01,
2.18599987e-01, 3.23199940e-01, 2.49199996e+00,
6.04399967e-01, 4.23484001e+01, 9.69399977e-01,
6.73174000e+01, 1.87620001e+00]),
'mean_score_time': array([ 0.00139995, 0.00120001, 0.00099993, 0.00100002, 0.00060005,
0.00180001, 0.00060005, 0.00100007, 0.00119996, 0.00140009,
0.00119996, 0.00100002, 0.00079999, 0.00699997]),
'mean_test_score': array([ 0.09985735, 0.60912981, 0.2339515 , 0.64051355, 0.62339515,
0.66761769, 0.67332382, 0.65192582, 0.62482168, 0.63766049,
0.57631954, 0.60057061, 0.54778887, 0.58059914]),
'mean_train_score': array([ 0.0998579 , 0.68866596, 0.23680409, 0.74715815, 0.69901865,
0.81170009, 0.85983567, 0.86768396, 0.94116088, 0.91297004,
0.98429903, 0.94972342, 0.98964857, 0.97539203]),
'param_C': masked_array(data = [0.001 0.001 0.01 0.01 0.1 0.1 1 1 10 10 100 100 1000 1000],
mask = [False False False False False False False False False False False False
False False],
fill_value = ?),
'param_penalty': masked_array(data = ['l1' 'l2' 'l1' 'l2' 'l1' 'l2' 'l1' 'l2' 'l1' 'l2' 'l1' 'l2' 'l1' 'l2'],
mask = [False False False False False False False False False False False False
False False],
fill_value = ?),
'params': ({'C': 0.001, 'penalty': 'l1'},
{'C': 0.001, 'penalty': 'l2'},
{'C': 0.01, 'penalty': 'l1'},
{'C': 0.01, 'penalty': 'l2'},
{'C': 0.1, 'penalty': 'l1'},
{'C': 0.1, 'penalty': 'l2'},
{'C': 1, 'penalty': 'l1'},
{'C': 1, 'penalty': 'l2'},
{'C': 10, 'penalty': 'l1'},
{'C': 10, 'penalty': 'l2'},
{'C': 100, 'penalty': 'l1'},
{'C': 100, 'penalty': 'l2'},
{'C': 1000, 'penalty': 'l1'},
{'C': 1000, 'penalty': 'l2'}),
'rank_test_score': array([14, 8, 13, 4, 7, 2, 1, 3, 6, 5, 11, 9, 12, 10]),
'split0_test_score': array([ 0.09859155, 0.61971831, 0.21830986, 0.62676056, 0.6056338 ,
0.65492958, 0.66901408, 0.63380282, 0.59859155, 0.59859155,
0.58450704, 0.57746479, 0.55633803, 0.57042254]),
'split0_train_score': array([ 0.10017889, 0.6940966 , 0.23613596, 0.76386404, 0.70483005,
0.81395349, 0.84973166, 0.86225403, 0.94454383, 0.90697674,
0.98211091, 0.95348837, 0.98926655, 0.97674419]),
'split1_test_score': array([ 0.09929078, 0.60992908, 0.23404255, 0.68085106, 0.63829787,
0.66666667, 0.64539007, 0.65957447, 0.65248227, 0.65957447,
0.57446809, 0.63120567, 0.5248227 , 0.60283688]),
'split1_train_score': array([ 0.1 , 0.6875 , 0.225 , 0.74642857, 0.70178571,
0.81428571, 0.8625 , 0.87678571, 0.94285714, 0.91071429,
0.97678571, 0.95 , 0.98214286, 0.96785714]),
'split2_test_score': array([ 0.1 , 0.6 , 0.21428571, 0.64285714, 0.63571429,
0.65714286, 0.65714286, 0.63571429, 0.62142857, 0.67142857,
0.58571429, 0.58571429, 0.57142857, 0.61428571]),
'split2_train_score': array([ 0.09982175, 0.70053476, 0.24064171, 0.73975045, 0.70409982,
0.80926916, 0.87344029, 0.86987522, 0.93582888, 0.90909091,
0.99108734, 0.95187166, 0.99286988, 0.98217469]),
'split3_test_score': array([ 0.1 , 0.61428571, 0.25714286, 0.63571429, 0.61428571,
0.67142857, 0.68571429, 0.67142857, 0.61428571, 0.62142857,
0.58571429, 0.60714286, 0.55714286, 0.55714286]),
'split3_train_score': array([ 0.09982175, 0.67736185, 0.2513369 , 0.73796791, 0.70231729,
0.80926916, 0.85204991, 0.85561497, 0.94474153, 0.91622103,
0.98039216, 0.95187166, 0.98573975, 0.97682709]),
'split4_test_score': array([ 0.10144928, 0.60144928, 0.24637681, 0.61594203, 0.62318841,
0.6884058 , 0.71014493, 0.65942029, 0.63768116, 0.63768116,
0.55072464, 0.60144928, 0.52898551, 0.55797101]),
'split4_train_score': array([ 0.09946714, 0.68383659, 0.23090586, 0.74777975, 0.68206039,
0.81172291, 0.86145648, 0.87388988, 0.93783304, 0.92184725,
0.99111901, 0.94138544, 0.9982238 , 0.97335702]),
'std_fit_time': array([ 2.57683653e-03, 1.08922055e-02, 4.11827777e-03,
8.56509030e-03, 4.31736540e-03, 1.75886413e-02,
7.20000664e-03, 2.18577453e-02, 3.05847666e-01,
3.42380435e-02, 1.12247750e+01, 8.47834657e-02,
5.65952800e+00, 2.16019833e-01]),
'std_score_time': array([ 4.89920847e-04, 7.48379193e-04, 0.00000000e+00,
6.32485093e-04, 4.89940316e-04, 4.00042545e-04,
4.89940316e-04, 6.32409703e-04, 1.46967817e-03,
4.89901382e-04, 7.48328219e-04, 1.16800773e-07,
3.99994861e-04, 5.54976776e-03]),
'std_test_score': array([ 0.00094665, 0.00751057, 0.01626677, 0.0221393 , 0.01246855,
0.01194485, 0.02259572, 0.01475829, 0.018716 , 0.02619427,
0.01335989, 0.01870232, 0.01787686, 0.02361698]),
'std_train_score': array([ 0.00023602, 0.00803524, 0.00894988, 0.00915795, 0.00855232,
0.00217175, 0.00845601, 0.00776275, 0.00365096, 0.00539313,
0.00581541, 0.00431285, 0.00557982, 0.00470761])}# 输出最佳结果和最佳参数 # 如果最佳值在候选参数的边缘,最好再尝试更大或更小的参数,直到找到拐点 print(grid.best_score_) print(grid.best_params_)

# 将调优的参数及结果写入文件
pd.DataFrame(grid.cv_results_).to_csv('LogisticGridSearchCV_Otto.csv')
# 绘制 CV误差曲线
test_means = grid.cv_results_['mean_test_score']
test_stds = grid.cv_results_['std_test_score']
train_means = grid.cv_results_['mean_train_score']
train_stds = grid.cv_results_['std_train_score']
# 绘制结果
number_C = len(Cs)
number_penaltys = len(penaltys)
test_scores = np.array(test_means).reshape(number_C ,number_penaltys)
train_scores = np.array(train_means).reshape(number_C , number_penaltys)
test_stds = np.array(test_stds).reshape(number_C , number_penaltys)
train_stds = np.array(train_stds).reshape(number_C , number_penaltys)
x_axis = np.log10(Cs)
for i , value in enumerate(penaltys):
pyplot.errorbar(x_axis , test_scores[: , i ] , yerr = test_stds[: , i ] , label = penaltys[i] + ' Test')
pyplot.errorbar(x_axis , train_scores[: , i ] , yerr = train_stds[: , i] , label = penaltys[i] + ' Train')
pyplot.legend()
pyplot.xlabel('log(C)')
pyplot.ylabel('accuracy')
pyplot.savefig('LogisticGridSearchCV_C.png')
pyplot.show()
from sklearn.linear_model import LogisticRegressionCV Cs = [1 , 10 , 100 , 1000] #大量样本(7W)、高维度(93维),L1正则-->可选用saga优化求解器(0.19版本新功能) lr_cv = LogisticRegressionCV(Cs = Cs , cv = 5 , penalty = 'l1' , solver = 'liblinear' , multi_class = 'ovr') lr_cv.fit(X_train , y_train)

lr_cv.scores_
{0: array([[ 0.90140845, 0.88028169, 0.85211268, 0.83802817],
[ 0.89361702, 0.87943262, 0.85106383, 0.84397163],
[ 0.89285714, 0.89285714, 0.86428571, 0.85714286],
[ 0.90714286, 0.85714286, 0.81428571, 0.81428571],
[ 0.91304348, 0.85507246, 0.81884058, 0.80434783]]),
1: array([[ 0.8943662 , 0.88732394, 0.88028169, 0.86619718],
[ 0.92198582, 0.90780142, 0.85815603, 0.83687943],
[ 0.90714286, 0.91428571, 0.9 , 0.87142857],
[ 0.87857143, 0.85 , 0.85 , 0.85714286],
[ 0.89855072, 0.89855072, 0.85507246, 0.86956522]]),
2: array([[ 0.88028169, 0.84507042, 0.86619718, 0.83802817],
[ 0.87234043, 0.86524823, 0.83687943, 0.82269504],
[ 0.9 , 0.9 , 0.85714286, 0.85 ],
[ 0.90714286, 0.87857143, 0.82857143, 0.81428571],
[ 0.9057971 , 0.86956522, 0.87681159, 0.88405797]]),
3: array([[ 0.93661972, 0.8943662 , 0.88028169, 0.85211268],
[ 0.92198582, 0.90780142, 0.90780142, 0.87943262],
[ 0.91428571, 0.89285714, 0.9 , 0.9 ],
[ 0.91428571, 0.90714286, 0.87857143, 0.85714286],
[ 0.95652174, 0.89855072, 0.85507246, 0.84782609]]),
4: array([[ 0.98591549, 0.97887324, 0.98591549, 0.98591549],
[ 1. , 1. , 1. , 0.9929078 ],
[ 0.97857143, 0.98571429, 0.99285714, 0.99285714],
[ 1. , 1. , 0.99285714, 0.97857143],
[ 0.99275362, 0.99275362, 0.99275362, 0.99275362]]),
5: array([[ 0.97887324, 0.94366197, 0.94366197, 0.93661972],
[ 0.95744681, 0.96453901, 0.95035461, 0.93617021],
[ 0.95714286, 0.90714286, 0.89285714, 0.87857143],
[ 0.96428571, 0.95714286, 0.95 , 0.94285714],
[ 0.98550725, 0.96376812, 0.94927536, 0.94927536]]),
6: array([[ 0.8943662 , 0.85915493, 0.82394366, 0.80985915],
[ 0.91489362, 0.87234043, 0.84397163, 0.85815603],
[ 0.94285714, 0.91428571, 0.9 , 0.9 ],
[ 0.92857143, 0.93571429, 0.90714286, 0.9 ],
[ 0.88405797, 0.86956522, 0.85507246, 0.84057971]]),
7: array([[ 0.92253521, 0.91549296, 0.92253521, 0.9084507 ],
[ 0.92907801, 0.91489362, 0.91489362, 0.90780142],
[ 0.95714286, 0.94285714, 0.91428571, 0.91428571],
[ 0.93571429, 0.92857143, 0.93571429, 0.92142857],
[ 0.94927536, 0.91304348, 0.91304348, 0.9057971 ]]),
8: array([[ 0.92957746, 0.87323944, 0.84507042, 0.83802817],
[ 0.90780142, 0.88652482, 0.86524823, 0.85106383],
[ 0.88571429, 0.83571429, 0.85 , 0.82857143],
[ 0.93571429, 0.9 , 0.84285714, 0.84285714],
[ 0.9057971 , 0.84782609, 0.84782609, 0.84782609]])}