数据说明:Kaggle 2015年举办的Otto Group Product Classification Challenge竞赛,Otto数据集是著名电商Otto提供的一个多类商品分类问题,类别数=9,每个样本有93维数值特征(整数,表某种事件发生的次数,已进行过脱敏处理); 参数调优说明:分别调缺省参数LogisticRegression、LogisticRegression + GridSearchCV以及LogisticRegressionCV进行参数调优,实际应用中LogisticRegression + GridSearchCV或LogisticRegressionCV任选一个即可;
# python 2.7 python 2.7 python 2.7 #import必要的模块 import pandas as pd import numpy as np # GridSearchCV用来做模型选择,GridSearchCV是用于交叉验证的类 from sklearn.model_selection import GridSearchCV #竞赛的评价指标为logloss from sklearn.metrics import log_loss from matplotlib import pyplot import seaborn as sns %matplotlib inline
#读取数据 dpath = './logistic/' train = pd.read_csv(dpath + "Otto_train_test.csv") train.head()
train.info()
......
#各属性的统计特性 train.describe()
......
# Target 分布,查看各类样本分布是否均衡 # 当各类样本数量不均衡时,交叉验证对分类任务要采用StratifiedKFold,即在每折采样时根据各类样本按比例采样, # 交叉验证的代码中缺省的就是StratifiedKFold sns.countplot(train.target) pyplot.xlabel('target') pyplot.ylabel('Number of occurrences')
# 将类别字符串变成数字 y_train = train['target'] #形式为Class_x y_train = y_train.map(lambda s: s[6:]) y_train = y_train.map(lambda s: int(s) - 1) train = train.drop(["id" , "target"] , axis = 1) X_train = np.array(train)
# 数据标准化 from sklearn.preprocessing import StandardScaler # 初始化特征的标准化器 ss_X = StandardScaler() # 分别对训练和测试数据的特征进行标准化处理 X_train = ss_X.fit_transform(X_train) # 因采用的是交叉验证,所以没有划分出训练数据和测试数据 #X_test = ss_X.transform(X_test)
from sklearn.linear_model import LogisticRegression lr = LogisticRegression() # 交叉验证用于评估模型性能和进行参数调优(模型选择) # 分类任务中交叉验证缺省是采用StratifiedKFold from sklearn.cross_validation import cross_val_score scores = cross_val_score(lr , X_train , y_train , cv=2 , scoring='accuracy') print '交叉验证中每折的准确度为:' print(scores) print 'cv accuracy is:' , scores.mean()
logistic回归需要调整的超参数有:C(正则系数,一般在log域(取log后的值))和正则函数penalty(L2/L1)目标函数为:J=sum(logloss(f(xi), yi)) + C* penalty
在sklearn框架下,不同学习器的参数调整步骤相同:设置候选参数集合调用GridSearchCV调用fit
from sklearn.model_selection import GridSearchCV from sklearn.linear_model import LogisticRegression # 需要调优的参数 # 待测:将L1正则和L2正则分开,并配合合适的优化求解算法(slover) #tuned_parameters = {'penalty':['l1','l2'], # 'C': [0.001, 0.01, 0.1, 1, 10, 100, 1000] # } penaltys = ['l1' , 'l2'] Cs = [0.001 , 0.01 , 0.1 , 1 , 10 , 100 , 1000] tuned_parameters = dict(penalty = penaltys , C = Cs) lr_penalty = LogisticRegression() # GridSearchCV参数说明:第一个参数为模型,第二个为参数 grid = GridSearchCV(lr_penalty , tuned_parameters , cv = 5) grid.fit(X_train , y_train)
# 输出结果 grid.cv_results_
out[46]:
{'mean_fit_time': array([ 2.56000519e-02, 6.05999947e-02, 3.22000504e-02, 1.10799980e-01, 6.85999870e-02, 1.75799990e-01, 2.18599987e-01, 3.23199940e-01, 2.49199996e+00, 6.04399967e-01, 4.23484001e+01, 9.69399977e-01, 6.73174000e+01, 1.87620001e+00]), 'mean_score_time': array([ 0.00139995, 0.00120001, 0.00099993, 0.00100002, 0.00060005, 0.00180001, 0.00060005, 0.00100007, 0.00119996, 0.00140009, 0.00119996, 0.00100002, 0.00079999, 0.00699997]), 'mean_test_score': array([ 0.09985735, 0.60912981, 0.2339515 , 0.64051355, 0.62339515, 0.66761769, 0.67332382, 0.65192582, 0.62482168, 0.63766049, 0.57631954, 0.60057061, 0.54778887, 0.58059914]), 'mean_train_score': array([ 0.0998579 , 0.68866596, 0.23680409, 0.74715815, 0.69901865, 0.81170009, 0.85983567, 0.86768396, 0.94116088, 0.91297004, 0.98429903, 0.94972342, 0.98964857, 0.97539203]), 'param_C': masked_array(data = [0.001 0.001 0.01 0.01 0.1 0.1 1 1 10 10 100 100 1000 1000], mask = [False False False False False False False False False False False False False False], fill_value = ?), 'param_penalty': masked_array(data = ['l1' 'l2' 'l1' 'l2' 'l1' 'l2' 'l1' 'l2' 'l1' 'l2' 'l1' 'l2' 'l1' 'l2'], mask = [False False False False False False False False False False False False False False], fill_value = ?), 'params': ({'C': 0.001, 'penalty': 'l1'}, {'C': 0.001, 'penalty': 'l2'}, {'C': 0.01, 'penalty': 'l1'}, {'C': 0.01, 'penalty': 'l2'}, {'C': 0.1, 'penalty': 'l1'}, {'C': 0.1, 'penalty': 'l2'}, {'C': 1, 'penalty': 'l1'}, {'C': 1, 'penalty': 'l2'}, {'C': 10, 'penalty': 'l1'}, {'C': 10, 'penalty': 'l2'}, {'C': 100, 'penalty': 'l1'}, {'C': 100, 'penalty': 'l2'}, {'C': 1000, 'penalty': 'l1'}, {'C': 1000, 'penalty': 'l2'}), 'rank_test_score': array([14, 8, 13, 4, 7, 2, 1, 3, 6, 5, 11, 9, 12, 10]), 'split0_test_score': array([ 0.09859155, 0.61971831, 0.21830986, 0.62676056, 0.6056338 , 0.65492958, 0.66901408, 0.63380282, 0.59859155, 0.59859155, 0.58450704, 0.57746479, 0.55633803, 0.57042254]), 'split0_train_score': array([ 0.10017889, 0.6940966 , 0.23613596, 0.76386404, 0.70483005, 0.81395349, 0.84973166, 0.86225403, 0.94454383, 0.90697674, 0.98211091, 0.95348837, 0.98926655, 0.97674419]), 'split1_test_score': array([ 0.09929078, 0.60992908, 0.23404255, 0.68085106, 0.63829787, 0.66666667, 0.64539007, 0.65957447, 0.65248227, 0.65957447, 0.57446809, 0.63120567, 0.5248227 , 0.60283688]), 'split1_train_score': array([ 0.1 , 0.6875 , 0.225 , 0.74642857, 0.70178571, 0.81428571, 0.8625 , 0.87678571, 0.94285714, 0.91071429, 0.97678571, 0.95 , 0.98214286, 0.96785714]), 'split2_test_score': array([ 0.1 , 0.6 , 0.21428571, 0.64285714, 0.63571429, 0.65714286, 0.65714286, 0.63571429, 0.62142857, 0.67142857, 0.58571429, 0.58571429, 0.57142857, 0.61428571]), 'split2_train_score': array([ 0.09982175, 0.70053476, 0.24064171, 0.73975045, 0.70409982, 0.80926916, 0.87344029, 0.86987522, 0.93582888, 0.90909091, 0.99108734, 0.95187166, 0.99286988, 0.98217469]), 'split3_test_score': array([ 0.1 , 0.61428571, 0.25714286, 0.63571429, 0.61428571, 0.67142857, 0.68571429, 0.67142857, 0.61428571, 0.62142857, 0.58571429, 0.60714286, 0.55714286, 0.55714286]), 'split3_train_score': array([ 0.09982175, 0.67736185, 0.2513369 , 0.73796791, 0.70231729, 0.80926916, 0.85204991, 0.85561497, 0.94474153, 0.91622103, 0.98039216, 0.95187166, 0.98573975, 0.97682709]), 'split4_test_score': array([ 0.10144928, 0.60144928, 0.24637681, 0.61594203, 0.62318841, 0.6884058 , 0.71014493, 0.65942029, 0.63768116, 0.63768116, 0.55072464, 0.60144928, 0.52898551, 0.55797101]), 'split4_train_score': array([ 0.09946714, 0.68383659, 0.23090586, 0.74777975, 0.68206039, 0.81172291, 0.86145648, 0.87388988, 0.93783304, 0.92184725, 0.99111901, 0.94138544, 0.9982238 , 0.97335702]), 'std_fit_time': array([ 2.57683653e-03, 1.08922055e-02, 4.11827777e-03, 8.56509030e-03, 4.31736540e-03, 1.75886413e-02, 7.20000664e-03, 2.18577453e-02, 3.05847666e-01, 3.42380435e-02, 1.12247750e+01, 8.47834657e-02, 5.65952800e+00, 2.16019833e-01]), 'std_score_time': array([ 4.89920847e-04, 7.48379193e-04, 0.00000000e+00, 6.32485093e-04, 4.89940316e-04, 4.00042545e-04, 4.89940316e-04, 6.32409703e-04, 1.46967817e-03, 4.89901382e-04, 7.48328219e-04, 1.16800773e-07, 3.99994861e-04, 5.54976776e-03]), 'std_test_score': array([ 0.00094665, 0.00751057, 0.01626677, 0.0221393 , 0.01246855, 0.01194485, 0.02259572, 0.01475829, 0.018716 , 0.02619427, 0.01335989, 0.01870232, 0.01787686, 0.02361698]), 'std_train_score': array([ 0.00023602, 0.00803524, 0.00894988, 0.00915795, 0.00855232, 0.00217175, 0.00845601, 0.00776275, 0.00365096, 0.00539313, 0.00581541, 0.00431285, 0.00557982, 0.00470761])}
# 输出最佳结果和最佳参数 # 如果最佳值在候选参数的边缘,最好再尝试更大或更小的参数,直到找到拐点 print(grid.best_score_) print(grid.best_params_)
# 将调优的参数及结果写入文件 pd.DataFrame(grid.cv_results_).to_csv('LogisticGridSearchCV_Otto.csv') # 绘制 CV误差曲线 test_means = grid.cv_results_['mean_test_score'] test_stds = grid.cv_results_['std_test_score'] train_means = grid.cv_results_['mean_train_score'] train_stds = grid.cv_results_['std_train_score'] # 绘制结果 number_C = len(Cs) number_penaltys = len(penaltys) test_scores = np.array(test_means).reshape(number_C ,number_penaltys) train_scores = np.array(train_means).reshape(number_C , number_penaltys) test_stds = np.array(test_stds).reshape(number_C , number_penaltys) train_stds = np.array(train_stds).reshape(number_C , number_penaltys) x_axis = np.log10(Cs) for i , value in enumerate(penaltys): pyplot.errorbar(x_axis , test_scores[: , i ] , yerr = test_stds[: , i ] , label = penaltys[i] + ' Test') pyplot.errorbar(x_axis , train_scores[: , i ] , yerr = train_stds[: , i] , label = penaltys[i] + ' Train') pyplot.legend() pyplot.xlabel('log(C)') pyplot.ylabel('accuracy') pyplot.savefig('LogisticGridSearchCV_C.png') pyplot.show()
from sklearn.linear_model import LogisticRegressionCV Cs = [1 , 10 , 100 , 1000] #大量样本(7W)、高维度(93维),L1正则-->可选用saga优化求解器(0.19版本新功能) lr_cv = LogisticRegressionCV(Cs = Cs , cv = 5 , penalty = 'l1' , solver = 'liblinear' , multi_class = 'ovr') lr_cv.fit(X_train , y_train)
lr_cv.scores_
{0: array([[ 0.90140845, 0.88028169, 0.85211268, 0.83802817], [ 0.89361702, 0.87943262, 0.85106383, 0.84397163], [ 0.89285714, 0.89285714, 0.86428571, 0.85714286], [ 0.90714286, 0.85714286, 0.81428571, 0.81428571], [ 0.91304348, 0.85507246, 0.81884058, 0.80434783]]), 1: array([[ 0.8943662 , 0.88732394, 0.88028169, 0.86619718], [ 0.92198582, 0.90780142, 0.85815603, 0.83687943], [ 0.90714286, 0.91428571, 0.9 , 0.87142857], [ 0.87857143, 0.85 , 0.85 , 0.85714286], [ 0.89855072, 0.89855072, 0.85507246, 0.86956522]]), 2: array([[ 0.88028169, 0.84507042, 0.86619718, 0.83802817], [ 0.87234043, 0.86524823, 0.83687943, 0.82269504], [ 0.9 , 0.9 , 0.85714286, 0.85 ], [ 0.90714286, 0.87857143, 0.82857143, 0.81428571], [ 0.9057971 , 0.86956522, 0.87681159, 0.88405797]]), 3: array([[ 0.93661972, 0.8943662 , 0.88028169, 0.85211268], [ 0.92198582, 0.90780142, 0.90780142, 0.87943262], [ 0.91428571, 0.89285714, 0.9 , 0.9 ], [ 0.91428571, 0.90714286, 0.87857143, 0.85714286], [ 0.95652174, 0.89855072, 0.85507246, 0.84782609]]), 4: array([[ 0.98591549, 0.97887324, 0.98591549, 0.98591549], [ 1. , 1. , 1. , 0.9929078 ], [ 0.97857143, 0.98571429, 0.99285714, 0.99285714], [ 1. , 1. , 0.99285714, 0.97857143], [ 0.99275362, 0.99275362, 0.99275362, 0.99275362]]), 5: array([[ 0.97887324, 0.94366197, 0.94366197, 0.93661972], [ 0.95744681, 0.96453901, 0.95035461, 0.93617021], [ 0.95714286, 0.90714286, 0.89285714, 0.87857143], [ 0.96428571, 0.95714286, 0.95 , 0.94285714], [ 0.98550725, 0.96376812, 0.94927536, 0.94927536]]), 6: array([[ 0.8943662 , 0.85915493, 0.82394366, 0.80985915], [ 0.91489362, 0.87234043, 0.84397163, 0.85815603], [ 0.94285714, 0.91428571, 0.9 , 0.9 ], [ 0.92857143, 0.93571429, 0.90714286, 0.9 ], [ 0.88405797, 0.86956522, 0.85507246, 0.84057971]]), 7: array([[ 0.92253521, 0.91549296, 0.92253521, 0.9084507 ], [ 0.92907801, 0.91489362, 0.91489362, 0.90780142], [ 0.95714286, 0.94285714, 0.91428571, 0.91428571], [ 0.93571429, 0.92857143, 0.93571429, 0.92142857], [ 0.94927536, 0.91304348, 0.91304348, 0.9057971 ]]), 8: array([[ 0.92957746, 0.87323944, 0.84507042, 0.83802817], [ 0.90780142, 0.88652482, 0.86524823, 0.85106383], [ 0.88571429, 0.83571429, 0.85 , 0.82857143], [ 0.93571429, 0.9 , 0.84285714, 0.84285714], [ 0.9057971 , 0.84782609, 0.84782609, 0.84782609]])}