数据说明:手写数字识别,数据同神经网络数据
1.将全体训练数据划分成一个训练集和一个校验集;
2.用不同的贡献度值在划分出来的训练集上训练模型,然后在校验集上测试并记录结果;
3.找到在校验集上准确率最高的模型的贡献度;
4.用找到的最佳贡献度对全体训练数据(未划分前的数据集)训练出模型
5.对测试集进行预测
# python 2.7 python 2.7 python 2.7 # 导入必要的工具包 import pandas as pd import numpy as np from sklearn import svm from sklearn.model_selection import train_test_split from sklearn.decomposition import PCA import time from matplotlib import pyplot import seaborn as sns %matplotlib inline
# 读取训练数据和测试数据 train = pd.read_csv('./data/number_train.csv') test = pd.read_csv('./data/number_test.csv') y_train = train.label.values X_train = train.drop('label' , axis = 1).values X_test = test.values
# 打印数据集的前30个训练图像 rows = 5 cols = 15 fig1 , ax1 = pyplot.subplots(rows ,cols ,figsize=(10 , 5)) # 标签字体 fontdict = {'fontsize': 20,'fontweight' : 6,'verticalalignment': 'baseline','horizontalalignment': 'center'} #reshape函数中的28*28=784,为本数据的维度 for j in range(rows): for i in range(cols): ax1[j][i].imshow(X_train[j*cols+i].reshape(28 , 28)) ax1[j][i].axis('off') ax1[j][i].set_title(y_train[j*cols+i] , fontdict = fontdict)
X_train_source = X_train #将像素值[0 , 255]转化成[0 , 1] X_train = X_train / 255.0 X_test = X_test / 255.0
#全体训练数据规模和测试数据规模 print('训练样本的规模及每一样本的维数为:{}'.format(X_train.shape)) print('测试样本的规模及每一样本的维数为:{}'.format(X_test.shape))
# 将训练集合拆分成训练集和校验集,在校验集上找到最佳的模型超参数(PCA维数) X_train_part , X_val , y_train_part , y_val = train_test_split(X_train , y_train , train_size =0.8 , random_state = 0)
# 拆分后的训练集和校验集的样本数目 print "训练集的样本数及每一个样本的维度为:" , X_train_part.shape print "校验集的样本数及每一个样本的维度为:" , X_val.shape
# 一个参数点(PCA维数为n)的模型训练和测试,得到该参数下模型在校验集上的预测性能 def n_component_analysis(n , X_train , y_train , X_val , y_val): start = time.time() pca = PCA(n_components = n) print "PCA的总贡献度为:" , format(n) pca.fit(X_train) #在训练集和测试集降维 X_train_pca = pca.transform(X_train) X_val_pca = pca.transform(X_val) # 利用SVC训练 print ('SVC begin') clf1 = svm.SVC() clf1.fit(X_train_pca , y_train) # 返回accuracy accuracy = clf1.score(X_val_pca , y_val) end = time.time() print ("accuracy: {} , time elaps: {}".format(accuracy , int(end-start))) return accuracy
# 设置超参数(PCA维数)搜索范围 n_s = np.linspace(0.65 , 0.80 , num = 15) accuracy = [] for n in n_s: tmp = n_component_analysis(n , X_train_part , y_train_part, X_val , y_val) accuracy.append(tmp)
PCA的总贡献度为: 0.65 SVC begin accuracy: 0.94395280236 , time elaps: 3 PCA的总贡献度为: 0.660714285714 SVC begin accuracy: 0.946902654867 , time elaps: 3 PCA的总贡献度为: 0.671428571429 SVC begin accuracy: 0.94395280236 , time elaps: 3 PCA的总贡献度为: 0.682142857143 SVC begin accuracy: 0.955752212389 , time elaps: 3 PCA的总贡献度为: 0.692857142857 SVC begin accuracy: 0.952802359882 , time elaps: 4 PCA的总贡献度为: 0.703571428571 SVC begin accuracy: 0.958702064897 , time elaps: 4 PCA的总贡献度为: 0.714285714286 SVC begin accuracy: 0.952802359882 , time elaps: 3 PCA的总贡献度为: 0.725 SVC begin accuracy: 0.952802359882 , time elaps: 3 PCA的总贡献度为: 0.735714285714 SVC begin accuracy: 0.955752212389 , time elaps: 3 PCA的总贡献度为: 0.746428571429 SVC begin accuracy: 0.949852507375 , time elaps: 3 PCA的总贡献度为: 0.757142857143 SVC begin accuracy: 0.946902654867 , time elaps: 3 PCA的总贡献度为: 0.767857142857 SVC begin accuracy: 0.94395280236 , time elaps: 3 PCA的总贡献度为: 0.778571428571 SVC begin accuracy: 0.946902654867 , time elaps: 3 PCA的总贡献度为: 0.789285714286 SVC begin accuracy: 0.94395280236 , time elaps: 3 PCA的总贡献度为: 0.8 SVC begin accuracy: 0.941002949853 , time elaps: 3
# 绘制不同PCA维数下模型的性能,找到最佳模型/参数(分数最高) pyplot.plot(n_s , np.array(accuracy) , 'b-')
# 最佳模型参数 pca = PCA(n_components = 0.71) #根据最佳参数,在全体训练数据上重新训练模型 pca.fit(X_train)
print '实际用于训练的数据维度为:' , pca.n_components_ print '实际用于训练的特征向量的特征值为:\n' , pca.explained_variance_ratio_
# 根据最佳参数,对全体训练数据降维 X_train_pca = pca.transform(X_train) # 根据最佳参数,对测试数据降维 X_test_pca = pca.transform(X_test)
# 降维后的特征维数 print X_train_pca.shape print X_test_pca.shape
# 在降维后的训练数据集上训练SVM分类器 clf = svm.SVC() clf.fit(X_train_pca , y_train)
# 对测试集进行测试 y_predict = clf.predict(X_test_pca)
#生成提交测试结果 import pandas as pd df = pd.DataFrame(y_predict) df.columns=['Label'] df.index+=1 df.index.name = 'Imageid' df.to_csv('SVC_Minist_submission.csv', header=True)
# 打印测试集上的前90个预测结果 rows = 6 cols = 15 fig1 , ax1 = pyplot.subplots(rows ,cols ,figsize=(12 , 8)) fontdict = {'fontsize': 20,'fontweight' : 6,'verticalalignment': 'baseline','horizontalalignment': 'center'} #reshape函数中的28*28=784,为本数据的维度 for j in range(rows): for i in range(cols): ax1[j][i].imshow(X_test[j*cols+i].reshape(28 , 28)) ax1[j][i].axis('off') ax1[j][i].set_title(y_predict[j*cols+i] , fontdict = fontdict)