数据说明:手写数字识别,数据同神经网络数据
1.将全体训练数据划分成一个训练集和一个校验集;
2.用不同的贡献度值在划分出来的训练集上训练模型,然后在校验集上测试并记录结果;
3.找到在校验集上准确率最高的模型的贡献度;
4.用找到的最佳贡献度对全体训练数据(未划分前的数据集)训练出模型
5.对测试集进行预测
# python 2.7 python 2.7 python 2.7 # 导入必要的工具包 import pandas as pd import numpy as np from sklearn import svm from sklearn.model_selection import train_test_split from sklearn.decomposition import PCA import time from matplotlib import pyplot import seaborn as sns %matplotlib inline
# 读取训练数据和测试数据
train = pd.read_csv('./data/number_train.csv')
test = pd.read_csv('./data/number_test.csv')
y_train = train.label.values
X_train = train.drop('label' , axis = 1).values
X_test = test.values# 打印数据集的前30个训练图像
rows = 5
cols = 15
fig1 , ax1 = pyplot.subplots(rows ,cols ,figsize=(10 , 5))
# 标签字体
fontdict = {'fontsize': 20,'fontweight' : 6,'verticalalignment': 'baseline','horizontalalignment': 'center'}
#reshape函数中的28*28=784,为本数据的维度
for j in range(rows):
for i in range(cols):
ax1[j][i].imshow(X_train[j*cols+i].reshape(28 , 28))
ax1[j][i].axis('off')
ax1[j][i].set_title(y_train[j*cols+i] , fontdict = fontdict)
X_train_source = X_train #将像素值[0 , 255]转化成[0 , 1] X_train = X_train / 255.0 X_test = X_test / 255.0
#全体训练数据规模和测试数据规模
print('训练样本的规模及每一样本的维数为:{}'.format(X_train.shape))
print('测试样本的规模及每一样本的维数为:{}'.format(X_test.shape))
# 将训练集合拆分成训练集和校验集,在校验集上找到最佳的模型超参数(PCA维数) X_train_part , X_val , y_train_part , y_val = train_test_split(X_train , y_train , train_size =0.8 , random_state = 0)
# 拆分后的训练集和校验集的样本数目 print "训练集的样本数及每一个样本的维度为:" , X_train_part.shape print "校验集的样本数及每一个样本的维度为:" , X_val.shape

# 一个参数点(PCA维数为n)的模型训练和测试,得到该参数下模型在校验集上的预测性能
def n_component_analysis(n , X_train , y_train , X_val , y_val):
start = time.time()
pca = PCA(n_components = n)
print "PCA的总贡献度为:" , format(n)
pca.fit(X_train)
#在训练集和测试集降维
X_train_pca = pca.transform(X_train)
X_val_pca = pca.transform(X_val)
# 利用SVC训练
print ('SVC begin')
clf1 = svm.SVC()
clf1.fit(X_train_pca , y_train)
# 返回accuracy
accuracy = clf1.score(X_val_pca , y_val)
end = time.time()
print ("accuracy: {} , time elaps: {}".format(accuracy , int(end-start)))
return accuracy# 设置超参数(PCA维数)搜索范围 n_s = np.linspace(0.65 , 0.80 , num = 15) accuracy = [] for n in n_s: tmp = n_component_analysis(n , X_train_part , y_train_part, X_val , y_val) accuracy.append(tmp)
PCA的总贡献度为: 0.65 SVC begin accuracy: 0.94395280236 , time elaps: 3 PCA的总贡献度为: 0.660714285714 SVC begin accuracy: 0.946902654867 , time elaps: 3 PCA的总贡献度为: 0.671428571429 SVC begin accuracy: 0.94395280236 , time elaps: 3 PCA的总贡献度为: 0.682142857143 SVC begin accuracy: 0.955752212389 , time elaps: 3 PCA的总贡献度为: 0.692857142857 SVC begin accuracy: 0.952802359882 , time elaps: 4 PCA的总贡献度为: 0.703571428571 SVC begin accuracy: 0.958702064897 , time elaps: 4 PCA的总贡献度为: 0.714285714286 SVC begin accuracy: 0.952802359882 , time elaps: 3 PCA的总贡献度为: 0.725 SVC begin accuracy: 0.952802359882 , time elaps: 3 PCA的总贡献度为: 0.735714285714 SVC begin accuracy: 0.955752212389 , time elaps: 3 PCA的总贡献度为: 0.746428571429 SVC begin accuracy: 0.949852507375 , time elaps: 3 PCA的总贡献度为: 0.757142857143 SVC begin accuracy: 0.946902654867 , time elaps: 3 PCA的总贡献度为: 0.767857142857 SVC begin accuracy: 0.94395280236 , time elaps: 3 PCA的总贡献度为: 0.778571428571 SVC begin accuracy: 0.946902654867 , time elaps: 3 PCA的总贡献度为: 0.789285714286 SVC begin accuracy: 0.94395280236 , time elaps: 3 PCA的总贡献度为: 0.8 SVC begin accuracy: 0.941002949853 , time elaps: 3
# 绘制不同PCA维数下模型的性能,找到最佳模型/参数(分数最高) pyplot.plot(n_s , np.array(accuracy) , 'b-')

# 最佳模型参数 pca = PCA(n_components = 0.71) #根据最佳参数,在全体训练数据上重新训练模型 pca.fit(X_train)

print '实际用于训练的数据维度为:' , pca.n_components_ print '实际用于训练的特征向量的特征值为:\n' , pca.explained_variance_ratio_

# 根据最佳参数,对全体训练数据降维 X_train_pca = pca.transform(X_train) # 根据最佳参数,对测试数据降维 X_test_pca = pca.transform(X_test)
# 降维后的特征维数 print X_train_pca.shape print X_test_pca.shape

# 在降维后的训练数据集上训练SVM分类器 clf = svm.SVC() clf.fit(X_train_pca , y_train)

# 对测试集进行测试 y_predict = clf.predict(X_test_pca)
#生成提交测试结果
import pandas as pd
df = pd.DataFrame(y_predict)
df.columns=['Label']
df.index+=1
df.index.name = 'Imageid'
df.to_csv('SVC_Minist_submission.csv', header=True)# 打印测试集上的前90个预测结果
rows = 6
cols = 15
fig1 , ax1 = pyplot.subplots(rows ,cols ,figsize=(12 , 8))
fontdict = {'fontsize': 20,'fontweight' : 6,'verticalalignment': 'baseline','horizontalalignment': 'center'}
#reshape函数中的28*28=784,为本数据的维度
for j in range(rows):
for i in range(cols):
ax1[j][i].imshow(X_test[j*cols+i].reshape(28 , 28))
ax1[j][i].axis('off')
ax1[j][i].set_title(y_predict[j*cols+i] , fontdict = fontdict)