说明:
# python 2.7 python 2.7 python 2.7 #导入必要的工具包 import pandas as pd import numpy as np # from sklearn import svm from sklearn.neural_network import MLPClassifier from sklearn.model_selection import train_test_split from sklearn.decomposition import PCA from matplotlib import pyplot import seaborn as sns %matplotlib inline
# 读取训练数据和测试数据 train = pd.read_csv('./data/number_train.csv') test = pd.read_csv('./data/number_test.csv') y_train = train.label.values X_train = train.drop('label' , axis = 1).values X_test = test.values
# 打印数据集的前30个训练图像 rows = 5 cols = 15 fig1 , ax1 = pyplot.subplots(rows ,cols ,figsize=(10 , 5)) # 标签字体 fontdict = {'fontsize': 20,'fontweight' : 6,'verticalalignment': 'baseline','horizontalalignment': 'center'} #reshape函数中的28*28=784,为本数据的维度 for j in range(rows): for i in range(cols): ax1[j][i].imshow(X_train[j*cols+i].reshape(28 , 28)) ax1[j][i].axis('off') ax1[j][i].set_title(y_train[j*cols+i] , fontdict = fontdict)
X_train_source = X_train #保存降维前的原数据,用于错误分析 X_test_source = X_test #将像素值[0 , 255]转化成[0 , 1] X_train = X_train / 255.0 X_test = X_test / 255.0
#全体训练数据规模和测试数据规模 print('训练样本的规模及每一样本的维数为:{}'.format(X_train.shape)) print('测试样本的规模及每一样本的维数为:{}'.format(X_test.shape))
#PCA降维 pca = PCA(n_components=0.85) pca.fit(X_train)
# PCA的维数 pca.n_components_
#对训练数据和测试数据进行降维 X_train_pca = pca.transform(X_train) X_test_pca = pca.transform(X_test)
#降维后的数据规模 print X_train.shape print X_test.shape print(X_train_pca.shape) print(X_test_pca.shape)
#将训练集合拆分成训练集和校验集,在校验集上评估模型预测性能 X_train_part , X_val , y_train_part , y_val = train_test_split(X_train_pca , y_train , train_size = 0.8 , random_state = 0) # 降维前训练集和校验集的数据副本,用于训练完后的错误分析 X_train_part_source , X_val_source , y_train_part_source , y_val_source = train_test_split(X_train , y_train , train_size = 0.8 , random_state = 0) X_train_part_source = '' y_train_part_source = '' y_val_source = ''
#训练集和校验集规模 print(X_train_part.shape) print(X_val.shape)
#训练一个多层感知机模型 clf_MLP = MLPClassifier(hidden_layer_sizes=(1000 , 100) , random_state=1) clf_MLP.fit(X_train_part , y_train_part)
#在校验集上测试 from sklearn import metrics from sklearn.metrics import accuracy_score #accuracy and confusion matrix y_predict = clf_MLP.predict(X_val) # 返回accuracy val_accuracy = accuracy_score(y_val , y_predict) print("Validation Accuary: %.2f%%" % (val_accuracy * 100.0)) print("Classification report for classifier %s:\n%s\n" % (clf_MLP , metrics.classification_report(y_val , y_predict))) print("Confusion matrix:\n%s" % metrics.confusion_matrix(y_val , y_predict))
Validation Accuary: 92.63% Classification report for classifier MLPClassifier(activation='relu', alpha=0.0001, batch_size='auto', beta_1=0.9, beta_2=0.999, early_stopping=False, epsilon=1e-08, hidden_layer_sizes=(1000, 100), learning_rate='constant', learning_rate_init=0.001, max_iter=200, momentum=0.9, nesterovs_momentum=True, power_t=0.5, random_state=1, shuffle=True, solver='adam', tol=0.0001, validation_fraction=0.1, verbose=False, warm_start=False): precision recall f1-score support 0 0.92 0.96 0.94 25 1 0.95 0.97 0.96 39 2 0.88 0.91 0.89 23 3 1.00 0.91 0.96 35 4 0.93 0.96 0.95 28 5 0.70 0.88 0.78 16 6 1.00 0.92 0.96 52 7 0.93 0.93 0.93 27 8 0.88 0.88 0.88 33 9 0.93 0.92 0.93 61 avg / total 0.93 0.93 0.93 339 Confusion matrix: [[24 0 1 0 0 0 0 0 0 0] [ 0 38 0 0 0 0 0 0 1 0] [ 0 1 21 0 0 0 0 1 0 0] [ 0 0 0 32 0 1 0 0 0 2] [ 0 0 0 0 27 0 0 0 0 1] [ 0 0 1 0 0 14 0 0 1 0] [ 0 1 1 0 0 1 48 0 1 0] [ 0 0 0 0 0 0 0 25 1 1] [ 1 0 0 0 0 2 0 1 29 0] [ 1 0 0 0 2 2 0 0 0 56]]
#在测试集上生成测试提交结果 #accuracy and confuion matrix y_predict_test = clf_MLP.predict(X_test_pca) import pandas as pd df = pd.DataFrame(y_predict_test) df.columns = ['Label'] df.index += 1 df.index.name = 'Imageid' df.to_csv('SVC_Minist_submission.csv' , header = True)
# 打印测试集上的前90个预测结果 rows = 6 cols = 15 fig1 , ax1 = pyplot.subplots(rows ,cols ,figsize=(12 , 8)) fontdict = {'fontsize': 20,'fontweight' : 6,'verticalalignment': 'baseline','horizontalalignment': 'center'} #reshape函数中的28*28=784,为本数据的维度 for j in range(rows): for i in range(cols): ax1[j][i].imshow(X_test_source[j*cols+i].reshape(28 , 28)) ax1[j][i].axis('off') ax1[j][i].set_title(y_predict[j*cols+i] , fontdict = fontdict)
# 生成比较列表,如果预测的结果正确,则对应位置为0,错误则为1 comp = [0 if y1 == y2 else 1 for y1 , y2 in zip(y_val , y_predict)] # 收集错误识别的样本下标 wrong_index = [] for i , value in enumerate(comp): if value: wrong_index.append(i) print "校验集中的错误样本总数为:" , len(wrong_index) #.drop("label" , axis = 1).value # 输出错误识别的样本图像 print "错误样本示例(真实值->预测值):" pyplot.figure(figsize=(8 , 6)) for plot_index , image_index in enumerate(wrong_index): image = X_val_source[image_index] #打印错误样本的个数 if plot_index == 25: break pyplot.subplot(5 , 5 , plot_index+1) pyplot.axis('off') pyplot.imshow(image.reshape(28 , 28) , cmap = pyplot.cm.gray_r , interpolation='nearest') info = "{right}->{wrong}".format(right = y_val[image_index] , wrong = y_predict[image_index]) pyplot.title(info , fontsize = 16) pyplot.show()