目标:熟悉各种聚类算法的调用,并用评价指标选择合适的超参数 数据:同神经网络
# python 2.7 python 2.7 python 2.7 # 导入必要的工具包 import pandas as pd import numpy as np from sklearn.cluster import MiniBatchKMeans from sklearn.model_selection import train_test_split from sklearn import metrics from sklearn.decomposition import PCA import time import matplotlib.pyplot as plt %matplotlib inline
# 读取训练数据
train = pd.read_csv('./data/number_train.csv')
n_trains = 1000
y_train = train.label.values[:n_trains]
X_train = train.drop("label" , axis = 1).values[:n_trains]#将像素值[0,255] --> [0 , 1] X_train = X_train / 255.0
# 原始输入的特征维数和样本数目
print('训练样本的规模及每一样本的维数为:{}'.format(X_train.shape))
#对数据进行PCA降维,0.71是由降维里的验证得出的 pca = PCA(n_components = 0.71) pca.fit(X_train) X_train_pca = pca.transform(X_train) # 降维后的特征维数 print(X_train_pca.shape)

# 将训练集合拆分成训练集和校验集,在校验集上找到最佳的模型超参数(PCA维数) X_train_part , X_val , y_train_part , y_val = train_test_split(X_train , y_train , train_size =0.8 , random_state = 0)
#拆分后的训练集和校验集的样本数目 print(X_train_part.shape) print(X_val.shape)

# 一个参数点(聚类数据为K)的模型,在校验集上评价聚类算法性能
def K_cluster_analysis(K , X_train , y_train , X_val , y_val):
start = time.time()
print("K-means begin with clusters: {}".format(K))
#K-means,在训练集上训练
mb_kmeans = MiniBatchKMeans(n_clusters = K)
mb_kmeans.fit(X_train)
# 在训练集和测试集上测试
# y_train_pred = mb_kmeans.fit_predict(X_train)
y_val_pred = mb_kmeans.predict(X_val)
#以前两维特征打印训练数据的分类结果
#plt.scatter(X_train[:,0] , X_train[: , 1] , c = y_pred)
#plt.show()
# K值的评估标准
#常见的方法有轮廓系数Silhouette Coefficient和Calinski-Harabaz Index
#这两个分数值越大则聚类效果越好
#CH_score = metrics.calinski_harabaz_score(X_train , mb_kmeans.predict(X_train))
CH_score = metrics.silhouette_score(X_train , mb_kmeans.predict(X_train))
#也可以在校验集上评估K
V_score = metrics.v_measure_score(y_val , y_val_pred)
end = time.time()
print("CH_score: {}, time elaps:{}".format(CH_score , int(end-start)))
print("V_score: {}".format(V_score))
return CH_score , V_score# 设置超参数(聚类数目K)搜索范围 Ks = [10 ,20, 30 ,40 ,50 , 60] CH_scores = [] V_scores = [] for K in Ks: ch , v = K_cluster_analysis(K , X_train_part , y_train_part , X_val , y_val) CH_scores.append(ch) V_scores.append(v)

# 绘制不同PCA维数下模型的性能,找到最佳模型/参数(分数最高) plt.plot(Ks , np.array(CH_scores) , 'b-')

plt.plot(Ks , np.array(V_scores) , 'g-')

# 显示聚类结果
# 画出聚类结果,每一类用一种颜色
colors = ['b' , 'g' , 'r' , 'k' , 'c' , 'm' , 'y' , '#e24fff' , '#524c90' , '#ee4c90']
n_clusters = 10
mb_kmeans = MiniBatchKMeans(n_clusters = n_clusters)
mb_kmeans.fit(X_train_pca)
y_train_pred = mb_kmeans.labels_
cents = mb_kmeans.cluster_centers_ #质心
#nonzeros(a)返回数组a中值不为零的元素的下标,返回值为数组
fontdict = {'weight' : 'bold' , 'size' : 9}
for i in range(n_clusters):
index = np.nonzero(y_train_pred == i)[0]# index中保存的是聚类后,类别为 i 的数组下标
x1 = X_train_pca[index , 0]#x1中保存的是训练数据中的第一个维度,示例:arr[[0 , 1 ,2],0]返回数组arr中下标为0 ,1 ,2的数组的第一个维度
x2 = X_train_pca[index , 1]
y_i = y_train[index]
for j in range(len(x1)):
if j < 20: #每类打印20个
plt.text(x1[j] , x2[j] , str(int(y_i[j])) , color = colors[i] , fontdict = fontdict)
plt.axis([-5 , 10 , -6 , 6])
plt.show()
