600字范文 > K-means算法在手写体数字图像数据上的使用示例-代码详解

K-means算法在手写体数字图像数据上的使用示例-代码详解

时间：2024-05-05 10:52:01

对应书上的P84页的代码

# coding: utf-8# 分别导入numpy、matplotlib以及pandas，用于数学运算、作图以及数据分析。import numpy as npimport matplotlib.pyplot as pltimport pandas as pd# 使用pandas分别读取训练数据与测试数据集。digits_train = pd.read_csv('optdigits.tra', header=None)digits_test = pd.read_csv('optdigits.tes', header=None)# 从训练与测试数据集上都分离出64维度的像素特征与1维度的数字目标。X_train = digits_train[np.arange(64)]#64个属性值y_train = digits_train[64]#一个结果值X_test = digits_test[np.arange(64)]y_test = digits_test[64]# 从sklearn.cluster中导入KMeans模型。from sklearn.cluster import KMeans# 初始化KMeans模型，并设置聚类中心数量为10。kmeans = KMeans(n_clusters=10)kmeans.fit(X_train)# 逐条判断每个测试图像所属的聚类中心。y_pred = kmeans.predict(X_test)# In[2]:# 从sklearn导入度量函数库metrics。from sklearn import metrics# 使用ARI进行KMeans聚类性能评估。print metrics.adjusted_rand_score(y_test, y_pred)# 导入numpy。import numpy as np# 从sklearn.cluster中导入KMeans算法包。from sklearn.cluster import KMeans# 从sklearn.metrics导入silhouette_score用于计算轮廓系数。from sklearn.metrics import silhouette_scoreimport matplotlib.pyplot as plt# 分割出3*2=6个子图，并在1号子图作图。plt.subplot(3,2,1)#这里的意思是这六个子图是呈3行2列排序的。# 初始化原始数据点。x1 = np.array([1, 2, 3, 1, 5, 6, 5, 5, 6, 7, 8, 9, 7, 9])x2 = np.array([1, 3, 2, 2, 8, 6, 7, 6, 7, 1, 2, 1, 1, 3])X = np.array(zip(x1, x2)).reshape(len(x1), 2)#zip(x1, x2)的意思就是拼接起来变成一个坐标。#X就是点的坐标的集合#reshape的作用就是让列表的元素也是列表，如果没有reshape这个函数，那么列表的元素是元组#所以reshape对输出功能没有影响，只是让输出的形式发生了改变。# 在1号子图做出原始数据点阵的分布。plt.xlim([0, 10])plt.ylim([0, 10])plt.title('Instances')plt.scatter(x1, x2)colors = ['b', 'g', 'r', 'c', 'm', 'y', 'k', 'b']markers = ['o', 's', 'D', 'v', '^', 'p', '*', '+']clusters = [2, 3, 4, 5, 8]#这里就是分成几类的意思subplot_counter = 1sc_scores = []for t in clusters:#t表示上面list中的某个参数的具体值print('t=',t)subplot_counter += 1print('subplot_counter=',subplot_counter)#参数subplot_counter之所以从2开始计数，是因为前面已经画过一个图了plt.subplot(3, 2, subplot_counter)#subplot表示画在同一张图中的子图kmeans_model = KMeans(n_clusters=t).fit(X)#表示将数据集X，分成t类for i, l in enumerate(kmeans_model.labels_):#i和l都是标记，一个用来指代坐标值，一个用来指代颜色和标记。plt.plot(x1[i], x2[i], color=colors[l], marker=markers[l], ls='None')#i用来选定坐标，color表示画marker时填充的颜色，marker表示画图时用的图形标记plt.xlim([0, 10])plt.ylim([0, 10])sc_score = silhouette_score(X, kmeans_model.labels_, metric='euclidean')#这个讲的是轮廓系数sc_scores.append(sc_score)#sc_scores是前面定义的# 绘制轮廓系数与不同类簇数量的直观显示图。plt.title('K = %s, silhouette coefficient= %0.03f' %(t, sc_score))#句子中两个%分别对应后面得t和sc_score# 绘制轮廓系数与不同类簇数量的关系曲线。对应书本P88图2-12中的曲线plt.figure()plt.plot(clusters, sc_scores, '*-')plt.xlabel('Number of Clusters')plt.ylabel('Silhouette Coefficient Score')plt.show()# 导入必要的工具包。import numpy as npfrom sklearn.cluster import KMeansfrom scipy.spatial.distance import cdistimport matplotlib.pyplot as plt# 使用均匀分布函数随机三个簇，每个簇周围10个数据样本。cluster1 = np.random.uniform(0.5, 1.5, (2, 10))#random.uniform(x, y)表示随机范围在x和y之间cluster2 = np.random.uniform(5.5, 6.5, (2, 10))cluster3 = np.random.uniform(3.0, 4.0, (2, 10))# 绘制30个数据样本的分布图像。X = np.hstack((cluster1, cluster2, cluster3)).T#hstack用来串行连接3个数组print('X=',X)plt.scatter(X[:,0], X[:, 1])print('X[:,0]=',X[:,0])#取得X中的第一列数据print('X[:,1]=',X[:,1])#取得X中的第二列数据plt.xlabel('x1')plt.ylabel('x2')plt.show()# 测试9种不同聚类中心数量下，每种情况的聚类质量，并作图。#下面代码就是为了绘制图2-15的K = range(1, 10)print("------------------------------")print("K=",K)meandistortions = []for k in K:#K是个范围kmeans = KMeans(n_clusters=k)#这里的意思是分成k类kmeans.fit(X)meandistortions.append(sum(np.min(cdist(X, kmeans.cluster_centers_, 'euclidean'), axis=1))/X.shape[0])#cdist来自scipy，用来计算距离，这里特别代指欧氏距离plt.plot(K, meandistortions, 'bx-')plt.xlabel('k')plt.ylabel('Average Dispersion')plt.title('Selecting k with the Elbow Method')plt.show()

本内容不代表本网观点和政治立场，如有侵犯你的权益请联系我们处理。

网友评论

网友评论仅供其表达个人看法，并不表明网站立场。