import pandas as pd import numpy as np from sklearn.cluster import KMeans from sklearn.metrics import silhouette_samples, silhouette_score #% matplotlib inline from IPython.core.pylabtools import figsize import matplotlib.pyplot as plt def select_n_cluster(X,cluster_Num): # クラスター数2~8を比べる range_n_clusters = [i for i in range(2, cluster_Num)] sse = [] cluster_num_silhouette_list =[] for n_clusters in range_n_clusters: clusterer = KMeans(init='k-means++', n_clusters=n_clusters, random_state=0) cluster_labels = clusterer.fit_predict(X) kmeans = clusterer.fit(X) # SSE(クラスター内誤差の平方和) sse.append(kmeans.inertia_) # シルエット値(-1~1)の平均 silhouette_avg = silhouette_score(X, cluster_labels) print('For n_clusters =', n_clusters,'The average silhouette_score is :', silhouette_avg) cluster_num_silhouette_list.append([n_clusters,silhouette_avg]) # エルボー図のプロット plt.plot(range_n_clusters, sse, marker='o') plt.xlabel('Number of clusters') plt.ylabel('SSE') # 保存画像 plt.savefig('figure.png') #plt.show() df2 = pd.DataFrame(cluster_num_silhouette_list,columns=['cluster','silhouette']) df2.to_csv("cluster_num_silhouette_"+str(cluster_Num)+".csv",index=False,encoding="utf-8-sig") if __name__ == '__main__': # データセットを読み込み cust_df = pd.read_csv("datasets_2times.csv",header=None) print(cust_df) cust_array = np.array([cust_df[1].tolist(),cust_df[2].tolist(),cust_df[3].tolist(),cust_df[4].tolist(),cust_df[5].tolist(),cust_df[6].tolist(),cust_df[7].tolist(),cust_df[8].tolist(),cust_df[9].tolist(),cust_df[10].tolist(),cust_df[11].tolist(),cust_df[12].tolist(),cust_df[13].tolist(),cust_df[14].tolist(),cust_df[15].tolist(),cust_df[16].tolist(),cust_df[17].tolist(),cust_df[18].tolist(),cust_df[19].tolist(),cust_df[20].tolist(), cust_df[21].tolist(),cust_df[22].tolist(),cust_df[23].tolist(),cust_df[24].tolist(),cust_df[25].tolist(),cust_df[26].tolist(),cust_df[27].tolist(),cust_df[28].tolist(),cust_df[29].tolist(),cust_df[30].tolist(), cust_df[31].tolist(),cust_df[32].tolist(),cust_df[33].tolist(),cust_df[34].tolist(),cust_df[35].tolist(),cust_df[36].tolist(),cust_df[37].tolist(),cust_df[38].tolist(),cust_df[39].tolist(),cust_df[40].tolist(), cust_df[41].tolist(),cust_df[42].tolist(),cust_df[43].tolist(),cust_df[44].tolist(),cust_df[45].tolist(),cust_df[46].tolist(),cust_df[47].tolist(),cust_df[48].tolist(),cust_df[49].tolist(),cust_df[50].tolist()], np.float32) # 行列を転置 cust_array = cust_array.T print(cust_array) select_n_cluster(cust_array,len(cust_df))