1)と2)の書いてみたソースコード:k-means_optimal_clusters.py.txt
import pandas as pd
import numpy as np
from sklearn.cluster import KMeans
from sklearn.metrics import silhouette_samples, silhouette_score
#% matplotlib inline
from IPython.core.pylabtools import figsize
import matplotlib.pyplot as plt
def select_n_cluster(X,cluster_Num):
# クラスター数2~8を比べる
range_n_clusters = [i for i in range(2, cluster_Num)]
sse = []
cluster_num_silhouette_list =[]
for n_clusters in range_n_clusters:
clusterer = KMeans(init='k-means++', n_clusters=n_clusters, random_state=0)
cluster_labels = clusterer.fit_predict(X)
kmeans = clusterer.fit(X)
# SSE(クラスター内誤差の平方和)
sse.append(kmeans.inertia_)
# シルエット値(-1~1)の平均
silhouette_avg = silhouette_score(X, cluster_labels)
print('For n_clusters =', n_clusters,'The average silhouette_score is :', silhouette_avg)
cluster_num_silhouette_list.append([n_clusters,silhouette_avg])
# エルボー図のプロット
plt.plot(range_n_clusters, sse, marker='o')
plt.xlabel('Number of clusters')
plt.ylabel('SSE')
# 保存画像
plt.savefig('figure.png')
#plt.show()
df2 = pd.DataFrame(cluster_num_silhouette_list,columns=['cluster','silhouette'])
df2.to_csv("cluster_num_silhouette_"+str(cluster_Num)+".csv",index=False,encoding="utf-8-sig")
if __name__ == '__main__':
# データセットを読み込み
cust_df = pd.read_csv("datasets_2times.csv",header=None)
print(cust_df)
cust_array = np.array([cust_df[1].tolist(),cust_df[2].tolist(),cust_df[3].tolist(),cust_df[4].tolist(),cust_df[5].tolist(),cust_df[6].tolist(),cust_df[7].tolist(),cust_df[8].tolist(),cust_df[9].tolist(),cust_df[10].tolist(),cust_df[11].tolist(),cust_df[12].tolist(),cust_df[13].tolist(),cust_df[14].tolist(),cust_df[15].tolist(),cust_df[16].tolist(),cust_df[17].tolist(),cust_df[18].tolist(),cust_df[19].tolist(),cust_df[20].tolist(),
cust_df[21].tolist(),cust_df[22].tolist(),cust_df[23].tolist(),cust_df[24].tolist(),cust_df[25].tolist(),cust_df[26].tolist(),cust_df[27].tolist(),cust_df[28].tolist(),cust_df[29].tolist(),cust_df[30].tolist(), cust_df[31].tolist(),cust_df[32].tolist(),cust_df[33].tolist(),cust_df[34].tolist(),cust_df[35].tolist(),cust_df[36].tolist(),cust_df[37].tolist(),cust_df[38].tolist(),cust_df[39].tolist(),cust_df[40].tolist(),
cust_df[41].tolist(),cust_df[42].tolist(),cust_df[43].tolist(),cust_df[44].tolist(),cust_df[45].tolist(),cust_df[46].tolist(),cust_df[47].tolist(),cust_df[48].tolist(),cust_df[49].tolist(),cust_df[50].tolist()], np.float32)
# 行列を転置
cust_array = cust_array.T
print(cust_array)
select_n_cluster(cust_array,len(cust_df))