import pandas as pd import random import numpy as np def cos_sim(v1, v2): return np.dot(v1, v2) / (np.linalg.norm(v1) * np.linalg.norm(v2)) # 重複なし乱数 def rand_ints_nodup(a, b, k): ns = [] while len(ns) < k: n = random.randint(a, b) if not n in ns: ns.append(n) return ns FNAME="label_average_cosine_50_cluter_2times_25only.csv" CLUSTER_NUM =25#149個項目名 print("CSV list reading...") df = pd.read_csv(FNAME,delimiter =",", header=None)#csvを読み込み dict ={} name_list =[] for i in range(len(df)): list_a =df.iloc[i,1:50].tolist() name_list.append(df.iat[i,0]) dict[df.iat[i,0]] = np.array(list_a) print(dict) print(name_list) #print(cos_sim(dict['町名'], dict['施設名'])) #print(rand_ints_nodup(0, 149, 100)) k=0 dict_check =[] name_cossim_list =[] for i in range(len(name_list)): #rand_num_list= rand_ints_nodup(0, 149, 100)#重複なし100個 for j in range(len(name_list)): if i ==j: continue if [i,j] not in dict_check or [j,i] not in dict_check: k=k+1 dict_check.append([i,j]) dict_check.append([j,i]) name_cossim_list.append([name_list[i],name_list[j],cos_sim(dict[name_list[i]],dict[name_list[j]])]) #if k==10000: # break df2 = pd.DataFrame(name_cossim_list,columns=['name1','name2','コサイン類似度']) df2.to_csv("cos_sim_all_for_"+str(CLUSTER_NUM)+".csv",index=False,encoding="utf-8-sig")