ソースコード
#%% imports
import pyclustering
from pyclustering.cluster import xmeans
import numpy as np
import pylab
from gensim.models import Word2Vec
import pandas as pd
import pandas.io.common
import numpy as np
import codecs
import random
import os
import time
import re
import sys
from janome.tokenizer import Tokenizer
SH=0.6#類似度の閾値sh
F_MODEL_NAME ="gs_cbow_50.model"
FNAME="datasets_3times.csv"
NORMAL_FLG=1#正規化
FNAME_SAVE="3times_normal_50_"+str(SH)+".csv"
FNAME_SAVE2="3times_normal_50_"+str(SH)+"_2.csv"
print("Word2Vec reading...")
word2vec_model =Word2Vec.load(F_MODEL_NAME)
#日本語をベクトルに変換する関数
def word_to_vec(item):
t = Tokenizer()#分かち書き
items =[token.surface for token in t.tokenize(item)]
vec =sum(word2vec_model[items[l]] for l in range(len(items)) if items[l] in word2vec_model.wv)
#print(type(vec))
return vec
#ベクトルの正規化
def normalization(x):
y=np.array(x)
d=np.linalg.norm(y)
x_norm = y/d
return x_norm
#ベクトルの類似度を計算
def cos_similarity(v1, v2):
return np.dot(v1, v2) / (np.linalg.norm(v1) * np.linalg.norm(v2))
#ラベル番号を付け
def clustering_to_clusters(list_clusters,dict_n):
names_list=[]
for k in range(len(list_clusters)):
for j in range(len(list_clusters[k])):
line =[]
line.append(list_clusters[k][j])
#print(type(dict_n[list_clusters[k][j]]))
for l in range(len(dict_n[list_clusters[k][j]].tolist())):
line.append(dict_n[list_clusters[k][j]][l])
line.append(k)
names_list.append(line)
df = pd.DataFrame(names_list)
print(df)
return df
#クラスタの重なり率を計算
def overlap_rate(cluster_list):
rate_list=[]
for i in range(len(cluster_list)):
rates_names = []
for j in range(len(cluster_list)):
if cluster_list[i]==cluster_list[j]:
rates_names.append(1)
else:
m=0
for k in range(len(cluster_list[i])):
if cluster_list[i][k] in cluster_list[j]:
m=m+1
rates_names.append((2*m)/(len(cluster_list[i])+len(cluster_list[j])))
rate_list.append(rates_names)
df = pd.DataFrame(rate_list)
print(df)
return df
#各グループの中心を求める
def cluster_average(group_list,dict_n):
if len(group_list) ==1:
return dict_n[group_list[0]]
else:
vec =sum(dict_n[group_list[l]] for l in range(len(group_list)))
average_vec=vec/len(group_list)
return average_vec
#array = np.array([[1, 1,1,1]])
#array2 = np.array([[2, 2,2,2]])
#list_a=[]
#list_a.append(array)
#list_a.append(array2)
#print(cluster_average(list_a))
#全クラスタに最大の類似度を計算し、記録(削除用関数)
dic_max={}
def max_cosine_all(word_list,cluster_list_all,dict_n):
for k in range(len(word_list)):
cosine_value =0
for i in range(len(cluster_list_all)):
if word_list[k] in cluster_list_all[i]:
word= word_list[k]
if len(cluster_list_all[i]) ==1 and word_list[k] == cluster_list_all[i][0]:
cosine_value =1#同じ単語
else:
tmp =cos_similarity(dict_n[word], cluster_average(cluster_list_all[i],dict_n))
if tmp > cosine_value:
cosine_value = tmp
dic_max[word_list[k]]=cosine_value
print(dic_max[word_list[k]])
#クラスタの要素を削除
def del_cluster(cluster_list,dict_n):
new_list =[]
for i in range(len(cluster_list)):
tmp=[]
words_list= cluster_list[i]
if len(words_list) ==1:
tmp.append(cluster_list[i][0])
else:
for k in range(len(cluster_list[i])):
word = cluster_list[i][k]
cos=cos_similarity(dict_n[word], cluster_average(words_list,dict_n))
if cos !=1 and cos ==dic_max[word]:
tmp.append(cluster_list[i][k])
if len(tmp)>0:
new_list.append(tmp)
return new_list
if __name__ == "__main__":
print(" CSV of item data reading...")
df=pd.read_csv(FNAME,sep=",",header=None) #読み込み
#print(df)
list_name =[]#ラベルの数を計算用
for i in range(len(df)):
list_name.append(df.iat[i,0])
#print(list_name)
print(len(list_name))
dict_n ={}
df_vec=[]
df_name =[]
for i in range(len(list_name)):#len(list_name)
tmp=[]
#print(list_name[i])
if type(word_to_vec(list_name[i])) is not int:
if NORMAL_FLG == 1:
vec =normalization(word_to_vec(list_name[i]))#.tolist()#正規化
else:
vec = np.array(word_to_vec(list_name[i]))#.tolist()#正規化しない
df_vec.append(vec)
df_name.append(list_name[i])
dict_n[list_name[i]]=vec
clusters_words=[]
k=0
for i in range(len(df_name)):
words=[df_name[i]]
for j in range(len(df_name)):
if df_name[i] == df_name[j]:
continue
#print("word1:"+df_name[i]+" word2:"+df_name[j]+"cos:")
#print(cos_similarity(dict_n[df_name[i]], dict_n[df_name[j]]))
if cos_similarity(dict_n[df_name[i]], dict_n[df_name[j]]) >= SH:
words.append(df_name[j])
clusters_words.append(sorted(words))
k+=1
print(len(words))
clusters_words = list(map(list, set(map(tuple, clusters_words))))#重複の要素を除外
print(clusters_words)
print(clusters_words)
print(len(clusters_words))
clusters_other = []
clusters_list =[]
for m in range(len(clusters_words)):
if len(clusters_words[m]) >1:
clusters_list.append(clusters_words[m])
else:
clusters_other.append(clusters_words[m])
print(clusters_list)
print(len(clusters_list))
print(clusters_other)
print(len(clusters_other))
df_rate =overlap_rate(clusters_words)
df_rate.to_csv("overlap_rate_100_"+str(SH)+".csv",index=True,encoding="utf-8-sig")
# クラスタを取得できる
df_cluster= clustering_to_clusters(clusters_words,dict_n)
df_cluster.to_csv(FNAME_SAVE,index=False,encoding="utf-8-sig",header = False)
max_cosine_all(df_name,clusters_words,dict_n)
new_list =del_cluster(clusters_words,dict_n)
print(new_list)
print(len(new_list))
# クラスタを取得できる
df_cluster= clustering_to_clusters(new_list,dict_n)
df_cluster.to_csv(FNAME_SAVE2,index=False,encoding="utf-8-sig",header = False)