from gensim.models import Word2Vec import pandas as pd import pandas.io.common import numpy as np import codecs import random import os import getenc import time import re import sys def is_item_name(item): try: if re.search('^[Ⅰ-Ⅹ0-90-9%%×,,..::\"’〜~―ー-−‐○〇…〒-]+$',item):#記号・数字等 return False if re.search("^[Unnamed:|H]+\d+$",item):#英語 return False elif re.search('[.]+[0-90-9]+$',item):#特殊文字:事業所数.1,従業者数.1 return False elif re.search('[0-90-9]+[人|歳|年|年度|月|時|歳代|の男性|の女性]+(以上)?$',item):#特殊文字:1〜4人 return False else: return True except TypeError: return False else: return False from progressbar import * start = time.time() print("CSV list reading...") csv_list_files="prep2_csv_list.csv" #ken_list.txt with codecs.open(csv_list_files, 'r','utf8') as f: csv_list =f.readlines() len_csv =len(csv_list) print('The number of csv list:',len_csv) WIDGETS = ['Progress of making dataset: ',Percentage(), ' ', Bar('#'),' ', Timer(),' ', ETA(), ' ', FileTransferSpeed()] pbar=ProgressBar(widgets=WIDGETS, maxval=len_csv).start()#プログレスバーのスタート k=0 j=0 exclusion_list =[]#除外のリストを保存 dict ={} dict_path ={} #len_csv pat=re.compile('[^/]+/[^/]+/([^/]+)/') for n in range(0,len_csv): fname=csv_list[n].strip()#空白文字を除外 if (os.path.exists(fname)): moji_code=getenc.getEncode(fname) if moji_code is not None : with codecs.open(fname,'rb',moji_code) as f: try: df = pd.read_csv(f,delimiter =",") name_list = list(df.columns.values) #basename = os.path.basename(fname)#パス文字列からファイル名を取得 res=pat.match(fname); path = res.group(1) #path = fname[0:len(fname)-len(basename)]#パス文字列からファイルのパス #print(name_list) #項目名の回数を統計 for name in name_list: name = ''.join(name.split())#空白文字を除外 if(is_item_name(name) ==False or name ==''): continue if name in dict.keys(): dict[name]+=1 else: dict[name] =1 if name in dict_path.keys(): if path not in dict_path[name]: dict_path[name].append(path) else: dict_path[name]=[] dict_path[name].append(path) k=k+1 if(k%100 == 0): print(k) except pandas.io.common.EmptyDataError: j=j+1 exclusion_list.append(fname) print("ERROR: {} is empty".format(fname)) except pandas.io.common.AbstractMethodError: j=j+1 print("ERROR: {} is AbstractMethodError".format(fname)) exclusion_list.append(fname) except pandas.io.common.DtypeWarning: j=j+1 print("ERROR: {} is DtypeWarning".format(fname)) exclusion_list.append(fname) except pandas.io.common.ParserError: j=j+1 print("ERROR: {} is ParserError".format(fname)) exclusion_list.append(fname) except pandas.io.common.ParserWarning: j=j+1 print("ERROR: {} is ParserWarning".format(fname)) exclusion_list.append(fname) except Exception as e: j=j+1 print("%s 's error is %s"%(format(fname),format(e))) exclusion_list.append(fname) else: j=j+1 exclusion_list.append(fname) else: j=j+1 exclusion_list.append(fname) print("done1") #print(dict) #for i in sorted(dict): # print(i,dict[i]) dic2 = sorted(dict.items(),key=lambda i: -i[1]) #print(dic2) #print(dict_path) re_list =[] itemname_list =[] k=0 print("done2") for item in dic2: item_name = item[0] if(len(dict_path[item_name]) >2): k=k+1 re_list.append([item_name,item[1],len(dict_path[item_name]),dict_path[item_name]]) itemname_list.append(item_name) print("done3") df2 = pd.DataFrame(re_list,columns=['item name','times','path_times','path']) df2.to_csv("times_all_626_2times_re.csv",index=False,encoding="utf-8-sig") #f = open('itemname_list_all_626_2times_re.txt', 'w') #for x in itemname_list: # f.write(str(x) + "\n") #f.close() #print('The lenth of dict:',len(dic2)) print("done") print('The number of discarded csv:',j) process_time = format(time.time() - start) print('Execution time is %s s'%process_time)