#!/usr/bin/env python # -*- coding: utf-8 -*- #20200117 pathを出力 from gensim.models import Word2Vec import pandas as pd import pandas.io.common import numpy as np import codecs import random import os import getenc import time import re import sys from progressbar import * start = time.time() def is_text(item): try: if re.search('^[A-ZA-Za-za-zⅠ-Ⅹ0-90-9%.・・.::\"’′″°〜~―ー-−‐_//○〇*()-×●★△&…〒〃㎡  ]+$',item):#記号・英語・数字等 return False elif re.search('^[0-90-9①-⑳-―ー-−‐()(),,.△><%  ]+$',item):#記号・英語・数字等2 return False elif re.search("(https?|ftp|http)(:\/\/[-_.!~*\'()a-zA-Z0-9;\/?:\@&=+\$,%#]+)",item):#URL return False elif re.search("[\s\S]{30}",item):#30文字以上 return False elif re.search('[︰-@~_「」[\a-zA-Z//-―-−‐←→↑┌↓?\\\]',item):#特殊文字2:円マック return False else: return True except TypeError: return False else: return False def is_item_name(item): try: if re.search('^[A-ZA-Za-za-zⅠ-Ⅹ0-90-9%.・・.::\"’′″°〜~―ー-−‐_//○〇*()-×●★△&…〒〃㎡  ]+$',item):#記号・英語・数字等1 return False elif re.search('^[0-90-9①-⑳-―ー-−‐()(),,.△><%  ]+$',item):#記号・英語・数字等2 return False elif re.search('^[0-90-9年月日円通平成度第回銭末日現在№(),.]+$',item):#特殊文字1 return False elif re.search('[︰-@~_「」[\a-zA-Z//-―-−‐←→↑┌↓?\\\]',item):#特殊文字2:円マック return False elif re.search('[a-zA-Z:{}]',item):#特殊文字3:英文字含めている場合 return False elif re.search('([0-90-9]{2,4})(-|ー|−)([0-90-9]{2,4})',item):#特殊文字4:住所番地など除外 return False elif re.search('^[0-90-9①-⑳]',item):#特殊文字5 return False elif re.search('^[ぁ-ゟ]+$',item):#ひらがなのみ (20191217追記) return False elif re.search("[\s\S]{15}",item):#15文字以上 return False #elif re.search("^[\s\S]{0,1}$",item):#1文字のみ # return False else: return True except TypeError: return False else: return False print("CSV list reading...") csv_list_files="prep2_csv_list.csv" with codecs.open(csv_list_files, 'r','utf8') as f: csv_list =f.readlines() len_csv =29995#len(csv_list) print('The number of csv list:',len_csv) k=0 item_name = [] exclusion_list =[] re_list =[] WIDGETS = ['Progress of making dataset: ',Percentage(), ' ', Bar('#'),' ', Timer(),' ', ETA(), ' ', FileTransferSpeed()] pbar=ProgressBar(widgets=WIDGETS, maxval=len_csv).start()#プログレスバーのスタート for n in range(0,len_csv): fname=csv_list[n].strip()#空白文字を除外 if (os.path.exists(fname)):#サイズ制限:and os.path.getsize(fname) <5242880):#5MB:5242880 and os.path.getsize(fname) <5242880 moji_code=getenc.getEncode(fname) if moji_code is not None : with codecs.open(fname,'rb',moji_code) as f: try: df = pd.read_csv(f,delimiter =",") k=k+1 for col in df.columns: for data in df[col]: #除外1:型(数字を除外) if ((type(data) is int) or(type(data) is float) or (pd.isnull(data) == True) ): continue #除外2:正規表現で除外 if ((is_text(data) ==False) or (is_item_name(col) ==False)): continue #除外3:空白文字除外 data =re.sub('[ | |"|\n|\r|\t|\vt|
]','',data) data =re.sub(',',',',data) col =re.sub('[ | |"|\n|\r|\t|\vt|
]','',col) col =re.sub(',',',',col) col =re.sub('[0-90-9①-⑳.-―ー-−‐]{0,10}$','',col)#項目名の後ろに数字等の場合の除外処理 #除外4:空を if(len(col)==0 or len(data)==0): continue if [data,col] not in re_list: re_list.append([data,col,fname]) if col not in item_name: item_name.append(col) except pandas.io.common.EmptyDataError: exclusion_list.append(fname) print("ERROR: {} is empty".format(fname)) except pandas.io.common.AbstractMethodError: print("ERROR: {} is AbstractMethodError".format(fname)) exclusion_list.append(fname) except pandas.io.common.DtypeWarning: print("ERROR: {} is DtypeWarning".format(fname)) exclusion_list.append(fname) except pandas.io.common.ParserError: print("ERROR: {} is ParserError".format(fname)) exclusion_list.append(fname) except pandas.io.common.ParserWarning: print("ERROR: {} is ParserWarning".format(fname)) exclusion_list.append(fname) except Exception as e: print("%s 's error is %s"%(format(fname),format(e))) exclusion_list.append(fname) else: exclusion_list.append(fname) else: exclusion_list.append(fname) pbar.update(n)#処理の進捗状況をプログレスバーとして表示 pbar.finish()#プログレスバーの終了 df2 = pd.DataFrame(re_list,columns=['item','item_data','path']) df2.to_csv("pre2_csv_items_all_2020117.csv",index=False,encoding="utf-8-sig") item_name_list = pd.DataFrame(item_name,columns=['item_name']) item_name_list.to_csv("items_name_all_2020117.csv",index=False,encoding="utf-8-sig") df3 = pd.DataFrame(exclusion_list,columns=['path']) df3.to_csv("exclusion_list_all_2020117.txt",index=False,encoding="utf-8-sig") print("done") print('The number of processed csv :',k) print('The number of discarded csv:',len_csv-k) process_time = format(time.time() - start) print('Execution time is %s s'%process_time)