ソースコード
# -*- coding: utf-8 -*-
from gensim.models import Word2Vec
import pandas as pd
import pandas.io.common
import numpy as np
import codecs
import random
import os
import getenc
import time
import re
import sys
import shutil
pd.set_option('display.max_rows',None)
pd.set_option('display.max_columns',None)#列は省略号ではなく、すべて表示
from progressbar import *
start = time.time()
print("CSV list reading...")
csv_list_files="prep2_csv_list.csv"
with codecs.open(csv_list_files, 'r','utf8') as f:
csv_list =f.readlines()
len_csv =29995#len(csv_list)
print('The number of csv list:',len_csv)
drop_list =[]
for n in range(0,20):
fname=csv_list[n].strip()#空白文字を除外
if (os.path.exists(fname)):#サイズ制限:and os.path.getsize(fname) <5242880):#5MB
moji_code=getenc.getEncode(fname)
if moji_code is not None :
with codecs.open(fname,'rb',moji_code) as f:
try:
df = pd.read_csv(f,delimiter =",",nrows =10)
#data = pd.read_csv(fname,nrows =10)
print(n)
print(fname)
print(df)
key_input = int(input())
if(key_input == 0):#0:ok
drop_list.append([n,fname,key_input])
continue
else:
drop_list.append([n,fname,key_input])#除外のパスを保存 #1:NG(全然だめ)2:一部ダメ(欠損NaNデータあり)
dst = (fname).replace("./", "@")
dst = dst.replace("/", "--")
shutil.copy(fname, './tmp/'+dst)
except pandas.io.common.EmptyDataError:
drop_list.append([n,fname,1])
print("ERROR: {} is empty".format(fname))
except pandas.io.common.AbstractMethodError:
print("ERROR: {} is AbstractMethodError".format(fname))
drop_list.append([n,fname,1])
except pandas.io.common.DtypeWarning:
print("ERROR: {} is DtypeWarning".format(fname))
drop_list.append([n,fname,1])
except pandas.io.common.ParserError:
print("ERROR: {} is ParserError".format(fname))
drop_list.append([n,fname,1])
except pandas.io.common.ParserWarning:
print("ERROR: {} is ParserWarning".format(fname))
drop_list.append([n,fname,1])
except Exception as e:
print("%s 's error is %s"%(format(fname),format(e)))
drop_list.append([n,fname,1])
else:
drop_list.append([n,fname,1])
else:
drop_list.append([n,fname,1])
df = pd.DataFrame(drop_list,columns=['index','path','flag'])
df.to_csv("drop_list_all.txt",index=False,encoding="utf-8-sig")