import json

import pandas as pd

def print_csv(src_csv,tgt_csv):

#csv_file = open(src_csv,encoding='utf-8')

g = open(tgt_csv, 'w', encoding='utf-8')

reader = pd.read_csv(src_file, iterator=True)

loop = True

chunkSize = 500000 #文件的分割行数

num_line = 0

sum_line = 0 #记录总行数

while loop:

try:

if sum_line % 500000 == 0:

print("第%d行" % sum_line)

lines = reader.get_chunk(chunkSize)

#print(lines) #用小文件输出,查看结果

for i in range(len(lines)):

real_line = num_line * chunkSize + i

dic = {'id':'', 'text':''}

sum_line += 1

num = lines['classification'][real_line] #IPC分类号

abs = lines['abs'][real_line] #专利摘要

if str(num)=='nan' or str(abs)=='nan':

continue #若为空,则跳过

#print(num)

#print(abs)

dic['id'] = num

dic['text'] = abs

json_data = json.dumps(dic, ensure_ascii=False)

g.write(json_data + '\n')

num_line += 1

except StopIteration:

loop = False

print("Iteration is stopped")

g.close()

if __name__ == '__main__':

src_csv = 'patent.csv'

tgt_csv = 'new_patent.json'

print_csv(src_csv, tgt_csv)

推荐内容