|
@@ -48,32 +48,34 @@ def read_file(filename):
|
|
def add_paper(file_path):
|
|
def add_paper(file_path):
|
|
papers.create_index("corpusid", unique=True)
|
|
papers.create_index("corpusid", unique=True)
|
|
# 读取 paper 文件,存入数据库
|
|
# 读取 paper 文件,存入数据库
|
|
- data_list = read_file(file_path)
|
|
|
|
|
|
+ # data_list = read_file(file_path)
|
|
# 批量插入数据
|
|
# 批量插入数据
|
|
inserted_ids = 0
|
|
inserted_ids = 0
|
|
|
|
|
|
try:
|
|
try:
|
|
sub_list = []
|
|
sub_list = []
|
|
- for line in data_list:
|
|
|
|
- sub_list.append(line)
|
|
|
|
|
|
+ with open(file_path, 'r') as f:
|
|
|
|
+ for line in f:
|
|
|
|
+ line_dict = json.loads(line)
|
|
|
|
+ sub_list.append(line_dict)
|
|
|
|
|
|
- if len(sub_list) == 2000:
|
|
|
|
|
|
+ if len(sub_list) == 2000:
|
|
|
|
+ result = papers.insert_many(sub_list, ordered=False)
|
|
|
|
+ inserted_ids += len(result.inserted_ids)
|
|
|
|
+ sub_list = []
|
|
|
|
+
|
|
|
|
+ if sub_list:
|
|
result = papers.insert_many(sub_list, ordered=False)
|
|
result = papers.insert_many(sub_list, ordered=False)
|
|
inserted_ids += len(result.inserted_ids)
|
|
inserted_ids += len(result.inserted_ids)
|
|
sub_list = []
|
|
sub_list = []
|
|
|
|
|
|
- if sub_list:
|
|
|
|
- result = papers.insert_many(sub_list, ordered=False)
|
|
|
|
- inserted_ids += len(result.inserted_ids)
|
|
|
|
- sub_list = []
|
|
|
|
-
|
|
|
|
- print('-------process', inserted_ids, '/', len(data_list))
|
|
|
|
|
|
+ print('-------process', inserted_ids, '/', '7318795')
|
|
except pymongo.errors.BulkWriteError as e:
|
|
except pymongo.errors.BulkWriteError as e:
|
|
inserted_ids = e.details['nInserted']
|
|
inserted_ids = e.details['nInserted']
|
|
finally:
|
|
finally:
|
|
# 输出插入结果
|
|
# 输出插入结果
|
|
print("总插入数据: {0}, 已插入数据: {1}, 已存在数据: {2}" .format(
|
|
print("总插入数据: {0}, 已插入数据: {1}, 已存在数据: {2}" .format(
|
|
- len(data_list), inserted_ids, papers.count_documents({})))
|
|
|
|
|
|
+ 7318795, inserted_ids, papers.count_documents({})))
|
|
|
|
|
|
|
|
|
|
def crawl_data():
|
|
def crawl_data():
|