MiVOLO / utils.py
George
add main
2b8b1bb
raw
history blame
No virus
1.61 kB
import os
import json
import random
from time import sleep
def create_dir(dir_path):
if not os.path.exists(dir_path):
os.makedirs(dir_path)
def skip_captcha():
print('爬取链接中...')
def sleeps(a, b):
if a > 0 and b > a:
sleep((b - a) * random.random() + a)
else:
print('Invalid parms!')
def save_to_file(data_list, file_path='./output/items.jsonl'):
with open(file_path, 'w', encoding='utf-8') as jsonl_file:
for data in data_list:
json.dump(data, jsonl_file, ensure_ascii=(
file_path != './output/items.jsonl'))
jsonl_file.write('\n')
def rm_duplicates_by_key(jsonl_path='./output/items.jsonl', key_to_check='id', failist_path='./output/duplicate_id.txt'):
print('Removing duplicates...')
if not os.path.exists(jsonl_path):
print('jsonl not exist')
return
data_set = set()
unique_data = []
duplicates = set()
with open(jsonl_path, 'r', encoding='utf-8') as jsonl_file:
for line in jsonl_file:
data = json.loads(line)
# 提取指定键值的值,并用作判断重复的标识
key_value = data.get(key_to_check)
# 如果标识值已存在,表示数据重复
if key_value in data_set:
duplicates.add(key_value)
continue
else:
data_set.add(key_value)
unique_data.append(data)
save_to_file(unique_data, file_path=jsonl_path)
save_to_file(duplicates, file_path=failist_path)
print('Duplicates removed!')