import os import json import random from time import sleep def create_dir(dir_path): if not os.path.exists(dir_path): os.makedirs(dir_path) def skip_captcha(): print('爬取链接中...') def sleeps(a, b): if a > 0 and b > a: sleep((b - a) * random.random() + a) else: print('Invalid parms!') def save_to_file(data_list, file_path='./output/items.jsonl'): with open(file_path, 'w', encoding='utf-8') as jsonl_file: for data in data_list: json.dump(data, jsonl_file, ensure_ascii=( file_path != './output/items.jsonl')) jsonl_file.write('\n') def rm_duplicates_by_key(jsonl_path='./output/items.jsonl', key_to_check='id', failist_path='./output/duplicate_id.txt'): print('Removing duplicates...') if not os.path.exists(jsonl_path): print('jsonl not exist') return data_set = set() unique_data = [] duplicates = set() with open(jsonl_path, 'r', encoding='utf-8') as jsonl_file: for line in jsonl_file: data = json.loads(line) # 提取指定键值的值,并用作判断重复的标识 key_value = data.get(key_to_check) # 如果标识值已存在,表示数据重复 if key_value in data_set: duplicates.add(key_value) continue else: data_set.add(key_value) unique_data.append(data) save_to_file(unique_data, file_path=jsonl_path) save_to_file(duplicates, file_path=failist_path) print('Duplicates removed!')