|
import os |
|
import json |
|
import random |
|
from time import sleep |
|
|
|
|
|
def create_dir(dir_path): |
|
if not os.path.exists(dir_path): |
|
os.makedirs(dir_path) |
|
|
|
|
|
def skip_captcha(): |
|
print('爬取链接中...') |
|
|
|
|
|
def sleeps(a, b): |
|
if a > 0 and b > a: |
|
sleep((b - a) * random.random() + a) |
|
|
|
else: |
|
print('Invalid parms!') |
|
|
|
|
|
def save_to_file(data_list, file_path='./output/items.jsonl'): |
|
with open(file_path, 'w', encoding='utf-8') as jsonl_file: |
|
for data in data_list: |
|
json.dump(data, jsonl_file, ensure_ascii=( |
|
file_path != './output/items.jsonl')) |
|
jsonl_file.write('\n') |
|
|
|
|
|
def rm_duplicates_by_key(jsonl_path='./output/items.jsonl', key_to_check='id', failist_path='./output/duplicate_id.txt'): |
|
print('Removing duplicates...') |
|
if not os.path.exists(jsonl_path): |
|
print('jsonl not exist') |
|
return |
|
|
|
data_set = set() |
|
unique_data = [] |
|
duplicates = set() |
|
|
|
with open(jsonl_path, 'r', encoding='utf-8') as jsonl_file: |
|
for line in jsonl_file: |
|
data = json.loads(line) |
|
|
|
|
|
key_value = data.get(key_to_check) |
|
|
|
|
|
if key_value in data_set: |
|
duplicates.add(key_value) |
|
continue |
|
else: |
|
data_set.add(key_value) |
|
unique_data.append(data) |
|
|
|
save_to_file(unique_data, file_path=jsonl_path) |
|
save_to_file(duplicates, file_path=failist_path) |
|
print('Duplicates removed!') |
|
|