import os
import json
import random
from time import sleep


def create_dir(dir_path):
    if not os.path.exists(dir_path):
        os.makedirs(dir_path)


def skip_captcha():
    print('爬取链接中...')


def sleeps(a, b):
    if a > 0 and b > a:
        sleep((b - a) * random.random() + a)

    else:
        print('Invalid parms!')


def save_to_file(data_list, file_path='./output/items.jsonl'):
    with open(file_path, 'w', encoding='utf-8') as jsonl_file:
        for data in data_list:
            json.dump(data, jsonl_file, ensure_ascii=(
                file_path != './output/items.jsonl'))
            jsonl_file.write('\n')


def rm_duplicates_by_key(jsonl_path='./output/items.jsonl', key_to_check='id', failist_path='./output/duplicate_id.txt'):
    print('Removing duplicates...')
    if not os.path.exists(jsonl_path):
        print('jsonl not exist')
        return

    data_set = set()
    unique_data = []
    duplicates = set()

    with open(jsonl_path, 'r', encoding='utf-8') as jsonl_file:
        for line in jsonl_file:
            data = json.loads(line)

            # 提取指定键值的值，并用作判断重复的标识
            key_value = data.get(key_to_check)

            # 如果标识值已存在，表示数据重复
            if key_value in data_set:
                duplicates.add(key_value)
                continue
            else:
                data_set.add(key_value)
                unique_data.append(data)

    save_to_file(unique_data, file_path=jsonl_path)
    save_to_file(duplicates, file_path=failist_path)
    print('Duplicates removed!')