File size: 5,470 Bytes
69a6cef
 
 
 
 
 
 
ca96eac
 
 
29218b1
ca96eac
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
29218b1
ca96eac
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
29218b1
ca96eac
 
 
 
 
 
 
 
 
d762fa5
 
ca96eac
 
 
 
69a6cef
 
 
ca96eac
 
69a6cef
c58ad6f
ca96eac
69a6cef
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
import argparse
import os
from waifuc.action import HeadCountAction, AlignMinSizeAction, CCIPAction, ThreeStageSplitAction, ModeConvertAction, ClassFilterAction, PersonSplitAction, TaggingAction, RatingFilterAction, NoMonochromeAction, RandomFilenameAction, FirstNSelectAction, FilterSimilarAction, FileExtAction
from waifuc.export import SaveExporter, TextualInversionExporter
from waifuc.source import DanbooruSource, PixivSearchSource, ZerochanSource, LocalSource, GcharAutoSource
from cyberharem.dataset.crawler import crawl_dataset_to_huggingface

import gradio as gr
import os
import json
from pathlib import Path
from waifuc.action import HeadCountAction, AlignMinSizeAction, CCIPAction, ThreeStageSplitAction, ModeConvertAction, ClassFilterAction, PersonSplitAction, TaggingAction, RatingFilterAction, NoMonochromeAction, RandomFilenameAction, FirstNSelectAction, FilterSimilarAction, FileExtAction
from waifuc.export import SaveExporter, TextualInversionExporter
from waifuc.source import DanbooruSource, PixivSearchSource, ZerochanSource, LocalSource, GcharAutoSource
from cyberharem.dataset.crawler import crawl_dataset_to_huggingface
from cyberharem.utils import get_hf_client, get_hf_fs
from hbutils.system import TemporaryDirectory
from cyberharem.utils import download_file as cyber_download_file
from huggingface_hub import hf_hub_url, hf_hub_download


def start_func(chars, is_cpu, udghs, game_index=None):
    if not udghs:
        if is_cpu:
            os.environ['ONNX_MODE'] = 'CPUExecutionProvider'
        char_list = chars.split(',')
        for ch in char_list:
            crawl_dataset_to_huggingface(ch)
            print(ch + "完成")
        return str(chars)+" 上传完成"
    else:
        dgrepo = 'deepghs/game_characters'
        if is_cpu:
            os.environ['ONNX_MODE'] = 'CPUExecutionProvider'
        with TemporaryDirectory() as jsondir:
            print("Downloading jsons..")
            hf_fs = get_hf_fs()
            _dgdatas = [file for file in hf_fs.glob(f'datasets/{dgrepo}/*/pixiv_characters.json')]
            if game_index:
                name = _dgdatas[game_index-1]
                os.makedirs(os.path.basename(os.path.dirname(name)), exist_ok=True)
                # print(f'https://huggingface.co/{dgrepo}/blob/main/{os.path.basename(os.path.dirname(name))}/{os.path.basename(name)}')
                js = hf_hub_download(
                    # f'https://huggingface.co/{dgrepo}/blob/main/{os.path.basename(os.path.dirname(name))}/{os.path.basename(name)}',
                    # hf_hub_url(dgrepo, filename=os.path.relpath(name, dgrepo)),
                    repo_id=dgrepo, repo_type='dataset',
                    # os.path.join(os.path.basename(os.path.dirname(name)), 'pixiv_characters.json'),
                    filename=Path(os.path.join(os.path.basename(os.path.dirname(name)), 'pixiv_characters.json')).as_posix(),
                    token=os.environ['HF_TOKEN']
                )
                # with open(os.path.join(os.path.basename(os.path.dirname(name)), 'pixiv_characters.json'), 'r') as f:
                with open(js, 'r', encoding='utf-8') as f:
                    jt = json.load(f)
                    chs = jt['characters']
                    for jp in chs:
                        jp = jp['jpname']
                        print(jp, 'start...')
                        crawl_dataset_to_huggingface(jp)
                        print(jp + "完成")
            else:
                for name in _dgdatas:
                    os.makedirs(os.path.basename(os.path.dirname(name)), exist_ok=True)
                    # print(f'https://huggingface.co/{dgrepo}/blob/main/{os.path.basename(os.path.dirname(name))}/{os.path.basename(name)}')
                    js = hf_hub_download(
                        # f'https://huggingface.co/{dgrepo}/blob/main/{os.path.basename(os.path.dirname(name))}/{os.path.basename(name)}',
                        # hf_hub_url(dgrepo, filename=os.path.relpath(name, dgrepo)),
                        repo_id=dgrepo, repo_type='dataset',
                        # os.path.join(os.path.basename(os.path.dirname(name)), 'pixiv_characters.json'),
                        filename=Path(os.path.join(os.path.basename(os.path.dirname(name)), 'pixiv_characters.json')).as_posix(),
                        token=os.environ['HF_TOKEN']
                    )
                    # with open(os.path.join(os.path.basename(os.path.dirname(name)), 'pixiv_characters.json'), 'r') as f:
                    with open(js, 'r', encoding='utf-8') as f:
                        jt = json.load(f)
                        chs = jt['characters']
                        for jp in chs:
                            jp = jp['jpname']
                            print(jp, 'start...')
                            with open(os.path.join(os.path.basename(os.path.dirname(name)), 'log.txt'), 'w') as log_f:
                                print(f'{jp} is in crawl.', file=log_f)
                            crawl_dataset_to_huggingface(jp)
                            print(jp + "完成")
            return "完成"


def main():
    parser = argparse.ArgumentParser()
    parser.add_argument('--char', type=str, help='角色列表', default=None)
    parser.add_argument('--index', type=int, default=None)
    args = parser.parse_args()
    start_func(args.char, True, False if args.char else True, args.index)
    print("全部完成")


if __name__ == "__main__":
    main()