import re from concurrent.futures import ThreadPoolExecutor from ditk import logging from gchar.games.azurlane import Character from gchar.resources.pixiv import get_pixiv_posts from tqdm.auto import tqdm from waifuc.utils import task_ctx from cyberharem.dataset import crawl_dataset_to_huggingface from cyberharem.utils import get_hf_fs def _cht(ch: Character): name = str(ch.enname or ch.cnname or ch.jpname) short_name = re.sub(r'[\W_]+', '_', name).strip('_') return f'{short_name}_{ch.__game_name__}' def _get_pixiv_posts(ch: Character): v = get_pixiv_posts(ch) return 0 if v is None else v[0] hf_fs = get_hf_fs() # all_chs = [ch for ch in Character.all(contains_extra=False) if not ch.is_extra and ch.accessible and ch.index >= 153] all_chs = Character.all(contains_extra=False) all_chs = sorted(all_chs, key=lambda x: -_get_pixiv_posts(x))[2::3] pg = tqdm(total=len(all_chs)) crawled_ids = set() def _crawl(char_): try: repo = f'CyberHarem/{_cht(char_)}' if char_.gender == 'female': if char_.index not in crawled_ids and \ not hf_fs.exists(f'datasets/{repo}/dataset-raw.zip'): with task_ctx(repo): crawl_dataset_to_huggingface( char_, repository=repo, limit=200 ) crawled_ids.add(char_.index) pg.update() else: logging.info(f'{char_!r} already crawled, skipped.') pg.update() else: logging.info(f'{char_!r} is not female, skipped!') pg.update() except Exception as err: logging.error(repr(err)) raise if __name__ == '__main__': logging.try_init_root(logging.INFO) tp = ThreadPoolExecutor(max_workers=1) for ch in all_chs: # _crawl(ch) tp.submit(_crawl, ch) tp.shutdown()