#!/usr/bin/env python3 from huggingface_hub import HfApi from huggingface_hub import hf_hub_download import huggingface_hub from huggingface_hub import get_repo_discussions from bs4 import BeautifulSoup import json repo_id = "stabilityai/stable-diffusion" repo_id = "dalle-mini/dalle-mini" dataset_repo_id = "triple-t/dummy" path = "/home/patrick_huggingface_co/image_cache" file_name = "_".join(repo_id.split("/")) + ".json" api = HfApi() interval = 100 print("retrieve images 0...") discussions_list = list(get_repo_discussions(repo_id=repo_id, repo_type="space")) path = hf_hub_download(repo_id=dataset_repo_id, filename=file_name, cache_dir=path, repo_type="dataset") with open(path, "r") as f: prev_all_data = json.load(f) for i in range(0, 10000, interval): print("retrieve images 1...") all_data = [] try: for i, disc in enumerate(discussions_list[i: i + interval]): disc = huggingface_hub.get_discussion_details(repo_id=repo_id, repo_type="space", discussion_num=disc.num) page = BeautifulSoup(disc.events[0]._event["data"]["latest"]["raw"]) image_urls = [link.get('src') for link in page.findAll('img')] data = { "discussion_number": i, "data": { "prompt": disc.title, "images": image_urls, } } if not image_urls: continue else: all_data.append(data) prev_all_data += all_data with open(path, "w") as f: f.write(json.dumps(prev_all_data, sort_keys=True, indent=4)) api.upload_file( path_or_fileobj=path, path_in_repo=file_name, repo_id=dataset_repo_id, repo_type="dataset", ) except: print("Screw it")