upload.py / upload.py
patrickvonplaten's picture
up
e466e6b
#!/usr/bin/env python3
from huggingface_hub import HfApi
from huggingface_hub import hf_hub_download
import huggingface_hub
from huggingface_hub import get_repo_discussions
from bs4 import BeautifulSoup
import json
repo_id = "stabilityai/stable-diffusion"
repo_id = "dalle-mini/dalle-mini"
dataset_repo_id = "triple-t/dummy"
path = "/home/patrick_huggingface_co/image_cache"
file_name = "_".join(repo_id.split("/")) + ".json"
api = HfApi()
interval = 100
print("retrieve images 0...")
discussions_list = list(get_repo_discussions(repo_id=repo_id, repo_type="space"))
path = hf_hub_download(repo_id=dataset_repo_id, filename=file_name, cache_dir=path, repo_type="dataset")
with open(path, "r") as f:
prev_all_data = json.load(f)
for i in range(0, 10000, interval):
print("retrieve images 1...")
all_data = []
try:
for i, disc in enumerate(discussions_list[i: i + interval]):
disc = huggingface_hub.get_discussion_details(repo_id=repo_id, repo_type="space", discussion_num=disc.num)
page = BeautifulSoup(disc.events[0]._event["data"]["latest"]["raw"])
image_urls = [link.get('src') for link in page.findAll('img')]
data = {
"discussion_number": i,
"data": {
"prompt": disc.title,
"images": image_urls,
}
}
if not image_urls:
continue
else:
all_data.append(data)
prev_all_data += all_data
with open(path, "w") as f:
f.write(json.dumps(prev_all_data, sort_keys=True, indent=4))
api.upload_file(
path_or_fileobj=path,
path_in_repo=file_name,
repo_id=dataset_repo_id,
repo_type="dataset",
)
except:
print("Screw it")