patrickvonplaten commited on
Commit
e466e6b
1 Parent(s): 86356d5
Files changed (1) hide show
  1. upload.py +42 -38
upload.py CHANGED
@@ -6,49 +6,53 @@ from huggingface_hub import get_repo_discussions
6
  from bs4 import BeautifulSoup
7
  import json
8
 
9
- # repo_id = "stabilityai/stable-diffusion"
10
- repo_id = "huggingface-projects/diffuse-the-rest"
11
  dataset_repo_id = "triple-t/dummy"
12
- path = "/home/patrick/image_cache"
 
 
13
 
 
14
  print("retrieve images 0...")
15
  discussions_list = list(get_repo_discussions(repo_id=repo_id, repo_type="space"))
16
- print("retrieve images 1...")
17
- all_data = []
18
- for i, disc in enumerate(discussions_list[:5]):
19
- disc = huggingface_hub.get_discussion_details(repo_id=repo_id, repo_type="space", discussion_num=disc.num)
20
- page = BeautifulSoup(disc.events[0]._event["data"]["latest"]["raw"])
21
- image_urls = [link.get('src') for link in page.findAll('img')]
22
- data = {
23
- "discussion_number": i,
24
- "data": {
25
- "prompt": disc.title,
26
- "images": image_urls,
27
- }
28
- }
29
- if not image_urls:
30
- continue
31
- else:
32
- all_data.append(data)
33
-
34
-
35
- file_name = "_".join(repo_id.split("/")) + ".json"
36
- api = HfApi()
37
 
38
  path = hf_hub_download(repo_id=dataset_repo_id, filename=file_name, cache_dir=path, repo_type="dataset")
39
 
40
  with open(path, "r") as f:
41
- data = json.load(path)
42
-
43
- data += all_data
44
- import ipdb; ipdb.set_trace()
45
-
46
- with open(path, "w") as f:
47
- f.write(json.dumps(all_data, sort_keys=True, indent=4))
48
-
49
- api.upload_file(
50
- path_or_fileobj=path,
51
- path_in_repo=file_name,
52
- repo_id=dataset_repo_id,
53
- repo_type="dataset",
54
- )
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
6
  from bs4 import BeautifulSoup
7
  import json
8
 
9
+ repo_id = "stabilityai/stable-diffusion"
10
+ repo_id = "dalle-mini/dalle-mini"
11
  dataset_repo_id = "triple-t/dummy"
12
+ path = "/home/patrick_huggingface_co/image_cache"
13
+ file_name = "_".join(repo_id.split("/")) + ".json"
14
+ api = HfApi()
15
 
16
+ interval = 100
17
  print("retrieve images 0...")
18
  discussions_list = list(get_repo_discussions(repo_id=repo_id, repo_type="space"))
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
19
 
20
  path = hf_hub_download(repo_id=dataset_repo_id, filename=file_name, cache_dir=path, repo_type="dataset")
21
 
22
  with open(path, "r") as f:
23
+ prev_all_data = json.load(f)
24
+
25
+
26
+ for i in range(0, 10000, interval):
27
+ print("retrieve images 1...")
28
+ all_data = []
29
+ try:
30
+ for i, disc in enumerate(discussions_list[i: i + interval]):
31
+ disc = huggingface_hub.get_discussion_details(repo_id=repo_id, repo_type="space", discussion_num=disc.num)
32
+ page = BeautifulSoup(disc.events[0]._event["data"]["latest"]["raw"])
33
+ image_urls = [link.get('src') for link in page.findAll('img')]
34
+ data = {
35
+ "discussion_number": i,
36
+ "data": {
37
+ "prompt": disc.title,
38
+ "images": image_urls,
39
+ }
40
+ }
41
+ if not image_urls:
42
+ continue
43
+ else:
44
+ all_data.append(data)
45
+
46
+ prev_all_data += all_data
47
+
48
+ with open(path, "w") as f:
49
+ f.write(json.dumps(prev_all_data, sort_keys=True, indent=4))
50
+
51
+ api.upload_file(
52
+ path_or_fileobj=path,
53
+ path_in_repo=file_name,
54
+ repo_id=dataset_repo_id,
55
+ repo_type="dataset",
56
+ )
57
+ except:
58
+ print("Screw it")