radames commited on
Commit
03258aa
1 Parent(s): f69954f

fix regex stuck

Browse files
Files changed (1) hide show
  1. app.py +136 -96
app.py CHANGED
@@ -24,9 +24,9 @@ import boto3
24
  from datetime import datetime
25
  from db import Database
26
 
27
- AWS_ACCESS_KEY_ID = os.getenv('MY_AWS_ACCESS_KEY_ID')
28
- AWS_SECRET_KEY = os.getenv('MY_AWS_SECRET_KEY')
29
- AWS_S3_BUCKET_NAME = os.getenv('MY_AWS_S3_BUCKET_NAME')
30
 
31
 
32
  HF_TOKEN = os.environ.get("HF_TOKEN")
@@ -35,13 +35,17 @@ S3_DATA_FOLDER = Path("sd-multiplayer-data")
35
 
36
  DB_FOLDER = Path("diffusers-gallery-data")
37
 
38
- CLASSIFIER_URL = "https://radames-aesthetic-style-nsfw-classifier.hf.space/run/inference"
 
 
39
  ASSETS_URL = "https://d26smi9133w0oo.cloudfront.net/diffusers-gallery/"
40
 
41
 
42
- s3 = boto3.client(service_name='s3',
43
- aws_access_key_id=AWS_ACCESS_KEY_ID,
44
- aws_secret_access_key=AWS_SECRET_KEY)
 
 
45
 
46
 
47
  repo = Repository(
@@ -54,24 +58,31 @@ repo.git_pull()
54
 
55
  database = Database(DB_FOLDER)
56
 
57
- REGEX_YAML_BLOCK = re.compile(
58
- r"^(\s*---[\r\n]+)([\S\s]*?)([\r\n]+---(\r\n|\n|$))")
59
-
60
 
61
  async def upload_resize_image_url(session, image_url):
62
  print(f"Uploading image {image_url}")
63
  try:
64
  async with session.get(image_url) as response:
65
- if response.status == 200 and (response.headers['content-type'].startswith('image') or response.headers['content-type'].startswith('application')):
66
- image = Image.open(BytesIO(await response.read())).convert('RGB')
 
 
 
67
  # resize image proportional
68
  image = ImageOps.fit(image, (400, 400), Image.LANCZOS)
69
  image_bytes = BytesIO()
70
  image.save(image_bytes, format="JPEG")
71
  image_bytes.seek(0)
72
- fname = f'{uuid.uuid4()}.jpg'
73
- s3.upload_fileobj(Fileobj=image_bytes, Bucket=AWS_S3_BUCKET_NAME, Key="diffusers-gallery/" + fname,
74
- ExtraArgs={"ContentType": "image/jpeg", "CacheControl": "max-age=31536000"})
 
 
 
 
 
 
 
75
  return fname
76
  except Exception as e:
77
  print(f"Error uploading image {image_url}: {e}")
@@ -80,41 +91,46 @@ async def upload_resize_image_url(session, image_url):
80
 
81
  def fetch_models(page=0):
82
  response = requests.get(
83
- f'https://huggingface.co/models-json?pipeline_tag=text-to-image&p={page}')
 
84
  data = response.json()
85
  return {
86
- "models": [model for model in data['models'] if not model['private']],
87
- "numItemsPerPage": data['numItemsPerPage'],
88
- "numTotalItems": data['numTotalItems'],
89
- "pageIndex": data['pageIndex']
90
  }
91
 
92
 
93
  def fetch_model_card(model_id):
94
- response = requests.get(
95
- f'https://huggingface.co/{model_id}/raw/main/README.md')
96
  return response.text
97
 
98
 
99
- def get_yaml_data(text_content):
100
- match = REGEX_YAML_BLOCK.search(text_content)
101
- if match:
102
- yaml_block = match.group(2)
103
- data_dict = yaml.safe_load(yaml_block)
104
- else:
105
- data_dict = {}
106
- return data_dict
107
 
 
 
 
 
 
 
 
 
 
 
108
 
109
  async def find_image_in_model_card(text):
110
- image_regex = re.compile(r'https?://\S+(?:png|jpg|jpeg|webp)')
111
  urls = re.findall(image_regex, text)
112
  if not urls:
113
  return []
114
 
115
  async with aiohttp.ClientSession() as session:
116
- tasks = [asyncio.ensure_future(upload_resize_image_url(
117
- session, image_url)) for image_url in urls[0:3]]
 
 
118
  return await asyncio.gather(*tasks)
119
 
120
 
@@ -123,17 +139,21 @@ def run_classifier(images):
123
  if len(images) > 0:
124
  # classifying only the first image
125
  images_urls = [ASSETS_URL + images[0]]
126
- response = requests.post(CLASSIFIER_URL, json={"data": [
127
- {"urls": images_urls}, # json urls: list of images urls
128
- False, # enable/disable gallery image output
129
- None, # single image input
130
- None, # files input
131
- ]}).json()
 
 
 
 
 
132
 
133
  # data response is array data:[[{img0}, {img1}, {img2}...], Label, Gallery],
134
- class_data = response['data'][0][0]
135
- class_data_parsed = {row['label']: round(
136
- row['score'], 3) for row in class_data}
137
 
138
  # update row data with classificator data
139
  return class_data_parsed
@@ -143,10 +163,11 @@ def run_classifier(images):
143
 
144
  async def get_all_new_models():
145
  initial = fetch_models(0)
146
- num_pages = ceil(initial['numTotalItems'] / initial['numItemsPerPage'])
147
 
148
  print(
149
- f"Total items: {initial['numTotalItems']} - Items per page: {initial['numItemsPerPage']}")
 
150
  print(f"Found {num_pages} pages")
151
 
152
  # fetch all models
@@ -154,7 +175,7 @@ async def get_all_new_models():
154
  for page in tqdm(range(0, num_pages)):
155
  print(f"Fetching page {page} of {num_pages}")
156
  page_models = fetch_models(page)
157
- new_models += page_models['models']
158
  return new_models
159
 
160
 
@@ -169,24 +190,28 @@ async def sync_data():
169
  # with open(DB_FOLDER / "models.json", "r") as f:
170
  # new_models = json.load(f)
171
 
172
- new_models_ids = [model['id'] for model in all_models]
173
 
174
  # get existing models
175
  with database.get_db() as db:
176
  cursor = db.cursor()
177
  cursor.execute("SELECT id FROM models")
178
- existing_models = [row['id'] for row in cursor.fetchall()]
179
  models_ids_to_add = list(set(new_models_ids) - set(existing_models))
180
  # find all models id to add from new_models
181
- models = [model for model in all_models if model['id'] in models_ids_to_add]
182
 
183
  print(f"Found {len(models)} new models")
184
  for model in tqdm(models):
185
- model_id = model['id']
186
- likes = model['likes']
187
- downloads = model['downloads']
 
 
188
  model_card = fetch_model_card(model_id)
 
189
  model_card_data = get_yaml_data(model_card)
 
190
  images = await find_image_in_model_card(model_card)
191
 
192
  classifier = run_classifier(images)
@@ -194,58 +219,69 @@ async def sync_data():
194
  # update model row with image and classifier data
195
  with database.get_db() as db:
196
  cursor = db.cursor()
197
- cursor.execute("INSERT INTO models(id, data, likes, downloads) VALUES (?, ?, ?, ?)",
198
- [model_id,
199
- json.dumps({
200
- **model,
201
- "meta": model_card_data,
202
- "images": images,
203
- "class": classifier
204
- }),
205
- likes,
206
- downloads
207
- ])
 
 
 
 
 
208
  db.commit()
209
- print("Try to update images again")
210
  with database.get_db() as db:
211
  cursor = db.cursor()
212
- cursor.execute(
213
- "SELECT * from models")
214
  to_all_models = list(cursor.fetchall())
215
  models_no_images = []
216
  for model in to_all_models:
217
- model_data = json.loads(model['data'])
218
- images = model_data['images']
219
  filtered_images = [x for x in images if x is not None]
220
  if len(filtered_images) == 0:
221
  models_no_images.append(model)
222
 
223
  for model in tqdm(models_no_images):
224
- model_id = model['id']
225
- model_data = json.loads(model['data'])
 
226
  model_card = fetch_model_card(model_id)
 
227
  model_card_data = get_yaml_data(model_card)
 
228
  images = await find_image_in_model_card(model_card)
229
  classifier = run_classifier(images)
230
- model_data['images'] = images
231
- model_data['class'] = classifier
232
- model_data['meta'] = model_card_data
233
  # update model row with image and classifier data
234
  with database.get_db() as db:
235
  cursor = db.cursor()
236
- cursor.execute("UPDATE models SET data = ? WHERE id = ?",
237
- [json.dumps(model_data), model_id])
 
 
238
  db.commit()
239
 
240
  print("Update likes and downloads")
241
  for model in tqdm(all_models):
242
- model_id = model['id']
243
- likes = model['likes']
244
- downloads = model['downloads']
245
  with database.get_db() as db:
246
  cursor = db.cursor()
247
- cursor.execute("UPDATE models SET likes = ?, downloads = ? WHERE id = ?",
248
- [likes, downloads, model_id])
 
 
249
  db.commit()
250
 
251
  print("Updating DB repository")
@@ -288,8 +324,10 @@ class Style(str, Enum):
288
  nsfw = "nsfw"
289
 
290
 
291
- @ app.get("/api/models")
292
- def get_page(page: int = 1, sort: Sort = Sort.trending, style: Style = Style.all, tag: str = None):
 
 
293
  page = page if page > 0 else 1
294
  if sort == Sort.trending:
295
  sort_query = "likes / MYPOWER((JULIANDAY('now') - JULIANDAY(datetime(json_extract(data, '$.lastModified')))) + 2, 2) DESC"
@@ -311,7 +349,8 @@ def get_page(page: int = 1, sort: Sort = Sort.trending, style: Style = Style.all
311
 
312
  with database.get_db() as db:
313
  cursor = db.cursor()
314
- cursor.execute(f"""
 
315
  SELECT *,
316
  COUNT(*) OVER() AS total,
317
  isNFSW
@@ -329,36 +368,37 @@ def get_page(page: int = 1, sort: Sort = Sort.trending, style: Style = Style.all
329
  ))
330
  ORDER BY {sort_query}
331
  LIMIT {MAX_PAGE_SIZE} OFFSET {(page - 1) * MAX_PAGE_SIZE};
332
- """, (tag, tag, tag, tag))
 
 
333
  results = cursor.fetchall()
334
- total = results[0]['total'] if results else 0
335
  total_pages = (total + MAX_PAGE_SIZE - 1) // MAX_PAGE_SIZE
336
  models_data = []
337
  for result in results:
338
- data = json.loads(result['data'])
339
- images = data['images']
340
  filtered_images = [x for x in images if x is not None]
341
  # clean nulls
342
- data['images'] = filtered_images
343
  # update downloads and likes from db table
344
- data['downloads'] = result['downloads']
345
- data['likes'] = result['likes']
346
- data['isNFSW'] = bool(result['isNFSW'])
347
  models_data.append(data)
348
 
349
- return {
350
- "models": models_data,
351
- "totalPages": total_pages
352
- }
353
 
354
 
355
  @app.get("/")
356
  def read_root():
357
  # return html page from string
358
- return HTMLResponse("""
 
359
  <p>Just a bot to sync data from diffusers gallery please go to
360
  <a href="https://huggingface.co/spaces/huggingface-projects/diffusers-gallery" target="_blank" rel="noopener noreferrer">https://huggingface.co/spaces/huggingface-projects/diffusers-gallery</a>
361
- </p>""")
 
362
 
363
 
364
  @app.on_event("startup")
 
24
  from datetime import datetime
25
  from db import Database
26
 
27
+ AWS_ACCESS_KEY_ID = os.getenv("MY_AWS_ACCESS_KEY_ID")
28
+ AWS_SECRET_KEY = os.getenv("MY_AWS_SECRET_KEY")
29
+ AWS_S3_BUCKET_NAME = os.getenv("MY_AWS_S3_BUCKET_NAME")
30
 
31
 
32
  HF_TOKEN = os.environ.get("HF_TOKEN")
 
35
 
36
  DB_FOLDER = Path("diffusers-gallery-data")
37
 
38
+ CLASSIFIER_URL = (
39
+ "https://radames-aesthetic-style-nsfw-classifier.hf.space/run/inference"
40
+ )
41
  ASSETS_URL = "https://d26smi9133w0oo.cloudfront.net/diffusers-gallery/"
42
 
43
 
44
+ s3 = boto3.client(
45
+ service_name="s3",
46
+ aws_access_key_id=AWS_ACCESS_KEY_ID,
47
+ aws_secret_access_key=AWS_SECRET_KEY,
48
+ )
49
 
50
 
51
  repo = Repository(
 
58
 
59
  database = Database(DB_FOLDER)
60
 
 
 
 
61
 
62
  async def upload_resize_image_url(session, image_url):
63
  print(f"Uploading image {image_url}")
64
  try:
65
  async with session.get(image_url) as response:
66
+ if response.status == 200 and (
67
+ response.headers["content-type"].startswith("image")
68
+ or response.headers["content-type"].startswith("application")
69
+ ):
70
+ image = Image.open(BytesIO(await response.read())).convert("RGB")
71
  # resize image proportional
72
  image = ImageOps.fit(image, (400, 400), Image.LANCZOS)
73
  image_bytes = BytesIO()
74
  image.save(image_bytes, format="JPEG")
75
  image_bytes.seek(0)
76
+ fname = f"{uuid.uuid4()}.jpg"
77
+ s3.upload_fileobj(
78
+ Fileobj=image_bytes,
79
+ Bucket=AWS_S3_BUCKET_NAME,
80
+ Key="diffusers-gallery/" + fname,
81
+ ExtraArgs={
82
+ "ContentType": "image/jpeg",
83
+ "CacheControl": "max-age=31536000",
84
+ },
85
+ )
86
  return fname
87
  except Exception as e:
88
  print(f"Error uploading image {image_url}: {e}")
 
91
 
92
  def fetch_models(page=0):
93
  response = requests.get(
94
+ f"https://huggingface.co/models-json?pipeline_tag=text-to-image&p={page}"
95
+ )
96
  data = response.json()
97
  return {
98
+ "models": [model for model in data["models"] if not model["private"]],
99
+ "numItemsPerPage": data["numItemsPerPage"],
100
+ "numTotalItems": data["numTotalItems"],
101
+ "pageIndex": data["pageIndex"],
102
  }
103
 
104
 
105
  def fetch_model_card(model_id):
106
+ response = requests.get(f"https://huggingface.co/{model_id}/raw/main/README.md")
 
107
  return response.text
108
 
109
 
110
+ REGEX = re.compile(r'---(.*?)---', re.DOTALL)
 
 
 
 
 
 
 
111
 
112
+ def get_yaml_data(text_content):
113
+ matches = REGEX.findall(text_content)
114
+ yaml_block = matches[0].strip() if matches else None
115
+ if yaml_block:
116
+ try:
117
+ data_dict = yaml.safe_load(yaml_block)
118
+ return data_dict
119
+ except yaml.YAMLError as exc:
120
+ print(exc)
121
+ return {}
122
 
123
  async def find_image_in_model_card(text):
124
+ image_regex = re.compile(r"https?://\S+(?:png|jpg|jpeg|webp)")
125
  urls = re.findall(image_regex, text)
126
  if not urls:
127
  return []
128
 
129
  async with aiohttp.ClientSession() as session:
130
+ tasks = [
131
+ asyncio.ensure_future(upload_resize_image_url(session, image_url))
132
+ for image_url in urls[0:3]
133
+ ]
134
  return await asyncio.gather(*tasks)
135
 
136
 
 
139
  if len(images) > 0:
140
  # classifying only the first image
141
  images_urls = [ASSETS_URL + images[0]]
142
+ response = requests.post(
143
+ CLASSIFIER_URL,
144
+ json={
145
+ "data": [
146
+ {"urls": images_urls}, # json urls: list of images urls
147
+ False, # enable/disable gallery image output
148
+ None, # single image input
149
+ None, # files input
150
+ ]
151
+ },
152
+ ).json()
153
 
154
  # data response is array data:[[{img0}, {img1}, {img2}...], Label, Gallery],
155
+ class_data = response["data"][0][0]
156
+ class_data_parsed = {row["label"]: round(row["score"], 3) for row in class_data}
 
157
 
158
  # update row data with classificator data
159
  return class_data_parsed
 
163
 
164
  async def get_all_new_models():
165
  initial = fetch_models(0)
166
+ num_pages = ceil(initial["numTotalItems"] / initial["numItemsPerPage"])
167
 
168
  print(
169
+ f"Total items: {initial['numTotalItems']} - Items per page: {initial['numItemsPerPage']}"
170
+ )
171
  print(f"Found {num_pages} pages")
172
 
173
  # fetch all models
 
175
  for page in tqdm(range(0, num_pages)):
176
  print(f"Fetching page {page} of {num_pages}")
177
  page_models = fetch_models(page)
178
+ new_models += page_models["models"]
179
  return new_models
180
 
181
 
 
190
  # with open(DB_FOLDER / "models.json", "r") as f:
191
  # new_models = json.load(f)
192
 
193
+ new_models_ids = [model["id"] for model in all_models]
194
 
195
  # get existing models
196
  with database.get_db() as db:
197
  cursor = db.cursor()
198
  cursor.execute("SELECT id FROM models")
199
+ existing_models = [row["id"] for row in cursor.fetchall()]
200
  models_ids_to_add = list(set(new_models_ids) - set(existing_models))
201
  # find all models id to add from new_models
202
+ models = [model for model in all_models if model["id"] in models_ids_to_add]
203
 
204
  print(f"Found {len(models)} new models")
205
  for model in tqdm(models):
206
+ model_id = model["id"]
207
+ print(f"\n\nFetching model {model_id}")
208
+ likes = model["likes"]
209
+ downloads = model["downloads"]
210
+ print("Fetching model card")
211
  model_card = fetch_model_card(model_id)
212
+ print("Parsing model card")
213
  model_card_data = get_yaml_data(model_card)
214
+ print("Finding images in model card")
215
  images = await find_image_in_model_card(model_card)
216
 
217
  classifier = run_classifier(images)
 
219
  # update model row with image and classifier data
220
  with database.get_db() as db:
221
  cursor = db.cursor()
222
+ cursor.execute(
223
+ "INSERT INTO models(id, data, likes, downloads) VALUES (?, ?, ?, ?)",
224
+ [
225
+ model_id,
226
+ json.dumps(
227
+ {
228
+ **model,
229
+ "meta": model_card_data,
230
+ "images": images,
231
+ "class": classifier,
232
+ }
233
+ ),
234
+ likes,
235
+ downloads,
236
+ ],
237
+ )
238
  db.commit()
239
+ print("\n\n\n\nTry to update images again\n\n\n")
240
  with database.get_db() as db:
241
  cursor = db.cursor()
242
+ cursor.execute("SELECT * from models")
 
243
  to_all_models = list(cursor.fetchall())
244
  models_no_images = []
245
  for model in to_all_models:
246
+ model_data = json.loads(model["data"])
247
+ images = model_data["images"]
248
  filtered_images = [x for x in images if x is not None]
249
  if len(filtered_images) == 0:
250
  models_no_images.append(model)
251
 
252
  for model in tqdm(models_no_images):
253
+ model_id = model["id"]
254
+ model_data = json.loads(model["data"])
255
+ print(f"\n\nFetching model {model_id}")
256
  model_card = fetch_model_card(model_id)
257
+ print("Parsing model card")
258
  model_card_data = get_yaml_data(model_card)
259
+ print("Finding images in model card")
260
  images = await find_image_in_model_card(model_card)
261
  classifier = run_classifier(images)
262
+ model_data["images"] = images
263
+ model_data["class"] = classifier
264
+ model_data["meta"] = model_card_data
265
  # update model row with image and classifier data
266
  with database.get_db() as db:
267
  cursor = db.cursor()
268
+ cursor.execute(
269
+ "UPDATE models SET data = ? WHERE id = ?",
270
+ [json.dumps(model_data), model_id],
271
+ )
272
  db.commit()
273
 
274
  print("Update likes and downloads")
275
  for model in tqdm(all_models):
276
+ model_id = model["id"]
277
+ likes = model["likes"]
278
+ downloads = model["downloads"]
279
  with database.get_db() as db:
280
  cursor = db.cursor()
281
+ cursor.execute(
282
+ "UPDATE models SET likes = ?, downloads = ? WHERE id = ?",
283
+ [likes, downloads, model_id],
284
+ )
285
  db.commit()
286
 
287
  print("Updating DB repository")
 
324
  nsfw = "nsfw"
325
 
326
 
327
+ @app.get("/api/models")
328
+ def get_page(
329
+ page: int = 1, sort: Sort = Sort.trending, style: Style = Style.all, tag: str = None
330
+ ):
331
  page = page if page > 0 else 1
332
  if sort == Sort.trending:
333
  sort_query = "likes / MYPOWER((JULIANDAY('now') - JULIANDAY(datetime(json_extract(data, '$.lastModified')))) + 2, 2) DESC"
 
349
 
350
  with database.get_db() as db:
351
  cursor = db.cursor()
352
+ cursor.execute(
353
+ f"""
354
  SELECT *,
355
  COUNT(*) OVER() AS total,
356
  isNFSW
 
368
  ))
369
  ORDER BY {sort_query}
370
  LIMIT {MAX_PAGE_SIZE} OFFSET {(page - 1) * MAX_PAGE_SIZE};
371
+ """,
372
+ (tag, tag, tag, tag),
373
+ )
374
  results = cursor.fetchall()
375
+ total = results[0]["total"] if results else 0
376
  total_pages = (total + MAX_PAGE_SIZE - 1) // MAX_PAGE_SIZE
377
  models_data = []
378
  for result in results:
379
+ data = json.loads(result["data"])
380
+ images = data["images"]
381
  filtered_images = [x for x in images if x is not None]
382
  # clean nulls
383
+ data["images"] = filtered_images
384
  # update downloads and likes from db table
385
+ data["downloads"] = result["downloads"]
386
+ data["likes"] = result["likes"]
387
+ data["isNFSW"] = bool(result["isNFSW"])
388
  models_data.append(data)
389
 
390
+ return {"models": models_data, "totalPages": total_pages}
 
 
 
391
 
392
 
393
  @app.get("/")
394
  def read_root():
395
  # return html page from string
396
+ return HTMLResponse(
397
+ """
398
  <p>Just a bot to sync data from diffusers gallery please go to
399
  <a href="https://huggingface.co/spaces/huggingface-projects/diffusers-gallery" target="_blank" rel="noopener noreferrer">https://huggingface.co/spaces/huggingface-projects/diffusers-gallery</a>
400
+ </p>"""
401
+ )
402
 
403
 
404
  @app.on_event("startup")