jsulz HF staff commited on
Commit
502fa70
·
1 Parent(s): d38ed41

fixing minor issues

Browse files
Files changed (1) hide show
  1. app.py +82 -44
app.py CHANGED
@@ -4,6 +4,7 @@ import numpy as np
4
  import plotly.express as px
5
  from datasets import load_dataset
6
 
 
7
  def load_transform_data():
8
  """
9
  Load and transform data from a parquet file.
@@ -11,9 +12,9 @@ def load_transform_data():
11
  Returns:
12
  pandas.DataFrame: Transformed dataframe.
13
  """
14
- spaces_dataset = 'jsulz/space-stats'
15
  dataset = load_dataset(spaces_dataset)
16
- df = dataset['train'].to_pandas()
17
  # combine the sdk and tags columns, one of which is a string and the other is an array of strings
18
  df["sdk"] = df["sdk"].apply(lambda x: np.array([str(x)]))
19
  df["licenses"] = df["license"].apply(
@@ -25,7 +26,7 @@ def load_transform_data():
25
  )
26
 
27
  # Fill the NaN values with an empty string
28
- df['emoji'] = np.where(df['emoji'].isnull(), '', df['emoji'])
29
 
30
  # where the custom_domains column is not null, use that as the url, otherwise, use the host column
31
  df["url"] = np.where(
@@ -37,9 +38,9 @@ def load_transform_data():
37
  # Build up a pretty url that's clickable with the emoji
38
  df["url"] = df[["url", "emoji"]].apply(
39
  lambda x: (
40
- f"<a target='_blank' href=https://huggingface.co/spaces/{x.iloc[0]}>{str(x.iloc[1]) + " " + x.iloc[0]}</a>"
41
- if x.iloc[0] is not None and "/" in x.iloc[0]
42
- else f"<a target='_blank' href=https://{x.iloc[0][0]}>{str(x.iloc[1]) + " " + x.iloc[0][0]}</a>"
43
  ),
44
  axis=1,
45
  )
@@ -145,9 +146,7 @@ def filtered_df(
145
  }
146
  )
147
  if filtered_devmode:
148
- _df = _df[
149
- _df["devMode"] == filtered_devmode
150
- ]
151
 
152
  return _df[["URL", "Likes", "Models", "Datasets", "Licenses"]]
153
 
@@ -158,7 +157,7 @@ def count_items(items):
158
  Parameters:
159
  items (dataframe column): A dataframe column containing a list of items.
160
  Returns:
161
- tuple: A tuple containing two dictionaries. The first dictionary contains the count of each item,
162
  and the second dictionary contains the count of each author.
163
  """
164
  items = np.concatenate([arr for arr in items.values if arr is not None])
@@ -169,14 +168,15 @@ def count_items(items):
169
  item_count[item] += 1
170
  else:
171
  item_count[item] = 1
172
- author = item.split('/')[0]
173
  if author in item_author_count:
174
  item_author_count[author] += 1
175
  else:
176
  item_author_count[author] = 1
177
-
178
  return item_count, item_author_count
179
 
 
180
  def flatten_column(_df, column):
181
  """
182
  Flattens a column in a DataFrame.
@@ -203,7 +203,7 @@ with gr.Blocks(fill_width=True) as demo:
203
  # The Pandas dataframe has a datetime column. Plot the growth of spaces (row entries) over time.
204
  # The x-axis should be the date and the y-axis should be the cumulative number of spaces created up to that date .
205
  df = df.sort_values("created_at")
206
- df['cumulative_spaces'] = df['created_at'].rank(method='first').astype(int)
207
  fig1 = px.line(
208
  df,
209
  x="created_at",
@@ -216,16 +216,29 @@ with gr.Blocks(fill_width=True) as demo:
216
 
217
  with gr.Row():
218
  # Create a pie charge showing the distribution of spaces by SDK
219
- fig2 = px.pie(df, names='sdk', title='Distribution of Spaces by SDK', template='plotly_dark')
 
 
 
 
 
220
  gr.Plot(fig2)
221
 
222
  # create a pie chart showing the distribution of spaces by emoji for the top 10 used emojis
223
- emoji_counts = df['emoji'].value_counts().head(10).reset_index()
224
- fig3 = px.pie(emoji_counts, names='emoji', values='count', title='Distribution of Spaces by Emoji', template='plotly_dark')
 
 
 
 
 
 
225
  gr.Plot(fig3)
226
 
227
  # Create a scatter plot showing the relationship between the number of likes and the number of spaces created by an author
228
- author_likes = df.groupby('author').agg({'likes': 'sum', 'id': 'count'}).reset_index()
 
 
229
  fig4 = px.scatter(
230
  author_likes,
231
  x="id",
@@ -238,7 +251,13 @@ with gr.Blocks(fill_width=True) as demo:
238
  gr.Plot(fig4)
239
 
240
  # Create a scatter plot showing the relationship between the number of likes and the number of spaces created by an author
241
- emoji_likes = df.groupby('emoji').agg({'likes': 'sum', 'id': 'count'}).sort_values(by='likes', ascending=False).head(20).reset_index()
 
 
 
 
 
 
242
  fig10 = px.scatter(
243
  emoji_likes,
244
  x="id",
@@ -251,8 +270,8 @@ with gr.Blocks(fill_width=True) as demo:
251
  gr.Plot(fig10)
252
 
253
  # Create a bar chart of hardware in use
254
- hardware = df['hardware'].value_counts().reset_index()
255
- hardware.columns = ['Hardware', 'Number of Spaces']
256
  fig5 = px.bar(
257
  hardware,
258
  x="Hardware",
@@ -268,8 +287,10 @@ with gr.Blocks(fill_width=True) as demo:
268
  fig5.update_layout(yaxis_type="log")
269
  gr.Plot(fig5)
270
 
271
- model_count, model_author_count = count_items(df['models'])
272
- model_author_count = pd.DataFrame(model_author_count.items(), columns=['Model Author', 'Number of Spaces'])
 
 
273
  fig8 = px.bar(
274
  model_author_count.sort_values("Number of Spaces", ascending=False).head(
275
  20
@@ -281,7 +302,9 @@ with gr.Blocks(fill_width=True) as demo:
281
  template="plotly_dark",
282
  )
283
  gr.Plot(fig8)
284
- model_count = pd.DataFrame(model_count.items(), columns=['Model', 'Number of Spaces'])
 
 
285
  # then make a bar chart
286
  fig6 = px.bar(
287
  model_count.sort_values("Number of Spaces", ascending=False).head(20),
@@ -293,9 +316,13 @@ with gr.Blocks(fill_width=True) as demo:
293
  )
294
  gr.Plot(fig6)
295
 
296
- dataset_count, dataset_author_count = count_items(df['datasets'])
297
- dataset_count = pd.DataFrame(dataset_count.items(), columns=['Datasets', 'Number of Spaces'])
298
- dataset_author_count = pd.DataFrame(dataset_author_count.items(), columns=['Dataset Author', 'Number of Spaces'])
 
 
 
 
299
  fig9 = px.bar(
300
  dataset_author_count.sort_values("Number of Spaces", ascending=False).head(
301
  20
@@ -323,26 +350,30 @@ with gr.Blocks(fill_width=True) as demo:
323
 
324
  with gr.Row():
325
  # Get the most duplicated spaces
326
- duplicated_spaces = df['duplicated_from'].value_counts().head(20).reset_index()
 
 
327
  duplicated_spaces["duplicated_from"] = duplicated_spaces[
328
  "duplicated_from"
329
  ].apply(
330
  lambda x: f"<a target='_blank' href=https://huggingface.co/spaces/{x}>{x}</a>"
331
  )
332
  duplicated_spaces.columns = ["Space", "Number of Duplicates"]
333
- gr.DataFrame(duplicated_spaces, datatype="html" )
334
 
335
  # Get the most liked spaces
336
- liked_spaces = df[['id', 'likes']].sort_values(by='likes', ascending=False).head(20)
 
 
337
  liked_spaces["id"] = liked_spaces["id"].apply(
338
  lambda x: f"<a target='_blank' href=https://huggingface.co/spaces/{x}>{x}</a>"
339
  )
340
- liked_spaces.columns = ['Space', 'Number of Likes']
341
  gr.DataFrame(liked_spaces, datatype="html")
342
 
343
  with gr.Row():
344
  # Create a dataframe with the top 10 authors and the number of spaces they have created
345
- author_counts = df['author'].value_counts().head(20).reset_index()
346
  author_counts["author"] = author_counts["author"].apply(
347
  lambda x: f"<a target='_blank' href=https://huggingface.co/{x}>{x}</a>"
348
  )
@@ -350,22 +381,25 @@ with gr.Blocks(fill_width=True) as demo:
350
  gr.DataFrame(author_counts, datatype="html")
351
 
352
  # create a dataframe where we groupby author and sum their likes
353
- author_likes = df.groupby('author').agg({'likes': 'sum'}).reset_index()
354
- author_likes = author_likes.sort_values(by='likes', ascending=False).head(20)
 
 
355
  author_likes["author"] = author_likes["author"].apply(
356
  lambda x: f"<a target='_blank' href=https://huggingface.co/{x}>{x}</a>"
357
  )
358
  author_likes.columns = ["Author", "Number of Likes"]
359
  gr.DataFrame(author_likes, datatype="html")
360
 
361
-
362
  with gr.Tab(label="Spaces Search"):
363
- df = df[df['stage'] == 'RUNNING']
364
 
365
  # Layout
366
  with gr.Row():
367
  emoji = gr.Dropdown(
368
- df["emoji"].unique().tolist(), label="Search by Emoji 🤗", multiselect=True
 
 
369
  ) # Dropdown to select the emoji
370
  likes = gr.Slider(
371
  minimum=df["likes"].min(),
@@ -375,7 +409,9 @@ with gr.Blocks(fill_width=True) as demo:
375
  ) # Slider to filter by likes
376
  with gr.Row():
377
  author = gr.Dropdown(
378
- df["author"].unique().tolist(), label="Search by Author", multiselect=True
 
 
379
  )
380
  # get the list of unique strings in the sdk_tags column
381
  sdk_tags = np.unique(np.concatenate(df["sdk_tags"].values))
@@ -405,15 +441,17 @@ with gr.Blocks(fill_width=True) as demo:
405
  )
406
 
407
  devmode = gr.Checkbox(label="Show Dev Mode Spaces")
408
- clear = gr.ClearButton(components=[
 
409
  emoji,
410
  author,
411
  hardware,
412
  sdk_tags,
413
  models,
414
  datasets,
415
- space_license
416
- ])
 
417
 
418
  df = pd.DataFrame(
419
  df[
@@ -432,7 +470,7 @@ with gr.Blocks(fill_width=True) as demo:
432
  "r_models",
433
  "r_datasets",
434
  "r_licenses",
435
- 'devMode'
436
  ]
437
  ]
438
  )
@@ -450,9 +488,9 @@ with gr.Blocks(fill_width=True) as demo:
450
  devmode,
451
  ],
452
  datatype="html",
453
- wrap=True,
454
- column_widths=["25%", "5%", "25%", "25%", "20%"]
455
  )
456
 
457
 
458
- demo.launch()
 
4
  import plotly.express as px
5
  from datasets import load_dataset
6
 
7
+
8
  def load_transform_data():
9
  """
10
  Load and transform data from a parquet file.
 
12
  Returns:
13
  pandas.DataFrame: Transformed dataframe.
14
  """
15
+ spaces_dataset = "jsulz/space-stats"
16
  dataset = load_dataset(spaces_dataset)
17
+ df = dataset["train"].to_pandas()
18
  # combine the sdk and tags columns, one of which is a string and the other is an array of strings
19
  df["sdk"] = df["sdk"].apply(lambda x: np.array([str(x)]))
20
  df["licenses"] = df["license"].apply(
 
26
  )
27
 
28
  # Fill the NaN values with an empty string
29
+ df["emoji"] = np.where(df["emoji"].isnull(), "", df["emoji"])
30
 
31
  # where the custom_domains column is not null, use that as the url, otherwise, use the host column
32
  df["url"] = np.where(
 
38
  # Build up a pretty url that's clickable with the emoji
39
  df["url"] = df[["url", "emoji"]].apply(
40
  lambda x: (
41
+ f'<a target="_blank" href=https://huggingface.co/spaces/{x.iloc[0]}>{str(x.iloc[1]) + " " + x.iloc[0]}</a>'
42
+ if x.iloc[0] is not None
43
+ else f'<a target="_blank" href=https://{x.iloc[0][0]}>{str(x.iloc[1]) + " " + x.iloc[0][0]}</a>'
44
  ),
45
  axis=1,
46
  )
 
146
  }
147
  )
148
  if filtered_devmode:
149
+ _df = _df[_df["devMode"] == filtered_devmode]
 
 
150
 
151
  return _df[["URL", "Likes", "Models", "Datasets", "Licenses"]]
152
 
 
157
  Parameters:
158
  items (dataframe column): A dataframe column containing a list of items.
159
  Returns:
160
+ tuple: A tuple containing two dictionaries. The first dictionary contains the count of each item,
161
  and the second dictionary contains the count of each author.
162
  """
163
  items = np.concatenate([arr for arr in items.values if arr is not None])
 
168
  item_count[item] += 1
169
  else:
170
  item_count[item] = 1
171
+ author = item.split("/")[0]
172
  if author in item_author_count:
173
  item_author_count[author] += 1
174
  else:
175
  item_author_count[author] = 1
176
+
177
  return item_count, item_author_count
178
 
179
+
180
  def flatten_column(_df, column):
181
  """
182
  Flattens a column in a DataFrame.
 
203
  # The Pandas dataframe has a datetime column. Plot the growth of spaces (row entries) over time.
204
  # The x-axis should be the date and the y-axis should be the cumulative number of spaces created up to that date .
205
  df = df.sort_values("created_at")
206
+ df["cumulative_spaces"] = df["created_at"].rank(method="first").astype(int)
207
  fig1 = px.line(
208
  df,
209
  x="created_at",
 
216
 
217
  with gr.Row():
218
  # Create a pie charge showing the distribution of spaces by SDK
219
+ fig2 = px.pie(
220
+ df,
221
+ names="sdk",
222
+ title="Distribution of Spaces by SDK",
223
+ template="plotly_dark",
224
+ )
225
  gr.Plot(fig2)
226
 
227
  # create a pie chart showing the distribution of spaces by emoji for the top 10 used emojis
228
+ emoji_counts = df["emoji"].value_counts().head(10).reset_index()
229
+ fig3 = px.pie(
230
+ emoji_counts,
231
+ names="emoji",
232
+ values="count",
233
+ title="Distribution of Spaces by Emoji",
234
+ template="plotly_dark",
235
+ )
236
  gr.Plot(fig3)
237
 
238
  # Create a scatter plot showing the relationship between the number of likes and the number of spaces created by an author
239
+ author_likes = (
240
+ df.groupby("author").agg({"likes": "sum", "id": "count"}).reset_index()
241
+ )
242
  fig4 = px.scatter(
243
  author_likes,
244
  x="id",
 
251
  gr.Plot(fig4)
252
 
253
  # Create a scatter plot showing the relationship between the number of likes and the number of spaces created by an author
254
+ emoji_likes = (
255
+ df.groupby("emoji")
256
+ .agg({"likes": "sum", "id": "count"})
257
+ .sort_values(by="likes", ascending=False)
258
+ .head(20)
259
+ .reset_index()
260
+ )
261
  fig10 = px.scatter(
262
  emoji_likes,
263
  x="id",
 
270
  gr.Plot(fig10)
271
 
272
  # Create a bar chart of hardware in use
273
+ hardware = df["hardware"].value_counts().reset_index()
274
+ hardware.columns = ["Hardware", "Number of Spaces"]
275
  fig5 = px.bar(
276
  hardware,
277
  x="Hardware",
 
287
  fig5.update_layout(yaxis_type="log")
288
  gr.Plot(fig5)
289
 
290
+ model_count, model_author_count = count_items(df["models"])
291
+ model_author_count = pd.DataFrame(
292
+ model_author_count.items(), columns=["Model Author", "Number of Spaces"]
293
+ )
294
  fig8 = px.bar(
295
  model_author_count.sort_values("Number of Spaces", ascending=False).head(
296
  20
 
302
  template="plotly_dark",
303
  )
304
  gr.Plot(fig8)
305
+ model_count = pd.DataFrame(
306
+ model_count.items(), columns=["Model", "Number of Spaces"]
307
+ )
308
  # then make a bar chart
309
  fig6 = px.bar(
310
  model_count.sort_values("Number of Spaces", ascending=False).head(20),
 
316
  )
317
  gr.Plot(fig6)
318
 
319
+ dataset_count, dataset_author_count = count_items(df["datasets"])
320
+ dataset_count = pd.DataFrame(
321
+ dataset_count.items(), columns=["Datasets", "Number of Spaces"]
322
+ )
323
+ dataset_author_count = pd.DataFrame(
324
+ dataset_author_count.items(), columns=["Dataset Author", "Number of Spaces"]
325
+ )
326
  fig9 = px.bar(
327
  dataset_author_count.sort_values("Number of Spaces", ascending=False).head(
328
  20
 
350
 
351
  with gr.Row():
352
  # Get the most duplicated spaces
353
+ duplicated_spaces = (
354
+ df["duplicated_from"].value_counts().head(20).reset_index()
355
+ )
356
  duplicated_spaces["duplicated_from"] = duplicated_spaces[
357
  "duplicated_from"
358
  ].apply(
359
  lambda x: f"<a target='_blank' href=https://huggingface.co/spaces/{x}>{x}</a>"
360
  )
361
  duplicated_spaces.columns = ["Space", "Number of Duplicates"]
362
+ gr.DataFrame(duplicated_spaces, datatype="html")
363
 
364
  # Get the most liked spaces
365
+ liked_spaces = (
366
+ df[["id", "likes"]].sort_values(by="likes", ascending=False).head(20)
367
+ )
368
  liked_spaces["id"] = liked_spaces["id"].apply(
369
  lambda x: f"<a target='_blank' href=https://huggingface.co/spaces/{x}>{x}</a>"
370
  )
371
+ liked_spaces.columns = ["Space", "Number of Likes"]
372
  gr.DataFrame(liked_spaces, datatype="html")
373
 
374
  with gr.Row():
375
  # Create a dataframe with the top 10 authors and the number of spaces they have created
376
+ author_counts = df["author"].value_counts().head(20).reset_index()
377
  author_counts["author"] = author_counts["author"].apply(
378
  lambda x: f"<a target='_blank' href=https://huggingface.co/{x}>{x}</a>"
379
  )
 
381
  gr.DataFrame(author_counts, datatype="html")
382
 
383
  # create a dataframe where we groupby author and sum their likes
384
+ author_likes = df.groupby("author").agg({"likes": "sum"}).reset_index()
385
+ author_likes = author_likes.sort_values(by="likes", ascending=False).head(
386
+ 20
387
+ )
388
  author_likes["author"] = author_likes["author"].apply(
389
  lambda x: f"<a target='_blank' href=https://huggingface.co/{x}>{x}</a>"
390
  )
391
  author_likes.columns = ["Author", "Number of Likes"]
392
  gr.DataFrame(author_likes, datatype="html")
393
 
 
394
  with gr.Tab(label="Spaces Search"):
395
+ df = df[df["stage"] == "RUNNING"]
396
 
397
  # Layout
398
  with gr.Row():
399
  emoji = gr.Dropdown(
400
+ df["emoji"].unique().tolist(),
401
+ label="Search by Emoji 🤗",
402
+ multiselect=True,
403
  ) # Dropdown to select the emoji
404
  likes = gr.Slider(
405
  minimum=df["likes"].min(),
 
409
  ) # Slider to filter by likes
410
  with gr.Row():
411
  author = gr.Dropdown(
412
+ df["author"].unique().tolist(),
413
+ label="Search by Author",
414
+ multiselect=True,
415
  )
416
  # get the list of unique strings in the sdk_tags column
417
  sdk_tags = np.unique(np.concatenate(df["sdk_tags"].values))
 
441
  )
442
 
443
  devmode = gr.Checkbox(label="Show Dev Mode Spaces")
444
+ clear = gr.ClearButton(
445
+ components=[
446
  emoji,
447
  author,
448
  hardware,
449
  sdk_tags,
450
  models,
451
  datasets,
452
+ space_license,
453
+ ]
454
+ )
455
 
456
  df = pd.DataFrame(
457
  df[
 
470
  "r_models",
471
  "r_datasets",
472
  "r_licenses",
473
+ "devMode",
474
  ]
475
  ]
476
  )
 
488
  devmode,
489
  ],
490
  datatype="html",
491
+ wrap=True,
492
+ column_widths=["25%", "5%", "25%", "25%", "20%"],
493
  )
494
 
495
 
496
+ demo.launch(share=True)