jsulz HF staff commited on
Commit
cf4323e
1 Parent(s): 87f778f

doing some cleanup

Browse files
Files changed (1) hide show
  1. app.py +315 -180
app.py CHANGED
@@ -2,43 +2,121 @@ import gradio as gr
2
  import pandas as pd
3
  import numpy as np
4
  import plotly.express as px
5
- # Load the spaces.parquet file as a dataframe and do some pre cleaning steps
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
6
 
 
 
7
 
8
- """
9
- Todos:
10
- Clean up existing filtering code
11
- """
 
 
12
 
 
 
 
 
 
 
 
 
 
13
 
14
- def filtered_df(emoji, likes, author, hardware, tags, models, datasets, space_licenses):
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
15
  _df = df
16
- # if emoji is not none, filter the dataframe with it
17
- if emoji:
18
- _df = _df[_df["emoji"].isin(emoji)]
19
- # if likes is not none, filter the dataframe with it
20
- if likes:
21
- _df = _df[_df["likes"] >= likes]
22
- if author:
23
- _df = _df[_df["author"].isin(author)]
24
- if hardware:
25
- _df = _df[_df["hardware"].isin(hardware)]
26
- # check to see if the array of sdk_tags contains any of the selected tags
27
- if tags:
28
- _df = _df[_df["sdk_tags"].apply(lambda x: any(tag in x for tag in tags))]
29
- if models:
30
  _df = _df[
31
  _df["models"].apply(
32
  lambda x: (
33
- any(model in x for model in models) if x is not None else False
 
 
34
  )
35
  )
36
  ]
37
- if datasets:
38
  _df = _df[
39
  _df["datasets"].apply(
40
  lambda x: (
41
- any(dataset in x for dataset in datasets)
42
  if x is not None
43
  else False
44
  )
@@ -58,212 +136,269 @@ def filtered_df(emoji, likes, author, hardware, tags, models, datasets, space_li
58
  # rename the columns names to make them more readable
59
  _df = _df.rename(
60
  columns={
61
- 'url': 'URL',
62
- 'likes': 'Likes',
63
  "r_models": "Models",
64
  "r_datasets": "Datasets",
65
  "r_licenses": "Licenses",
66
  }
67
  )
68
 
69
- return _df[["URL", "Likes", "Models", "Datasets", "Licenses" ]]
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
70
 
71
 
72
  with gr.Blocks(fill_width=True) as demo:
 
73
  with gr.Tab(label="Spaces Overview"):
74
 
75
- # The Pandas dataframe has a datetime column. Plot the growth of spaces (row entries) over time.
76
  # The x-axis should be the date and the y-axis should be the cumulative number of spaces created up to that date .
77
- df = pd.read_parquet("spaces.parquet")
78
  df = df.sort_values("created_at")
79
  df['cumulative_spaces'] = df['created_at'].rank(method='first').astype(int)
80
- fig1 = px.line(df, x='created_at', y='cumulative_spaces', title='Growth of Spaces Over Time', labels={'created_at': 'Date', 'cumulative_spaces': 'Number of Spaces'}, template='plotly_dark')
 
 
 
 
 
 
 
81
  gr.Plot(fig1)
82
 
83
- # Create a pie charge showing the distribution of spaces by SDK
84
- fig2 = px.pie(df, names='sdk', title='Distribution of Spaces by SDK', template='plotly_dark')
85
- gr.Plot(fig2)
86
-
87
- # create a pie chart showing the distribution of spaces by emoji for the top 10 used emojis
88
- emoji_counts = df['emoji'].value_counts().head(10).reset_index()
89
- fig3 = px.pie(emoji_counts, names='emoji', values='count', title='Distribution of Spaces by Emoji', template='plotly_dark')
90
- gr.Plot(fig3)
91
 
92
- # Create a dataframe with the top 10 authors and the number of spaces they have created
93
- author_counts = df['author'].value_counts().head(20).reset_index()
94
- author_counts.columns = ['Author', 'Number of Spaces']
95
- gr.DataFrame(author_counts)
96
 
97
  # Create a scatter plot showing the relationship between the number of likes and the number of spaces created by an author
98
  author_likes = df.groupby('author').agg({'likes': 'sum', 'id': 'count'}).reset_index()
99
- fig4 = px.scatter(author_likes, x='id', y='likes', title='Relationship between Number of Spaces Created and Number of Likes', labels={'id': 'Number of Spaces Created', 'likes': 'Number of Likes'}, hover_data={'author': True}, template='plotly_dark')
 
 
 
 
 
 
 
 
100
  gr.Plot(fig4)
101
 
102
  # Create a scatter plot showing the relationship between the number of likes and the number of spaces created by an author
103
  emoji_likes = df.groupby('emoji').agg({'likes': 'sum', 'id': 'count'}).sort_values(by='likes', ascending=False).head(20).reset_index()
104
- fig10 = px.scatter(emoji_likes, x='id', y='likes', title='Relationship between Number of Spaces Created and Number of Likes', labels={'id': 'Number of Spaces Created', 'likes': 'Number of Likes'}, hover_data={'emoji': True}, template='plotly_dark')
 
 
 
 
 
 
 
 
105
  gr.Plot(fig10)
106
 
107
  # Create a bar chart of hardware in use
108
  hardware = df['hardware'].value_counts().reset_index()
109
  hardware.columns = ['Hardware', 'Number of Spaces']
110
- fig5 = px.bar(hardware, x='Hardware', y='Number of Spaces', title='Hardware in Use', labels={'Hardware': 'Hardware', 'Number of Spaces': 'Number of Spaces (log scale)'}, color='Hardware', template='plotly_dark')
111
- fig5.update_layout(yaxis_type='log')
 
 
 
 
 
 
 
 
 
 
 
112
  gr.Plot(fig5)
113
 
114
- models = np.concatenate([arr for arr in df['models'].values if arr is not None])
115
- model_count = {}
116
- model_author_count = {}
117
- for model in models:
118
- author = model.split('/')[0]
119
- if model in model_count:
120
- model_count[model] += 1
121
- else:
122
- model_count[model] = 1
123
- if author in model_author_count:
124
- model_author_count[author] += 1
125
- else:
126
- model_author_count[author] = 1
127
  model_author_count = pd.DataFrame(model_author_count.items(), columns=['Model Author', 'Number of Spaces'])
128
- fig8 = px.bar(model_author_count.sort_values('Number of Spaces', ascending=False).head(20), x='Model Author', y='Number of Spaces', title='Most Popular Model Authors', labels={'Model': 'Model', 'Number of Spaces': 'Number of Spaces'}, template='plotly_dark')
 
 
 
 
 
 
 
 
 
129
  gr.Plot(fig8)
130
  model_count = pd.DataFrame(model_count.items(), columns=['Model', 'Number of Spaces'])
131
  # then make a bar chart
132
- fig6 = px.bar(model_count.sort_values('Number of Spaces', ascending=False).head(20), x='Model', y='Number of Spaces', title='Most Used Models', labels={'Model': 'Model', 'Number of Spaces': 'Number of Spaces'}, template='plotly_dark')
 
 
 
 
 
 
 
133
  gr.Plot(fig6)
134
 
135
- datasets = np.concatenate([arr for arr in df['datasets'].values if arr is not None])
136
- dataset_count = {}
137
- dataset_author_count = {}
138
- for dataset in datasets:
139
- author = dataset.split('/')[0]
140
- if dataset in dataset_count:
141
- dataset_count[dataset] += 1
142
- else:
143
- dataset_count[dataset] = 1
144
- if author in dataset_author_count:
145
- dataset_author_count[author] += 1
146
- else:
147
- dataset_author_count[author] = 1
148
  dataset_count = pd.DataFrame(dataset_count.items(), columns=['Datasets', 'Number of Spaces'])
149
  dataset_author_count = pd.DataFrame(dataset_author_count.items(), columns=['Dataset Author', 'Number of Spaces'])
150
- fig9 = px.bar(dataset_author_count.sort_values('Number of Spaces', ascending=False).head(20), x='Dataset Author', y='Number of Spaces', title='Most Popular Dataset Authors', labels={'Dataset Author': 'Dataset Author', 'Number of Spaces': 'Number of Spaces'}, template='plotly_dark')
 
 
 
 
 
 
 
 
 
 
 
 
151
  gr.Plot(fig9)
152
  # then make a bar chart
153
- fig7 = px.bar(dataset_count.sort_values('Number of Spaces', ascending=False).head(20), x='Datasets', y='Number of Spaces', title='Most Used Datasets', labels={'Datasets': 'Datasets', 'Number of Spaces': 'Number of Spaces'}, template='plotly_dark')
154
- gr.Plot(fig7)
155
-
156
- # Get the most duplicated spaces
157
- duplicated_spaces = df['duplicated_from'].value_counts().head(20).reset_index()
158
- duplicated_spaces.columns = ['Space', 'Number of Duplicates']
159
- gr.DataFrame(duplicated_spaces)
160
-
161
- # Get the most duplicated spaces
162
- liked_spaces = df[['id', 'likes']].sort_values(by='likes', ascending=False).head(20)
163
- liked_spaces.columns = ['Space', 'Number of Likes']
164
- gr.DataFrame(liked_spaces)
165
-
166
- # Get the spaces with the longest READMEs
167
- readme_sizes = df[['id', 'readme_size']].sort_values(by='readme_size', ascending=False).head(20)
168
- readme_sizes.columns = ['Space', 'Longest READMEs']
169
- gr.DataFrame(readme_sizes)
170
-
171
- with gr.Tab(label="Spaces Search"):
172
- df = pd.read_parquet("spaces.parquet")
173
- df = df[df["stage"] == "RUNNING"]
174
- # combine the sdk and tags columns, one of which is a string and the other is an array of strings
175
- # first convert the sdk column to an array of strings
176
- df["sdk"] = df["sdk"].apply(lambda x: np.array([str(x)]))
177
- df["licenses"] = df["license"].apply(
178
- lambda x: np.array([str(x)]) if x is None else x
179
- )
180
- # then combine the sdk and tags columns so that their elements are together
181
- df["sdk_tags"] = df[["sdk", "tags"]].apply(
182
- lambda x: np.concatenate((x.iloc[0], x.iloc[1])), axis=1
183
  )
 
184
 
185
- df['emoji'] = np.where(df['emoji'].isnull(), '', df['emoji'])
186
-
187
- # where the custom_domains column is not null, use that as the url, otherwise, use the host column
188
- df["url"] = np.where(
189
- df["custom_domains"].isnull(),
190
- df["id"],
191
- df["custom_domains"],
192
- )
193
- df["url"] = df[["url", "emoji"]].apply(
194
- lambda x: (
195
- f"<a target='_blank' href=https://huggingface.co/spaces/{x.iloc[0]}>{str(x.iloc[1]) + " " + x.iloc[0]}</a>"
196
- if x.iloc[0] is not None and "/" in x.iloc[0]
197
- else f"<a target='_blank' href=https://{x.iloc[0][0]}>{str(x.iloc[1]) + " " + x.iloc[0][0]}</a>"
198
- ),
199
- axis=1,
200
- )
201
 
202
- # Make all of this human readable
203
- df["r_models"] = [', '.join(models) if models is not None else '' for models in df["models"]]
204
- df["r_sdk_tags"] = [', '.join(sdk_tags) if sdk_tags is not None else '' for sdk_tags in df["sdk_tags"]]
205
- df["r_datasets"] = [', '.join(datasets) if datasets is not None else '' for datasets in df["datasets"]]
206
- df["r_licenses"] = [', '.join(licenses) if licenses is not None else '' for licenses in df["licenses"]]
207
-
208
-
209
- emoji = gr.Dropdown(
210
- df["emoji"].unique().tolist(), label="Search by Emoji 🤗", multiselect=True
211
- ) # Dropdown to select the emoji
212
- likes = gr.Slider(
213
- minimum=df["likes"].min(),
214
- maximum=df["likes"].max(),
215
- step=1,
216
- label="Filter by Likes",
217
- ) # Slider to filter by likes
218
- hardware = gr.Dropdown(
219
- df["hardware"].unique().tolist(), label="Search by Hardware", multiselect=True
220
- )
221
- author = gr.Dropdown(
222
- df["author"].unique().tolist(), label="Search by Author", multiselect=True
223
- )
 
 
 
224
 
225
 
226
- # get the list of unique strings in the sdk_tags column
227
- sdk_tags = np.unique(np.concatenate(df["sdk_tags"].values))
228
- # create a dropdown for the sdk_tags
229
- sdk_tags = gr.Dropdown(
230
- sdk_tags.tolist(), label="Filter by SDK/Tags", multiselect=True
231
- )
232
- # create a gradio checkbox group for hardware
233
- hardware = gr.CheckboxGroup(
234
- df["hardware"].unique().tolist(), label="Filter by Hardware"
235
- )
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
236
 
237
- licenses = np.unique(np.concatenate(df["licenses"].values))
238
- space_license = gr.CheckboxGroup(licenses.tolist(), label="Filter by license")
239
 
240
- # If the models column is none make it an array of "none" so that things don't break
241
- models_column_to_list = df["models"].apply(
242
- lambda x: np.array(["None"]) if np.ndim(x) == 0 else x
243
- )
244
- # Now, flatten all arrays into one list
245
- models_flattened = np.concatenate(models_column_to_list.values)
246
- # Get unique strings
247
- unique_models = np.unique(models_flattened)
248
- models = gr.Dropdown(
249
- unique_models.tolist(),
250
- label="Search by Model",
251
- multiselect=True,
252
- )
253
-
254
- # Do the same for datasets that we did for models
255
- datasets_column_to_list = df["datasets"].apply(
256
- lambda x: np.array(["None"]) if np.ndim(x) == 0 else x
257
- )
258
- flattened_datasets = np.concatenate(datasets_column_to_list.values)
259
- unique_datasets = np.unique(flattened_datasets)
260
- datasets = gr.Dropdown(
261
- unique_datasets.tolist(),
262
- label="Search by Dataset",
263
- multiselect=True,
264
- )
265
 
266
- devMode = gr.Checkbox(value=False, label="DevMode Enabled")
267
  clear = gr.ClearButton(components=[
268
  emoji,
269
  author,
 
2
  import pandas as pd
3
  import numpy as np
4
  import plotly.express as px
5
+ from datasets import load_dataset
6
+
7
+ def load_transform_data():
8
+ """
9
+ Load and transform data from a parquet file.
10
+
11
+ Returns:
12
+ pandas.DataFrame: Transformed dataframe.
13
+ """
14
+ spaces_dataset = 'jsulz/space-stats'
15
+ dataset = load_dataset(spaces_dataset)
16
+ df = dataset['train'].to_pandas()
17
+ # combine the sdk and tags columns, one of which is a string and the other is an array of strings
18
+ df["sdk"] = df["sdk"].apply(lambda x: np.array([str(x)]))
19
+ df["licenses"] = df["license"].apply(
20
+ lambda x: np.array([str(x)]) if x is None else x
21
+ )
22
+ # then combine the sdk and tags columns so that their elements are together
23
+ df["sdk_tags"] = df[["sdk", "tags"]].apply(
24
+ lambda x: np.concatenate((x.iloc[0], x.iloc[1])), axis=1
25
+ )
26
 
27
+ # Fill the NaN values with an empty string
28
+ df['emoji'] = np.where(df['emoji'].isnull(), '', df['emoji'])
29
 
30
+ # where the custom_domains column is not null, use that as the url, otherwise, use the host column
31
+ df["url"] = np.where(
32
+ df["custom_domains"].isnull(),
33
+ df["id"],
34
+ df["custom_domains"],
35
+ )
36
 
37
+ # Build up a pretty url that's clickable with the emoji
38
+ df["url"] = df[["url", "emoji"]].apply(
39
+ lambda x: (
40
+ f"<a target='_blank' href=https://huggingface.co/spaces/{x.iloc[0]}>{str(x.iloc[1]) + " " + x.iloc[0]}</a>"
41
+ if x.iloc[0] is not None and "/" in x.iloc[0]
42
+ else f"<a target='_blank' href=https://{x.iloc[0][0]}>{str(x.iloc[1]) + " " + x.iloc[0][0]}</a>"
43
+ ),
44
+ axis=1,
45
+ )
46
 
47
+ # Prep the models, datasets, and licenses columns for display
48
+ df["r_models"] = [
49
+ ", ".join(models) if models is not None else "" for models in df["models"]
50
+ ]
51
+ df["r_sdk_tags"] = [
52
+ ", ".join(sdk_tags) if sdk_tags is not None else ""
53
+ for sdk_tags in df["sdk_tags"]
54
+ ]
55
+ df["r_datasets"] = [
56
+ ", ".join(datasets) if datasets is not None else ""
57
+ for datasets in df["datasets"]
58
+ ]
59
+ df["r_licenses"] = [
60
+ ", ".join(licenses) if licenses is not None else ""
61
+ for licenses in df["licenses"]
62
+ ]
63
+ return df
64
+
65
+
66
+ def filtered_df(
67
+ filtered_emojis,
68
+ filtered_likes,
69
+ filtered_author,
70
+ filtered_hardware,
71
+ filtered_tags,
72
+ filtered_models,
73
+ filtered_datasets,
74
+ space_licenses,
75
+ ):
76
+ """
77
+ Filter the dataframe based on the given criteria.
78
+
79
+ Args:
80
+ filtered_emojis (list): List of emojis to filter the dataframe by.
81
+ filtered_likes (int): Minimum number of likes to filter the dataframe by.
82
+ filtered_author (list): List of authors to filter the dataframe by.
83
+ filtered_hardware (list): List of hardware to filter the dataframe by.
84
+ filtered_tags (list): List of tags to filter the dataframe by.
85
+ filtered_models (list): List of models to filter the dataframe by.
86
+ filtered_datasets (list): List of datasets to filter the dataframe by.
87
+ space_licenses (list): List of licenses to filter the dataframe by.
88
+
89
+ Returns:
90
+ pandas.DataFrame: Filtered dataframe with the following columns: "URL", "Likes", "Models", "Datasets", "Licenses".
91
+ """
92
  _df = df
93
+ if filtered_emojis:
94
+ _df = _df[_df["emoji"].isin(filtered_emojis)]
95
+ if filtered_likes:
96
+ _df = _df[_df["likes"] >= filtered_likes]
97
+ if filtered_author:
98
+ _df = _df[_df["author"].isin(filtered_author)]
99
+ if filtered_hardware:
100
+ _df = _df[_df["hardware"].isin(filtered_hardware)]
101
+ if filtered_tags:
102
+ _df = _df[
103
+ _df["sdk_tags"].apply(lambda x: any(tag in x for tag in filtered_tags))
104
+ ]
105
+ if filtered_models:
 
106
  _df = _df[
107
  _df["models"].apply(
108
  lambda x: (
109
+ any(model in x for model in filtered_models)
110
+ if x is not None
111
+ else False
112
  )
113
  )
114
  ]
115
+ if filtered_datasets:
116
  _df = _df[
117
  _df["datasets"].apply(
118
  lambda x: (
119
+ any(dataset in x for dataset in filtered_datasets)
120
  if x is not None
121
  else False
122
  )
 
136
  # rename the columns names to make them more readable
137
  _df = _df.rename(
138
  columns={
139
+ "url": "URL",
140
+ "likes": "Likes",
141
  "r_models": "Models",
142
  "r_datasets": "Datasets",
143
  "r_licenses": "Licenses",
144
  }
145
  )
146
 
147
+ return _df[["URL", "Likes", "Models", "Datasets", "Licenses"]]
148
+
149
+
150
+ def count_items(items):
151
+ """
152
+ Count the occurrences of items and authors in a given list of items.
153
+ Parameters:
154
+ items (dataframe column): A dataframe column containing a list of items.
155
+ Returns:
156
+ tuple: A tuple containing two dictionaries. The first dictionary contains the count of each item,
157
+ and the second dictionary contains the count of each author.
158
+ """
159
+ items = np.concatenate([arr for arr in items.values if arr is not None])
160
+ item_count = {}
161
+ item_author_count = {}
162
+ for item in items:
163
+ if item in item_count:
164
+ item_count[item] += 1
165
+ else:
166
+ item_count[item] = 1
167
+ author = item.split('/')[0]
168
+ if author in item_author_count:
169
+ item_author_count[author] += 1
170
+ else:
171
+ item_author_count[author] = 1
172
+
173
+ return item_count, item_author_count
174
+
175
+ def flatten_column(_df, column):
176
+ """
177
+ Flattens a column in a DataFrame.
178
+
179
+ Args:
180
+ _df (pandas.DataFrame): The DataFrame containing the column.
181
+ column (str): The name of the column to flatten.
182
+
183
+ Returns:
184
+ list: A list of unique values from the flattened column.
185
+ """
186
+ column_to_list = _df[column].apply(
187
+ lambda x: np.array(["None"]) if np.ndim(x) == 0 else x
188
+ )
189
+ flattened = np.concatenate(column_to_list.values)
190
+ uniques = np.unique(flattened)
191
+ return uniques.tolist()
192
 
193
 
194
  with gr.Blocks(fill_width=True) as demo:
195
+ df = load_transform_data()
196
  with gr.Tab(label="Spaces Overview"):
197
 
198
+ # The Pandas dataframe has a datetime column. Plot the growth of spaces (row entries) over time.
199
  # The x-axis should be the date and the y-axis should be the cumulative number of spaces created up to that date .
 
200
  df = df.sort_values("created_at")
201
  df['cumulative_spaces'] = df['created_at'].rank(method='first').astype(int)
202
+ fig1 = px.line(
203
+ df,
204
+ x="created_at",
205
+ y="cumulative_spaces",
206
+ title="Growth of Spaces Over Time",
207
+ labels={"created_at": "Date", "cumulative_spaces": "Number of Spaces"},
208
+ template="plotly_dark",
209
+ )
210
  gr.Plot(fig1)
211
 
212
+ with gr.Row():
213
+ # Create a pie charge showing the distribution of spaces by SDK
214
+ fig2 = px.pie(df, names='sdk', title='Distribution of Spaces by SDK', template='plotly_dark')
215
+ gr.Plot(fig2)
 
 
 
 
216
 
217
+ # create a pie chart showing the distribution of spaces by emoji for the top 10 used emojis
218
+ emoji_counts = df['emoji'].value_counts().head(10).reset_index()
219
+ fig3 = px.pie(emoji_counts, names='emoji', values='count', title='Distribution of Spaces by Emoji', template='plotly_dark')
220
+ gr.Plot(fig3)
221
 
222
  # Create a scatter plot showing the relationship between the number of likes and the number of spaces created by an author
223
  author_likes = df.groupby('author').agg({'likes': 'sum', 'id': 'count'}).reset_index()
224
+ fig4 = px.scatter(
225
+ author_likes,
226
+ x="id",
227
+ y="likes",
228
+ title="Relationship between Number of Spaces Created and Number of Likes",
229
+ labels={"id": "Number of Spaces Created", "likes": "Number of Likes"},
230
+ hover_data={"author": True},
231
+ template="plotly_dark",
232
+ )
233
  gr.Plot(fig4)
234
 
235
  # Create a scatter plot showing the relationship between the number of likes and the number of spaces created by an author
236
  emoji_likes = df.groupby('emoji').agg({'likes': 'sum', 'id': 'count'}).sort_values(by='likes', ascending=False).head(20).reset_index()
237
+ fig10 = px.scatter(
238
+ emoji_likes,
239
+ x="id",
240
+ y="likes",
241
+ title="Relationship between Emoji and Number of Likes",
242
+ labels={"id": "Number of Spaces Created", "likes": "Number of Likes"},
243
+ hover_data={"emoji": True},
244
+ template="plotly_dark",
245
+ )
246
  gr.Plot(fig10)
247
 
248
  # Create a bar chart of hardware in use
249
  hardware = df['hardware'].value_counts().reset_index()
250
  hardware.columns = ['Hardware', 'Number of Spaces']
251
+ fig5 = px.bar(
252
+ hardware,
253
+ x="Hardware",
254
+ y="Number of Spaces",
255
+ title="Hardware in Use",
256
+ labels={
257
+ "Hardware": "Hardware",
258
+ "Number of Spaces": "Number of Spaces (log scale)",
259
+ },
260
+ color="Hardware",
261
+ template="plotly_dark",
262
+ )
263
+ fig5.update_layout(yaxis_type="log")
264
  gr.Plot(fig5)
265
 
266
+ model_count, model_author_count = count_items(df['models'])
 
 
 
 
 
 
 
 
 
 
 
 
267
  model_author_count = pd.DataFrame(model_author_count.items(), columns=['Model Author', 'Number of Spaces'])
268
+ fig8 = px.bar(
269
+ model_author_count.sort_values("Number of Spaces", ascending=False).head(
270
+ 20
271
+ ),
272
+ x="Model Author",
273
+ y="Number of Spaces",
274
+ title="Most Popular Model Authors",
275
+ labels={"Model": "Model", "Number of Spaces": "Number of Spaces"},
276
+ template="plotly_dark",
277
+ )
278
  gr.Plot(fig8)
279
  model_count = pd.DataFrame(model_count.items(), columns=['Model', 'Number of Spaces'])
280
  # then make a bar chart
281
+ fig6 = px.bar(
282
+ model_count.sort_values("Number of Spaces", ascending=False).head(20),
283
+ x="Model",
284
+ y="Number of Spaces",
285
+ title="Most Used Models",
286
+ labels={"Model": "Model", "Number of Spaces": "Number of Spaces"},
287
+ template="plotly_dark",
288
+ )
289
  gr.Plot(fig6)
290
 
291
+ dataset_count, dataset_author_count = count_items(df['datasets'])
 
 
 
 
 
 
 
 
 
 
 
 
292
  dataset_count = pd.DataFrame(dataset_count.items(), columns=['Datasets', 'Number of Spaces'])
293
  dataset_author_count = pd.DataFrame(dataset_author_count.items(), columns=['Dataset Author', 'Number of Spaces'])
294
+ fig9 = px.bar(
295
+ dataset_author_count.sort_values("Number of Spaces", ascending=False).head(
296
+ 20
297
+ ),
298
+ x="Dataset Author",
299
+ y="Number of Spaces",
300
+ title="Most Popular Dataset Authors",
301
+ labels={
302
+ "Dataset Author": "Dataset Author",
303
+ "Number of Spaces": "Number of Spaces",
304
+ },
305
+ template="plotly_dark",
306
+ )
307
  gr.Plot(fig9)
308
  # then make a bar chart
309
+ fig7 = px.bar(
310
+ dataset_count.sort_values("Number of Spaces", ascending=False).head(20),
311
+ x="Datasets",
312
+ y="Number of Spaces",
313
+ title="Most Used Datasets",
314
+ labels={"Datasets": "Datasets", "Number of Spaces": "Number of Spaces"},
315
+ template="plotly_dark",
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
316
  )
317
+ gr.Plot(fig7)
318
 
319
+ with gr.Row():
320
+ # Get the most duplicated spaces
321
+ duplicated_spaces = df['duplicated_from'].value_counts().head(20).reset_index()
322
+ duplicated_spaces["duplicated_from"] = duplicated_spaces[
323
+ "duplicated_from"
324
+ ].apply(
325
+ lambda x: f"<a target='_blank' href=https://huggingface.co/spaces/{x}>{x}</a>"
326
+ )
327
+ duplicated_spaces.columns = ["Space", "Number of Duplicates"]
328
+ gr.DataFrame(duplicated_spaces, datatype="html" )
 
 
 
 
 
 
329
 
330
+ # Get the most liked spaces
331
+ liked_spaces = df[['id', 'likes']].sort_values(by='likes', ascending=False).head(20)
332
+ liked_spaces["id"] = liked_spaces["id"].apply(
333
+ lambda x: f"<a target='_blank' href=https://huggingface.co/spaces/{x}>{x}</a>"
334
+ )
335
+ liked_spaces.columns = ['Space', 'Number of Likes']
336
+ gr.DataFrame(liked_spaces, datatype="html")
337
+
338
+ with gr.Row():
339
+ # Create a dataframe with the top 10 authors and the number of spaces they have created
340
+ author_counts = df['author'].value_counts().head(20).reset_index()
341
+ author_counts["author"] = author_counts["author"].apply(
342
+ lambda x: f"<a target='_blank' href=https://huggingface.co/{x}>{x}</a>"
343
+ )
344
+ author_counts.columns = ["Author", "Number of Spaces"]
345
+ gr.DataFrame(author_counts, datatype="html")
346
+
347
+ # create a dataframe where we groupby author and sum their likes
348
+ author_likes = df.groupby('author').agg({'likes': 'sum'}).reset_index()
349
+ author_likes = author_likes.sort_values(by='likes', ascending=False).head(20)
350
+ author_likes["author"] = author_likes["author"].apply(
351
+ lambda x: f"<a target='_blank' href=https://huggingface.co/{x}>{x}</a>"
352
+ )
353
+ author_likes.columns = ["Author", "Number of Likes"]
354
+ gr.DataFrame(author_likes, datatype="html")
355
 
356
 
357
+ with gr.Tab(label="Spaces Search"):
358
+ df = df[df['stage'] == 'RUNNING']
359
+
360
+ # Layout
361
+ with gr.Row():
362
+ emoji = gr.Dropdown(
363
+ df["emoji"].unique().tolist(), label="Search by Emoji 🤗", multiselect=True
364
+ ) # Dropdown to select the emoji
365
+ likes = gr.Slider(
366
+ minimum=df["likes"].min(),
367
+ maximum=df["likes"].max(),
368
+ step=1,
369
+ label="Filter by Likes",
370
+ ) # Slider to filter by likes
371
+ with gr.Row():
372
+ author = gr.Dropdown(
373
+ df["author"].unique().tolist(), label="Search by Author", multiselect=True
374
+ )
375
+ # get the list of unique strings in the sdk_tags column
376
+ sdk_tags = np.unique(np.concatenate(df["sdk_tags"].values))
377
+ # create a dropdown for the sdk_tags
378
+ sdk_tags = gr.Dropdown(
379
+ sdk_tags.tolist(), label="Filter by SDK/Tags", multiselect=True
380
+ )
381
+ with gr.Row():
382
+ # create a gradio checkbox group for hardware
383
+ hardware = gr.CheckboxGroup(
384
+ df["hardware"].unique().tolist(), label="Filter by Hardware"
385
+ )
386
 
387
+ licenses = np.unique(np.concatenate(df["licenses"].values))
388
+ space_license = gr.Dropdown(licenses.tolist(), label="Filter by license")
389
 
390
+ with gr.Row():
391
+ models = gr.Dropdown(
392
+ flatten_column(df, "models"),
393
+ label="Search by Model",
394
+ multiselect=True,
395
+ )
396
+ datasets = gr.Dropdown(
397
+ flatten_column(df, "datasets"),
398
+ label="Search by Dataset",
399
+ multiselect=True,
400
+ )
 
 
 
 
 
 
 
 
 
 
 
 
 
 
401
 
 
402
  clear = gr.ClearButton(components=[
403
  emoji,
404
  author,