import gradio as gr
import pandas as pd
import numpy as np
import plotly.express as px
from datasets import load_dataset
def load_transform_data():
"""
Load and transform data from a parquet file.
Returns:
pandas.DataFrame: Transformed dataframe.
"""
spaces_dataset = 'jsulz/space-stats'
dataset = load_dataset(spaces_dataset)
df = dataset['train'].to_pandas()
# combine the sdk and tags columns, one of which is a string and the other is an array of strings
df["sdk"] = df["sdk"].apply(lambda x: np.array([str(x)]))
df["licenses"] = df["license"].apply(
lambda x: np.array([str(x)]) if x is None else x
)
# then combine the sdk and tags columns so that their elements are together
df["sdk_tags"] = df[["sdk", "tags"]].apply(
lambda x: np.concatenate((x.iloc[0], x.iloc[1])), axis=1
)
# Fill the NaN values with an empty string
df['emoji'] = np.where(df['emoji'].isnull(), '', df['emoji'])
# where the custom_domains column is not null, use that as the url, otherwise, use the host column
df["url"] = np.where(
df["custom_domains"].isnull(),
df["id"],
df["custom_domains"],
)
# Build up a pretty url that's clickable with the emoji
df["url"] = df[["url", "emoji"]].apply(
lambda x: (
f"{str(x.iloc[1]) + " " + x.iloc[0]}"
if x.iloc[0] is not None and "/" in x.iloc[0]
else f"{str(x.iloc[1]) + " " + x.iloc[0][0]}"
),
axis=1,
)
# Prep the models, datasets, and licenses columns for display
df["r_models"] = [
", ".join(models) if models is not None else "" for models in df["models"]
]
df["r_sdk_tags"] = [
", ".join(sdk_tags) if sdk_tags is not None else ""
for sdk_tags in df["sdk_tags"]
]
df["r_datasets"] = [
", ".join(datasets) if datasets is not None else ""
for datasets in df["datasets"]
]
df["r_licenses"] = [
", ".join(licenses) if licenses is not None else ""
for licenses in df["licenses"]
]
return df
def filtered_df(
filtered_emojis,
filtered_likes,
filtered_author,
filtered_hardware,
filtered_tags,
filtered_models,
filtered_datasets,
space_licenses,
):
"""
Filter the dataframe based on the given criteria.
Args:
filtered_emojis (list): List of emojis to filter the dataframe by.
filtered_likes (int): Minimum number of likes to filter the dataframe by.
filtered_author (list): List of authors to filter the dataframe by.
filtered_hardware (list): List of hardware to filter the dataframe by.
filtered_tags (list): List of tags to filter the dataframe by.
filtered_models (list): List of models to filter the dataframe by.
filtered_datasets (list): List of datasets to filter the dataframe by.
space_licenses (list): List of licenses to filter the dataframe by.
Returns:
pandas.DataFrame: Filtered dataframe with the following columns: "URL", "Likes", "Models", "Datasets", "Licenses".
"""
_df = df
if filtered_emojis:
_df = _df[_df["emoji"].isin(filtered_emojis)]
if filtered_likes:
_df = _df[_df["likes"] >= filtered_likes]
if filtered_author:
_df = _df[_df["author"].isin(filtered_author)]
if filtered_hardware:
_df = _df[_df["hardware"].isin(filtered_hardware)]
if filtered_tags:
_df = _df[
_df["sdk_tags"].apply(lambda x: any(tag in x for tag in filtered_tags))
]
if filtered_models:
_df = _df[
_df["models"].apply(
lambda x: (
any(model in x for model in filtered_models)
if x is not None
else False
)
)
]
if filtered_datasets:
_df = _df[
_df["datasets"].apply(
lambda x: (
any(dataset in x for dataset in filtered_datasets)
if x is not None
else False
)
)
]
if space_licenses:
_df = _df[
_df["licenses"].apply(
lambda x: (
any(space_license in x for space_license in space_licenses)
if x is not None
else False
)
)
]
# rename the columns names to make them more readable
_df = _df.rename(
columns={
"url": "URL",
"likes": "Likes",
"r_models": "Models",
"r_datasets": "Datasets",
"r_licenses": "Licenses",
}
)
return _df[["URL", "Likes", "Models", "Datasets", "Licenses"]]
def count_items(items):
"""
Count the occurrences of items and authors in a given list of items.
Parameters:
items (dataframe column): A dataframe column containing a list of items.
Returns:
tuple: A tuple containing two dictionaries. The first dictionary contains the count of each item,
and the second dictionary contains the count of each author.
"""
items = np.concatenate([arr for arr in items.values if arr is not None])
item_count = {}
item_author_count = {}
for item in items:
if item in item_count:
item_count[item] += 1
else:
item_count[item] = 1
author = item.split('/')[0]
if author in item_author_count:
item_author_count[author] += 1
else:
item_author_count[author] = 1
return item_count, item_author_count
def flatten_column(_df, column):
"""
Flattens a column in a DataFrame.
Args:
_df (pandas.DataFrame): The DataFrame containing the column.
column (str): The name of the column to flatten.
Returns:
list: A list of unique values from the flattened column.
"""
column_to_list = _df[column].apply(
lambda x: np.array(["None"]) if np.ndim(x) == 0 else x
)
flattened = np.concatenate(column_to_list.values)
uniques = np.unique(flattened)
return uniques.tolist()
with gr.Blocks(fill_width=True) as demo:
df = load_transform_data()
with gr.Tab(label="Spaces Overview"):
# The Pandas dataframe has a datetime column. Plot the growth of spaces (row entries) over time.
# The x-axis should be the date and the y-axis should be the cumulative number of spaces created up to that date .
df = df.sort_values("created_at")
df['cumulative_spaces'] = df['created_at'].rank(method='first').astype(int)
fig1 = px.line(
df,
x="created_at",
y="cumulative_spaces",
title="Growth of Spaces Over Time",
labels={"created_at": "Date", "cumulative_spaces": "Number of Spaces"},
template="plotly_dark",
)
gr.Plot(fig1)
with gr.Row():
# Create a pie charge showing the distribution of spaces by SDK
fig2 = px.pie(df, names='sdk', title='Distribution of Spaces by SDK', template='plotly_dark')
gr.Plot(fig2)
# create a pie chart showing the distribution of spaces by emoji for the top 10 used emojis
emoji_counts = df['emoji'].value_counts().head(10).reset_index()
fig3 = px.pie(emoji_counts, names='emoji', values='count', title='Distribution of Spaces by Emoji', template='plotly_dark')
gr.Plot(fig3)
# Create a scatter plot showing the relationship between the number of likes and the number of spaces created by an author
author_likes = df.groupby('author').agg({'likes': 'sum', 'id': 'count'}).reset_index()
fig4 = px.scatter(
author_likes,
x="id",
y="likes",
title="Relationship between Number of Spaces Created and Number of Likes",
labels={"id": "Number of Spaces Created", "likes": "Number of Likes"},
hover_data={"author": True},
template="plotly_dark",
)
gr.Plot(fig4)
# Create a scatter plot showing the relationship between the number of likes and the number of spaces created by an author
emoji_likes = df.groupby('emoji').agg({'likes': 'sum', 'id': 'count'}).sort_values(by='likes', ascending=False).head(20).reset_index()
fig10 = px.scatter(
emoji_likes,
x="id",
y="likes",
title="Relationship between Emoji and Number of Likes",
labels={"id": "Number of Spaces Created", "likes": "Number of Likes"},
hover_data={"emoji": True},
template="plotly_dark",
)
gr.Plot(fig10)
# Create a bar chart of hardware in use
hardware = df['hardware'].value_counts().reset_index()
hardware.columns = ['Hardware', 'Number of Spaces']
fig5 = px.bar(
hardware,
x="Hardware",
y="Number of Spaces",
title="Hardware in Use",
labels={
"Hardware": "Hardware",
"Number of Spaces": "Number of Spaces (log scale)",
},
color="Hardware",
template="plotly_dark",
)
fig5.update_layout(yaxis_type="log")
gr.Plot(fig5)
model_count, model_author_count = count_items(df['models'])
model_author_count = pd.DataFrame(model_author_count.items(), columns=['Model Author', 'Number of Spaces'])
fig8 = px.bar(
model_author_count.sort_values("Number of Spaces", ascending=False).head(
20
),
x="Model Author",
y="Number of Spaces",
title="Most Popular Model Authors",
labels={"Model": "Model", "Number of Spaces": "Number of Spaces"},
template="plotly_dark",
)
gr.Plot(fig8)
model_count = pd.DataFrame(model_count.items(), columns=['Model', 'Number of Spaces'])
# then make a bar chart
fig6 = px.bar(
model_count.sort_values("Number of Spaces", ascending=False).head(20),
x="Model",
y="Number of Spaces",
title="Most Used Models",
labels={"Model": "Model", "Number of Spaces": "Number of Spaces"},
template="plotly_dark",
)
gr.Plot(fig6)
dataset_count, dataset_author_count = count_items(df['datasets'])
dataset_count = pd.DataFrame(dataset_count.items(), columns=['Datasets', 'Number of Spaces'])
dataset_author_count = pd.DataFrame(dataset_author_count.items(), columns=['Dataset Author', 'Number of Spaces'])
fig9 = px.bar(
dataset_author_count.sort_values("Number of Spaces", ascending=False).head(
20
),
x="Dataset Author",
y="Number of Spaces",
title="Most Popular Dataset Authors",
labels={
"Dataset Author": "Dataset Author",
"Number of Spaces": "Number of Spaces",
},
template="plotly_dark",
)
gr.Plot(fig9)
# then make a bar chart
fig7 = px.bar(
dataset_count.sort_values("Number of Spaces", ascending=False).head(20),
x="Datasets",
y="Number of Spaces",
title="Most Used Datasets",
labels={"Datasets": "Datasets", "Number of Spaces": "Number of Spaces"},
template="plotly_dark",
)
gr.Plot(fig7)
with gr.Row():
# Get the most duplicated spaces
duplicated_spaces = df['duplicated_from'].value_counts().head(20).reset_index()
duplicated_spaces["duplicated_from"] = duplicated_spaces[
"duplicated_from"
].apply(
lambda x: f"{x}"
)
duplicated_spaces.columns = ["Space", "Number of Duplicates"]
gr.DataFrame(duplicated_spaces, datatype="html" )
# Get the most liked spaces
liked_spaces = df[['id', 'likes']].sort_values(by='likes', ascending=False).head(20)
liked_spaces["id"] = liked_spaces["id"].apply(
lambda x: f"{x}"
)
liked_spaces.columns = ['Space', 'Number of Likes']
gr.DataFrame(liked_spaces, datatype="html")
with gr.Row():
# Create a dataframe with the top 10 authors and the number of spaces they have created
author_counts = df['author'].value_counts().head(20).reset_index()
author_counts["author"] = author_counts["author"].apply(
lambda x: f"{x}"
)
author_counts.columns = ["Author", "Number of Spaces"]
gr.DataFrame(author_counts, datatype="html")
# create a dataframe where we groupby author and sum their likes
author_likes = df.groupby('author').agg({'likes': 'sum'}).reset_index()
author_likes = author_likes.sort_values(by='likes', ascending=False).head(20)
author_likes["author"] = author_likes["author"].apply(
lambda x: f"{x}"
)
author_likes.columns = ["Author", "Number of Likes"]
gr.DataFrame(author_likes, datatype="html")
with gr.Tab(label="Spaces Search"):
df = df[df['stage'] == 'RUNNING']
# Layout
with gr.Row():
emoji = gr.Dropdown(
df["emoji"].unique().tolist(), label="Search by Emoji 🤗", multiselect=True
) # Dropdown to select the emoji
likes = gr.Slider(
minimum=df["likes"].min(),
maximum=df["likes"].max(),
step=1,
label="Filter by Likes",
) # Slider to filter by likes
with gr.Row():
author = gr.Dropdown(
df["author"].unique().tolist(), label="Search by Author", multiselect=True
)
# get the list of unique strings in the sdk_tags column
sdk_tags = np.unique(np.concatenate(df["sdk_tags"].values))
# create a dropdown for the sdk_tags
sdk_tags = gr.Dropdown(
sdk_tags.tolist(), label="Filter by SDK/Tags", multiselect=True
)
with gr.Row():
# create a gradio checkbox group for hardware
hardware = gr.CheckboxGroup(
df["hardware"].unique().tolist(), label="Filter by Hardware"
)
licenses = np.unique(np.concatenate(df["licenses"].values))
space_license = gr.Dropdown(licenses.tolist(), label="Filter by license")
with gr.Row():
models = gr.Dropdown(
flatten_column(df, "models"),
label="Search by Model",
multiselect=True,
)
datasets = gr.Dropdown(
flatten_column(df, "datasets"),
label="Search by Dataset",
multiselect=True,
)
clear = gr.ClearButton(components=[
emoji,
author,
hardware,
sdk_tags,
models,
datasets,
space_license
])
df = pd.DataFrame(
df[
[
"id",
"emoji",
"author",
"url",
"likes",
"hardware",
"sdk_tags",
"models",
"datasets",
"licenses",
"r_sdk_tags",
"r_models",
"r_datasets",
"r_licenses",
]
]
)
gr.DataFrame(
filtered_df,
inputs=[
emoji,
likes,
author,
hardware,
sdk_tags,
models,
datasets,
space_license,
],
datatype="html",
wrap=True,
column_widths=["25%", "5%", "25%", "25%", "20%"]
)
demo.launch()