import gradio as gr
import pandas as pd
from plotly import graph_objects as go
import plotly.io as pio
import plotly.express as px
# @TODO: Add a custom template to the plotly figure
"""
pio.templates["custom"] = go.layout.Template()
pio.templates["custom"].layout = dict(
plot_bgcolor="#bde5ec", paper_bgcolor="#bbd5da"
)
# Set the default theme to "plotly_dark"
pio.templates.default = "custom"
"""
def process_dataset():
"""
Process the dataset and perform the following operations:
1. Read the file_counts_and_sizes, repo_by_size_df, unique_files_df, and file_extensions data from parquet files.
2. Convert the total size to petabytes and format it to two decimal places.
3. Capitalize the 'type' column in the file_counts_and_sizes dataframe.
4. Rename the columns in the file_counts_and_sizes dataframe.
5. Sort the file_counts_and_sizes dataframe by total size in descending order.
6. Drop rows with missing values in the 'extension' column of the file_extensions dataframe.
7. Return the repo_by_size_df, unique_files_df, file_counts_and_sizes, and file_extensions dataframes.
"""
file_counts_and_sizes = pd.read_parquet(
"hf://datasets/xet-team/lfs-analysis-data/transformed/file_counts_and_sizes.parquet"
)
repo_by_size_df = pd.read_parquet(
"hf://datasets/xet-team/lfs-analysis-data/transformed/repo_by_size.parquet"
)
unique_files_df = pd.read_parquet(
"hf://datasets/xet-team/lfs-analysis-data/transformed/repo_by_size_file_dedupe.parquet"
)
file_extensions = pd.read_parquet(
"hf://datasets/xet-team/lfs-analysis-data/transformed/file_extensions.parquet"
)
# read the file_extensions_by_month.parquet file
file_extensions_by_month = pd.read_parquet(
"hf://datasets/xet-team/lfs-analysis-data/transformed/file_extensions_by_month.parquet"
)
# drop any nas
file_extensions_by_month = file_extensions_by_month.dropna()
file_counts_and_sizes["type"] = file_counts_and_sizes["type"].str.capitalize()
# update the column name to 'total size (PB)'
file_counts_and_sizes = file_counts_and_sizes.rename(
columns={
"type": "Repository Type",
"num_files": "Number of Files",
"total_size": "Total Size (PBs)",
}
)
file_counts_and_sizes = file_counts_and_sizes.drop(columns=["Number of Files"])
# sort the dataframe by total size in descending order
file_counts_and_sizes = file_counts_and_sizes.sort_values(
by="Total Size (PBs)", ascending=False
)
# drop nas from the extension column
file_extensions = file_extensions.dropna(subset=["extension"])
return (
repo_by_size_df,
unique_files_df,
file_counts_and_sizes,
file_extensions,
file_extensions_by_month,
)
def cumulative_growth_df(_df):
# Sort by date to ensure correct cumulative sum
_df = _df.sort_values(by="date")
# Pivot the dataframe to get the totalsize
pivot_df = _df.pivot_table(
index="date", columns="type", values="totalsize", aggfunc="sum"
).fillna(0)
# Calculate cumulative sum
cumulative_df = pivot_df.cumsum()
return cumulative_df
def compare_last_10_months(_cumulative_df, _cumulative_df_compressed):
last_10_months = _cumulative_df.tail(10).copy()
last_10_months["total"] = last_10_months.sum(axis=1)
last_10_months["total_change"] = last_10_months["total"].diff()
last_10_months["compressed_change"] = (
_cumulative_df_compressed.tail(10).sum(axis=1).diff()
)
last_10_months["savings"] = (
last_10_months["total_change"] - last_10_months["compressed_change"]
)
last_10_months = format_dataframe_size_column(
last_10_months, ["total_change", "compressed_change", "savings"]
)
last_10_months["date"] = _cumulative_df.tail(10).index
# drop the dataset, model, and space
last_10_months = last_10_months.drop(columns=["model", "space", "dataset"])
# pretiffy the date column to not have 00:00:00
last_10_months["date"] = last_10_months["date"].dt.strftime("%Y-%m")
# drop the first row
last_10_months = last_10_months.drop(last_10_months.index[0])
# order the columns date, total, total_change
last_10_months = last_10_months[
["date", "total_change", "compressed_change", "savings"]
]
# rename the columns
last_10_months = last_10_months.rename(
columns={
"date": "Date",
"total_change": "Month-to-Month Growth (PBs)",
"compressed_change": "Growth with File-Level Deduplication (PBs)",
"savings": "Dedupe Savings (PBs)",
}
)
return last_10_months
def tabular_analysis(repo_sizes, cumulative_df, cumulative_df_compressed):
# create a new column in the repository sizes dataframe for "compressed size" and set it to empty atif rist
repo_sizes["Compressed Size (PBs)"] = ""
repo_sizes["Dedupe Savings (PBs)"] = ""
for column in cumulative_df.columns:
cum_repo_size = cumulative_df[column].iloc[-1]
comp_repo_size = cumulative_df_compressed[column].iloc[-1]
repo_size_diff = cum_repo_size - comp_repo_size
repo_sizes.loc[
repo_sizes["Repository Type"] == column.capitalize(),
"Compressed Size (PBs)",
] = comp_repo_size
repo_sizes.loc[
repo_sizes["Repository Type"] == column.capitalize(), "Dedupe Savings (PBs)"
] = repo_size_diff
# add a row that sums the total size and compressed size
repo_sizes.loc["Total"] = repo_sizes.sum()
repo_sizes.loc["Total", "Repository Type"] = "Total"
return repo_sizes
def cumulative_growth_plot_analysis(cumulative_df, cumulative_df_compressed):
"""
Calculates the cumulative growth of models, spaces, and datasets over time and generates a plot and dataframe from the analysis.
Args:
df (DataFrame): The input dataframe containing the data.
df_compressed (DataFrame): The input dataframe containing the compressed data.
Returns:
tuple: A tuple containing two elements:
- fig (Figure): The Plotly figure showing the cumulative growth of models, spaces, and datasets over time.
- last_10_months (DataFrame): The last 10 months of data showing the month-to-month growth in petabytes.
Raises:
None
"""
# Create a Plotly figure
fig = go.Figure()
# Define a color map for each type
color_map = {
"model": px.colors.qualitative.Alphabet[3],
"space": px.colors.qualitative.Alphabet[2],
"dataset": px.colors.qualitative.Alphabet[9],
}
# Add a scatter trace for each type
for column in cumulative_df.columns:
fig.add_trace(
go.Scatter(
x=cumulative_df.index,
y=cumulative_df[column] / 1e15, # Convert to petabytes
mode="lines",
name=column.capitalize(),
line=dict(color=color_map.get(column, "black")), # Use color map
)
)
# Add a scatter trace for each type
for column in cumulative_df_compressed.columns:
fig.add_trace(
go.Scatter(
x=cumulative_df_compressed.index,
y=cumulative_df_compressed[column] / 1e15, # Convert to petabytes
mode="lines",
name=column.capitalize() + " (File-Level Deduplication)",
line=dict(color=color_map.get(column, "black"), dash="dash"),
)
)
# Update layout
fig.update_layout(
title="Cumulative Growth of Models, Spaces, and Datasets Over Time
Dotted lines represent growth with file-level deduplication",
xaxis_title="Date",
yaxis_title="Cumulative Size (PBs)",
legend_title="Type",
yaxis=dict(tickformat=".2f"), # Format y-axis labels to 2 decimal places
)
return fig
def plot_total_sum(by_type_arr):
# Sort the array by size in decreasing order
by_type_arr = sorted(by_type_arr, key=lambda x: x[1])
# Create a Plotly figure
fig = go.Figure()
# Add a bar trace for each type
for type, size in by_type_arr:
fig.add_trace(
go.Bar(
x=[type],
y=[size / 1e15], # Convert to petabytes
name=type.capitalize(),
)
)
# Update layout
fig.update_layout(
title="Top 20 File Extensions by Total Size (in PBs)",
xaxis_title="File Extension",
yaxis_title="Total Size (PBs)",
yaxis=dict(tickformat=".2f"), # Format y-axis labels to 2 decimal places
colorway=px.colors.qualitative.Alphabet, # Use Plotly color palette
)
return fig
def filter_by_extension_month(_df, _extension):
"""
Filters the given DataFrame (_df) by the specified extension and creates a line plot using Plotly.
Parameters:
_df (DataFrame): The input DataFrame containing the data.
extension (str): The extension to filter the DataFrame by. If None, no filtering is applied.
Returns:
fig (Figure): The Plotly figure object representing the line plot.
"""
# Filter the DataFrame by the specified extension or extensions
if _extension is None:
pass
elif len(_extension) == 0:
pass
else:
_df = _df[_df["extension"].isin(_extension)].copy()
# Convert year and month into a datetime column and sort by date
_df["date"] = pd.to_datetime(_df[["year", "month"]].assign(day=1))
_df = _df.sort_values(by="date")
# Pivot the DataFrame to get the total size for each extension and make this plotable as a time series
pivot_df = _df.pivot_table(
index="date", columns="extension", values="total_size"
).fillna(0)
# Plot!!
fig = go.Figure()
for i, column in enumerate(pivot_df.columns):
if column != "":
fig.add_trace(
go.Scatter(
x=pivot_df.index,
y=pivot_df[column] / 1e12, # Convert to TBs
mode="lines",
name=column,
line=dict(color=px.colors.qualitative.Alphabet[i]),
)
)
# Update layout
fig.update_layout(
title="Monthly Additions of LFS Files by Extension (in TBs)",
xaxis_title="Date",
yaxis_title="Size (TBs)",
legend_title="Type",
yaxis=dict(tickformat=".2f"), # Format y-axis labels to 2 decimal places
)
return fig
def area_plot_by_extension_month(_df):
_df["total_size"] = _df["total_size"] / 1e15
_df["date"] = pd.to_datetime(_df[["year", "month"]].assign(day=1))
# make a plotly area chart with data and extension
fig = px.area(_df, x="date", y="total_size", color="extension")
# Update layout
fig.update_layout(
title="File Extension Monthly Additions (in PBs) Over Time",
xaxis_title="Date",
yaxis_title="Size (PBs)",
legend_title="Type",
# format y-axis to be PBs (currently bytes) with two decimal places
yaxis=dict(tickformat=".2f"),
)
return fig
## Utility functions
def div_px(height):
"""
Returns a string representing a div element with the specified height in pixels.
"""
return f"