jsulz HF staff commited on
Commit
6c1172f
1 Parent(s): 6118d5d

cleaning up a bit

Browse files
Files changed (1) hide show
  1. app.py +146 -84
app.py CHANGED
@@ -68,67 +68,34 @@ def process_dataset():
68
  )
69
 
70
 
71
- def format_dataframe_size_column(_df, column_name):
72
- """
73
- Format the size to petabytes and return the formatted size.
74
- """
75
- _df[column_name] = _df[column_name] / 1e15
76
- _df[column_name] = _df[column_name].map("{:.2f}".format)
77
- return _df
78
-
79
-
80
- def cumulative_growth_plot_analysis(df, df_compressed, repo_sizes):
81
- """
82
- Calculates the cumulative growth of models, spaces, and datasets over time and generates a plot and dataframe from the analysis.
83
-
84
- Args:
85
- df (DataFrame): The input dataframe containing the data.
86
- df_compressed (DataFrame): The input dataframe containing the compressed data.
87
-
88
- Returns:
89
- tuple: A tuple containing two elements:
90
- - fig (Figure): The Plotly figure showing the cumulative growth of models, spaces, and datasets over time.
91
- - last_10_months (DataFrame): The last 10 months of data showing the month-to-month growth in petabytes.
92
-
93
- Raises:
94
- None
95
- """
96
- # Convert year and month into a datetime column
97
- df["date"] = pd.to_datetime(df[["year", "month"]].assign(day=1))
98
- df_compressed["date"] = pd.to_datetime(
99
- df_compressed[["year", "month"]].assign(day=1)
100
- )
101
-
102
  # Sort by date to ensure correct cumulative sum
103
- df = df.sort_values(by="date")
104
- df_compressed = df_compressed.sort_values(by="date")
105
-
106
- # Pivot the dataframe to get the totalsize for each type
107
- pivot_df = df.pivot_table(
108
- index="date", columns="type", values="totalsize", aggfunc="sum"
109
- ).fillna(0)
110
- pivot_df_compressed = df_compressed.pivot_table(
111
  index="date", columns="type", values="totalsize", aggfunc="sum"
112
  ).fillna(0)
113
-
114
- # Calculate cumulative sum for each type
115
  cumulative_df = pivot_df.cumsum()
116
- cumulative_df_compressed = pivot_df_compressed.cumsum()
 
117
 
118
- last_10_months = cumulative_df.tail(10).copy()
 
119
  last_10_months["total"] = last_10_months.sum(axis=1)
120
  last_10_months["total_change"] = last_10_months["total"].diff()
121
  last_10_months["compressed_change"] = (
122
- cumulative_df_compressed.tail(10).sum(axis=1).diff()
123
  )
124
  last_10_months["savings"] = (
125
  last_10_months["total_change"] - last_10_months["compressed_change"]
126
  )
127
- last_10_months = format_dataframe_size_column(last_10_months, "savings")
128
- last_10_months = format_dataframe_size_column(last_10_months, "compressed_change")
129
- last_10_months = format_dataframe_size_column(last_10_months, "total_change")
130
 
131
- last_10_months["date"] = cumulative_df.tail(10).index
 
 
 
 
132
  # drop the dataset, model, and space
133
  last_10_months = last_10_months.drop(columns=["model", "space", "dataset"])
134
  # pretiffy the date column to not have 00:00:00
@@ -148,17 +115,10 @@ def cumulative_growth_plot_analysis(df, df_compressed, repo_sizes):
148
  "savings": "Dedupe Savings (PBs)",
149
  }
150
  )
 
151
 
152
- # Create a Plotly figure
153
- fig = go.Figure()
154
-
155
- # Define a color map for each type
156
- color_map = {
157
- "model": px.colors.qualitative.Alphabet[3],
158
- "space": px.colors.qualitative.Alphabet[2],
159
- "dataset": px.colors.qualitative.Alphabet[9],
160
- }
161
 
 
162
  # create a new column in the repository sizes dataframe for "compressed size" and set it to empty atif rist
163
  repo_sizes["Compressed Size (PBs)"] = ""
164
  repo_sizes["Dedupe Savings (PBs)"] = ""
@@ -175,6 +135,39 @@ def cumulative_growth_plot_analysis(df, df_compressed, repo_sizes):
175
  repo_sizes["Repository Type"] == column.capitalize(), "Dedupe Savings (PBs)"
176
  ] = repo_size_diff
177
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
178
  # Add a scatter trace for each type
179
  for column in cumulative_df.columns:
180
  fig.add_trace(
@@ -207,7 +200,7 @@ def cumulative_growth_plot_analysis(df, df_compressed, repo_sizes):
207
  legend_title="Type",
208
  yaxis=dict(tickformat=".2f"), # Format y-axis labels to 2 decimal places
209
  )
210
- return fig, last_10_months
211
 
212
 
213
  def plot_total_sum(by_type_arr):
@@ -244,7 +237,7 @@ def filter_by_extension_month(_df, _extension):
244
 
245
  Parameters:
246
  _df (DataFrame): The input DataFrame containing the data.
247
- extension (str): The extension to filter the DataFrame by. If set to "All", no filtering is applied.
248
 
249
  Returns:
250
  fig (Figure): The Plotly figure object representing the line plot.
@@ -273,7 +266,7 @@ def filter_by_extension_month(_df, _extension):
273
  fig.add_trace(
274
  go.Scatter(
275
  x=pivot_df.index,
276
- y=pivot_df[column] / 1e12, # Convert to petabytes
277
  mode="lines",
278
  name=column,
279
  line=dict(color=px.colors.qualitative.Alphabet[i]),
@@ -292,12 +285,70 @@ def filter_by_extension_month(_df, _extension):
292
  return fig
293
 
294
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
295
  # Create a gradio blocks interface and launch a demo
296
  with gr.Blocks() as demo:
297
  df, file_df, by_repo_type, by_extension, by_extension_month = process_dataset()
298
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
299
  # get the figure for the cumulative growth plot and the last 10 months dataframe
300
- fig, last_10_months = cumulative_growth_plot_analysis(df, file_df, by_repo_type)
301
 
302
  # Add top level heading and introduction text
303
  gr.Markdown("# Git LFS Usage Across the Hub")
@@ -308,48 +359,53 @@ with gr.Blocks() as demo:
308
  gr.Markdown(
309
  "Now, you might ask yourself, 'Why are you doing this?' Well, the [Xet Team](https://huggingface.co/xet-team) is a [new addition to Hugging Face](https://huggingface.co/blog/xethub-joins-hf), bringing a new way to store massive datasets and models to enable ML teams to operate like software teams: Quickly and without friction. Because this story all starts with storage, that's where we've begun with our own deep dives into what the Hub holds. As part of this, we've included a look at what happens with just one simple deduplication strategy - deduplicating at the file level. Read on to see more!"
310
  )
311
- gr.HTML("<div style='height: 20px;'></div>")
312
  # Cumulative growth analysis
313
  gr.Markdown("## Repository Growth")
314
  with gr.Row():
315
  gr.Plot(fig)
 
 
 
316
  with gr.Row():
317
  with gr.Column(scale=1):
318
  gr.Markdown(
319
- "This table shows the total number of files and cumulative size of those files across all repositories on the Hub. These numbers might be hard to grok, so let's try to put them in context. The last [Common Crawl](https://commoncrawl.org/) download was [451 TBs](https://github.com/commoncrawl/cc-crawl-statistics/blob/master/stats/crawler/CC-MAIN-2024-38.json#L31). The Spaces repositories alone outpaces that. Meanwhile, between Datasets and Model repos, the Hub stores **64 Common Crawls** 🤯."
320
  )
321
  with gr.Column(scale=3):
322
  # Convert the total size to petabytes and format to two decimal places
323
  by_repo_type = format_dataframe_size_column(
324
- by_repo_type, "Total Size (PBs)"
325
- )
326
- by_repo_type = format_dataframe_size_column(
327
- by_repo_type, "Compressed Size (PBs)"
328
- )
329
- by_repo_type = format_dataframe_size_column(
330
- by_repo_type, "Dedupe Savings (PBs)"
331
  )
332
  gr.Dataframe(by_repo_type)
 
 
333
  with gr.Row():
334
  with gr.Column(scale=1):
335
  gr.Markdown(
336
  "The cumulative growth of models, spaces, and datasets over time can be seen in the adjacent chart. Beside that is a view of the total change, from the previous month to the current one, of LFS files stored on the hub over 2024. We're averaging nearly **2.3 PBs uploaded to LFS per month!**"
337
  )
 
 
 
 
338
  with gr.Column(scale=3):
339
  gr.Dataframe(last_10_months, height=250)
340
 
341
- gr.HTML("<div style='height: 20px;'></div>")
342
  # File Extension analysis
343
  gr.Markdown("## File Extensions on the Hub")
344
  gr.Markdown(
345
- "Breaking this down by file extension, some interesting trends emerge. [Safetensors](https://huggingface.co/docs/safetensors/en/index) are quickly becoming the defacto standard on the hub, accounting for over 7PBs (25%) of LFS storage. The top 20 file extensions seen here and in the table below account for 82% of all LFS storage on the hub."
 
 
 
346
  )
347
  # Get the top 10 file extnesions by size
348
  by_extension_size = by_extension.sort_values(by="size", ascending=False).head(22)
349
- # get the top 10 file extensions by count
350
- # by_extension_count = by_extension.sort_values(by="count", ascending=False).head(20)
351
 
352
- # make a pie chart of the by_extension_size dataframe
353
  gr.Plot(plot_total_sum(by_extension_size[["extension", "size"]].values))
354
  # drop the unnamed: 0 column
355
  by_extension_size = by_extension_size.drop(columns=["Unnamed: 0"])
@@ -364,7 +420,7 @@ with gr.Blocks() as demo:
364
  "Average File Size (MBs)"
365
  ].map("{:.2f}".format)
366
  # format the size column
367
- by_extension_size = format_dataframe_size_column(by_extension_size, "size")
368
  # Rename the other columns
369
  by_extension_size = by_extension_size.rename(
370
  columns={
@@ -373,24 +429,30 @@ with gr.Blocks() as demo:
373
  "size": "Total Size (PBs)",
374
  }
375
  )
 
 
 
 
 
376
  gr.Dataframe(by_extension_size)
377
 
378
- gr.HTML("<div style='height: 20px;'></div>")
379
- gr.Markdown("## File Extension Growth Over Time")
380
- by_extension_month["date"] = pd.to_datetime(
381
- by_extension_month[["year", "month"]].assign(day=1)
382
  )
383
- # make a plotly area chart with data and extension
384
- figure = px.area(by_extension_month, x="date", y="total_size", color="extension")
385
- gr.Plot(figure)
386
  gr.Markdown(
387
- "Want to dig a little deeper? Select a file extension to see how many bytes of that type were uploaded to the Hub each month."
388
  )
389
 
390
  # build a dropdown using the unique values in the extension column
391
  extension = gr.Dropdown(
392
  choices=by_extension["extension"].unique().tolist(),
393
  multiselect=True,
 
394
  )
395
  _by_extension_month = gr.State(by_extension_month)
396
  gr.Plot(filter_by_extension_month, inputs=[_by_extension_month, extension])
 
68
  )
69
 
70
 
71
+ def cumulative_growth_df(_df):
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
72
  # Sort by date to ensure correct cumulative sum
73
+ _df = _df.sort_values(by="date")
74
+ # Pivot the dataframe to get the totalsize
75
+ pivot_df = _df.pivot_table(
 
 
 
 
 
76
  index="date", columns="type", values="totalsize", aggfunc="sum"
77
  ).fillna(0)
78
+ # Calculate cumulative sum
 
79
  cumulative_df = pivot_df.cumsum()
80
+ return cumulative_df
81
+
82
 
83
+ def compare_last_10_months(_cumulative_df, _cumulative_df_compressed):
84
+ last_10_months = _cumulative_df.tail(10).copy()
85
  last_10_months["total"] = last_10_months.sum(axis=1)
86
  last_10_months["total_change"] = last_10_months["total"].diff()
87
  last_10_months["compressed_change"] = (
88
+ _cumulative_df_compressed.tail(10).sum(axis=1).diff()
89
  )
90
  last_10_months["savings"] = (
91
  last_10_months["total_change"] - last_10_months["compressed_change"]
92
  )
 
 
 
93
 
94
+ last_10_months = format_dataframe_size_column(
95
+ last_10_months, ["total_change", "compressed_change", "savings"]
96
+ )
97
+
98
+ last_10_months["date"] = _cumulative_df.tail(10).index
99
  # drop the dataset, model, and space
100
  last_10_months = last_10_months.drop(columns=["model", "space", "dataset"])
101
  # pretiffy the date column to not have 00:00:00
 
115
  "savings": "Dedupe Savings (PBs)",
116
  }
117
  )
118
+ return last_10_months
119
 
 
 
 
 
 
 
 
 
 
120
 
121
+ def tabular_analysis(repo_sizes, cumulative_df, cumulative_df_compressed):
122
  # create a new column in the repository sizes dataframe for "compressed size" and set it to empty atif rist
123
  repo_sizes["Compressed Size (PBs)"] = ""
124
  repo_sizes["Dedupe Savings (PBs)"] = ""
 
135
  repo_sizes["Repository Type"] == column.capitalize(), "Dedupe Savings (PBs)"
136
  ] = repo_size_diff
137
 
138
+ # add a row that sums the total size and compressed size
139
+ repo_sizes.loc["Total"] = repo_sizes.sum()
140
+ repo_sizes.loc["Total", "Repository Type"] = "Total"
141
+ return repo_sizes
142
+
143
+
144
+ def cumulative_growth_plot_analysis(cumulative_df, cumulative_df_compressed):
145
+ """
146
+ Calculates the cumulative growth of models, spaces, and datasets over time and generates a plot and dataframe from the analysis.
147
+
148
+ Args:
149
+ df (DataFrame): The input dataframe containing the data.
150
+ df_compressed (DataFrame): The input dataframe containing the compressed data.
151
+
152
+ Returns:
153
+ tuple: A tuple containing two elements:
154
+ - fig (Figure): The Plotly figure showing the cumulative growth of models, spaces, and datasets over time.
155
+ - last_10_months (DataFrame): The last 10 months of data showing the month-to-month growth in petabytes.
156
+
157
+ Raises:
158
+ None
159
+ """
160
+
161
+ # Create a Plotly figure
162
+ fig = go.Figure()
163
+
164
+ # Define a color map for each type
165
+ color_map = {
166
+ "model": px.colors.qualitative.Alphabet[3],
167
+ "space": px.colors.qualitative.Alphabet[2],
168
+ "dataset": px.colors.qualitative.Alphabet[9],
169
+ }
170
+
171
  # Add a scatter trace for each type
172
  for column in cumulative_df.columns:
173
  fig.add_trace(
 
200
  legend_title="Type",
201
  yaxis=dict(tickformat=".2f"), # Format y-axis labels to 2 decimal places
202
  )
203
+ return fig
204
 
205
 
206
  def plot_total_sum(by_type_arr):
 
237
 
238
  Parameters:
239
  _df (DataFrame): The input DataFrame containing the data.
240
+ extension (str): The extension to filter the DataFrame by. If None, no filtering is applied.
241
 
242
  Returns:
243
  fig (Figure): The Plotly figure object representing the line plot.
 
266
  fig.add_trace(
267
  go.Scatter(
268
  x=pivot_df.index,
269
+ y=pivot_df[column] / 1e12, # Convert to TBs
270
  mode="lines",
271
  name=column,
272
  line=dict(color=px.colors.qualitative.Alphabet[i]),
 
285
  return fig
286
 
287
 
288
+ def area_plot_by_extension_month(_df):
289
+ _df["total_size"] = _df["total_size"] / 1e15
290
+ _df["date"] = pd.to_datetime(_df[["year", "month"]].assign(day=1))
291
+ # make a plotly area chart with data and extension
292
+ fig = px.area(_df, x="date", y="total_size", color="extension")
293
+ # Update layout
294
+ fig.update_layout(
295
+ title="File Extension Cumulative Growth (in PBs) Over Time",
296
+ xaxis_title="Date",
297
+ yaxis_title="Size (PBs)",
298
+ legend_title="Type",
299
+ # format y-axis to be PBs (currently bytes) with two decimal places
300
+ yaxis=dict(tickformat=".2f"),
301
+ )
302
+
303
+ return fig
304
+
305
+
306
+ ## Utility functions
307
+ def div_px(height):
308
+ """
309
+ Returns a string representing a div element with the specified height in pixels.
310
+ """
311
+ return f"<div style='height: {height}px;'></div>"
312
+
313
+
314
+ def format_dataframe_size_column(_df, column_names):
315
+ """
316
+ Format the size to petabytes and return the formatted size.
317
+ """
318
+ for column_name in column_names:
319
+ _df[column_name] = _df[column_name] / 1e15
320
+ _df[column_name] = _df[column_name].map("{:.2f}".format)
321
+ return _df
322
+
323
+
324
+ def month_year_to_date(_df):
325
+ """
326
+ Converts the 'year' and 'month' columns in the given DataFrame to a single 'date' column.
327
+ """
328
+ _df["date"] = pd.to_datetime(_df[["year", "month"]].assign(day=1))
329
+ return _df
330
+
331
+
332
  # Create a gradio blocks interface and launch a demo
333
  with gr.Blocks() as demo:
334
  df, file_df, by_repo_type, by_extension, by_extension_month = process_dataset()
335
 
336
+ # Convert year and month into a datetime column
337
+ df = month_year_to_date(df)
338
+ df_compressed = month_year_to_date(file_df)
339
+
340
+ # Calculate the cumulative growth of models, spaces, and datasets over time
341
+ cumulative_df = cumulative_growth_df(df)
342
+ cumulative_df_compressed = cumulative_growth_df(df_compressed)
343
+
344
+ last_10_months = compare_last_10_months(cumulative_df, cumulative_df_compressed)
345
+
346
+ by_repo_type = tabular_analysis(
347
+ by_repo_type, cumulative_df, cumulative_df_compressed
348
+ )
349
+
350
  # get the figure for the cumulative growth plot and the last 10 months dataframe
351
+ fig = cumulative_growth_plot_analysis(cumulative_df, cumulative_df_compressed)
352
 
353
  # Add top level heading and introduction text
354
  gr.Markdown("# Git LFS Usage Across the Hub")
 
359
  gr.Markdown(
360
  "Now, you might ask yourself, 'Why are you doing this?' Well, the [Xet Team](https://huggingface.co/xet-team) is a [new addition to Hugging Face](https://huggingface.co/blog/xethub-joins-hf), bringing a new way to store massive datasets and models to enable ML teams to operate like software teams: Quickly and without friction. Because this story all starts with storage, that's where we've begun with our own deep dives into what the Hub holds. As part of this, we've included a look at what happens with just one simple deduplication strategy - deduplicating at the file level. Read on to see more!"
361
  )
362
+ gr.HTML(div_px(25))
363
  # Cumulative growth analysis
364
  gr.Markdown("## Repository Growth")
365
  with gr.Row():
366
  gr.Plot(fig)
367
+
368
+ gr.HTML(div_px(5))
369
+ # @TODO Talk to Allison about variant="panel"
370
  with gr.Row():
371
  with gr.Column(scale=1):
372
  gr.Markdown(
373
+ "This table shows the total number of files, cumulative size of those files across all repositories on the Hub, and the potential file-level dedupe savings. To put this in context, the last [Common Crawl](https://commoncrawl.org/) download was [451 TBs](https://github.com/commoncrawl/cc-crawl-statistics/blob/master/stats/crawler/CC-MAIN-2024-38.json#L31). The Spaces repositories alone outpaces that! Meanwhile, between Datasets and Model repos, the Hub stores **64 Common Crawls** 🤯. Current estimates put total deduplication savings at approximately 3.24 PBs (7.2 Common Crawls)!"
374
  )
375
  with gr.Column(scale=3):
376
  # Convert the total size to petabytes and format to two decimal places
377
  by_repo_type = format_dataframe_size_column(
378
+ by_repo_type,
379
+ ["Total Size (PBs)", "Compressed Size (PBs)", "Dedupe Savings (PBs)"],
 
 
 
 
 
380
  )
381
  gr.Dataframe(by_repo_type)
382
+
383
+ gr.HTML(div_px(5))
384
  with gr.Row():
385
  with gr.Column(scale=1):
386
  gr.Markdown(
387
  "The cumulative growth of models, spaces, and datasets over time can be seen in the adjacent chart. Beside that is a view of the total change, from the previous month to the current one, of LFS files stored on the hub over 2024. We're averaging nearly **2.3 PBs uploaded to LFS per month!**"
388
  )
389
+
390
+ gr.Markdown(
391
+ "By the same token, the monthly file deduplication savings are nearly 225TBs. Borrowing from the [Common Crawl](https://commoncrawl.org/) analogy, that's about half a crawl saved each month!"
392
+ )
393
  with gr.Column(scale=3):
394
  gr.Dataframe(last_10_months, height=250)
395
 
396
+ gr.HTML(div_px(25))
397
  # File Extension analysis
398
  gr.Markdown("## File Extensions on the Hub")
399
  gr.Markdown(
400
+ "Breaking this down by file extension, some interesting trends emerge. The following sections filter the analysis to the top 20 file extensions stored (in bytes) using LFS (which accounts for 82% of storage consumption)."
401
+ )
402
+ gr.Markdown(
403
+ "As is evident in the chart below, [Safetensors](https://huggingface.co/docs/safetensors/en/index) is quickly becoming the defacto standard on the Hub for storing tensor files, accounting for over 7PBs (25%) of LFS storage. If you want to know why you'd want to check out YAF (yet another format), this explanation from the [Safetensors docs](https://github.com/huggingface/safetensors?tab=readme-ov-file#yet-another-format-) is a good place to start. Speaking of YAF, [GGUF (GPT-Generated Unified Format)](https://github.com/ggerganov/ggml/blob/master/docs/gguf.md) is also on the rise, accounting for 3.2 PBs (11%) of LFS storage. GGUF, like Safetensors, is a format for storing tensor files, with a different set of optimizations. The Hub has a few [built-in tools](https://huggingface.co/docs/hub/en/gguf) for working with GGUF."
404
  )
405
  # Get the top 10 file extnesions by size
406
  by_extension_size = by_extension.sort_values(by="size", ascending=False).head(22)
 
 
407
 
408
+ # make a bar chart of the by_extension_size dataframe
409
  gr.Plot(plot_total_sum(by_extension_size[["extension", "size"]].values))
410
  # drop the unnamed: 0 column
411
  by_extension_size = by_extension_size.drop(columns=["Unnamed: 0"])
 
420
  "Average File Size (MBs)"
421
  ].map("{:.2f}".format)
422
  # format the size column
423
+ by_extension_size = format_dataframe_size_column(by_extension_size, ["size"])
424
  # Rename the other columns
425
  by_extension_size = by_extension_size.rename(
426
  columns={
 
429
  "size": "Total Size (PBs)",
430
  }
431
  )
432
+
433
+ gr.HTML(div_px(5))
434
+ gr.Markdown(
435
+ "Below, we have a more detailed tabular view of the same top 20 file extensions by total size, number of files, and average file size."
436
+ )
437
  gr.Dataframe(by_extension_size)
438
 
439
+ gr.HTML(div_px(5))
440
+ gr.Markdown("### File Extension Monthly Additions (in PBs)")
441
+ gr.Markdown(
442
+ "What if we want to see trends over time? The following area chart shows the number of bytes added to LFS storage each month, faceted by the most popular file extensions."
443
  )
444
+ gr.Plot(area_plot_by_extension_month(by_extension_month))
445
+
446
+ gr.HTML(div_px(5))
447
  gr.Markdown(
448
+ "To dig a little deeper, the following dropdown allows you to filter the area chart by file extension."
449
  )
450
 
451
  # build a dropdown using the unique values in the extension column
452
  extension = gr.Dropdown(
453
  choices=by_extension["extension"].unique().tolist(),
454
  multiselect=True,
455
+ label="File Extension",
456
  )
457
  _by_extension_month = gr.State(by_extension_month)
458
  gr.Plot(filter_by_extension_month, inputs=[_by_extension_month, extension])