|
import os |
|
import pickle |
|
from collections import Counter |
|
from datetime import datetime |
|
|
|
import matplotlib.pyplot as plt |
|
import numpy as np |
|
import pandas as pd |
|
from huggingface_hub import HfApi |
|
|
|
|
|
LANGUAGE_COLORS = { |
|
"english": "orange", |
|
"spanish": "blue", |
|
"catalan": "red", |
|
"galician": "green", |
|
"basque": "purple", |
|
} |
|
|
|
GRID = False |
|
|
|
|
|
def fetch_datasets(cache_file="datasets_cache.pkl"): |
|
"""Fetch and filter datasets from HuggingFace Hub with caching""" |
|
|
|
if os.path.exists(cache_file): |
|
cache_age = datetime.now().timestamp() - os.path.getmtime(cache_file) |
|
if cache_age < 24 * 3600: |
|
print("Loading datasets from cache...") |
|
with open(cache_file, "rb") as f: |
|
return pickle.load(f) |
|
else: |
|
print("Cache is older than 24 hours, fetching fresh data...") |
|
else: |
|
print("No cache found, fetching datasets from Hugging Face Hub...") |
|
|
|
hf_api = HfApi() |
|
all_datasets = list(hf_api.list_datasets(full=True)) |
|
|
|
|
|
english_filter = filter( |
|
lambda d: "language:en" in d.tags |
|
and not any( |
|
tag.startswith("language:") and tag != "language:en" for tag in d.tags |
|
), |
|
all_datasets, |
|
) |
|
spanish_filter = filter( |
|
lambda d: "language:es" in d.tags |
|
and not any( |
|
tag.startswith("language:") and tag != "language:es" for tag in d.tags |
|
), |
|
all_datasets, |
|
) |
|
catalan_filter = filter( |
|
lambda d: "language:ca" in d.tags |
|
and not any( |
|
tag.startswith("language:") and tag != "language:ca" for tag in d.tags |
|
), |
|
all_datasets, |
|
) |
|
galician_filter = filter( |
|
lambda d: "language:gl" in d.tags |
|
and not any( |
|
tag.startswith("language:") and tag != "language:gl" for tag in d.tags |
|
), |
|
all_datasets, |
|
) |
|
basque_filter = filter( |
|
lambda d: "language:eu" in d.tags |
|
and not any( |
|
tag.startswith("language:") and tag != "language:eu" for tag in d.tags |
|
), |
|
all_datasets, |
|
) |
|
filtered_datasets = { |
|
"english": list(english_filter), |
|
"spanish": list(spanish_filter), |
|
"catalan": list(catalan_filter), |
|
"galician": list(galician_filter), |
|
"basque": list(basque_filter), |
|
} |
|
|
|
|
|
print("Saving datasets to cache...") |
|
with open(cache_file, "wb") as f: |
|
pickle.dump(filtered_datasets, f) |
|
|
|
return filtered_datasets |
|
|
|
|
|
def create_bar_plots(datasets, output_dir): |
|
"""Create horizontal and vertical bar plots""" |
|
|
|
years = sorted( |
|
set( |
|
date.year |
|
for date in [ |
|
d.created_at.date() for d in datasets["english"] + datasets["spanish"] |
|
] |
|
) |
|
) |
|
english_counts = Counter( |
|
date.year for date in [d.created_at.date() for d in datasets["english"]] |
|
) |
|
spanish_counts = Counter( |
|
date.year for date in [d.created_at.date() for d in datasets["spanish"]] |
|
) |
|
|
|
|
|
plt.figure(figsize=(8, 5)) |
|
bar_width = 0.4 |
|
years_index = np.arange(len(years)) |
|
|
|
plt.bar( |
|
years_index - bar_width / 2, |
|
[english_counts[year] for year in years], |
|
width=bar_width, |
|
label="English", |
|
color=LANGUAGE_COLORS["english"], |
|
) |
|
plt.bar( |
|
years_index + bar_width / 2, |
|
[spanish_counts[year] for year in years], |
|
width=bar_width, |
|
label="Spanish", |
|
color=LANGUAGE_COLORS["spanish"], |
|
) |
|
|
|
plt.xlabel("Year", fontsize=10) |
|
plt.ylabel("Number of Datasets", fontsize=10) |
|
plt.xticks(years_index, years, fontsize=10) |
|
plt.legend() |
|
plt.grid(GRID) |
|
plt.tight_layout() |
|
plt.savefig(f"{output_dir}/bar_plot_horizontal.png") |
|
plt.close() |
|
|
|
|
|
plt.figure(figsize=(8, 5)) |
|
plt.bar( |
|
years, |
|
[english_counts[year] for year in years], |
|
width=0.4, |
|
label="English", |
|
color=LANGUAGE_COLORS["english"], |
|
) |
|
plt.bar( |
|
years, |
|
[spanish_counts[year] for year in years], |
|
width=0.4, |
|
label="Spanish", |
|
color=LANGUAGE_COLORS["spanish"], |
|
bottom=[english_counts[year] for year in years], |
|
) |
|
|
|
plt.xlabel("Year", fontsize=10) |
|
plt.ylabel("Number of Datasets", fontsize=10) |
|
plt.xticks(years, fontsize=10) |
|
plt.legend() |
|
plt.tight_layout() |
|
plt.grid(GRID) |
|
plt.savefig(f"{output_dir}/bar_plot_vertical.png") |
|
plt.close() |
|
|
|
|
|
def create_pie_chart(datasets, output_dir): |
|
"""Create pie chart showing distribution of datasets by language""" |
|
|
|
counts = { |
|
lang.capitalize(): len(datasets[lang]) |
|
for lang in ["english", "spanish", "catalan", "galician", "basque"] |
|
} |
|
|
|
plt.figure(figsize=(8, 8)) |
|
plt.pie( |
|
counts.values(), |
|
labels=counts.keys(), |
|
autopct="%1.1f%%", |
|
startangle=180, |
|
colors=[ |
|
LANGUAGE_COLORS[lang] |
|
for lang in ["english", "spanish", "catalan", "galician", "basque"] |
|
], |
|
) |
|
plt.axis("equal") |
|
plt.savefig(f"{output_dir}/pie_chart.png") |
|
plt.close() |
|
|
|
|
|
def create_time_series(datasets, output_dir): |
|
"""Create time series plots""" |
|
|
|
creation_dates_english = [d.created_at.date() for d in datasets["english"]] |
|
creation_dates_spanish = [d.created_at.date() for d in datasets["spanish"]] |
|
|
|
df_english = pd.DataFrame(creation_dates_english, columns=["Date"]) |
|
df_spanish = pd.DataFrame(creation_dates_spanish, columns=["Date"]) |
|
|
|
df_english["Count"] = 1 |
|
df_spanish["Count"] = 1 |
|
|
|
df_english["Date"] = pd.to_datetime(df_english["Date"]) |
|
df_spanish["Date"] = pd.to_datetime(df_spanish["Date"]) |
|
|
|
|
|
df_english_cum = ( |
|
df_english.groupby(pd.Grouper(key="Date", freq="MS")).sum().cumsum() |
|
) |
|
df_spanish_cum = ( |
|
df_spanish.groupby(pd.Grouper(key="Date", freq="MS")).sum().cumsum() |
|
) |
|
|
|
plt.figure(figsize=(10, 6)) |
|
plt.plot( |
|
df_english_cum.index, |
|
df_english_cum["Count"], |
|
label="English", |
|
color=LANGUAGE_COLORS["english"], |
|
) |
|
plt.plot( |
|
df_spanish_cum.index, |
|
df_spanish_cum["Count"], |
|
label="Spanish", |
|
color=LANGUAGE_COLORS["spanish"], |
|
) |
|
|
|
plt.xlabel("Date", fontsize=10) |
|
plt.ylabel("Cumulative Number of Datasets", fontsize=10) |
|
plt.xticks(rotation=45, fontsize=10) |
|
plt.legend(loc="upper left") |
|
plt.tight_layout() |
|
plt.grid(GRID) |
|
plt.savefig(f"{output_dir}/time_series.png") |
|
plt.close() |
|
|
|
|
|
def create_stack_area_plots(datasets, output_dir): |
|
"""Create stacked area plots""" |
|
|
|
all_dates = [] |
|
languages = ["english", "spanish", "catalan", "galician", "basque"] |
|
for lang in languages: |
|
all_dates.extend([d.created_at.date() for d in datasets[lang]]) |
|
|
|
|
|
min_date = min(all_dates) |
|
max_date = max(all_dates) |
|
date_range = pd.date_range(start=min_date, end=max_date, freq="MS") |
|
|
|
|
|
dfs = {} |
|
for lang in languages: |
|
dates = [d.created_at.date() for d in datasets[lang]] |
|
df = pd.DataFrame({"Date": dates}) |
|
df["Count"] = 1 |
|
df["Date"] = pd.to_datetime(df["Date"]) |
|
|
|
df_grouped = df.groupby(pd.Grouper(key="Date", freq="MS")).sum() |
|
df_grouped = df_grouped.reindex(date_range, fill_value=0) |
|
dfs[lang] = df_grouped.cumsum() |
|
|
|
|
|
plt.figure(figsize=(10, 6)) |
|
plt.stackplot( |
|
date_range, |
|
[dfs[lang]["Count"].values for lang in languages], |
|
labels=[lang.capitalize() for lang in languages], |
|
colors=[LANGUAGE_COLORS[lang] for lang in languages], |
|
) |
|
|
|
plt.xlabel("Date", fontsize=10) |
|
plt.ylabel("Cumulative Number of Datasets", fontsize=10) |
|
plt.xticks(rotation=45, fontsize=10) |
|
plt.legend(loc="upper left") |
|
plt.tight_layout() |
|
plt.grid(GRID) |
|
plt.savefig(f"{output_dir}/stack_area.png") |
|
plt.close() |
|
|
|
|
|
plt.figure(figsize=(10, 6)) |
|
plt.stackplot( |
|
date_range, |
|
[ |
|
dfs[lang]["Count"].values |
|
for lang in ["spanish", "catalan", "galician", "basque"] |
|
], |
|
labels=["Spanish", "Catalan", "Galician", "Basque"], |
|
colors=[ |
|
LANGUAGE_COLORS[lang] |
|
for lang in ["spanish", "catalan", "galician", "basque"] |
|
], |
|
) |
|
|
|
plt.xlabel("Date", fontsize=10) |
|
plt.ylabel("Cumulative Number of Datasets", fontsize=10) |
|
plt.xticks(rotation=45, fontsize=10) |
|
plt.legend(loc="upper left") |
|
plt.tight_layout() |
|
plt.grid(GRID) |
|
plt.savefig(f"{output_dir}/stack_area_es_ca_gl_eu.png") |
|
plt.close() |
|
|
|
|
|
plt.figure(figsize=(10, 6)) |
|
plt.stackplot( |
|
date_range, |
|
[dfs[lang]["Count"].values for lang in ["english", "spanish"]], |
|
labels=["English", "Spanish"], |
|
colors=[LANGUAGE_COLORS[lang] for lang in ["english", "spanish"]], |
|
) |
|
|
|
plt.xlabel("Date", fontsize=10) |
|
plt.ylabel("Cumulative Number of Datasets", fontsize=10) |
|
plt.xticks(rotation=45, fontsize=10) |
|
plt.legend(loc="upper left") |
|
plt.tight_layout() |
|
plt.grid(GRID) |
|
plt.savefig(f"{output_dir}/stack_area_en_es.png") |
|
plt.close() |
|
|
|
|
|
plt.figure(figsize=(10, 6)) |
|
plt.stackplot( |
|
date_range, |
|
[dfs["spanish"]["Count"].values], |
|
labels=["Spanish"], |
|
colors=[LANGUAGE_COLORS["spanish"]], |
|
) |
|
|
|
plt.xlabel("Date", fontsize=10) |
|
plt.ylabel("Cumulative Number of Datasets", fontsize=10) |
|
plt.xticks(rotation=45, fontsize=10) |
|
plt.legend(loc="upper left") |
|
plt.tight_layout() |
|
plt.grid(GRID) |
|
plt.savefig(f"{output_dir}/stack_area_es.png") |
|
plt.close() |
|
|
|
|
|
def main(): |
|
|
|
output_dir = "plots" |
|
os.makedirs(output_dir, exist_ok=True) |
|
|
|
|
|
print("Fetching datasets from Hugging Face Hub...") |
|
datasets = fetch_datasets() |
|
|
|
|
|
print("Creating bar plots...") |
|
create_bar_plots(datasets, output_dir) |
|
|
|
print("Creating pie chart...") |
|
create_pie_chart(datasets, output_dir) |
|
|
|
print("Creating time series plots...") |
|
create_time_series(datasets, output_dir) |
|
|
|
print("Creating stack area plots...") |
|
create_stack_area_plots(datasets, output_dir) |
|
|
|
print(f"All visualizations have been saved to the '{output_dir}' directory") |
|
|
|
|
|
if __name__ == "__main__": |
|
main() |
|
|