import os import pickle from collections import Counter from datetime import datetime import matplotlib.pyplot as plt import numpy as np import pandas as pd from huggingface_hub import HfApi # Define colors for each language LANGUAGE_COLORS = { "english": "orange", "spanish": "blue", "catalan": "red", "galician": "green", "basque": "purple", } GRID = False def fetch_datasets(cache_file="datasets_cache.pkl"): """Fetch and filter datasets from HuggingFace Hub with caching""" # Check if cached data exists and is less than 24 hours old if os.path.exists(cache_file): cache_age = datetime.now().timestamp() - os.path.getmtime(cache_file) if cache_age < 24 * 3600: # 24 hours in seconds print("Loading datasets from cache...") with open(cache_file, "rb") as f: return pickle.load(f) else: print("Cache is older than 24 hours, fetching fresh data...") else: print("No cache found, fetching datasets from Hugging Face Hub...") hf_api = HfApi() all_datasets = list(hf_api.list_datasets(full=True)) # Filter datasets by language english_filter = filter( lambda d: "language:en" in d.tags and not any( tag.startswith("language:") and tag != "language:en" for tag in d.tags ), all_datasets, ) spanish_filter = filter( lambda d: "language:es" in d.tags and not any( tag.startswith("language:") and tag != "language:es" for tag in d.tags ), all_datasets, ) catalan_filter = filter( lambda d: "language:ca" in d.tags and not any( tag.startswith("language:") and tag != "language:ca" for tag in d.tags ), all_datasets, ) galician_filter = filter( lambda d: "language:gl" in d.tags and not any( tag.startswith("language:") and tag != "language:gl" for tag in d.tags ), all_datasets, ) basque_filter = filter( lambda d: "language:eu" in d.tags and not any( tag.startswith("language:") and tag != "language:eu" for tag in d.tags ), all_datasets, ) filtered_datasets = { "english": list(english_filter), "spanish": list(spanish_filter), "catalan": list(catalan_filter), "galician": list(galician_filter), "basque": list(basque_filter), } # Cache the filtered datasets print("Saving datasets to cache...") with open(cache_file, "wb") as f: pickle.dump(filtered_datasets, f) return filtered_datasets def create_bar_plots(datasets, output_dir): """Create horizontal and vertical bar plots""" # Extract creation dates and counts years = sorted( set( date.year for date in [ d.created_at.date() for d in datasets["english"] + datasets["spanish"] ] ) ) english_counts = Counter( date.year for date in [d.created_at.date() for d in datasets["english"]] ) spanish_counts = Counter( date.year for date in [d.created_at.date() for d in datasets["spanish"]] ) # Horizontal bar plot plt.figure(figsize=(8, 5)) bar_width = 0.4 years_index = np.arange(len(years)) plt.bar( years_index - bar_width / 2, [english_counts[year] for year in years], width=bar_width, label="English", color=LANGUAGE_COLORS["english"], ) plt.bar( years_index + bar_width / 2, [spanish_counts[year] for year in years], width=bar_width, label="Spanish", color=LANGUAGE_COLORS["spanish"], ) plt.xlabel("Year", fontsize=10) plt.ylabel("Number of Datasets", fontsize=10) plt.xticks(years_index, years, fontsize=10) plt.legend() plt.grid(GRID) plt.tight_layout() plt.savefig(f"{output_dir}/bar_plot_horizontal.png") plt.close() # Vertical bar plot plt.figure(figsize=(8, 5)) plt.bar( years, [english_counts[year] for year in years], width=0.4, label="English", color=LANGUAGE_COLORS["english"], ) plt.bar( years, [spanish_counts[year] for year in years], width=0.4, label="Spanish", color=LANGUAGE_COLORS["spanish"], bottom=[english_counts[year] for year in years], ) plt.xlabel("Year", fontsize=10) plt.ylabel("Number of Datasets", fontsize=10) plt.xticks(years, fontsize=10) plt.legend() plt.tight_layout() plt.grid(GRID) plt.savefig(f"{output_dir}/bar_plot_vertical.png") plt.close() def create_pie_chart(datasets, output_dir): """Create pie chart showing distribution of datasets by language""" # Calculate counts counts = { lang.capitalize(): len(datasets[lang]) for lang in ["english", "spanish", "catalan", "galician", "basque"] } plt.figure(figsize=(8, 8)) plt.pie( counts.values(), labels=counts.keys(), autopct="%1.1f%%", startangle=180, colors=[ LANGUAGE_COLORS[lang] for lang in ["english", "spanish", "catalan", "galician", "basque"] ], ) plt.axis("equal") plt.savefig(f"{output_dir}/pie_chart.png") plt.close() def create_time_series(datasets, output_dir): """Create time series plots""" # Prepare data creation_dates_english = [d.created_at.date() for d in datasets["english"]] creation_dates_spanish = [d.created_at.date() for d in datasets["spanish"]] df_english = pd.DataFrame(creation_dates_english, columns=["Date"]) df_spanish = pd.DataFrame(creation_dates_spanish, columns=["Date"]) df_english["Count"] = 1 df_spanish["Count"] = 1 df_english["Date"] = pd.to_datetime(df_english["Date"]) df_spanish["Date"] = pd.to_datetime(df_spanish["Date"]) # Cumulative plots df_english_cum = ( df_english.groupby(pd.Grouper(key="Date", freq="MS")).sum().cumsum() ) df_spanish_cum = ( df_spanish.groupby(pd.Grouper(key="Date", freq="MS")).sum().cumsum() ) plt.figure(figsize=(10, 6)) plt.plot( df_english_cum.index, df_english_cum["Count"], label="English", color=LANGUAGE_COLORS["english"], ) plt.plot( df_spanish_cum.index, df_spanish_cum["Count"], label="Spanish", color=LANGUAGE_COLORS["spanish"], ) plt.xlabel("Date", fontsize=10) plt.ylabel("Cumulative Number of Datasets", fontsize=10) plt.xticks(rotation=45, fontsize=10) plt.legend(loc="upper left") plt.tight_layout() plt.grid(GRID) plt.savefig(f"{output_dir}/time_series.png") plt.close() def create_stack_area_plots(datasets, output_dir): """Create stacked area plots""" # Prepare data for all languages all_dates = [] languages = ["english", "spanish", "catalan", "galician", "basque"] for lang in languages: all_dates.extend([d.created_at.date() for d in datasets[lang]]) # Create a common date range for all languages min_date = min(all_dates) max_date = max(all_dates) date_range = pd.date_range(start=min_date, end=max_date, freq="MS") # Create separate DataFrames for each language dfs = {} for lang in languages: dates = [d.created_at.date() for d in datasets[lang]] df = pd.DataFrame({"Date": dates}) df["Count"] = 1 df["Date"] = pd.to_datetime(df["Date"]) # Reindex to common date range and fill missing values with 0 df_grouped = df.groupby(pd.Grouper(key="Date", freq="MS")).sum() df_grouped = df_grouped.reindex(date_range, fill_value=0) dfs[lang] = df_grouped.cumsum() # Plot stacked area for all languages plt.figure(figsize=(10, 6)) plt.stackplot( date_range, [dfs[lang]["Count"].values for lang in languages], labels=[lang.capitalize() for lang in languages], colors=[LANGUAGE_COLORS[lang] for lang in languages], ) plt.xlabel("Date", fontsize=10) plt.ylabel("Cumulative Number of Datasets", fontsize=10) plt.xticks(rotation=45, fontsize=10) plt.legend(loc="upper left") plt.tight_layout() plt.grid(GRID) plt.savefig(f"{output_dir}/stack_area.png") plt.close() # Plot stacked area for all except English plt.figure(figsize=(10, 6)) plt.stackplot( date_range, [ dfs[lang]["Count"].values for lang in ["spanish", "catalan", "galician", "basque"] ], labels=["Spanish", "Catalan", "Galician", "Basque"], colors=[ LANGUAGE_COLORS[lang] for lang in ["spanish", "catalan", "galician", "basque"] ], ) plt.xlabel("Date", fontsize=10) plt.ylabel("Cumulative Number of Datasets", fontsize=10) plt.xticks(rotation=45, fontsize=10) plt.legend(loc="upper left") plt.tight_layout() plt.grid(GRID) plt.savefig(f"{output_dir}/stack_area_es_ca_gl_eu.png") plt.close() # Plot stacked area for English and Spanish plt.figure(figsize=(10, 6)) plt.stackplot( date_range, [dfs[lang]["Count"].values for lang in ["english", "spanish"]], labels=["English", "Spanish"], colors=[LANGUAGE_COLORS[lang] for lang in ["english", "spanish"]], ) plt.xlabel("Date", fontsize=10) plt.ylabel("Cumulative Number of Datasets", fontsize=10) plt.xticks(rotation=45, fontsize=10) plt.legend(loc="upper left") plt.tight_layout() plt.grid(GRID) plt.savefig(f"{output_dir}/stack_area_en_es.png") plt.close() # Plot stacked area for Spanish only plt.figure(figsize=(10, 6)) plt.stackplot( date_range, [dfs["spanish"]["Count"].values], labels=["Spanish"], colors=[LANGUAGE_COLORS["spanish"]], ) plt.xlabel("Date", fontsize=10) plt.ylabel("Cumulative Number of Datasets", fontsize=10) plt.xticks(rotation=45, fontsize=10) plt.legend(loc="upper left") plt.tight_layout() plt.grid(GRID) plt.savefig(f"{output_dir}/stack_area_es.png") plt.close() def main(): # Create output directory if it doesn't exist output_dir = "plots" os.makedirs(output_dir, exist_ok=True) # Fetch datasets print("Fetching datasets from Hugging Face Hub...") datasets = fetch_datasets() # Create visualizations print("Creating bar plots...") create_bar_plots(datasets, output_dir) print("Creating pie chart...") create_pie_chart(datasets, output_dir) print("Creating time series plots...") create_time_series(datasets, output_dir) print("Creating stack area plots...") create_stack_area_plots(datasets, output_dir) print(f"All visualizations have been saved to the '{output_dir}' directory") if __name__ == "__main__": main()