Spaces:

mariagrandury
/

language-gap-in-hf-hub

Sleeping

File size: 11,061 Bytes

import os
import pickle
from collections import Counter
from datetime import datetime

import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
from huggingface_hub import HfApi

# Define colors for each language
LANGUAGE_COLORS = {
    "english": "orange",
    "spanish": "blue",
    "catalan": "red",
    "galician": "green",
    "basque": "purple",
}

GRID = False


def fetch_datasets(cache_file="datasets_cache.pkl"):
    """Fetch and filter datasets from HuggingFace Hub with caching"""
    # Check if cached data exists and is less than 24 hours old
    if os.path.exists(cache_file):
        cache_age = datetime.now().timestamp() - os.path.getmtime(cache_file)
        if cache_age < 24 * 3600:  # 24 hours in seconds
            print("Loading datasets from cache...")
            with open(cache_file, "rb") as f:
                return pickle.load(f)
        else:
            print("Cache is older than 24 hours, fetching fresh data...")
    else:
        print("No cache found, fetching datasets from Hugging Face Hub...")

    hf_api = HfApi()
    all_datasets = list(hf_api.list_datasets(full=True))

    # Filter datasets by language
    english_filter = filter(
        lambda d: "language:en" in d.tags
        and not any(
            tag.startswith("language:") and tag != "language:en" for tag in d.tags
        ),
        all_datasets,
    )
    spanish_filter = filter(
        lambda d: "language:es" in d.tags
        and not any(
            tag.startswith("language:") and tag != "language:es" for tag in d.tags
        ),
        all_datasets,
    )
    catalan_filter = filter(
        lambda d: "language:ca" in d.tags
        and not any(
            tag.startswith("language:") and tag != "language:ca" for tag in d.tags
        ),
        all_datasets,
    )
    galician_filter = filter(
        lambda d: "language:gl" in d.tags
        and not any(
            tag.startswith("language:") and tag != "language:gl" for tag in d.tags
        ),
        all_datasets,
    )
    basque_filter = filter(
        lambda d: "language:eu" in d.tags
        and not any(
            tag.startswith("language:") and tag != "language:eu" for tag in d.tags
        ),
        all_datasets,
    )
    filtered_datasets = {
        "english": list(english_filter),
        "spanish": list(spanish_filter),
        "catalan": list(catalan_filter),
        "galician": list(galician_filter),
        "basque": list(basque_filter),
    }

    # Cache the filtered datasets
    print("Saving datasets to cache...")
    with open(cache_file, "wb") as f:
        pickle.dump(filtered_datasets, f)

    return filtered_datasets


def create_bar_plots(datasets, output_dir):
    """Create horizontal and vertical bar plots"""
    # Extract creation dates and counts
    years = sorted(
        set(
            date.year
            for date in [
                d.created_at.date() for d in datasets["english"] + datasets["spanish"]
            ]
        )
    )
    english_counts = Counter(
        date.year for date in [d.created_at.date() for d in datasets["english"]]
    )
    spanish_counts = Counter(
        date.year for date in [d.created_at.date() for d in datasets["spanish"]]
    )

    # Horizontal bar plot
    plt.figure(figsize=(8, 5))
    bar_width = 0.4
    years_index = np.arange(len(years))

    plt.bar(
        years_index - bar_width / 2,
        [english_counts[year] for year in years],
        width=bar_width,
        label="English",
        color=LANGUAGE_COLORS["english"],
    )
    plt.bar(
        years_index + bar_width / 2,
        [spanish_counts[year] for year in years],
        width=bar_width,
        label="Spanish",
        color=LANGUAGE_COLORS["spanish"],
    )

    plt.xlabel("Year", fontsize=10)
    plt.ylabel("Number of Datasets", fontsize=10)
    plt.xticks(years_index, years, fontsize=10)
    plt.legend()
    plt.grid(GRID)
    plt.tight_layout()
    plt.savefig(f"{output_dir}/datasets_bar_plot_horizontal.png")
    plt.close()

    # Vertical bar plot
    plt.figure(figsize=(8, 5))
    plt.bar(
        years,
        [english_counts[year] for year in years],
        width=0.4,
        label="English",
        color=LANGUAGE_COLORS["english"],
    )
    plt.bar(
        years,
        [spanish_counts[year] for year in years],
        width=0.4,
        label="Spanish",
        color=LANGUAGE_COLORS["spanish"],
        bottom=[english_counts[year] for year in years],
    )

    plt.xlabel("Year", fontsize=10)
    plt.ylabel("Number of Datasets", fontsize=10)
    plt.xticks(years, fontsize=10)
    plt.legend()
    plt.tight_layout()
    plt.grid(GRID)
    plt.savefig(f"{output_dir}/datasets_bar_plot_vertical.png")
    plt.close()


def create_pie_chart(datasets, output_dir):
    """Create pie chart showing distribution of datasets by language"""
    # Calculate counts
    counts = {
        lang.capitalize(): len(datasets[lang])
        for lang in ["english", "spanish", "catalan", "galician", "basque"]
    }

    plt.figure(figsize=(8, 8))
    plt.pie(
        counts.values(),
        labels=counts.keys(),
        autopct="%1.1f%%",
        startangle=180,
        colors=[
            LANGUAGE_COLORS[lang]
            for lang in ["english", "spanish", "catalan", "galician", "basque"]
        ],
    )
    plt.axis("equal")
    plt.savefig(f"{output_dir}/datasets_pie_chart.png")
    plt.close()


def create_time_series(datasets, output_dir):
    """Create time series plots"""
    # Prepare data
    creation_dates_english = [d.created_at.date() for d in datasets["english"]]
    creation_dates_spanish = [d.created_at.date() for d in datasets["spanish"]]

    df_english = pd.DataFrame(creation_dates_english, columns=["Date"])
    df_spanish = pd.DataFrame(creation_dates_spanish, columns=["Date"])

    df_english["Count"] = 1
    df_spanish["Count"] = 1

    df_english["Date"] = pd.to_datetime(df_english["Date"])
    df_spanish["Date"] = pd.to_datetime(df_spanish["Date"])

    # Cumulative plots
    df_english_cum = (
        df_english.groupby(pd.Grouper(key="Date", freq="MS")).sum().cumsum()
    )
    df_spanish_cum = (
        df_spanish.groupby(pd.Grouper(key="Date", freq="MS")).sum().cumsum()
    )

    plt.figure(figsize=(10, 6))
    plt.plot(
        df_english_cum.index,
        df_english_cum["Count"],
        label="English",
        color=LANGUAGE_COLORS["english"],
    )
    plt.plot(
        df_spanish_cum.index,
        df_spanish_cum["Count"],
        label="Spanish",
        color=LANGUAGE_COLORS["spanish"],
    )

    plt.xlabel("Date", fontsize=10)
    plt.ylabel("Cumulative Number of Datasets", fontsize=10)
    plt.xticks(rotation=45, fontsize=10)
    plt.legend(loc="upper left")
    plt.tight_layout()
    plt.grid(GRID)
    plt.savefig(f"{output_dir}/datasets_time_series.png")
    plt.close()


def create_stack_area_plots(datasets, output_dir):
    """Create stacked area plots"""
    # Prepare data for all languages
    all_dates = []
    languages = ["english", "spanish", "catalan", "galician", "basque"]
    for lang in languages:
        all_dates.extend([d.created_at.date() for d in datasets[lang]])

    # Create a common date range for all languages
    min_date = min(all_dates)
    max_date = max(all_dates)
    date_range = pd.date_range(start=min_date, end=max_date, freq="MS")

    # Create separate DataFrames for each language
    dfs = {}
    for lang in languages:
        dates = [d.created_at.date() for d in datasets[lang]]
        df = pd.DataFrame({"Date": dates})
        df["Count"] = 1
        df["Date"] = pd.to_datetime(df["Date"])
        # Reindex to common date range and fill missing values with 0
        df_grouped = df.groupby(pd.Grouper(key="Date", freq="MS")).sum()
        df_grouped = df_grouped.reindex(date_range, fill_value=0)
        dfs[lang] = df_grouped.cumsum()

    # Plot stacked area for all languages
    plt.figure(figsize=(10, 6))
    plt.stackplot(
        date_range,
        [dfs[lang]["Count"].values for lang in languages],
        labels=[lang.capitalize() for lang in languages],
        colors=[LANGUAGE_COLORS[lang] for lang in languages],
    )

    plt.xlabel("Date", fontsize=10)
    plt.ylabel("Cumulative Number of Datasets", fontsize=10)
    plt.xticks(rotation=45, fontsize=10)
    plt.legend(loc="upper left")
    plt.tight_layout()
    plt.grid(GRID)
    plt.savefig(f"{output_dir}/datasets_stack_area.png")
    plt.close()

    # Plot stacked area for all except English
    plt.figure(figsize=(10, 6))
    plt.stackplot(
        date_range,
        [
            dfs[lang]["Count"].values
            for lang in ["spanish", "catalan", "galician", "basque"]
        ],
        labels=["Spanish", "Catalan", "Galician", "Basque"],
        colors=[
            LANGUAGE_COLORS[lang]
            for lang in ["spanish", "catalan", "galician", "basque"]
        ],
    )

    plt.xlabel("Date", fontsize=10)
    plt.ylabel("Cumulative Number of Datasets", fontsize=10)
    plt.xticks(rotation=45, fontsize=10)
    plt.legend(loc="upper left")
    plt.tight_layout()
    plt.grid(GRID)
    plt.savefig(f"{output_dir}/datasets_stack_area_es_ca_gl_eu.png")
    plt.close()

    # Plot stacked area for English and Spanish
    plt.figure(figsize=(10, 6))
    plt.stackplot(
        date_range,
        [dfs[lang]["Count"].values for lang in ["english", "spanish"]],
        labels=["English", "Spanish"],
        colors=[LANGUAGE_COLORS[lang] for lang in ["english", "spanish"]],
    )

    plt.xlabel("Date", fontsize=10)
    plt.ylabel("Cumulative Number of Datasets", fontsize=10)
    plt.xticks(rotation=45, fontsize=10)
    plt.legend(loc="upper left")
    plt.tight_layout()
    plt.grid(GRID)
    plt.savefig(f"{output_dir}/datasets_stack_area_en_es.png")
    plt.close()

    # Plot stacked area for Spanish only
    plt.figure(figsize=(10, 6))
    plt.stackplot(
        date_range,
        [dfs["spanish"]["Count"].values],
        labels=["Spanish"],
        colors=[LANGUAGE_COLORS["spanish"]],
    )

    plt.xlabel("Date", fontsize=10)
    plt.ylabel("Cumulative Number of Datasets", fontsize=10)
    plt.xticks(rotation=45, fontsize=10)
    plt.legend(loc="upper left")
    plt.tight_layout()
    plt.grid(GRID)
    plt.savefig(f"{output_dir}/datasets_stack_area_es.png")
    plt.close()


def main():
    # Create output directory if it doesn't exist
    output_dir = "plots"
    os.makedirs(output_dir, exist_ok=True)

    # Fetch datasets
    print("Fetching datasets from Hugging Face Hub...")
    datasets = fetch_datasets()

    # Create visualizations
    print("Creating bar plots...")
    create_bar_plots(datasets, output_dir)

    print("Creating pie chart...")
    create_pie_chart(datasets, output_dir)

    print("Creating time series plots...")
    create_time_series(datasets, output_dir)

    print("Creating stack area plots...")
    create_stack_area_plots(datasets, output_dir)

    print(f"All visualizations have been saved to the '{output_dir}' directory")


if __name__ == "__main__":
    main()