Spaces:

mariagrandury
/

language-gap-in-hf-hub

Sleeping

App Files Files Community

language-gap-in-hf-hub / hub_datasets_by_language.py

mariagrandury

implement script and add languages from Spain

30918aa 12 days ago

raw

history blame contribute delete

11 kB

	import os
	import pickle
	from collections import Counter
	from datetime import datetime

	import matplotlib.pyplot as plt
	import numpy as np
	import pandas as pd
	from huggingface_hub import HfApi

	# Define colors for each language
	LANGUAGE_COLORS = {
	"english": "orange",
	"spanish": "blue",
	"catalan": "red",
	"galician": "green",
	"basque": "purple",
	}

	GRID = False


	def fetch_datasets(cache_file="datasets_cache.pkl"):
	"""Fetch and filter datasets from HuggingFace Hub with caching"""
	# Check if cached data exists and is less than 24 hours old
	if os.path.exists(cache_file):
	cache_age = datetime.now().timestamp() - os.path.getmtime(cache_file)
	if cache_age < 24 * 3600: # 24 hours in seconds
	print("Loading datasets from cache...")
	with open(cache_file, "rb") as f:
	return pickle.load(f)
	else:
	print("Cache is older than 24 hours, fetching fresh data...")
	else:
	print("No cache found, fetching datasets from Hugging Face Hub...")

	hf_api = HfApi()
	all_datasets = list(hf_api.list_datasets(full=True))

	# Filter datasets by language
	english_filter = filter(
	lambda d: "language:en" in d.tags
	and not any(
	tag.startswith("language:") and tag != "language:en" for tag in d.tags
	),
	all_datasets,
	)
	spanish_filter = filter(
	lambda d: "language:es" in d.tags
	and not any(
	tag.startswith("language:") and tag != "language:es" for tag in d.tags
	),
	all_datasets,
	)
	catalan_filter = filter(
	lambda d: "language:ca" in d.tags
	and not any(
	tag.startswith("language:") and tag != "language:ca" for tag in d.tags
	),
	all_datasets,
	)
	galician_filter = filter(
	lambda d: "language:gl" in d.tags
	and not any(
	tag.startswith("language:") and tag != "language:gl" for tag in d.tags
	),
	all_datasets,
	)
	basque_filter = filter(
	lambda d: "language:eu" in d.tags
	and not any(
	tag.startswith("language:") and tag != "language:eu" for tag in d.tags
	),
	all_datasets,
	)
	filtered_datasets = {
	"english": list(english_filter),
	"spanish": list(spanish_filter),
	"catalan": list(catalan_filter),
	"galician": list(galician_filter),
	"basque": list(basque_filter),
	}

	# Cache the filtered datasets
	print("Saving datasets to cache...")
	with open(cache_file, "wb") as f:
	pickle.dump(filtered_datasets, f)

	return filtered_datasets


	def create_bar_plots(datasets, output_dir):
	"""Create horizontal and vertical bar plots"""
	# Extract creation dates and counts
	years = sorted(
	set(
	date.year
	for date in [
	d.created_at.date() for d in datasets["english"] + datasets["spanish"]
	]
	)
	)
	english_counts = Counter(
	date.year for date in [d.created_at.date() for d in datasets["english"]]
	)
	spanish_counts = Counter(
	date.year for date in [d.created_at.date() for d in datasets["spanish"]]
	)

	# Horizontal bar plot
	plt.figure(figsize=(8, 5))
	bar_width = 0.4
	years_index = np.arange(len(years))

	plt.bar(
	years_index - bar_width / 2,
	[english_counts[year] for year in years],
	width=bar_width,
	label="English",
	color=LANGUAGE_COLORS["english"],
	)
	plt.bar(
	years_index + bar_width / 2,
	[spanish_counts[year] for year in years],
	width=bar_width,
	label="Spanish",
	color=LANGUAGE_COLORS["spanish"],
	)

	plt.xlabel("Year", fontsize=10)
	plt.ylabel("Number of Datasets", fontsize=10)
	plt.xticks(years_index, years, fontsize=10)
	plt.legend()
	plt.grid(GRID)
	plt.tight_layout()
	plt.savefig(f"{output_dir}/bar_plot_horizontal.png")
	plt.close()

	# Vertical bar plot
	plt.figure(figsize=(8, 5))
	plt.bar(
	years,
	[english_counts[year] for year in years],
	width=0.4,
	label="English",
	color=LANGUAGE_COLORS["english"],
	)
	plt.bar(
	years,
	[spanish_counts[year] for year in years],
	width=0.4,
	label="Spanish",
	color=LANGUAGE_COLORS["spanish"],
	bottom=[english_counts[year] for year in years],
	)

	plt.xlabel("Year", fontsize=10)
	plt.ylabel("Number of Datasets", fontsize=10)
	plt.xticks(years, fontsize=10)
	plt.legend()
	plt.tight_layout()
	plt.grid(GRID)
	plt.savefig(f"{output_dir}/bar_plot_vertical.png")
	plt.close()


	def create_pie_chart(datasets, output_dir):
	"""Create pie chart showing distribution of datasets by language"""
	# Calculate counts
	counts = {
	lang.capitalize(): len(datasets[lang])
	for lang in ["english", "spanish", "catalan", "galician", "basque"]
	}

	plt.figure(figsize=(8, 8))
	plt.pie(
	counts.values(),
	labels=counts.keys(),
	autopct="%1.1f%%",
	startangle=180,
	colors=[
	LANGUAGE_COLORS[lang]
	for lang in ["english", "spanish", "catalan", "galician", "basque"]
	],
	)
	plt.axis("equal")
	plt.savefig(f"{output_dir}/pie_chart.png")
	plt.close()


	def create_time_series(datasets, output_dir):
	"""Create time series plots"""
	# Prepare data
	creation_dates_english = [d.created_at.date() for d in datasets["english"]]
	creation_dates_spanish = [d.created_at.date() for d in datasets["spanish"]]

	df_english = pd.DataFrame(creation_dates_english, columns=["Date"])
	df_spanish = pd.DataFrame(creation_dates_spanish, columns=["Date"])

	df_english["Count"] = 1
	df_spanish["Count"] = 1

	df_english["Date"] = pd.to_datetime(df_english["Date"])
	df_spanish["Date"] = pd.to_datetime(df_spanish["Date"])

	# Cumulative plots
	df_english_cum = (
	df_english.groupby(pd.Grouper(key="Date", freq="MS")).sum().cumsum()
	)
	df_spanish_cum = (
	df_spanish.groupby(pd.Grouper(key="Date", freq="MS")).sum().cumsum()
	)

	plt.figure(figsize=(10, 6))
	plt.plot(
	df_english_cum.index,
	df_english_cum["Count"],
	label="English",
	color=LANGUAGE_COLORS["english"],
	)
	plt.plot(
	df_spanish_cum.index,
	df_spanish_cum["Count"],
	label="Spanish",
	color=LANGUAGE_COLORS["spanish"],
	)

	plt.xlabel("Date", fontsize=10)
	plt.ylabel("Cumulative Number of Datasets", fontsize=10)
	plt.xticks(rotation=45, fontsize=10)
	plt.legend(loc="upper left")
	plt.tight_layout()
	plt.grid(GRID)
	plt.savefig(f"{output_dir}/time_series.png")
	plt.close()


	def create_stack_area_plots(datasets, output_dir):
	"""Create stacked area plots"""
	# Prepare data for all languages
	all_dates = []
	languages = ["english", "spanish", "catalan", "galician", "basque"]
	for lang in languages:
	all_dates.extend([d.created_at.date() for d in datasets[lang]])

	# Create a common date range for all languages
	min_date = min(all_dates)
	max_date = max(all_dates)
	date_range = pd.date_range(start=min_date, end=max_date, freq="MS")

	# Create separate DataFrames for each language
	dfs = {}
	for lang in languages:
	dates = [d.created_at.date() for d in datasets[lang]]
	df = pd.DataFrame({"Date": dates})
	df["Count"] = 1
	df["Date"] = pd.to_datetime(df["Date"])
	# Reindex to common date range and fill missing values with 0
	df_grouped = df.groupby(pd.Grouper(key="Date", freq="MS")).sum()
	df_grouped = df_grouped.reindex(date_range, fill_value=0)
	dfs[lang] = df_grouped.cumsum()

	# Plot stacked area for all languages
	plt.figure(figsize=(10, 6))
	plt.stackplot(
	date_range,
	[dfs[lang]["Count"].values for lang in languages],
	labels=[lang.capitalize() for lang in languages],
	colors=[LANGUAGE_COLORS[lang] for lang in languages],
	)

	plt.xlabel("Date", fontsize=10)
	plt.ylabel("Cumulative Number of Datasets", fontsize=10)
	plt.xticks(rotation=45, fontsize=10)
	plt.legend(loc="upper left")
	plt.tight_layout()
	plt.grid(GRID)
	plt.savefig(f"{output_dir}/stack_area.png")
	plt.close()

	# Plot stacked area for all except English
	plt.figure(figsize=(10, 6))
	plt.stackplot(
	date_range,
	[
	dfs[lang]["Count"].values
	for lang in ["spanish", "catalan", "galician", "basque"]
	],
	labels=["Spanish", "Catalan", "Galician", "Basque"],
	colors=[
	LANGUAGE_COLORS[lang]
	for lang in ["spanish", "catalan", "galician", "basque"]
	],
	)

	plt.xlabel("Date", fontsize=10)
	plt.ylabel("Cumulative Number of Datasets", fontsize=10)
	plt.xticks(rotation=45, fontsize=10)
	plt.legend(loc="upper left")
	plt.tight_layout()
	plt.grid(GRID)
	plt.savefig(f"{output_dir}/stack_area_es_ca_gl_eu.png")
	plt.close()

	# Plot stacked area for English and Spanish
	plt.figure(figsize=(10, 6))
	plt.stackplot(
	date_range,
	[dfs[lang]["Count"].values for lang in ["english", "spanish"]],
	labels=["English", "Spanish"],
	colors=[LANGUAGE_COLORS[lang] for lang in ["english", "spanish"]],
	)

	plt.xlabel("Date", fontsize=10)
	plt.ylabel("Cumulative Number of Datasets", fontsize=10)
	plt.xticks(rotation=45, fontsize=10)
	plt.legend(loc="upper left")
	plt.tight_layout()
	plt.grid(GRID)
	plt.savefig(f"{output_dir}/stack_area_en_es.png")
	plt.close()

	# Plot stacked area for Spanish only
	plt.figure(figsize=(10, 6))
	plt.stackplot(
	date_range,
	[dfs["spanish"]["Count"].values],
	labels=["Spanish"],
	colors=[LANGUAGE_COLORS["spanish"]],
	)

	plt.xlabel("Date", fontsize=10)
	plt.ylabel("Cumulative Number of Datasets", fontsize=10)
	plt.xticks(rotation=45, fontsize=10)
	plt.legend(loc="upper left")
	plt.tight_layout()
	plt.grid(GRID)
	plt.savefig(f"{output_dir}/stack_area_es.png")
	plt.close()


	def main():
	# Create output directory if it doesn't exist
	output_dir = "plots"
	os.makedirs(output_dir, exist_ok=True)

	# Fetch datasets
	print("Fetching datasets from Hugging Face Hub...")
	datasets = fetch_datasets()

	# Create visualizations
	print("Creating bar plots...")
	create_bar_plots(datasets, output_dir)

	print("Creating pie chart...")
	create_pie_chart(datasets, output_dir)

	print("Creating time series plots...")
	create_time_series(datasets, output_dir)

	print("Creating stack area plots...")
	create_stack_area_plots(datasets, output_dir)

	print(f"All visualizations have been saved to the '{output_dir}' directory")


	if __name__ == "__main__":
	main()