Spaces:

mariagrandury
/

language-gap-in-hf-hub

Sleeping

File size: 29,481 Bytes

{
  "cells": [
    {
      "cell_type": "markdown",
      "source": [
        "# Language gap in the Hugging Face Hub\n",
        "\n",
        "<a target=\"_blank\" href=\"https://colab.research.google.com/drive/16KNpk25dQR9sdo7FSTONCIyS2Uvf0cOO?usp=sharing\">\n",
        "  <img src=\"https://colab.research.google.com/assets/colab-badge.svg\" alt=\"Open In Colab\"/>\n",
        "</a>"
      ],
      "metadata": {
        "id": "jgtFu9csb5kY"
      }
    },
    {
      "cell_type": "code",
      "execution_count": null,
      "metadata": {
        "colab": {
          "base_uri": "https://localhost:8080/"
        },
        "id": "bCPvBCk_VLoi",
        "outputId": "4e3e86c5-36bb-4f42-8777-9762373251ff"
      },
      "outputs": [
        {
          "name": "stdout",
          "output_type": "stream",
          "text": [
            "Requirement already satisfied: huggingface_hub in /usr/local/lib/python3.10/dist-packages (0.20.3)\n",
            "Requirement already satisfied: filelock in /usr/local/lib/python3.10/dist-packages (from huggingface_hub) (3.14.0)\n",
            "Requirement already satisfied: fsspec>=2023.5.0 in /usr/local/lib/python3.10/dist-packages (from huggingface_hub) (2023.6.0)\n",
            "Requirement already satisfied: requests in /usr/local/lib/python3.10/dist-packages (from huggingface_hub) (2.31.0)\n",
            "Requirement already satisfied: tqdm>=4.42.1 in /usr/local/lib/python3.10/dist-packages (from huggingface_hub) (4.66.4)\n",
            "Requirement already satisfied: pyyaml>=5.1 in /usr/local/lib/python3.10/dist-packages (from huggingface_hub) (6.0.1)\n",
            "Requirement already satisfied: typing-extensions>=3.7.4.3 in /usr/local/lib/python3.10/dist-packages (from huggingface_hub) (4.11.0)\n",
            "Requirement already satisfied: packaging>=20.9 in /usr/local/lib/python3.10/dist-packages (from huggingface_hub) (24.0)\n",
            "Requirement already satisfied: charset-normalizer<4,>=2 in /usr/local/lib/python3.10/dist-packages (from requests->huggingface_hub) (3.3.2)\n",
            "Requirement already satisfied: idna<4,>=2.5 in /usr/local/lib/python3.10/dist-packages (from requests->huggingface_hub) (3.7)\n",
            "Requirement already satisfied: urllib3<3,>=1.21.1 in /usr/local/lib/python3.10/dist-packages (from requests->huggingface_hub) (2.0.7)\n",
            "Requirement already satisfied: certifi>=2017.4.17 in /usr/local/lib/python3.10/dist-packages (from requests->huggingface_hub) (2024.2.2)\n"
          ]
        }
      ],
      "source": [
        "!pip install huggingface_hub"
      ]
    },
    {
      "cell_type": "code",
      "execution_count": null,
      "metadata": {
        "id": "NbQeXxudVJW9"
      },
      "outputs": [],
      "source": [
        "from datetime import datetime\n",
        "\n",
        "import matplotlib.pyplot as plt\n",
        "import pandas as pd\n",
        "from huggingface_hub import HfApi\n"
      ]
    },
    {
      "cell_type": "code",
      "execution_count": null,
      "metadata": {
        "colab": {
          "base_uri": "https://localhost:8080/"
        },
        "id": "ogyTHBYJVZ8I",
        "outputId": "0590665f-c62d-4c2b-8195-1367995bc01a"
      },
      "outputs": [
        {
          "name": "stderr",
          "output_type": "stream",
          "text": [
            "/usr/local/lib/python3.10/dist-packages/huggingface_hub/utils/_token.py:88: UserWarning: \n",
            "The secret `HF_TOKEN` does not exist in your Colab secrets.\n",
            "To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.\n",
            "You will be able to reuse this secret in all of your notebooks.\n",
            "Please note that authentication is recommended but still optional to access public models or datasets.\n",
            "  warnings.warn(\n"
          ]
        },
        {
          "name": "stdout",
          "output_type": "stream",
          "text": [
            "146571\n"
          ]
        }
      ],
      "source": [
        "hf_api = HfApi()\n",
        "\n",
        "all_datasets = hf_api.list_datasets(full=True)\n",
        "\n",
        "total_count = len(list(all_datasets))\n",
        "print(total_count)"
      ]
    },
    {
      "cell_type": "code",
      "execution_count": null,
      "metadata": {
        "id": "GXDMUU-4XmaI"
      },
      "outputs": [],
      "source": [
        "# language_filter = filter(lambda dataset: 'language:es' in dataset.tags, all_datasets) # 882\n",
        "\n",
        "# spanish_filter = filter(lambda d: \"language:es\" in d.tags and not any(tag.startswith(\"language:\") and tag != \"language:es\" for tag in d.tags), all_datasets) # 317\n",
        "\n",
        "#filtered_datasets_2 = filter(lambda dataset: \"es\" in dataset.card_data.language, all_datasets) # 882\n",
        "\n",
        "#filtered_datasets_3 = filter(lambda dataset: dataset.card_data.language == [\"es\"], all_datasets) #\n",
        "\n",
        "#for dataset in spanish_only_datasets:\n",
        "#    print(dataset)\n",
        "#    break"
      ]
    },
    {
      "cell_type": "code",
      "execution_count": null,
      "metadata": {
        "colab": {
          "base_uri": "https://localhost:8080/"
        },
        "id": "pjCvHVq_hChx",
        "outputId": "d37a955e-9ee0-4d0f-e738-11a376377770"
      },
      "outputs": [
        {
          "name": "stdout",
          "output_type": "stream",
          "text": [
            "331\n"
          ]
        }
      ],
      "source": [
        "hf_api = HfApi()\n",
        "\n",
        "all_datasets = hf_api.list_datasets(full=True)\n",
        "\n",
        "spanish_filter = filter(lambda d: \"language:es\" in d.tags and not any(tag.startswith(\"language:\") and tag != \"language:es\" for tag in d.tags), all_datasets) # 317\n",
        "spanish_datasets = list(spanish_filter)\n",
        "spanish_count = len(list(spanish_datasets))\n",
        "print(spanish_count)\n"
      ]
    },
    {
      "cell_type": "code",
      "execution_count": null,
      "metadata": {
        "colab": {
          "base_uri": "https://localhost:8080/"
        },
        "id": "WANGkTpGRw8t",
        "outputId": "0143ae40-510b-4da2-9e22-47f2af90759a"
      },
      "outputs": [
        {
          "name": "stdout",
          "output_type": "stream",
          "text": [
            "8442\n"
          ]
        }
      ],
      "source": [
        "hf_api = HfApi()\n",
        "\n",
        "all_datasets = hf_api.list_datasets(full=True)\n",
        "\n",
        "english_filter = filter(lambda d: \"language:en\" in d.tags and not any(tag.startswith(\"language:\") and tag != \"language:en\" for tag in d.tags), all_datasets)\n",
        "english_datasets = list(english_filter)\n",
        "english_count = len(list(english_datasets))\n",
        "print(english_count)"
      ]
    },
    {
      "cell_type": "code",
      "execution_count": null,
      "metadata": {
        "colab": {
          "background_save": true
        },
        "id": "yPtF0G7SWS53",
        "outputId": "18a9515e-eeb7-4eb8-f734-c195b15c011a"
      },
      "outputs": [
        {
          "name": "stdout",
          "output_type": "stream",
          "text": [
            "577\n"
          ]
        }
      ],
      "source": [
        "hf_api = HfApi()\n",
        "\n",
        "all_datasets = hf_api.list_datasets(full=True)\n",
        "\n",
        "chinese_filter = filter(lambda d: \"language:zh\" in d.tags and not any(tag.startswith(\"language:\") and tag != \"language:zh\" for tag in d.tags), all_datasets)\n",
        "chinese_datasets = list(chinese_filter)\n",
        "chinese_count = len(list(chinese_datasets))\n",
        "print(chinese_count)"
      ]
    },
    {
      "cell_type": "code",
      "execution_count": null,
      "metadata": {
        "colab": {
          "background_save": true
        },
        "id": "RlxAlOOsW7p9",
        "outputId": "71ff74e7-cd4e-4b39-aa8b-a22e21130f4e"
      },
      "outputs": [
        {
          "name": "stdout",
          "output_type": "stream",
          "text": [
            "438\n"
          ]
        }
      ],
      "source": [
        "hf_api = HfApi()\n",
        "\n",
        "all_datasets = hf_api.list_datasets(full=True)\n",
        "\n",
        "french_filter = filter(lambda d: \"language:fr\" in d.tags and not any(tag.startswith(\"language:\") and tag != \"language:fr\" for tag in d.tags), all_datasets)\n",
        "french_datasets = list(french_filter)\n",
        "french_count = len(list(french_datasets))\n",
        "print(french_count)"
      ]
    },
    {
      "cell_type": "code",
      "execution_count": null,
      "metadata": {
        "id": "OMQfBXjUYBPz"
      },
      "outputs": [],
      "source": [
        "hf_api = HfApi()\n",
        "\n",
        "all_datasets = hf_api.list_datasets(full=True)\n",
        "\n",
        "mono_filter = filter(lambda dataset: sum(tag.startswith('language:') for tag in dataset.tags) == 1, all_datasets)\n",
        "mono_datasets = list(mono_filter)\n",
        "mono_count = len(list(mono_datasets))\n",
        "print(mono_count)"
      ]
    },
    {
      "cell_type": "code",
      "execution_count": null,
      "metadata": {
        "id": "sTPechkdWmYS"
      },
      "outputs": [],
      "source": [
        "# Extract creation date\n",
        "\n",
        "creation_dates_spanish = [d.created_at.date() for d in spanish_datasets]\n",
        "#assert len(creation_dates_spanish) == 318\n",
        "\n",
        "creation_dates_english = [d.created_at.date() for d in english_datasets]\n",
        "#assert len(creation_dates_english) == 8336"
      ]
    },
    {
      "cell_type": "code",
      "execution_count": null,
      "metadata": {
        "id": "hefZVynDSjjE"
      },
      "outputs": [],
      "source": [
        "print(creation_dates_spanish[0])"
      ]
    },
    {
      "cell_type": "markdown",
      "metadata": {
        "id": "aFaEBlkkSbrs"
      },
      "source": [
        "## Bar Chart\n",
        "\n"
      ]
    },
    {
      "cell_type": "code",
      "execution_count": null,
      "metadata": {
        "id": "dYJ2zd4dShYh"
      },
      "outputs": [],
      "source": [
        "import matplotlib.pyplot as plt\n",
        "from collections import Counter\n",
        "\n",
        "# Sample data (replace with your actual data)\n",
        "creation_dates_english = [d.created_at.date() for d in english_datasets]\n",
        "creation_dates_spanish = [d.created_at.date() for d in spanish_datasets]\n",
        "\n",
        "# Extract years from the creation dates\n",
        "years = sorted(set(date.year for date in creation_dates_english + creation_dates_spanish))\n",
        "english_counts = Counter(date.year for date in creation_dates_english)\n",
        "spanish_counts = Counter(date.year for date in creation_dates_spanish)\n",
        "\n",
        "# Plotting the bar chart\n",
        "plt.figure(figsize=(8, 5))\n",
        "plt.bar(years, [english_counts[year] for year in years], width=0.4, label='English Datasets', color='blue')\n",
        "plt.bar(years, [spanish_counts[year] for year in years], width=0.4, label='Spanish Datasets', color='orange', bottom=[english_counts[year] for year in years])\n",
        "\n",
        "# Adding labels and title\n",
        "plt.xlabel('Year', fontsize=10)\n",
        "plt.ylabel('Number of Datasets', fontsize=10)\n",
        "#plt.title('Distribution of Monolingual English and Spanish Datasets by Year')\n",
        "plt.xticks(years, fontsize=10)\n",
        "plt.legend()\n",
        "\n",
        "# Display the plot\n",
        "plt.grid(True)\n",
        "plt.tight_layout()\n",
        "plt.show()\n",
        "\n",
        "plt.savefig(\"bar_chart_vertical.png\")\n"
      ]
    },
    {
      "cell_type": "code",
      "execution_count": null,
      "metadata": {
        "id": "wViEE4wCUVgs"
      },
      "outputs": [],
      "source": [
        "import matplotlib.pyplot as plt\n",
        "import numpy as np\n",
        "from collections import Counter\n",
        "\n",
        "# Sample data (replace with your actual data)\n",
        "creation_dates_english = [d.created_at.date() for d in english_datasets]\n",
        "creation_dates_spanish = [d.created_at.date() for d in spanish_datasets]\n",
        "\n",
        "# Extract years from the creation dates\n",
        "years = sorted(set(date.year for date in creation_dates_english + creation_dates_spanish))\n",
        "english_counts = Counter(date.year for date in creation_dates_english)\n",
        "spanish_counts = Counter(date.year for date in creation_dates_spanish)\n",
        "\n",
        "# Define the width of each bar\n",
        "bar_width = 0.4\n",
        "\n",
        "# Define the x-coordinates for the bars\n",
        "years_index = np.arange(len(years))\n",
        "\n",
        "# Plotting the side-by-side bar chart\n",
        "plt.figure(figsize=(8, 5))\n",
        "plt.bar(years_index - bar_width/2, [english_counts[year] for year in years], width=bar_width, label='English Datasets', color='blue')\n",
        "plt.bar(years_index + bar_width/2, [spanish_counts[year] for year in years], width=bar_width, label='Spanish Datasets', color='orange')\n",
        "\n",
        "# Adding labels and title\n",
        "plt.xlabel('Year', fontsize=10)\n",
        "plt.ylabel('Number of Datasets', fontsize=10)\n",
        "#plt.title('Distribution of Monolingual English and Spanish Datasets by Year')\n",
        "plt.xticks(years_index, years, fontsize=10)\n",
        "plt.legend()\n",
        "\n",
        "# Display the plot\n",
        "plt.grid(True)\n",
        "plt.tight_layout()\n",
        "plt.show()\n",
        "\n",
        "plt.savefig(\"bar_chart_horizontal.png\")"
      ]
    },
    {
      "cell_type": "markdown",
      "metadata": {
        "id": "Hp8vNA6LUA1E"
      },
      "source": [
        "# Stacked Area Chart\n"
      ]
    },
    {
      "cell_type": "code",
      "execution_count": null,
      "metadata": {
        "id": "CWgCunzGUCot"
      },
      "outputs": [],
      "source": [
        "import matplotlib.pyplot as plt\n",
        "from collections import Counter\n",
        "\n",
        "# Sample data (replace with your actual data)\n",
        "creation_dates_english = [d.created_at.date() for d in english_datasets]\n",
        "creation_dates_spanish = [d.created_at.date() for d in spanish_datasets]\n",
        "\n",
        "# Extract years from the creation dates\n",
        "years = sorted(set(date.year for date in creation_dates_english + creation_dates_spanish))\n",
        "english_counts = Counter(date.year for date in creation_dates_english)\n",
        "spanish_counts = Counter(date.year for date in creation_dates_spanish)\n",
        "\n",
        "# Calculate cumulative counts\n",
        "english_datasets_cumulative = [english_counts[year] for year in years]\n",
        "spanish_datasets_cumulative = [spanish_counts[year] for year in years]\n",
        "for i in range(1, len(years)):\n",
        "    english_datasets_cumulative[i] += english_datasets_cumulative[i-1]\n",
        "    spanish_datasets_cumulative[i] += spanish_datasets_cumulative[i-1]\n",
        "\n",
        "# Plotting the stacked area chart\n",
        "plt.figure(figsize=(8, 5))\n",
        "plt.stackplot(years, english_datasets_cumulative, spanish_datasets_cumulative, labels=['English Datasets', 'Spanish Datasets'], colors=['blue', 'orange'])\n",
        "\n",
        "# Adding labels and title\n",
        "plt.xlabel('Year', fontsize=10)\n",
        "plt.ylabel('Cumulative Number of Datasets', fontsize=10)\n",
        "#plt.title('Cumulative Growth of Monolingual English and Spanish Datasets Over Time')\n",
        "plt.xticks(years, fontsize=10)\n",
        "plt.legend(loc='upper left')\n",
        "\n",
        "# Display the plot\n",
        "plt.grid(True)\n",
        "plt.tight_layout()\n",
        "plt.show()\n",
        "\n",
        "plt.savefig(\"stack_area_1.png\")\n"
      ]
    },
    {
      "cell_type": "code",
      "execution_count": null,
      "metadata": {
        "id": "GwRpZwYWhau3"
      },
      "outputs": [],
      "source": [
        "import matplotlib.pyplot as plt\n",
        "import pandas as pd\n",
        "from collections import Counter\n",
        "\n",
        "# Sample data (replace with your actual data)\n",
        "creation_dates_english = [d.created_at.date() for d in english_datasets]\n",
        "creation_dates_spanish = [d.created_at.date() for d in spanish_datasets]\n",
        "\n",
        "# Extract months from the creation dates\n",
        "months_english = [(date.year, date.month) for date in creation_dates_english]\n",
        "months_spanish = [(date.year, date.month) for date in creation_dates_spanish]\n",
        "\n",
        "# Count the occurrences of each month\n",
        "english_counts = Counter(months_english)\n",
        "spanish_counts = Counter(months_spanish)\n",
        "\n",
        "# Create a DataFrame for English datasets\n",
        "df_english = pd.DataFrame.from_dict(english_counts, orient='index', columns=['English'])\n",
        "df_english.index = pd.MultiIndex.from_tuples(df_english.index, names=['Year', 'Month'])\n",
        "df_english = df_english.sort_index()\n",
        "\n",
        "# Create a DataFrame for Spanish datasets\n",
        "df_spanish = pd.DataFrame.from_dict(spanish_counts, orient='index', columns=['Spanish'])\n",
        "df_spanish.index = pd.MultiIndex.from_tuples(df_spanish.index, names=['Year', 'Month'])\n",
        "df_spanish = df_spanish.sort_index()\n",
        "\n",
        "# Merge the DataFrames\n",
        "df = pd.merge(df_english, df_spanish, how='outer', left_index=True, right_index=True).fillna(0)\n",
        "\n",
        "# Convert index to datetime\n",
        "df.index = pd.to_datetime(df.index.map(lambda x: f'{x[0]}-{x[1]}'))\n",
        "\n",
        "# Calculate cumulative sum\n",
        "df_cumulative = df.cumsum()\n",
        "\n",
        "# Plotting the stacked area chart\n",
        "plt.figure(figsize=(8, 5))\n",
        "plt.stackplot(df_cumulative.index, df_cumulative['English'], df_cumulative['Spanish'], labels=['English', 'Spanish'], colors=['orange', 'blue'])\n",
        "\n",
        "# Adding labels and title\n",
        "plt.xlabel('Creation date', fontsize=10)\n",
        "plt.ylabel('Cumulative number of monolingual datasets', fontsize=10)\n",
        "#plt.title('Cumulative growth of monolingual English and Spanish datasets in the Hugging Face Hub over time')\n",
        "\n",
        "# Display the plot\n",
        "plt.xticks(rotation=45, fontsize=10)\n",
        "plt.legend(loc='upper left')\n",
        "plt.grid(False)\n",
        "plt.tight_layout()\n",
        "plt.show()\n",
        "\n",
        "plt.savefig(\"stack_area_2.png\")"
      ]
    },
    {
      "cell_type": "code",
      "execution_count": null,
      "metadata": {
        "id": "kJQ0OgRtglOQ"
      },
      "outputs": [],
      "source": [
        "import matplotlib.pyplot as plt\n",
        "import pandas as pd\n",
        "from collections import Counter\n",
        "\n",
        "# Sample data (replace with your actual data)\n",
        "creation_dates_english = [d.created_at.date() for d in english_datasets]\n",
        "creation_dates_spanish = [d.created_at.date() for d in spanish_datasets]\n",
        "\n",
        "# Extract months from the creation dates\n",
        "months_english = [(date.year, date.month) for date in creation_dates_english]\n",
        "months_spanish = [(date.year, date.month) for date in creation_dates_spanish]\n",
        "\n",
        "# Count the occurrences of each month\n",
        "english_counts = Counter(months_english)\n",
        "spanish_counts = Counter(months_spanish)\n",
        "\n",
        "# Create a DataFrame for English datasets\n",
        "df_english = pd.DataFrame.from_dict(english_counts, orient='index', columns=['English'])\n",
        "df_english.index = pd.MultiIndex.from_tuples(df_english.index, names=['Year', 'Month'])\n",
        "df_english = df_english.sort_index()\n",
        "\n",
        "# Create a DataFrame for Spanish datasets\n",
        "df_spanish = pd.DataFrame.from_dict(spanish_counts, orient='index', columns=['Spanish'])\n",
        "df_spanish.index = pd.MultiIndex.from_tuples(df_spanish.index, names=['Year', 'Month'])\n",
        "df_spanish = df_spanish.sort_index()\n",
        "\n",
        "# Merge the DataFrames\n",
        "df = pd.merge(df_english, df_spanish, how='outer', left_index=True, right_index=True).fillna(0)\n",
        "\n",
        "# Convert index to datetime\n",
        "df.index = pd.to_datetime(df.index.map(lambda x: f'{x[0]}-{x[1]}'))\n",
        "\n",
        "# Plotting the stacked area chart\n",
        "plt.figure(figsize=(10, 6))\n",
        "plt.stackplot(df.index, df['English'], df['Spanish'], labels=['English Datasets', 'Spanish Datasets'], colors=['blue', 'orange'])\n",
        "\n",
        "# Adding labels and title\n",
        "plt.xlabel('Date', fontsize=10)\n",
        "plt.ylabel('Cumulative Number of Datasets', fontsize=10)\n",
        "#plt.title('Cumulative Growth of Monolingual English and Spanish Datasets Over Time')\n",
        "\n",
        "# Display the plot\n",
        "plt.xticks(rotation=45, fontsize=10)\n",
        "plt.legend(loc='upper left')\n",
        "plt.grid(True)\n",
        "plt.tight_layout()\n",
        "plt.show()\n",
        "\n",
        "plt.savefig(\"stack_area_3.png\")"
      ]
    },
    {
      "cell_type": "markdown",
      "metadata": {
        "id": "IAnFHiPlgnRE"
      },
      "source": [
        "## Pie Chart"
      ]
    },
    {
      "cell_type": "code",
      "execution_count": null,
      "metadata": {
        "id": "8tKR1x-kVeZT"
      },
      "outputs": [],
      "source": [
        "import matplotlib.pyplot as plt\n",
        "from collections import Counter\n",
        "\n",
        "# Calculate the count of \"other\" datasets\n",
        "other_count = mono_count - (english_count + spanish_count + chinese_count + french_count)\n",
        "\n",
        "# Pie chart data\n",
        "labels = ['English', 'Chinese', 'French', 'Spanish', 'Other']\n",
        "sizes = [english_count, chinese_count, french_count, spanish_count, other_count]\n",
        "\n",
        "# Plotting the pie chart\n",
        "plt.figure(figsize=(8, 8))\n",
        "plt.pie(sizes, labels=labels, autopct='%1.1f%%', startangle=180, colors=['blue', 'red', 'green', 'orange', 'purple'])\n",
        "#plt.title('Distribution of Monolingual Datasets by Language')\n",
        "plt.axis('equal')  # Equal aspect ratio ensures that pie is drawn as a circle.\n",
        "\n",
        "# Display the plot\n",
        "plt.show()\n",
        "\n",
        "plt.savefig(\"pie_chart.png\")\n"
      ]
    },
    {
      "cell_type": "markdown",
      "metadata": {
        "id": "z2xf8FrHROxy"
      },
      "source": [
        "# Time series plot"
      ]
    },
    {
      "cell_type": "code",
      "execution_count": null,
      "metadata": {
        "id": "DuPFSZKUhyQj"
      },
      "outputs": [],
      "source": [
        "# Prepare data for plotting\n",
        "\n",
        "df = pd.DataFrame(creation_dates_spanish, columns=[\"Date\"])\n",
        "df[\"Count\"] = 1\n",
        "# Ensure the 'Date' column is of type datetime\n",
        "df['Date'] = pd.to_datetime(df['Date'])\n",
        "# Group by month and calculate cumulative sum\n",
        "df = df.groupby(pd.Grouper(key=\"Date\", freq=\"MS\")).sum().cumsum()\n",
        "\n",
        "# Plot the data\n",
        "plt.figure(figsize=(10, 6))\n",
        "plt.plot(\n",
        "    df.index,\n",
        "    df[\"Count\"],\n",
        "    #marker=\"o\",\n",
        "    color=\"g\"\n",
        ")\n",
        "#plt.title(\"Evolución de bases de datos monolingües en español\")\n",
        "plt.xlabel(\"Fecha\", fontsize=10)\n",
        "plt.ylabel(\"Número de bases de datos\", fontsize=10)\n",
        "plt.grid(True)\n",
        "plt.xticks(rotation=45, fontsize=10)\n",
        "plt.tight_layout()\n",
        "plt.show()\n",
        "\n",
        "plt.savefig(\"time_series_1.png\")"
      ]
    },
    {
      "cell_type": "code",
      "execution_count": null,
      "metadata": {
        "id": "-Vu3PIe2hITq"
      },
      "outputs": [],
      "source": [
        "import matplotlib.pyplot as plt\n",
        "import pandas as pd\n",
        "from collections import Counter\n",
        "\n",
        "# Sample data (replace with your actual data)\n",
        "creation_dates_english = [d.created_at.date() for d in english_datasets]\n",
        "creation_dates_spanish = [d.created_at.date() for d in spanish_datasets]\n",
        "\n",
        "# Extract months from the creation dates\n",
        "months_english = [(date.year, date.month) for date in creation_dates_english]\n",
        "months_spanish = [(date.year, date.month) for date in creation_dates_spanish]\n",
        "\n",
        "# Count the occurrences of each month\n",
        "english_counts = Counter(months_english)\n",
        "spanish_counts = Counter(months_spanish)\n",
        "\n",
        "# Create a DataFrame for English datasets\n",
        "df_english = pd.DataFrame.from_dict(english_counts, orient='index', columns=['English'])\n",
        "df_english.index = pd.MultiIndex.from_tuples(df_english.index, names=['Year', 'Month'])\n",
        "df_english = df_english.sort_index()\n",
        "\n",
        "# Create a DataFrame for Spanish datasets\n",
        "df_spanish = pd.DataFrame.from_dict(spanish_counts, orient='index', columns=['Spanish'])\n",
        "df_spanish.index = pd.MultiIndex.from_tuples(df_spanish.index, names=['Year', 'Month'])\n",
        "df_spanish = df_spanish.sort_index()\n",
        "\n",
        "# Merge the DataFrames\n",
        "df = pd.merge(df_english, df_spanish, how='outer', left_index=True, right_index=True).fillna(0)\n",
        "\n",
        "# Convert index to datetime\n",
        "df.index = pd.to_datetime(df.index.map(lambda x: f'{x[0]}-{x[1]}'))\n",
        "\n",
        "# Calculate cumulative sum\n",
        "df_cumulative = df.cumsum()\n",
        "\n",
        "# Plotting the cumulative chart\n",
        "plt.figure(figsize=(10, 6))\n",
        "plt.plot(df_cumulative.index, df_cumulative['English'], label='English', color='blue')\n",
        "plt.plot(df_cumulative.index, df_cumulative['Spanish'], label='Spanish', color='orange')\n",
        "\n",
        "# Adding labels and title\n",
        "plt.xlabel('Date', fontsize=10)\n",
        "plt.ylabel('Cumulative Number of Datasets', fontsize=10)\n",
        "#plt.title('Cumulative Growth of Monolingual English and Spanish Datasets Over Time')\n",
        "\n",
        "# Display the plot\n",
        "plt.xticks(rotation=45, fontsize=10)\n",
        "plt.legend(loc='upper left')\n",
        "plt.grid(True)\n",
        "plt.tight_layout()\n",
        "plt.show()\n",
        "\n",
        "plt.savefig(\"time_series_2.png\")"
      ]
    },
    {
      "cell_type": "code",
      "execution_count": null,
      "metadata": {
        "id": "KG__of2IfdHu"
      },
      "outputs": [],
      "source": [
        "import matplotlib.pyplot as plt\n",
        "import pandas as pd\n",
        "from collections import Counter\n",
        "\n",
        "# Sample data (replace with your actual data)\n",
        "creation_dates_english = [d.created_at.date() for d in english_datasets]\n",
        "creation_dates_spanish = [d.created_at.date() for d in spanish_datasets]\n",
        "\n",
        "# Extract years from the creation dates\n",
        "years = sorted(set(date.year for date in creation_dates_english + creation_dates_spanish))\n",
        "english_counts = Counter(date.year for date in creation_dates_english)\n",
        "spanish_counts = Counter(date.year for date in creation_dates_spanish)\n",
        "\n",
        "# Prepare data for plotting\n",
        "english_series = pd.Series([english_counts[year] for year in years], index=years)\n",
        "spanish_series = pd.Series([spanish_counts[year] for year in years], index=years)\n",
        "\n",
        "# Plotting the time series\n",
        "plt.figure(figsize=(10, 6))\n",
        "plt.plot(english_series.index, english_series.values, label='English', color='blue')\n",
        "plt.plot(spanish_series.index, spanish_series.values, label='Spanish', color='orange')\n",
        "\n",
        "# Adding labels and title\n",
        "#plt.title('Evolution of English and Spanish Datasets Over Time')\n",
        "plt.xlabel('Year', fontsize=10)\n",
        "plt.ylabel('Number of Datasets', fontsize=10)\n",
        "plt.legend()\n",
        "plt.grid(True)\n",
        "plt.xticks(rotation=45, fontsize=10)\n",
        "plt.tight_layout()\n",
        "plt.show()\n",
        "\n",
        "plt.savefig(\"time_series_3.png\")"
      ]
    }
  ],
  "metadata": {
    "colab": {
      "provenance": []
    },
    "kernelspec": {
      "display_name": "Python 3",
      "name": "python3"
    },
    "language_info": {
      "name": "python"
    }
  },
  "nbformat": 4,
  "nbformat_minor": 0
}