Spaces:

mariagrandury
/

language-gap-in-hf-hub

Running

App Files Files Community

mariagrandury commited on May 29

Commit

7cb31c4

•

1 Parent(s): 1f70be8

create notebook and add plots

Browse files

Files changed (9) hide show

app.py +5 -8
numero_datasets_hub.ipynb → hub_datasets_by_language.ipynb +97 -108
numero_datasets_hub_output.ipynb +0 -918
plots/bar_plot_horizontal.png +0 -0
plots/bar_plot_vertical.png +0 -0
plots/datasets_hub.png +0 -0
plots/stack_area.png +0 -0
plots/stack_area_es.png +0 -0
plots/time_series.png +0 -0

app.py CHANGED Viewed

@@ -25,7 +25,6 @@ def create_app():
         run_button.click(run_notebook, outputs=output_label)
-        # Create a 2x2 grid for images
         with gr.Row():
             with gr.Column():
                 image1 = gr.Image(
@@ -49,23 +48,21 @@ def create_app():
                     label="Image 4",
                 )
-        # Description for images
         gr.Markdown("### Image Descriptions")
         gr.Markdown("Description for Image 1")
         gr.Markdown("Description for Image 2")
         gr.Markdown("Description for Image 3")
         gr.Markdown("Description for Image 4")
-        # Collapsible block for citation
         with gr.Accordion("Citation Information"):
             gr.Markdown(
                 """
-                        If you use the images or code please cite:
-                        ```
-                        fjdlsafd
-                        ```
-                        """
             )
     return app

         run_button.click(run_notebook, outputs=output_label)
         with gr.Row():
             with gr.Column():
                 image1 = gr.Image(
                     label="Image 4",
                 )
         gr.Markdown("### Image Descriptions")
         gr.Markdown("Description for Image 1")
         gr.Markdown("Description for Image 2")
         gr.Markdown("Description for Image 3")
         gr.Markdown("Description for Image 4")
         with gr.Accordion("Citation Information"):
             gr.Markdown(
                 """
+                If you use the images or code please cite:
+                ```
+                fjdlsafd
+                ```
+                """
             )
     return app

numero_datasets_hub.ipynb → hub_datasets_by_language.ipynb RENAMED Viewed

@@ -1,14 +1,27 @@
 {
   "cells": [
     {
       "cell_type": "code",
-      "execution_count": 1,
       "metadata": {
         "colab": {
           "base_uri": "https://localhost:8080/"
         },
         "id": "bCPvBCk_VLoi",
-        "outputId": "48174b27-072f-4cf9-bfcc-2a7cb12f60ba"
       },
       "outputs": [
         {
@@ -36,7 +49,7 @@
     },
     {
       "cell_type": "code",
-      "execution_count": 2,
       "metadata": {
         "id": "NbQeXxudVJW9"
       },
@@ -51,13 +64,13 @@
     },
     {
       "cell_type": "code",
-      "execution_count": 3,
       "metadata": {
         "colab": {
           "base_uri": "https://localhost:8080/"
         },
         "id": "ogyTHBYJVZ8I",
-        "outputId": "f23a554a-7328-4e50-d87c-90368294467d"
       },
       "outputs": [
         {
@@ -76,7 +89,7 @@
           "name": "stdout",
           "output_type": "stream",
           "text": [
-            "145101\n"
           ]
         }
       ],
@@ -91,7 +104,7 @@
     },
     {
       "cell_type": "code",
-      "execution_count": 4,
       "metadata": {
         "id": "GXDMUU-4XmaI"
       },
@@ -112,20 +125,20 @@
     },
     {
       "cell_type": "code",
-      "execution_count": 5,
       "metadata": {
         "colab": {
           "base_uri": "https://localhost:8080/"
         },
         "id": "pjCvHVq_hChx",
-        "outputId": "d279d0bc-a3c6-4994-f23c-a7274b1f4ee8"
       },
       "outputs": [
         {
           "name": "stdout",
           "output_type": "stream",
           "text": [
-            "318\n"
           ]
         }
       ],
@@ -142,20 +155,20 @@
     },
     {
       "cell_type": "code",
-      "execution_count": 6,
       "metadata": {
         "colab": {
           "base_uri": "https://localhost:8080/"
         },
         "id": "WANGkTpGRw8t",
-        "outputId": "da8931bf-7ae2-438d-8188-20190f568193"
       },
       "outputs": [
         {
           "name": "stdout",
           "output_type": "stream",
           "text": [
-            "8357\n"
           ]
         }
       ],
@@ -172,20 +185,20 @@
     },
     {
       "cell_type": "code",
-      "execution_count": 7,
       "metadata": {
         "colab": {
-          "base_uri": "https://localhost:8080/"
         },
         "id": "yPtF0G7SWS53",
-        "outputId": "a2a51160-c803-4e7f-a6dc-8879eea1dd69"
       },
       "outputs": [
         {
           "name": "stdout",
           "output_type": "stream",
           "text": [
-            "568\n"
           ]
         }
       ],
@@ -202,20 +215,20 @@
     },
     {
       "cell_type": "code",
-      "execution_count": 8,
       "metadata": {
         "colab": {
-          "base_uri": "https://localhost:8080/"
         },
         "id": "RlxAlOOsW7p9",
-        "outputId": "f1c12edd-5502-4018-b9a7-149f9fc29322"
       },
       "outputs": [
         {
           "name": "stdout",
           "output_type": "stream",
           "text": [
-            "436\n"
           ]
         }
       ],
@@ -232,23 +245,11 @@
     },
     {
       "cell_type": "code",
-      "execution_count": 9,
       "metadata": {
-        "colab": {
-          "base_uri": "https://localhost:8080/"
-        },
-        "id": "OMQfBXjUYBPz",
-        "outputId": "8cd3fdb9-0bc8-4d82-d25b-fb9eef7118ed"
       },
-      "outputs": [
-        {
-          "name": "stdout",
-          "output_type": "stream",
-          "text": [
-            "13886\n"
-          ]
-        }
-      ],
       "source": [
         "hf_api = HfApi()\n",
         "\n",
@@ -262,36 +263,19 @@
     },
     {
       "cell_type": "code",
-      "execution_count": 10,
       "metadata": {
-        "colab": {
-          "base_uri": "https://localhost:8080/",
-          "height": 180
-        },
-        "id": "sTPechkdWmYS",
-        "outputId": "bb49f9f4-150b-4a29-d58e-faff4f88cce3"
       },
-      "outputs": [
-        {
-          "ename": "AssertionError",
-          "evalue": "",
-          "output_type": "error",
-          "traceback": [
-            "\u001b[0;31m---------------------------------------------------------------------------\u001b[0m",
-            "\u001b[0;31mAssertionError\u001b[0m                            Traceback (most recent call last)",
-            "\u001b[0;32m<ipython-input-10-da38b5a6b412>\u001b[0m in \u001b[0;36m<cell line: 7>\u001b[0;34m()\u001b[0m\n\u001b[1;32m      5\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m      6\u001b[0m \u001b[0mcreation_dates_english\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0;34m[\u001b[0m\u001b[0md\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mcreated_at\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mdate\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m)\u001b[0m \u001b[0;32mfor\u001b[0m \u001b[0md\u001b[0m \u001b[0;32min\u001b[0m \u001b[0menglish_datasets\u001b[0m\u001b[0;34m]\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m----> 7\u001b[0;31m \u001b[0;32massert\u001b[0m \u001b[0mlen\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mcreation_dates_english\u001b[0m\u001b[0;34m)\u001b[0m \u001b[0;34m==\u001b[0m \u001b[0;36m8336\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m",
-            "\u001b[0;31mAssertionError\u001b[0m: "
-          ]
-        }
-      ],
       "source": [
         "# Extract creation date\n",
         "\n",
         "creation_dates_spanish = [d.created_at.date() for d in spanish_datasets]\n",
-        "assert len(creation_dates_spanish) == 318\n",
         "\n",
         "creation_dates_english = [d.created_at.date() for d in english_datasets]\n",
-        "assert len(creation_dates_english) == 8336"
       ]
     },
     {
@@ -336,22 +320,23 @@
         "spanish_counts = Counter(date.year for date in creation_dates_spanish)\n",
         "\n",
         "# Plotting the bar chart\n",
-        "plt.figure(figsize=(10, 6))\n",
         "plt.bar(years, [english_counts[year] for year in years], width=0.4, label='English Datasets', color='blue')\n",
         "plt.bar(years, [spanish_counts[year] for year in years], width=0.4, label='Spanish Datasets', color='orange', bottom=[english_counts[year] for year in years])\n",
         "\n",
         "# Adding labels and title\n",
-        "plt.xlabel('Year')\n",
-        "plt.ylabel('Number of Datasets')\n",
-        "plt.title('Distribution of Monolingual English and Spanish Datasets by Year')\n",
-        "plt.xticks(years)\n",
         "plt.legend()\n",
         "\n",
         "# Display the plot\n",
         "plt.grid(True)\n",
         "plt.tight_layout()\n",
         "plt.show()\n",
-        "plt.savefig(\"plots/bar_stack.png\")\n"
       ]
     },
     {
@@ -382,22 +367,23 @@
         "years_index = np.arange(len(years))\n",
         "\n",
         "# Plotting the side-by-side bar chart\n",
-        "plt.figure(figsize=(10, 6))\n",
         "plt.bar(years_index - bar_width/2, [english_counts[year] for year in years], width=bar_width, label='English Datasets', color='blue')\n",
         "plt.bar(years_index + bar_width/2, [spanish_counts[year] for year in years], width=bar_width, label='Spanish Datasets', color='orange')\n",
         "\n",
         "# Adding labels and title\n",
-        "plt.xlabel('Year')\n",
-        "plt.ylabel('Number of Datasets')\n",
-        "plt.title('Distribution of Monolingual English and Spanish Datasets by Year')\n",
-        "plt.xticks(years_index, years)\n",
         "plt.legend()\n",
         "\n",
         "# Display the plot\n",
         "plt.grid(True)\n",
         "plt.tight_layout()\n",
         "plt.show()\n",
-        "plt.savefig(\"plots/bar_width.png\")"
       ]
     },
     {
@@ -437,14 +423,14 @@
         "    spanish_datasets_cumulative[i] += spanish_datasets_cumulative[i-1]\n",
         "\n",
         "# Plotting the stacked area chart\n",
-        "plt.figure(figsize=(10, 6))\n",
         "plt.stackplot(years, english_datasets_cumulative, spanish_datasets_cumulative, labels=['English Datasets', 'Spanish Datasets'], colors=['blue', 'orange'])\n",
         "\n",
         "# Adding labels and title\n",
-        "plt.xlabel('Year')\n",
-        "plt.ylabel('Cumulative Number of Datasets')\n",
-        "plt.title('Cumulative Growth of Monolingual English and Spanish Datasets Over Time')\n",
-        "plt.xticks(years)\n",
         "plt.legend(loc='upper left')\n",
         "\n",
         "# Display the plot\n",
@@ -452,7 +438,7 @@
         "plt.tight_layout()\n",
         "plt.show()\n",
         "\n",
-        "plt.savefig(\"plots/stack_area_1.png\")"
       ]
     },
     {
@@ -503,18 +489,18 @@
         "plt.stackplot(df_cumulative.index, df_cumulative['English'], df_cumulative['Spanish'], labels=['English', 'Spanish'], colors=['orange', 'blue'])\n",
         "\n",
         "# Adding labels and title\n",
-        "plt.xlabel('Creation date')\n",
-        "plt.ylabel('Cumulative number of monolingual datasets')\n",
-        "plt.title('Cumulative growth of monolingual English and Spanish datasets in the Hugging Face Hub over time')\n",
         "\n",
         "# Display the plot\n",
-        "plt.xticks(rotation=45)\n",
         "plt.legend(loc='upper left')\n",
         "plt.grid(False)\n",
         "plt.tight_layout()\n",
         "plt.show()\n",
         "\n",
-        "plt.savefig(\"plots/stack_area_2.png\")"
       ]
     },
     {
@@ -562,18 +548,18 @@
         "plt.stackplot(df.index, df['English'], df['Spanish'], labels=['English Datasets', 'Spanish Datasets'], colors=['blue', 'orange'])\n",
         "\n",
         "# Adding labels and title\n",
-        "plt.xlabel('Date')\n",
-        "plt.ylabel('Cumulative Number of Datasets')\n",
-        "plt.title('Cumulative Growth of Monolingual English and Spanish Datasets Over Time')\n",
         "\n",
         "# Display the plot\n",
-        "plt.xticks(rotation=45)\n",
         "plt.legend(loc='upper left')\n",
         "plt.grid(True)\n",
         "plt.tight_layout()\n",
         "plt.show()\n",
         "\n",
-        "plt.savefig(\"plots/stack_area_3.png\")"
       ]
     },
     {
@@ -606,13 +592,13 @@
         "# Plotting the pie chart\n",
         "plt.figure(figsize=(8, 8))\n",
         "plt.pie(sizes, labels=labels, autopct='%1.1f%%', startangle=180, colors=['blue', 'red', 'green', 'orange', 'purple'])\n",
-        "plt.title('Distribution of Monolingual Datasets by Language')\n",
         "plt.axis('equal')  # Equal aspect ratio ensures that pie is drawn as a circle.\n",
         "\n",
         "# Display the plot\n",
         "plt.show()\n",
         "\n",
-        "plt.savefig(\"plots/pie_chart.png\")"
       ]
     },
     {
@@ -649,13 +635,15 @@
         "    #marker=\"o\",\n",
         "    color=\"g\"\n",
         ")\n",
-        "plt.title(\"Evolución de bases de datos monolingües en español\")\n",
-        "plt.xlabel(\"Fecha\")\n",
-        "plt.ylabel(\"Número de bases de datos\")\n",
         "plt.grid(True)\n",
-        "plt.xticks(rotation=45)\n",
         "plt.tight_layout()\n",
-        "plt.show()"
       ]
     },
     {
@@ -707,16 +695,18 @@
         "plt.plot(df_cumulative.index, df_cumulative['Spanish'], label='Spanish', color='orange')\n",
         "\n",
         "# Adding labels and title\n",
-        "plt.xlabel('Date')\n",
-        "plt.ylabel('Cumulative Number of Datasets')\n",
-        "plt.title('Cumulative Growth of Monolingual English and Spanish Datasets Over Time')\n",
         "\n",
         "# Display the plot\n",
-        "plt.xticks(rotation=45)\n",
         "plt.legend(loc='upper left')\n",
         "plt.grid(True)\n",
         "plt.tight_layout()\n",
-        "plt.show()\n"
       ]
     },
     {
@@ -750,21 +740,21 @@
         "plt.plot(spanish_series.index, spanish_series.values, label='Spanish', color='orange')\n",
         "\n",
         "# Adding labels and title\n",
-        "plt.title('Evolution of English and Spanish Datasets Over Time')\n",
-        "plt.xlabel('Year')\n",
-        "plt.ylabel('Number of Datasets')\n",
         "plt.legend()\n",
         "plt.grid(True)\n",
-        "plt.xticks(rotation=45)\n",
         "plt.tight_layout()\n",
-        "plt.show()\n"
       ]
     }
   ],
   "metadata": {
-    "accelerator": "GPU",
     "colab": {
-      "gpuType": "T4",
       "provenance": []
     },
     "kernelspec": {
@@ -772,10 +762,9 @@
       "name": "python3"
     },
     "language_info": {
-      "name": "python",
-      "version": "3.11.6"
     }
   },
   "nbformat": 4,
   "nbformat_minor": 0
-}

 {
   "cells": [
+    {
+      "cell_type": "markdown",
+      "source": [
+        "# Language gap in the Hugging Face Hub\n",
+        "\n",
+        "<a target=\"_blank\" href=\"https://colab.research.google.com/drive/16KNpk25dQR9sdo7FSTONCIyS2Uvf0cOO?usp=sharing\">\n",
+        "  <img src=\"https://colab.research.google.com/assets/colab-badge.svg\" alt=\"Open In Colab\"/>\n",
+        "</a>"
+      ],
+      "metadata": {
+        "id": "jgtFu9csb5kY"
+      }
+    },
     {
       "cell_type": "code",
+      "execution_count": null,
       "metadata": {
         "colab": {
           "base_uri": "https://localhost:8080/"
         },
         "id": "bCPvBCk_VLoi",
+        "outputId": "4e3e86c5-36bb-4f42-8777-9762373251ff"
       },
       "outputs": [
         {
     },
     {
       "cell_type": "code",
+      "execution_count": null,
       "metadata": {
         "id": "NbQeXxudVJW9"
       },
     },
     {
       "cell_type": "code",
+      "execution_count": null,
       "metadata": {
         "colab": {
           "base_uri": "https://localhost:8080/"
         },
         "id": "ogyTHBYJVZ8I",
+        "outputId": "0590665f-c62d-4c2b-8195-1367995bc01a"
       },
       "outputs": [
         {
           "name": "stdout",
           "output_type": "stream",
           "text": [
+            "146571\n"
           ]
         }
       ],
     },
     {
       "cell_type": "code",
+      "execution_count": null,
       "metadata": {
         "id": "GXDMUU-4XmaI"
       },
     },
     {
       "cell_type": "code",
+      "execution_count": null,
       "metadata": {
         "colab": {
           "base_uri": "https://localhost:8080/"
         },
         "id": "pjCvHVq_hChx",
+        "outputId": "d37a955e-9ee0-4d0f-e738-11a376377770"
       },
       "outputs": [
         {
           "name": "stdout",
           "output_type": "stream",
           "text": [
+            "331\n"
           ]
         }
       ],
     },
     {
       "cell_type": "code",
+      "execution_count": null,
       "metadata": {
         "colab": {
           "base_uri": "https://localhost:8080/"
         },
         "id": "WANGkTpGRw8t",
+        "outputId": "0143ae40-510b-4da2-9e22-47f2af90759a"
       },
       "outputs": [
         {
           "name": "stdout",
           "output_type": "stream",
           "text": [
+            "8442\n"
           ]
         }
       ],
     },
     {
       "cell_type": "code",
+      "execution_count": null,
       "metadata": {
         "colab": {
+          "background_save": true
         },
         "id": "yPtF0G7SWS53",
+        "outputId": "18a9515e-eeb7-4eb8-f734-c195b15c011a"
       },
       "outputs": [
         {
           "name": "stdout",
           "output_type": "stream",
           "text": [
+            "577\n"
           ]
         }
       ],
     },
     {
       "cell_type": "code",
+      "execution_count": null,
       "metadata": {
         "colab": {
+          "background_save": true
         },
         "id": "RlxAlOOsW7p9",
+        "outputId": "71ff74e7-cd4e-4b39-aa8b-a22e21130f4e"
       },
       "outputs": [
         {
           "name": "stdout",
           "output_type": "stream",
           "text": [
+            "438\n"
           ]
         }
       ],
     },
     {
       "cell_type": "code",
+      "execution_count": null,
       "metadata": {
+        "id": "OMQfBXjUYBPz"
       },
+      "outputs": [],
       "source": [
         "hf_api = HfApi()\n",
         "\n",
     },
     {
       "cell_type": "code",
+      "execution_count": null,
       "metadata": {
+        "id": "sTPechkdWmYS"
       },
+      "outputs": [],
       "source": [
         "# Extract creation date\n",
         "\n",
         "creation_dates_spanish = [d.created_at.date() for d in spanish_datasets]\n",
+        "#assert len(creation_dates_spanish) == 318\n",
         "\n",
         "creation_dates_english = [d.created_at.date() for d in english_datasets]\n",
+        "#assert len(creation_dates_english) == 8336"
       ]
     },
     {
         "spanish_counts = Counter(date.year for date in creation_dates_spanish)\n",
         "\n",
         "# Plotting the bar chart\n",
+        "plt.figure(figsize=(8, 5))\n",
         "plt.bar(years, [english_counts[year] for year in years], width=0.4, label='English Datasets', color='blue')\n",
         "plt.bar(years, [spanish_counts[year] for year in years], width=0.4, label='Spanish Datasets', color='orange', bottom=[english_counts[year] for year in years])\n",
         "\n",
         "# Adding labels and title\n",
+        "plt.xlabel('Year', fontsize=10)\n",
+        "plt.ylabel('Number of Datasets', fontsize=10)\n",
+        "#plt.title('Distribution of Monolingual English and Spanish Datasets by Year')\n",
+        "plt.xticks(years, fontsize=10)\n",
         "plt.legend()\n",
         "\n",
         "# Display the plot\n",
         "plt.grid(True)\n",
         "plt.tight_layout()\n",
         "plt.show()\n",
+        "\n",
+        "plt.savefig(\"bar_chart_vertical.png\")\n"
       ]
     },
     {
         "years_index = np.arange(len(years))\n",
         "\n",
         "# Plotting the side-by-side bar chart\n",
+        "plt.figure(figsize=(8, 5))\n",
         "plt.bar(years_index - bar_width/2, [english_counts[year] for year in years], width=bar_width, label='English Datasets', color='blue')\n",
         "plt.bar(years_index + bar_width/2, [spanish_counts[year] for year in years], width=bar_width, label='Spanish Datasets', color='orange')\n",
         "\n",
         "# Adding labels and title\n",
+        "plt.xlabel('Year', fontsize=10)\n",
+        "plt.ylabel('Number of Datasets', fontsize=10)\n",
+        "#plt.title('Distribution of Monolingual English and Spanish Datasets by Year')\n",
+        "plt.xticks(years_index, years, fontsize=10)\n",
         "plt.legend()\n",
         "\n",
         "# Display the plot\n",
         "plt.grid(True)\n",
         "plt.tight_layout()\n",
         "plt.show()\n",
+        "\n",
+        "plt.savefig(\"bar_chart_horizontal.png\")"
       ]
     },
     {
         "    spanish_datasets_cumulative[i] += spanish_datasets_cumulative[i-1]\n",
         "\n",
         "# Plotting the stacked area chart\n",
+        "plt.figure(figsize=(8, 5))\n",
         "plt.stackplot(years, english_datasets_cumulative, spanish_datasets_cumulative, labels=['English Datasets', 'Spanish Datasets'], colors=['blue', 'orange'])\n",
         "\n",
         "# Adding labels and title\n",
+        "plt.xlabel('Year', fontsize=10)\n",
+        "plt.ylabel('Cumulative Number of Datasets', fontsize=10)\n",
+        "#plt.title('Cumulative Growth of Monolingual English and Spanish Datasets Over Time')\n",
+        "plt.xticks(years, fontsize=10)\n",
         "plt.legend(loc='upper left')\n",
         "\n",
         "# Display the plot\n",
         "plt.tight_layout()\n",
         "plt.show()\n",
         "\n",
+        "plt.savefig(\"stack_area_1.png\")\n"
       ]
     },
     {
         "plt.stackplot(df_cumulative.index, df_cumulative['English'], df_cumulative['Spanish'], labels=['English', 'Spanish'], colors=['orange', 'blue'])\n",
         "\n",
         "# Adding labels and title\n",
+        "plt.xlabel('Creation date', fontsize=10)\n",
+        "plt.ylabel('Cumulative number of monolingual datasets', fontsize=10)\n",
+        "#plt.title('Cumulative growth of monolingual English and Spanish datasets in the Hugging Face Hub over time')\n",
         "\n",
         "# Display the plot\n",
+        "plt.xticks(rotation=45, fontsize=10)\n",
         "plt.legend(loc='upper left')\n",
         "plt.grid(False)\n",
         "plt.tight_layout()\n",
         "plt.show()\n",
         "\n",
+        "plt.savefig(\"stack_area_2.png\")"
       ]
     },
     {
         "plt.stackplot(df.index, df['English'], df['Spanish'], labels=['English Datasets', 'Spanish Datasets'], colors=['blue', 'orange'])\n",
         "\n",
         "# Adding labels and title\n",
+        "plt.xlabel('Date', fontsize=10)\n",
+        "plt.ylabel('Cumulative Number of Datasets', fontsize=10)\n",
+        "#plt.title('Cumulative Growth of Monolingual English and Spanish Datasets Over Time')\n",
         "\n",
         "# Display the plot\n",
+        "plt.xticks(rotation=45, fontsize=10)\n",
         "plt.legend(loc='upper left')\n",
         "plt.grid(True)\n",
         "plt.tight_layout()\n",
         "plt.show()\n",
         "\n",
+        "plt.savefig(\"stack_area_3.png\")"
       ]
     },
     {
         "# Plotting the pie chart\n",
         "plt.figure(figsize=(8, 8))\n",
         "plt.pie(sizes, labels=labels, autopct='%1.1f%%', startangle=180, colors=['blue', 'red', 'green', 'orange', 'purple'])\n",
+        "#plt.title('Distribution of Monolingual Datasets by Language')\n",
         "plt.axis('equal')  # Equal aspect ratio ensures that pie is drawn as a circle.\n",
         "\n",
         "# Display the plot\n",
         "plt.show()\n",
         "\n",
+        "plt.savefig(\"pie_chart.png\")\n"
       ]
     },
     {
         "    #marker=\"o\",\n",
         "    color=\"g\"\n",
         ")\n",
+        "#plt.title(\"Evolución de bases de datos monolingües en español\")\n",
+        "plt.xlabel(\"Fecha\", fontsize=10)\n",
+        "plt.ylabel(\"Número de bases de datos\", fontsize=10)\n",
         "plt.grid(True)\n",
+        "plt.xticks(rotation=45, fontsize=10)\n",
         "plt.tight_layout()\n",
+        "plt.show()\n",
+        "\n",
+        "plt.savefig(\"time_series_1.png\")"
       ]
     },
     {
         "plt.plot(df_cumulative.index, df_cumulative['Spanish'], label='Spanish', color='orange')\n",
         "\n",
         "# Adding labels and title\n",
+        "plt.xlabel('Date', fontsize=10)\n",
+        "plt.ylabel('Cumulative Number of Datasets', fontsize=10)\n",
+        "#plt.title('Cumulative Growth of Monolingual English and Spanish Datasets Over Time')\n",
         "\n",
         "# Display the plot\n",
+        "plt.xticks(rotation=45, fontsize=10)\n",
         "plt.legend(loc='upper left')\n",
         "plt.grid(True)\n",
         "plt.tight_layout()\n",
+        "plt.show()\n",
+        "\n",
+        "plt.savefig(\"time_series_2.png\")"
       ]
     },
     {
         "plt.plot(spanish_series.index, spanish_series.values, label='Spanish', color='orange')\n",
         "\n",
         "# Adding labels and title\n",
+        "#plt.title('Evolution of English and Spanish Datasets Over Time')\n",
+        "plt.xlabel('Year', fontsize=10)\n",
+        "plt.ylabel('Number of Datasets', fontsize=10)\n",
         "plt.legend()\n",
         "plt.grid(True)\n",
+        "plt.xticks(rotation=45, fontsize=10)\n",
         "plt.tight_layout()\n",
+        "plt.show()\n",
+        "\n",
+        "plt.savefig(\"time_series_3.png\")"
       ]
     }
   ],
   "metadata": {
     "colab": {
       "provenance": []
     },
     "kernelspec": {
       "name": "python3"
     },
     "language_info": {
+      "name": "python"
     }
   },
   "nbformat": 4,
   "nbformat_minor": 0
+}

numero_datasets_hub_output.ipynb DELETED Viewed

@@ -1,918 +0,0 @@
-{
- "cells": [
-  {
-   "cell_type": "code",
-   "execution_count": null,
-   "id": "9b494ecb",
-   "metadata": {
-    "colab": {
-     "base_uri": "https://localhost:8080/"
-    },
-    "id": "bCPvBCk_VLoi",
-    "outputId": "48174b27-072f-4cf9-bfcc-2a7cb12f60ba",
-    "papermill": {
-     "duration": null,
-     "end_time": null,
-     "exception": null,
-     "start_time": null,
-     "status": "completed"
-    },
-    "tags": []
-   },
-   "outputs": [],
-   "source": [
-    "!pip install huggingface_hub"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": null,
-   "id": "d736660e",
-   "metadata": {
-    "id": "NbQeXxudVJW9",
-    "papermill": {
-     "duration": null,
-     "end_time": null,
-     "exception": null,
-     "start_time": null,
-     "status": "completed"
-    },
-    "tags": []
-   },
-   "outputs": [],
-   "source": [
-    "from datetime import datetime\n",
-    "\n",
-    "import matplotlib.pyplot as plt\n",
-    "import pandas as pd\n",
-    "from huggingface_hub import HfApi\n"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": null,
-   "id": "8dc1a8d8",
-   "metadata": {
-    "colab": {
-     "base_uri": "https://localhost:8080/"
-    },
-    "id": "ogyTHBYJVZ8I",
-    "outputId": "f23a554a-7328-4e50-d87c-90368294467d",
-    "papermill": {
-     "duration": null,
-     "end_time": null,
-     "exception": null,
-     "start_time": null,
-     "status": "completed"
-    },
-    "tags": []
-   },
-   "outputs": [],
-   "source": [
-    "hf_api = HfApi()\n",
-    "\n",
-    "all_datasets = hf_api.list_datasets(full=True)\n",
-    "\n",
-    "total_count = len(list(all_datasets))\n",
-    "print(total_count)"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": null,
-   "id": "299e6d56",
-   "metadata": {
-    "id": "GXDMUU-4XmaI",
-    "papermill": {
-     "duration": null,
-     "end_time": null,
-     "exception": null,
-     "start_time": null,
-     "status": "completed"
-    },
-    "tags": []
-   },
-   "outputs": [],
-   "source": [
-    "# language_filter = filter(lambda dataset: 'language:es' in dataset.tags, all_datasets) # 882\n",
-    "\n",
-    "# spanish_filter = filter(lambda d: \"language:es\" in d.tags and not any(tag.startswith(\"language:\") and tag != \"language:es\" for tag in d.tags), all_datasets) # 317\n",
-    "\n",
-    "#filtered_datasets_2 = filter(lambda dataset: \"es\" in dataset.card_data.language, all_datasets) # 882\n",
-    "\n",
-    "#filtered_datasets_3 = filter(lambda dataset: dataset.card_data.language == [\"es\"], all_datasets) #\n",
-    "\n",
-    "#for dataset in spanish_only_datasets:\n",
-    "#    print(dataset)\n",
-    "#    break"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": null,
-   "id": "691d8f3a",
-   "metadata": {
-    "colab": {
-     "base_uri": "https://localhost:8080/"
-    },
-    "id": "pjCvHVq_hChx",
-    "outputId": "d279d0bc-a3c6-4994-f23c-a7274b1f4ee8",
-    "papermill": {
-     "duration": null,
-     "end_time": null,
-     "exception": null,
-     "start_time": null,
-     "status": "completed"
-    },
-    "tags": []
-   },
-   "outputs": [],
-   "source": [
-    "hf_api = HfApi()\n",
-    "\n",
-    "all_datasets = hf_api.list_datasets(full=True)\n",
-    "\n",
-    "spanish_filter = filter(lambda d: \"language:es\" in d.tags and not any(tag.startswith(\"language:\") and tag != \"language:es\" for tag in d.tags), all_datasets) # 317\n",
-    "spanish_datasets = list(spanish_filter)\n",
-    "spanish_count = len(list(spanish_datasets))\n",
-    "print(spanish_count)\n"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": null,
-   "id": "c9676c89",
-   "metadata": {
-    "colab": {
-     "base_uri": "https://localhost:8080/"
-    },
-    "id": "WANGkTpGRw8t",
-    "outputId": "da8931bf-7ae2-438d-8188-20190f568193",
-    "papermill": {
-     "duration": null,
-     "end_time": null,
-     "exception": null,
-     "start_time": null,
-     "status": "completed"
-    },
-    "tags": []
-   },
-   "outputs": [],
-   "source": [
-    "hf_api = HfApi()\n",
-    "\n",
-    "all_datasets = hf_api.list_datasets(full=True)\n",
-    "\n",
-    "english_filter = filter(lambda d: \"language:en\" in d.tags and not any(tag.startswith(\"language:\") and tag != \"language:en\" for tag in d.tags), all_datasets)\n",
-    "english_datasets = list(english_filter)\n",
-    "english_count = len(list(english_datasets))\n",
-    "print(english_count)"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": null,
-   "id": "bf300ce6",
-   "metadata": {
-    "colab": {
-     "base_uri": "https://localhost:8080/"
-    },
-    "id": "yPtF0G7SWS53",
-    "outputId": "a2a51160-c803-4e7f-a6dc-8879eea1dd69",
-    "papermill": {
-     "duration": null,
-     "end_time": null,
-     "exception": null,
-     "start_time": null,
-     "status": "completed"
-    },
-    "tags": []
-   },
-   "outputs": [],
-   "source": [
-    "hf_api = HfApi()\n",
-    "\n",
-    "all_datasets = hf_api.list_datasets(full=True)\n",
-    "\n",
-    "chinese_filter = filter(lambda d: \"language:zh\" in d.tags and not any(tag.startswith(\"language:\") and tag != \"language:zh\" for tag in d.tags), all_datasets)\n",
-    "chinese_datasets = list(chinese_filter)\n",
-    "chinese_count = len(list(chinese_datasets))\n",
-    "print(chinese_count)"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": null,
-   "id": "407c46fc",
-   "metadata": {
-    "colab": {
-     "base_uri": "https://localhost:8080/"
-    },
-    "id": "RlxAlOOsW7p9",
-    "outputId": "f1c12edd-5502-4018-b9a7-149f9fc29322",
-    "papermill": {
-     "duration": null,
-     "end_time": null,
-     "exception": null,
-     "start_time": null,
-     "status": "completed"
-    },
-    "tags": []
-   },
-   "outputs": [],
-   "source": [
-    "hf_api = HfApi()\n",
-    "\n",
-    "all_datasets = hf_api.list_datasets(full=True)\n",
-    "\n",
-    "french_filter = filter(lambda d: \"language:fr\" in d.tags and not any(tag.startswith(\"language:\") and tag != \"language:fr\" for tag in d.tags), all_datasets)\n",
-    "french_datasets = list(french_filter)\n",
-    "french_count = len(list(french_datasets))\n",
-    "print(french_count)"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": null,
-   "id": "a7d82d5d",
-   "metadata": {
-    "colab": {
-     "base_uri": "https://localhost:8080/"
-    },
-    "id": "OMQfBXjUYBPz",
-    "outputId": "8cd3fdb9-0bc8-4d82-d25b-fb9eef7118ed",
-    "papermill": {
-     "duration": null,
-     "end_time": null,
-     "exception": null,
-     "start_time": null,
-     "status": "completed"
-    },
-    "tags": []
-   },
-   "outputs": [],
-   "source": [
-    "hf_api = HfApi()\n",
-    "\n",
-    "all_datasets = hf_api.list_datasets(full=True)\n",
-    "\n",
-    "mono_filter = filter(lambda dataset: sum(tag.startswith('language:') for tag in dataset.tags) == 1, all_datasets)\n",
-    "mono_datasets = list(mono_filter)\n",
-    "mono_count = len(list(mono_datasets))\n",
-    "print(mono_count)"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": null,
-   "id": "6dc0ac68",
-   "metadata": {
-    "colab": {
-     "base_uri": "https://localhost:8080/",
-     "height": 180
-    },
-    "id": "sTPechkdWmYS",
-    "outputId": "bb49f9f4-150b-4a29-d58e-faff4f88cce3",
-    "papermill": {
-     "duration": null,
-     "end_time": null,
-     "exception": null,
-     "start_time": null,
-     "status": "completed"
-    },
-    "tags": []
-   },
-   "outputs": [],
-   "source": [
-    "# Extract creation date\n",
-    "\n",
-    "creation_dates_spanish = [d.created_at.date() for d in spanish_datasets]\n",
-    "assert len(creation_dates_spanish) == 318\n",
-    "\n",
-    "creation_dates_english = [d.created_at.date() for d in english_datasets]\n",
-    "assert len(creation_dates_english) == 8336"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": null,
-   "id": "57d206ec",
-   "metadata": {
-    "id": "hefZVynDSjjE",
-    "papermill": {
-     "duration": null,
-     "end_time": null,
-     "exception": null,
-     "start_time": null,
-     "status": "completed"
-    },
-    "tags": []
-   },
-   "outputs": [],
-   "source": [
-    "print(creation_dates_spanish[0])"
-   ]
-  },
-  {
-   "cell_type": "markdown",
-   "id": "b80e411d",
-   "metadata": {
-    "id": "aFaEBlkkSbrs",
-    "papermill": {
-     "duration": null,
-     "end_time": null,
-     "exception": null,
-     "start_time": null,
-     "status": "completed"
-    },
-    "tags": []
-   },
-   "source": [
-    "## Bar Chart\n",
-    "\n"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": null,
-   "id": "96652421",
-   "metadata": {
-    "id": "dYJ2zd4dShYh",
-    "papermill": {
-     "duration": null,
-     "end_time": null,
-     "exception": null,
-     "start_time": null,
-     "status": "completed"
-    },
-    "tags": []
-   },
-   "outputs": [],
-   "source": [
-    "import matplotlib.pyplot as plt\n",
-    "from collections import Counter\n",
-    "\n",
-    "# Sample data (replace with your actual data)\n",
-    "creation_dates_english = [d.created_at.date() for d in english_datasets]\n",
-    "creation_dates_spanish = [d.created_at.date() for d in spanish_datasets]\n",
-    "\n",
-    "# Extract years from the creation dates\n",
-    "years = sorted(set(date.year for date in creation_dates_english + creation_dates_spanish))\n",
-    "english_counts = Counter(date.year for date in creation_dates_english)\n",
-    "spanish_counts = Counter(date.year for date in creation_dates_spanish)\n",
-    "\n",
-    "# Plotting the bar chart\n",
-    "plt.figure(figsize=(10, 6))\n",
-    "plt.bar(years, [english_counts[year] for year in years], width=0.4, label='English Datasets', color='blue')\n",
-    "plt.bar(years, [spanish_counts[year] for year in years], width=0.4, label='Spanish Datasets', color='orange', bottom=[english_counts[year] for year in years])\n",
-    "\n",
-    "# Adding labels and title\n",
-    "plt.xlabel('Year')\n",
-    "plt.ylabel('Number of Datasets')\n",
-    "plt.title('Distribution of Monolingual English and Spanish Datasets by Year')\n",
-    "plt.xticks(years)\n",
-    "plt.legend()\n",
-    "\n",
-    "# Display the plot\n",
-    "plt.grid(True)\n",
-    "plt.tight_layout()\n",
-    "plt.show()\n",
-    "plt.savefig(\"plots/bar_stack.png\")\n"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": null,
-   "id": "2d1ae015",
-   "metadata": {
-    "id": "wViEE4wCUVgs",
-    "papermill": {
-     "duration": null,
-     "end_time": null,
-     "exception": null,
-     "start_time": null,
-     "status": "completed"
-    },
-    "tags": []
-   },
-   "outputs": [],
-   "source": [
-    "import matplotlib.pyplot as plt\n",
-    "import numpy as np\n",
-    "from collections import Counter\n",
-    "\n",
-    "# Sample data (replace with your actual data)\n",
-    "creation_dates_english = [d.created_at.date() for d in english_datasets]\n",
-    "creation_dates_spanish = [d.created_at.date() for d in spanish_datasets]\n",
-    "\n",
-    "# Extract years from the creation dates\n",
-    "years = sorted(set(date.year for date in creation_dates_english + creation_dates_spanish))\n",
-    "english_counts = Counter(date.year for date in creation_dates_english)\n",
-    "spanish_counts = Counter(date.year for date in creation_dates_spanish)\n",
-    "\n",
-    "# Define the width of each bar\n",
-    "bar_width = 0.4\n",
-    "\n",
-    "# Define the x-coordinates for the bars\n",
-    "years_index = np.arange(len(years))\n",
-    "\n",
-    "# Plotting the side-by-side bar chart\n",
-    "plt.figure(figsize=(10, 6))\n",
-    "plt.bar(years_index - bar_width/2, [english_counts[year] for year in years], width=bar_width, label='English Datasets', color='blue')\n",
-    "plt.bar(years_index + bar_width/2, [spanish_counts[year] for year in years], width=bar_width, label='Spanish Datasets', color='orange')\n",
-    "\n",
-    "# Adding labels and title\n",
-    "plt.xlabel('Year')\n",
-    "plt.ylabel('Number of Datasets')\n",
-    "plt.title('Distribution of Monolingual English and Spanish Datasets by Year')\n",
-    "plt.xticks(years_index, years)\n",
-    "plt.legend()\n",
-    "\n",
-    "# Display the plot\n",
-    "plt.grid(True)\n",
-    "plt.tight_layout()\n",
-    "plt.show()\n",
-    "plt.savefig(\"plots/bar_width.png\")"
-   ]
-  },
-  {
-   "cell_type": "markdown",
-   "id": "cddf7237",
-   "metadata": {
-    "id": "Hp8vNA6LUA1E",
-    "papermill": {
-     "duration": null,
-     "end_time": null,
-     "exception": null,
-     "start_time": null,
-     "status": "completed"
-    },
-    "tags": []
-   },
-   "source": [
-    "# Stacked Area Chart\n"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": null,
-   "id": "68255399",
-   "metadata": {
-    "id": "CWgCunzGUCot",
-    "papermill": {
-     "duration": null,
-     "end_time": null,
-     "exception": null,
-     "start_time": null,
-     "status": "completed"
-    },
-    "tags": []
-   },
-   "outputs": [],
-   "source": [
-    "import matplotlib.pyplot as plt\n",
-    "from collections import Counter\n",
-    "\n",
-    "# Sample data (replace with your actual data)\n",
-    "creation_dates_english = [d.created_at.date() for d in english_datasets]\n",
-    "creation_dates_spanish = [d.created_at.date() for d in spanish_datasets]\n",
-    "\n",
-    "# Extract years from the creation dates\n",
-    "years = sorted(set(date.year for date in creation_dates_english + creation_dates_spanish))\n",
-    "english_counts = Counter(date.year for date in creation_dates_english)\n",
-    "spanish_counts = Counter(date.year for date in creation_dates_spanish)\n",
-    "\n",
-    "# Calculate cumulative counts\n",
-    "english_datasets_cumulative = [english_counts[year] for year in years]\n",
-    "spanish_datasets_cumulative = [spanish_counts[year] for year in years]\n",
-    "for i in range(1, len(years)):\n",
-    "    english_datasets_cumulative[i] += english_datasets_cumulative[i-1]\n",
-    "    spanish_datasets_cumulative[i] += spanish_datasets_cumulative[i-1]\n",
-    "\n",
-    "# Plotting the stacked area chart\n",
-    "plt.figure(figsize=(10, 6))\n",
-    "plt.stackplot(years, english_datasets_cumulative, spanish_datasets_cumulative, labels=['English Datasets', 'Spanish Datasets'], colors=['blue', 'orange'])\n",
-    "\n",
-    "# Adding labels and title\n",
-    "plt.xlabel('Year')\n",
-    "plt.ylabel('Cumulative Number of Datasets')\n",
-    "plt.title('Cumulative Growth of Monolingual English and Spanish Datasets Over Time')\n",
-    "plt.xticks(years)\n",
-    "plt.legend(loc='upper left')\n",
-    "\n",
-    "# Display the plot\n",
-    "plt.grid(True)\n",
-    "plt.tight_layout()\n",
-    "plt.show()\n",
-    "\n",
-    "plt.savefig(\"plots/stack_area_1.png\")"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": null,
-   "id": "4ba74cf5",
-   "metadata": {
-    "id": "GwRpZwYWhau3",
-    "papermill": {
-     "duration": null,
-     "end_time": null,
-     "exception": null,
-     "start_time": null,
-     "status": "completed"
-    },
-    "tags": []
-   },
-   "outputs": [],
-   "source": [
-    "import matplotlib.pyplot as plt\n",
-    "import pandas as pd\n",
-    "from collections import Counter\n",
-    "\n",
-    "# Sample data (replace with your actual data)\n",
-    "creation_dates_english = [d.created_at.date() for d in english_datasets]\n",
-    "creation_dates_spanish = [d.created_at.date() for d in spanish_datasets]\n",
-    "\n",
-    "# Extract months from the creation dates\n",
-    "months_english = [(date.year, date.month) for date in creation_dates_english]\n",
-    "months_spanish = [(date.year, date.month) for date in creation_dates_spanish]\n",
-    "\n",
-    "# Count the occurrences of each month\n",
-    "english_counts = Counter(months_english)\n",
-    "spanish_counts = Counter(months_spanish)\n",
-    "\n",
-    "# Create a DataFrame for English datasets\n",
-    "df_english = pd.DataFrame.from_dict(english_counts, orient='index', columns=['English'])\n",
-    "df_english.index = pd.MultiIndex.from_tuples(df_english.index, names=['Year', 'Month'])\n",
-    "df_english = df_english.sort_index()\n",
-    "\n",
-    "# Create a DataFrame for Spanish datasets\n",
-    "df_spanish = pd.DataFrame.from_dict(spanish_counts, orient='index', columns=['Spanish'])\n",
-    "df_spanish.index = pd.MultiIndex.from_tuples(df_spanish.index, names=['Year', 'Month'])\n",
-    "df_spanish = df_spanish.sort_index()\n",
-    "\n",
-    "# Merge the DataFrames\n",
-    "df = pd.merge(df_english, df_spanish, how='outer', left_index=True, right_index=True).fillna(0)\n",
-    "\n",
-    "# Convert index to datetime\n",
-    "df.index = pd.to_datetime(df.index.map(lambda x: f'{x[0]}-{x[1]}'))\n",
-    "\n",
-    "# Calculate cumulative sum\n",
-    "df_cumulative = df.cumsum()\n",
-    "\n",
-    "# Plotting the stacked area chart\n",
-    "plt.figure(figsize=(8, 5))\n",
-    "plt.stackplot(df_cumulative.index, df_cumulative['English'], df_cumulative['Spanish'], labels=['English', 'Spanish'], colors=['orange', 'blue'])\n",
-    "\n",
-    "# Adding labels and title\n",
-    "plt.xlabel('Creation date')\n",
-    "plt.ylabel('Cumulative number of monolingual datasets')\n",
-    "plt.title('Cumulative growth of monolingual English and Spanish datasets in the Hugging Face Hub over time')\n",
-    "\n",
-    "# Display the plot\n",
-    "plt.xticks(rotation=45)\n",
-    "plt.legend(loc='upper left')\n",
-    "plt.grid(False)\n",
-    "plt.tight_layout()\n",
-    "plt.show()\n",
-    "\n",
-    "plt.savefig(\"plots/stack_area_2.png\")"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": null,
-   "id": "d96225ce",
-   "metadata": {
-    "id": "kJQ0OgRtglOQ",
-    "papermill": {
-     "duration": null,
-     "end_time": null,
-     "exception": null,
-     "start_time": null,
-     "status": "completed"
-    },
-    "tags": []
-   },
-   "outputs": [],
-   "source": [
-    "import matplotlib.pyplot as plt\n",
-    "import pandas as pd\n",
-    "from collections import Counter\n",
-    "\n",
-    "# Sample data (replace with your actual data)\n",
-    "creation_dates_english = [d.created_at.date() for d in english_datasets]\n",
-    "creation_dates_spanish = [d.created_at.date() for d in spanish_datasets]\n",
-    "\n",
-    "# Extract months from the creation dates\n",
-    "months_english = [(date.year, date.month) for date in creation_dates_english]\n",
-    "months_spanish = [(date.year, date.month) for date in creation_dates_spanish]\n",
-    "\n",
-    "# Count the occurrences of each month\n",
-    "english_counts = Counter(months_english)\n",
-    "spanish_counts = Counter(months_spanish)\n",
-    "\n",
-    "# Create a DataFrame for English datasets\n",
-    "df_english = pd.DataFrame.from_dict(english_counts, orient='index', columns=['English'])\n",
-    "df_english.index = pd.MultiIndex.from_tuples(df_english.index, names=['Year', 'Month'])\n",
-    "df_english = df_english.sort_index()\n",
-    "\n",
-    "# Create a DataFrame for Spanish datasets\n",
-    "df_spanish = pd.DataFrame.from_dict(spanish_counts, orient='index', columns=['Spanish'])\n",
-    "df_spanish.index = pd.MultiIndex.from_tuples(df_spanish.index, names=['Year', 'Month'])\n",
-    "df_spanish = df_spanish.sort_index()\n",
-    "\n",
-    "# Merge the DataFrames\n",
-    "df = pd.merge(df_english, df_spanish, how='outer', left_index=True, right_index=True).fillna(0)\n",
-    "\n",
-    "# Convert index to datetime\n",
-    "df.index = pd.to_datetime(df.index.map(lambda x: f'{x[0]}-{x[1]}'))\n",
-    "\n",
-    "# Plotting the stacked area chart\n",
-    "plt.figure(figsize=(10, 6))\n",
-    "plt.stackplot(df.index, df['English'], df['Spanish'], labels=['English Datasets', 'Spanish Datasets'], colors=['blue', 'orange'])\n",
-    "\n",
-    "# Adding labels and title\n",
-    "plt.xlabel('Date')\n",
-    "plt.ylabel('Cumulative Number of Datasets')\n",
-    "plt.title('Cumulative Growth of Monolingual English and Spanish Datasets Over Time')\n",
-    "\n",
-    "# Display the plot\n",
-    "plt.xticks(rotation=45)\n",
-    "plt.legend(loc='upper left')\n",
-    "plt.grid(True)\n",
-    "plt.tight_layout()\n",
-    "plt.show()\n",
-    "\n",
-    "plt.savefig(\"plots/stack_area_3.png\")"
-   ]
-  },
-  {
-   "cell_type": "markdown",
-   "id": "7bbec0ac",
-   "metadata": {
-    "id": "IAnFHiPlgnRE",
-    "papermill": {
-     "duration": null,
-     "end_time": null,
-     "exception": null,
-     "start_time": null,
-     "status": "completed"
-    },
-    "tags": []
-   },
-   "source": [
-    "## Pie Chart"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": null,
-   "id": "7c3dd684",
-   "metadata": {
-    "id": "8tKR1x-kVeZT",
-    "papermill": {
-     "duration": null,
-     "end_time": null,
-     "exception": null,
-     "start_time": null,
-     "status": "completed"
-    },
-    "tags": []
-   },
-   "outputs": [],
-   "source": [
-    "import matplotlib.pyplot as plt\n",
-    "from collections import Counter\n",
-    "\n",
-    "# Calculate the count of \"other\" datasets\n",
-    "other_count = mono_count - (english_count + spanish_count + chinese_count + french_count)\n",
-    "\n",
-    "# Pie chart data\n",
-    "labels = ['English', 'Chinese', 'French', 'Spanish', 'Other']\n",
-    "sizes = [english_count, chinese_count, french_count, spanish_count, other_count]\n",
-    "\n",
-    "# Plotting the pie chart\n",
-    "plt.figure(figsize=(8, 8))\n",
-    "plt.pie(sizes, labels=labels, autopct='%1.1f%%', startangle=180, colors=['blue', 'red', 'green', 'orange', 'purple'])\n",
-    "plt.title('Distribution of Monolingual Datasets by Language')\n",
-    "plt.axis('equal')  # Equal aspect ratio ensures that pie is drawn as a circle.\n",
-    "\n",
-    "# Display the plot\n",
-    "plt.show()\n",
-    "\n",
-    "plt.savefig(\"plots/pie_chart.png\")"
-   ]
-  },
-  {
-   "cell_type": "markdown",
-   "id": "11c1c9c8",
-   "metadata": {
-    "id": "z2xf8FrHROxy",
-    "papermill": {
-     "duration": null,
-     "end_time": null,
-     "exception": null,
-     "start_time": null,
-     "status": "completed"
-    },
-    "tags": []
-   },
-   "source": [
-    "# Time series plot"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": null,
-   "id": "1bb6a676",
-   "metadata": {
-    "id": "DuPFSZKUhyQj",
-    "papermill": {
-     "duration": null,
-     "end_time": null,
-     "exception": null,
-     "start_time": null,
-     "status": "completed"
-    },
-    "tags": []
-   },
-   "outputs": [],
-   "source": [
-    "# Prepare data for plotting\n",
-    "\n",
-    "df = pd.DataFrame(creation_dates_spanish, columns=[\"Date\"])\n",
-    "df[\"Count\"] = 1\n",
-    "# Ensure the 'Date' column is of type datetime\n",
-    "df['Date'] = pd.to_datetime(df['Date'])\n",
-    "# Group by month and calculate cumulative sum\n",
-    "df = df.groupby(pd.Grouper(key=\"Date\", freq=\"MS\")).sum().cumsum()\n",
-    "\n",
-    "# Plot the data\n",
-    "plt.figure(figsize=(10, 6))\n",
-    "plt.plot(\n",
-    "    df.index,\n",
-    "    df[\"Count\"],\n",
-    "    #marker=\"o\",\n",
-    "    color=\"g\"\n",
-    ")\n",
-    "plt.title(\"Evolución de bases de datos monolingües en español\")\n",
-    "plt.xlabel(\"Fecha\")\n",
-    "plt.ylabel(\"Número de bases de datos\")\n",
-    "plt.grid(True)\n",
-    "plt.xticks(rotation=45)\n",
-    "plt.tight_layout()\n",
-    "plt.show()"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": null,
-   "id": "2fc77d7f",
-   "metadata": {
-    "id": "-Vu3PIe2hITq",
-    "papermill": {
-     "duration": null,
-     "end_time": null,
-     "exception": null,
-     "start_time": null,
-     "status": "completed"
-    },
-    "tags": []
-   },
-   "outputs": [],
-   "source": [
-    "import matplotlib.pyplot as plt\n",
-    "import pandas as pd\n",
-    "from collections import Counter\n",
-    "\n",
-    "# Sample data (replace with your actual data)\n",
-    "creation_dates_english = [d.created_at.date() for d in english_datasets]\n",
-    "creation_dates_spanish = [d.created_at.date() for d in spanish_datasets]\n",
-    "\n",
-    "# Extract months from the creation dates\n",
-    "months_english = [(date.year, date.month) for date in creation_dates_english]\n",
-    "months_spanish = [(date.year, date.month) for date in creation_dates_spanish]\n",
-    "\n",
-    "# Count the occurrences of each month\n",
-    "english_counts = Counter(months_english)\n",
-    "spanish_counts = Counter(months_spanish)\n",
-    "\n",
-    "# Create a DataFrame for English datasets\n",
-    "df_english = pd.DataFrame.from_dict(english_counts, orient='index', columns=['English'])\n",
-    "df_english.index = pd.MultiIndex.from_tuples(df_english.index, names=['Year', 'Month'])\n",
-    "df_english = df_english.sort_index()\n",
-    "\n",
-    "# Create a DataFrame for Spanish datasets\n",
-    "df_spanish = pd.DataFrame.from_dict(spanish_counts, orient='index', columns=['Spanish'])\n",
-    "df_spanish.index = pd.MultiIndex.from_tuples(df_spanish.index, names=['Year', 'Month'])\n",
-    "df_spanish = df_spanish.sort_index()\n",
-    "\n",
-    "# Merge the DataFrames\n",
-    "df = pd.merge(df_english, df_spanish, how='outer', left_index=True, right_index=True).fillna(0)\n",
-    "\n",
-    "# Convert index to datetime\n",
-    "df.index = pd.to_datetime(df.index.map(lambda x: f'{x[0]}-{x[1]}'))\n",
-    "\n",
-    "# Calculate cumulative sum\n",
-    "df_cumulative = df.cumsum()\n",
-    "\n",
-    "# Plotting the cumulative chart\n",
-    "plt.figure(figsize=(10, 6))\n",
-    "plt.plot(df_cumulative.index, df_cumulative['English'], label='English', color='blue')\n",
-    "plt.plot(df_cumulative.index, df_cumulative['Spanish'], label='Spanish', color='orange')\n",
-    "\n",
-    "# Adding labels and title\n",
-    "plt.xlabel('Date')\n",
-    "plt.ylabel('Cumulative Number of Datasets')\n",
-    "plt.title('Cumulative Growth of Monolingual English and Spanish Datasets Over Time')\n",
-    "\n",
-    "# Display the plot\n",
-    "plt.xticks(rotation=45)\n",
-    "plt.legend(loc='upper left')\n",
-    "plt.grid(True)\n",
-    "plt.tight_layout()\n",
-    "plt.show()\n"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": null,
-   "id": "6c0d23ac",
-   "metadata": {
-    "id": "KG__of2IfdHu",
-    "papermill": {
-     "duration": null,
-     "end_time": null,
-     "exception": null,
-     "start_time": null,
-     "status": "completed"
-    },
-    "tags": []
-   },
-   "outputs": [],
-   "source": [
-    "import matplotlib.pyplot as plt\n",
-    "import pandas as pd\n",
-    "from collections import Counter\n",
-    "\n",
-    "# Sample data (replace with your actual data)\n",
-    "creation_dates_english = [d.created_at.date() for d in english_datasets]\n",
-    "creation_dates_spanish = [d.created_at.date() for d in spanish_datasets]\n",
-    "\n",
-    "# Extract years from the creation dates\n",
-    "years = sorted(set(date.year for date in creation_dates_english + creation_dates_spanish))\n",
-    "english_counts = Counter(date.year for date in creation_dates_english)\n",
-    "spanish_counts = Counter(date.year for date in creation_dates_spanish)\n",
-    "\n",
-    "# Prepare data for plotting\n",
-    "english_series = pd.Series([english_counts[year] for year in years], index=years)\n",
-    "spanish_series = pd.Series([spanish_counts[year] for year in years], index=years)\n",
-    "\n",
-    "# Plotting the time series\n",
-    "plt.figure(figsize=(10, 6))\n",
-    "plt.plot(english_series.index, english_series.values, label='English', color='blue')\n",
-    "plt.plot(spanish_series.index, spanish_series.values, label='Spanish', color='orange')\n",
-    "\n",
-    "# Adding labels and title\n",
-    "plt.title('Evolution of English and Spanish Datasets Over Time')\n",
-    "plt.xlabel('Year')\n",
-    "plt.ylabel('Number of Datasets')\n",
-    "plt.legend()\n",
-    "plt.grid(True)\n",
-    "plt.xticks(rotation=45)\n",
-    "plt.tight_layout()\n",
-    "plt.show()\n"
-   ]
-  }
- ],
- "metadata": {
-  "accelerator": "GPU",
-  "colab": {
-   "gpuType": "T4",
-   "provenance": []
-  },
-  "kernelspec": {
-   "display_name": "Python 3",
-   "name": "python3"
-  },
-  "language_info": {
-   "name": "python",
-   "version": "3.11.6"
-  },
-  "papermill": {
-   "default_parameters": {},
-   "duration": 0.047858,
-   "end_time": "2024-05-15T09:04:29.634379",
-   "environment_variables": {},
-   "exception": null,
-   "input_path": "numero_datasets_hub.ipynb",
-   "output_path": "numero_datasets_hub_output.ipynb",
-   "parameters": {},
-   "start_time": "2024-05-15T09:04:29.586521",
-   "version": "2.6.0"
-  }
- },
- "nbformat": 4,
- "nbformat_minor": 5
-}

plots/bar_plot_horizontal.png ADDED Viewed

plots/bar_plot_vertical.png ADDED Viewed

plots/datasets_hub.png DELETED Viewed

Binary file (46.9 kB)

plots/stack_area.png ADDED Viewed

plots/stack_area_es.png ADDED Viewed

plots/time_series.png ADDED Viewed