mariagrandury commited on
Commit
7cb31c4
1 Parent(s): 1f70be8

create notebook and add plots

Browse files
app.py CHANGED
@@ -25,7 +25,6 @@ def create_app():
25
 
26
  run_button.click(run_notebook, outputs=output_label)
27
 
28
- # Create a 2x2 grid for images
29
  with gr.Row():
30
  with gr.Column():
31
  image1 = gr.Image(
@@ -49,23 +48,21 @@ def create_app():
49
  label="Image 4",
50
  )
51
 
52
- # Description for images
53
  gr.Markdown("### Image Descriptions")
54
  gr.Markdown("Description for Image 1")
55
  gr.Markdown("Description for Image 2")
56
  gr.Markdown("Description for Image 3")
57
  gr.Markdown("Description for Image 4")
58
 
59
- # Collapsible block for citation
60
  with gr.Accordion("Citation Information"):
61
  gr.Markdown(
62
  """
63
- If you use the images or code please cite:
64
 
65
- ```
66
- fjdlsafd
67
- ```
68
- """
69
  )
70
 
71
  return app
 
25
 
26
  run_button.click(run_notebook, outputs=output_label)
27
 
 
28
  with gr.Row():
29
  with gr.Column():
30
  image1 = gr.Image(
 
48
  label="Image 4",
49
  )
50
 
 
51
  gr.Markdown("### Image Descriptions")
52
  gr.Markdown("Description for Image 1")
53
  gr.Markdown("Description for Image 2")
54
  gr.Markdown("Description for Image 3")
55
  gr.Markdown("Description for Image 4")
56
 
 
57
  with gr.Accordion("Citation Information"):
58
  gr.Markdown(
59
  """
60
+ If you use the images or code please cite:
61
 
62
+ ```
63
+ fjdlsafd
64
+ ```
65
+ """
66
  )
67
 
68
  return app
numero_datasets_hub.ipynb → hub_datasets_by_language.ipynb RENAMED
@@ -1,14 +1,27 @@
1
  {
2
  "cells": [
 
 
 
 
 
 
 
 
 
 
 
 
 
3
  {
4
  "cell_type": "code",
5
- "execution_count": 1,
6
  "metadata": {
7
  "colab": {
8
  "base_uri": "https://localhost:8080/"
9
  },
10
  "id": "bCPvBCk_VLoi",
11
- "outputId": "48174b27-072f-4cf9-bfcc-2a7cb12f60ba"
12
  },
13
  "outputs": [
14
  {
@@ -36,7 +49,7 @@
36
  },
37
  {
38
  "cell_type": "code",
39
- "execution_count": 2,
40
  "metadata": {
41
  "id": "NbQeXxudVJW9"
42
  },
@@ -51,13 +64,13 @@
51
  },
52
  {
53
  "cell_type": "code",
54
- "execution_count": 3,
55
  "metadata": {
56
  "colab": {
57
  "base_uri": "https://localhost:8080/"
58
  },
59
  "id": "ogyTHBYJVZ8I",
60
- "outputId": "f23a554a-7328-4e50-d87c-90368294467d"
61
  },
62
  "outputs": [
63
  {
@@ -76,7 +89,7 @@
76
  "name": "stdout",
77
  "output_type": "stream",
78
  "text": [
79
- "145101\n"
80
  ]
81
  }
82
  ],
@@ -91,7 +104,7 @@
91
  },
92
  {
93
  "cell_type": "code",
94
- "execution_count": 4,
95
  "metadata": {
96
  "id": "GXDMUU-4XmaI"
97
  },
@@ -112,20 +125,20 @@
112
  },
113
  {
114
  "cell_type": "code",
115
- "execution_count": 5,
116
  "metadata": {
117
  "colab": {
118
  "base_uri": "https://localhost:8080/"
119
  },
120
  "id": "pjCvHVq_hChx",
121
- "outputId": "d279d0bc-a3c6-4994-f23c-a7274b1f4ee8"
122
  },
123
  "outputs": [
124
  {
125
  "name": "stdout",
126
  "output_type": "stream",
127
  "text": [
128
- "318\n"
129
  ]
130
  }
131
  ],
@@ -142,20 +155,20 @@
142
  },
143
  {
144
  "cell_type": "code",
145
- "execution_count": 6,
146
  "metadata": {
147
  "colab": {
148
  "base_uri": "https://localhost:8080/"
149
  },
150
  "id": "WANGkTpGRw8t",
151
- "outputId": "da8931bf-7ae2-438d-8188-20190f568193"
152
  },
153
  "outputs": [
154
  {
155
  "name": "stdout",
156
  "output_type": "stream",
157
  "text": [
158
- "8357\n"
159
  ]
160
  }
161
  ],
@@ -172,20 +185,20 @@
172
  },
173
  {
174
  "cell_type": "code",
175
- "execution_count": 7,
176
  "metadata": {
177
  "colab": {
178
- "base_uri": "https://localhost:8080/"
179
  },
180
  "id": "yPtF0G7SWS53",
181
- "outputId": "a2a51160-c803-4e7f-a6dc-8879eea1dd69"
182
  },
183
  "outputs": [
184
  {
185
  "name": "stdout",
186
  "output_type": "stream",
187
  "text": [
188
- "568\n"
189
  ]
190
  }
191
  ],
@@ -202,20 +215,20 @@
202
  },
203
  {
204
  "cell_type": "code",
205
- "execution_count": 8,
206
  "metadata": {
207
  "colab": {
208
- "base_uri": "https://localhost:8080/"
209
  },
210
  "id": "RlxAlOOsW7p9",
211
- "outputId": "f1c12edd-5502-4018-b9a7-149f9fc29322"
212
  },
213
  "outputs": [
214
  {
215
  "name": "stdout",
216
  "output_type": "stream",
217
  "text": [
218
- "436\n"
219
  ]
220
  }
221
  ],
@@ -232,23 +245,11 @@
232
  },
233
  {
234
  "cell_type": "code",
235
- "execution_count": 9,
236
  "metadata": {
237
- "colab": {
238
- "base_uri": "https://localhost:8080/"
239
- },
240
- "id": "OMQfBXjUYBPz",
241
- "outputId": "8cd3fdb9-0bc8-4d82-d25b-fb9eef7118ed"
242
  },
243
- "outputs": [
244
- {
245
- "name": "stdout",
246
- "output_type": "stream",
247
- "text": [
248
- "13886\n"
249
- ]
250
- }
251
- ],
252
  "source": [
253
  "hf_api = HfApi()\n",
254
  "\n",
@@ -262,36 +263,19 @@
262
  },
263
  {
264
  "cell_type": "code",
265
- "execution_count": 10,
266
  "metadata": {
267
- "colab": {
268
- "base_uri": "https://localhost:8080/",
269
- "height": 180
270
- },
271
- "id": "sTPechkdWmYS",
272
- "outputId": "bb49f9f4-150b-4a29-d58e-faff4f88cce3"
273
  },
274
- "outputs": [
275
- {
276
- "ename": "AssertionError",
277
- "evalue": "",
278
- "output_type": "error",
279
- "traceback": [
280
- "\u001b[0;31m---------------------------------------------------------------------------\u001b[0m",
281
- "\u001b[0;31mAssertionError\u001b[0m Traceback (most recent call last)",
282
- "\u001b[0;32m<ipython-input-10-da38b5a6b412>\u001b[0m in \u001b[0;36m<cell line: 7>\u001b[0;34m()\u001b[0m\n\u001b[1;32m 5\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 6\u001b[0m \u001b[0mcreation_dates_english\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0;34m[\u001b[0m\u001b[0md\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mcreated_at\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mdate\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m)\u001b[0m \u001b[0;32mfor\u001b[0m \u001b[0md\u001b[0m \u001b[0;32min\u001b[0m \u001b[0menglish_datasets\u001b[0m\u001b[0;34m]\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m----> 7\u001b[0;31m \u001b[0;32massert\u001b[0m \u001b[0mlen\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mcreation_dates_english\u001b[0m\u001b[0;34m)\u001b[0m \u001b[0;34m==\u001b[0m \u001b[0;36m8336\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m",
283
- "\u001b[0;31mAssertionError\u001b[0m: "
284
- ]
285
- }
286
- ],
287
  "source": [
288
  "# Extract creation date\n",
289
  "\n",
290
  "creation_dates_spanish = [d.created_at.date() for d in spanish_datasets]\n",
291
- "assert len(creation_dates_spanish) == 318\n",
292
  "\n",
293
  "creation_dates_english = [d.created_at.date() for d in english_datasets]\n",
294
- "assert len(creation_dates_english) == 8336"
295
  ]
296
  },
297
  {
@@ -336,22 +320,23 @@
336
  "spanish_counts = Counter(date.year for date in creation_dates_spanish)\n",
337
  "\n",
338
  "# Plotting the bar chart\n",
339
- "plt.figure(figsize=(10, 6))\n",
340
  "plt.bar(years, [english_counts[year] for year in years], width=0.4, label='English Datasets', color='blue')\n",
341
  "plt.bar(years, [spanish_counts[year] for year in years], width=0.4, label='Spanish Datasets', color='orange', bottom=[english_counts[year] for year in years])\n",
342
  "\n",
343
  "# Adding labels and title\n",
344
- "plt.xlabel('Year')\n",
345
- "plt.ylabel('Number of Datasets')\n",
346
- "plt.title('Distribution of Monolingual English and Spanish Datasets by Year')\n",
347
- "plt.xticks(years)\n",
348
  "plt.legend()\n",
349
  "\n",
350
  "# Display the plot\n",
351
  "plt.grid(True)\n",
352
  "plt.tight_layout()\n",
353
  "plt.show()\n",
354
- "plt.savefig(\"plots/bar_stack.png\")\n"
 
355
  ]
356
  },
357
  {
@@ -382,22 +367,23 @@
382
  "years_index = np.arange(len(years))\n",
383
  "\n",
384
  "# Plotting the side-by-side bar chart\n",
385
- "plt.figure(figsize=(10, 6))\n",
386
  "plt.bar(years_index - bar_width/2, [english_counts[year] for year in years], width=bar_width, label='English Datasets', color='blue')\n",
387
  "plt.bar(years_index + bar_width/2, [spanish_counts[year] for year in years], width=bar_width, label='Spanish Datasets', color='orange')\n",
388
  "\n",
389
  "# Adding labels and title\n",
390
- "plt.xlabel('Year')\n",
391
- "plt.ylabel('Number of Datasets')\n",
392
- "plt.title('Distribution of Monolingual English and Spanish Datasets by Year')\n",
393
- "plt.xticks(years_index, years)\n",
394
  "plt.legend()\n",
395
  "\n",
396
  "# Display the plot\n",
397
  "plt.grid(True)\n",
398
  "plt.tight_layout()\n",
399
  "plt.show()\n",
400
- "plt.savefig(\"plots/bar_width.png\")"
 
401
  ]
402
  },
403
  {
@@ -437,14 +423,14 @@
437
  " spanish_datasets_cumulative[i] += spanish_datasets_cumulative[i-1]\n",
438
  "\n",
439
  "# Plotting the stacked area chart\n",
440
- "plt.figure(figsize=(10, 6))\n",
441
  "plt.stackplot(years, english_datasets_cumulative, spanish_datasets_cumulative, labels=['English Datasets', 'Spanish Datasets'], colors=['blue', 'orange'])\n",
442
  "\n",
443
  "# Adding labels and title\n",
444
- "plt.xlabel('Year')\n",
445
- "plt.ylabel('Cumulative Number of Datasets')\n",
446
- "plt.title('Cumulative Growth of Monolingual English and Spanish Datasets Over Time')\n",
447
- "plt.xticks(years)\n",
448
  "plt.legend(loc='upper left')\n",
449
  "\n",
450
  "# Display the plot\n",
@@ -452,7 +438,7 @@
452
  "plt.tight_layout()\n",
453
  "plt.show()\n",
454
  "\n",
455
- "plt.savefig(\"plots/stack_area_1.png\")"
456
  ]
457
  },
458
  {
@@ -503,18 +489,18 @@
503
  "plt.stackplot(df_cumulative.index, df_cumulative['English'], df_cumulative['Spanish'], labels=['English', 'Spanish'], colors=['orange', 'blue'])\n",
504
  "\n",
505
  "# Adding labels and title\n",
506
- "plt.xlabel('Creation date')\n",
507
- "plt.ylabel('Cumulative number of monolingual datasets')\n",
508
- "plt.title('Cumulative growth of monolingual English and Spanish datasets in the Hugging Face Hub over time')\n",
509
  "\n",
510
  "# Display the plot\n",
511
- "plt.xticks(rotation=45)\n",
512
  "plt.legend(loc='upper left')\n",
513
  "plt.grid(False)\n",
514
  "plt.tight_layout()\n",
515
  "plt.show()\n",
516
  "\n",
517
- "plt.savefig(\"plots/stack_area_2.png\")"
518
  ]
519
  },
520
  {
@@ -562,18 +548,18 @@
562
  "plt.stackplot(df.index, df['English'], df['Spanish'], labels=['English Datasets', 'Spanish Datasets'], colors=['blue', 'orange'])\n",
563
  "\n",
564
  "# Adding labels and title\n",
565
- "plt.xlabel('Date')\n",
566
- "plt.ylabel('Cumulative Number of Datasets')\n",
567
- "plt.title('Cumulative Growth of Monolingual English and Spanish Datasets Over Time')\n",
568
  "\n",
569
  "# Display the plot\n",
570
- "plt.xticks(rotation=45)\n",
571
  "plt.legend(loc='upper left')\n",
572
  "plt.grid(True)\n",
573
  "plt.tight_layout()\n",
574
  "plt.show()\n",
575
  "\n",
576
- "plt.savefig(\"plots/stack_area_3.png\")"
577
  ]
578
  },
579
  {
@@ -606,13 +592,13 @@
606
  "# Plotting the pie chart\n",
607
  "plt.figure(figsize=(8, 8))\n",
608
  "plt.pie(sizes, labels=labels, autopct='%1.1f%%', startangle=180, colors=['blue', 'red', 'green', 'orange', 'purple'])\n",
609
- "plt.title('Distribution of Monolingual Datasets by Language')\n",
610
  "plt.axis('equal') # Equal aspect ratio ensures that pie is drawn as a circle.\n",
611
  "\n",
612
  "# Display the plot\n",
613
  "plt.show()\n",
614
  "\n",
615
- "plt.savefig(\"plots/pie_chart.png\")"
616
  ]
617
  },
618
  {
@@ -649,13 +635,15 @@
649
  " #marker=\"o\",\n",
650
  " color=\"g\"\n",
651
  ")\n",
652
- "plt.title(\"Evolución de bases de datos monolingües en español\")\n",
653
- "plt.xlabel(\"Fecha\")\n",
654
- "plt.ylabel(\"Número de bases de datos\")\n",
655
  "plt.grid(True)\n",
656
- "plt.xticks(rotation=45)\n",
657
  "plt.tight_layout()\n",
658
- "plt.show()"
 
 
659
  ]
660
  },
661
  {
@@ -707,16 +695,18 @@
707
  "plt.plot(df_cumulative.index, df_cumulative['Spanish'], label='Spanish', color='orange')\n",
708
  "\n",
709
  "# Adding labels and title\n",
710
- "plt.xlabel('Date')\n",
711
- "plt.ylabel('Cumulative Number of Datasets')\n",
712
- "plt.title('Cumulative Growth of Monolingual English and Spanish Datasets Over Time')\n",
713
  "\n",
714
  "# Display the plot\n",
715
- "plt.xticks(rotation=45)\n",
716
  "plt.legend(loc='upper left')\n",
717
  "plt.grid(True)\n",
718
  "plt.tight_layout()\n",
719
- "plt.show()\n"
 
 
720
  ]
721
  },
722
  {
@@ -750,21 +740,21 @@
750
  "plt.plot(spanish_series.index, spanish_series.values, label='Spanish', color='orange')\n",
751
  "\n",
752
  "# Adding labels and title\n",
753
- "plt.title('Evolution of English and Spanish Datasets Over Time')\n",
754
- "plt.xlabel('Year')\n",
755
- "plt.ylabel('Number of Datasets')\n",
756
  "plt.legend()\n",
757
  "plt.grid(True)\n",
758
- "plt.xticks(rotation=45)\n",
759
  "plt.tight_layout()\n",
760
- "plt.show()\n"
 
 
761
  ]
762
  }
763
  ],
764
  "metadata": {
765
- "accelerator": "GPU",
766
  "colab": {
767
- "gpuType": "T4",
768
  "provenance": []
769
  },
770
  "kernelspec": {
@@ -772,10 +762,9 @@
772
  "name": "python3"
773
  },
774
  "language_info": {
775
- "name": "python",
776
- "version": "3.11.6"
777
  }
778
  },
779
  "nbformat": 4,
780
  "nbformat_minor": 0
781
- }
 
1
  {
2
  "cells": [
3
+ {
4
+ "cell_type": "markdown",
5
+ "source": [
6
+ "# Language gap in the Hugging Face Hub\n",
7
+ "\n",
8
+ "<a target=\"_blank\" href=\"https://colab.research.google.com/drive/16KNpk25dQR9sdo7FSTONCIyS2Uvf0cOO?usp=sharing\">\n",
9
+ " <img src=\"https://colab.research.google.com/assets/colab-badge.svg\" alt=\"Open In Colab\"/>\n",
10
+ "</a>"
11
+ ],
12
+ "metadata": {
13
+ "id": "jgtFu9csb5kY"
14
+ }
15
+ },
16
  {
17
  "cell_type": "code",
18
+ "execution_count": null,
19
  "metadata": {
20
  "colab": {
21
  "base_uri": "https://localhost:8080/"
22
  },
23
  "id": "bCPvBCk_VLoi",
24
+ "outputId": "4e3e86c5-36bb-4f42-8777-9762373251ff"
25
  },
26
  "outputs": [
27
  {
 
49
  },
50
  {
51
  "cell_type": "code",
52
+ "execution_count": null,
53
  "metadata": {
54
  "id": "NbQeXxudVJW9"
55
  },
 
64
  },
65
  {
66
  "cell_type": "code",
67
+ "execution_count": null,
68
  "metadata": {
69
  "colab": {
70
  "base_uri": "https://localhost:8080/"
71
  },
72
  "id": "ogyTHBYJVZ8I",
73
+ "outputId": "0590665f-c62d-4c2b-8195-1367995bc01a"
74
  },
75
  "outputs": [
76
  {
 
89
  "name": "stdout",
90
  "output_type": "stream",
91
  "text": [
92
+ "146571\n"
93
  ]
94
  }
95
  ],
 
104
  },
105
  {
106
  "cell_type": "code",
107
+ "execution_count": null,
108
  "metadata": {
109
  "id": "GXDMUU-4XmaI"
110
  },
 
125
  },
126
  {
127
  "cell_type": "code",
128
+ "execution_count": null,
129
  "metadata": {
130
  "colab": {
131
  "base_uri": "https://localhost:8080/"
132
  },
133
  "id": "pjCvHVq_hChx",
134
+ "outputId": "d37a955e-9ee0-4d0f-e738-11a376377770"
135
  },
136
  "outputs": [
137
  {
138
  "name": "stdout",
139
  "output_type": "stream",
140
  "text": [
141
+ "331\n"
142
  ]
143
  }
144
  ],
 
155
  },
156
  {
157
  "cell_type": "code",
158
+ "execution_count": null,
159
  "metadata": {
160
  "colab": {
161
  "base_uri": "https://localhost:8080/"
162
  },
163
  "id": "WANGkTpGRw8t",
164
+ "outputId": "0143ae40-510b-4da2-9e22-47f2af90759a"
165
  },
166
  "outputs": [
167
  {
168
  "name": "stdout",
169
  "output_type": "stream",
170
  "text": [
171
+ "8442\n"
172
  ]
173
  }
174
  ],
 
185
  },
186
  {
187
  "cell_type": "code",
188
+ "execution_count": null,
189
  "metadata": {
190
  "colab": {
191
+ "background_save": true
192
  },
193
  "id": "yPtF0G7SWS53",
194
+ "outputId": "18a9515e-eeb7-4eb8-f734-c195b15c011a"
195
  },
196
  "outputs": [
197
  {
198
  "name": "stdout",
199
  "output_type": "stream",
200
  "text": [
201
+ "577\n"
202
  ]
203
  }
204
  ],
 
215
  },
216
  {
217
  "cell_type": "code",
218
+ "execution_count": null,
219
  "metadata": {
220
  "colab": {
221
+ "background_save": true
222
  },
223
  "id": "RlxAlOOsW7p9",
224
+ "outputId": "71ff74e7-cd4e-4b39-aa8b-a22e21130f4e"
225
  },
226
  "outputs": [
227
  {
228
  "name": "stdout",
229
  "output_type": "stream",
230
  "text": [
231
+ "438\n"
232
  ]
233
  }
234
  ],
 
245
  },
246
  {
247
  "cell_type": "code",
248
+ "execution_count": null,
249
  "metadata": {
250
+ "id": "OMQfBXjUYBPz"
 
 
 
 
251
  },
252
+ "outputs": [],
 
 
 
 
 
 
 
 
253
  "source": [
254
  "hf_api = HfApi()\n",
255
  "\n",
 
263
  },
264
  {
265
  "cell_type": "code",
266
+ "execution_count": null,
267
  "metadata": {
268
+ "id": "sTPechkdWmYS"
 
 
 
 
 
269
  },
270
+ "outputs": [],
 
 
 
 
 
 
 
 
 
 
 
 
271
  "source": [
272
  "# Extract creation date\n",
273
  "\n",
274
  "creation_dates_spanish = [d.created_at.date() for d in spanish_datasets]\n",
275
+ "#assert len(creation_dates_spanish) == 318\n",
276
  "\n",
277
  "creation_dates_english = [d.created_at.date() for d in english_datasets]\n",
278
+ "#assert len(creation_dates_english) == 8336"
279
  ]
280
  },
281
  {
 
320
  "spanish_counts = Counter(date.year for date in creation_dates_spanish)\n",
321
  "\n",
322
  "# Plotting the bar chart\n",
323
+ "plt.figure(figsize=(8, 5))\n",
324
  "plt.bar(years, [english_counts[year] for year in years], width=0.4, label='English Datasets', color='blue')\n",
325
  "plt.bar(years, [spanish_counts[year] for year in years], width=0.4, label='Spanish Datasets', color='orange', bottom=[english_counts[year] for year in years])\n",
326
  "\n",
327
  "# Adding labels and title\n",
328
+ "plt.xlabel('Year', fontsize=10)\n",
329
+ "plt.ylabel('Number of Datasets', fontsize=10)\n",
330
+ "#plt.title('Distribution of Monolingual English and Spanish Datasets by Year')\n",
331
+ "plt.xticks(years, fontsize=10)\n",
332
  "plt.legend()\n",
333
  "\n",
334
  "# Display the plot\n",
335
  "plt.grid(True)\n",
336
  "plt.tight_layout()\n",
337
  "plt.show()\n",
338
+ "\n",
339
+ "plt.savefig(\"bar_chart_vertical.png\")\n"
340
  ]
341
  },
342
  {
 
367
  "years_index = np.arange(len(years))\n",
368
  "\n",
369
  "# Plotting the side-by-side bar chart\n",
370
+ "plt.figure(figsize=(8, 5))\n",
371
  "plt.bar(years_index - bar_width/2, [english_counts[year] for year in years], width=bar_width, label='English Datasets', color='blue')\n",
372
  "plt.bar(years_index + bar_width/2, [spanish_counts[year] for year in years], width=bar_width, label='Spanish Datasets', color='orange')\n",
373
  "\n",
374
  "# Adding labels and title\n",
375
+ "plt.xlabel('Year', fontsize=10)\n",
376
+ "plt.ylabel('Number of Datasets', fontsize=10)\n",
377
+ "#plt.title('Distribution of Monolingual English and Spanish Datasets by Year')\n",
378
+ "plt.xticks(years_index, years, fontsize=10)\n",
379
  "plt.legend()\n",
380
  "\n",
381
  "# Display the plot\n",
382
  "plt.grid(True)\n",
383
  "plt.tight_layout()\n",
384
  "plt.show()\n",
385
+ "\n",
386
+ "plt.savefig(\"bar_chart_horizontal.png\")"
387
  ]
388
  },
389
  {
 
423
  " spanish_datasets_cumulative[i] += spanish_datasets_cumulative[i-1]\n",
424
  "\n",
425
  "# Plotting the stacked area chart\n",
426
+ "plt.figure(figsize=(8, 5))\n",
427
  "plt.stackplot(years, english_datasets_cumulative, spanish_datasets_cumulative, labels=['English Datasets', 'Spanish Datasets'], colors=['blue', 'orange'])\n",
428
  "\n",
429
  "# Adding labels and title\n",
430
+ "plt.xlabel('Year', fontsize=10)\n",
431
+ "plt.ylabel('Cumulative Number of Datasets', fontsize=10)\n",
432
+ "#plt.title('Cumulative Growth of Monolingual English and Spanish Datasets Over Time')\n",
433
+ "plt.xticks(years, fontsize=10)\n",
434
  "plt.legend(loc='upper left')\n",
435
  "\n",
436
  "# Display the plot\n",
 
438
  "plt.tight_layout()\n",
439
  "plt.show()\n",
440
  "\n",
441
+ "plt.savefig(\"stack_area_1.png\")\n"
442
  ]
443
  },
444
  {
 
489
  "plt.stackplot(df_cumulative.index, df_cumulative['English'], df_cumulative['Spanish'], labels=['English', 'Spanish'], colors=['orange', 'blue'])\n",
490
  "\n",
491
  "# Adding labels and title\n",
492
+ "plt.xlabel('Creation date', fontsize=10)\n",
493
+ "plt.ylabel('Cumulative number of monolingual datasets', fontsize=10)\n",
494
+ "#plt.title('Cumulative growth of monolingual English and Spanish datasets in the Hugging Face Hub over time')\n",
495
  "\n",
496
  "# Display the plot\n",
497
+ "plt.xticks(rotation=45, fontsize=10)\n",
498
  "plt.legend(loc='upper left')\n",
499
  "plt.grid(False)\n",
500
  "plt.tight_layout()\n",
501
  "plt.show()\n",
502
  "\n",
503
+ "plt.savefig(\"stack_area_2.png\")"
504
  ]
505
  },
506
  {
 
548
  "plt.stackplot(df.index, df['English'], df['Spanish'], labels=['English Datasets', 'Spanish Datasets'], colors=['blue', 'orange'])\n",
549
  "\n",
550
  "# Adding labels and title\n",
551
+ "plt.xlabel('Date', fontsize=10)\n",
552
+ "plt.ylabel('Cumulative Number of Datasets', fontsize=10)\n",
553
+ "#plt.title('Cumulative Growth of Monolingual English and Spanish Datasets Over Time')\n",
554
  "\n",
555
  "# Display the plot\n",
556
+ "plt.xticks(rotation=45, fontsize=10)\n",
557
  "plt.legend(loc='upper left')\n",
558
  "plt.grid(True)\n",
559
  "plt.tight_layout()\n",
560
  "plt.show()\n",
561
  "\n",
562
+ "plt.savefig(\"stack_area_3.png\")"
563
  ]
564
  },
565
  {
 
592
  "# Plotting the pie chart\n",
593
  "plt.figure(figsize=(8, 8))\n",
594
  "plt.pie(sizes, labels=labels, autopct='%1.1f%%', startangle=180, colors=['blue', 'red', 'green', 'orange', 'purple'])\n",
595
+ "#plt.title('Distribution of Monolingual Datasets by Language')\n",
596
  "plt.axis('equal') # Equal aspect ratio ensures that pie is drawn as a circle.\n",
597
  "\n",
598
  "# Display the plot\n",
599
  "plt.show()\n",
600
  "\n",
601
+ "plt.savefig(\"pie_chart.png\")\n"
602
  ]
603
  },
604
  {
 
635
  " #marker=\"o\",\n",
636
  " color=\"g\"\n",
637
  ")\n",
638
+ "#plt.title(\"Evolución de bases de datos monolingües en español\")\n",
639
+ "plt.xlabel(\"Fecha\", fontsize=10)\n",
640
+ "plt.ylabel(\"Número de bases de datos\", fontsize=10)\n",
641
  "plt.grid(True)\n",
642
+ "plt.xticks(rotation=45, fontsize=10)\n",
643
  "plt.tight_layout()\n",
644
+ "plt.show()\n",
645
+ "\n",
646
+ "plt.savefig(\"time_series_1.png\")"
647
  ]
648
  },
649
  {
 
695
  "plt.plot(df_cumulative.index, df_cumulative['Spanish'], label='Spanish', color='orange')\n",
696
  "\n",
697
  "# Adding labels and title\n",
698
+ "plt.xlabel('Date', fontsize=10)\n",
699
+ "plt.ylabel('Cumulative Number of Datasets', fontsize=10)\n",
700
+ "#plt.title('Cumulative Growth of Monolingual English and Spanish Datasets Over Time')\n",
701
  "\n",
702
  "# Display the plot\n",
703
+ "plt.xticks(rotation=45, fontsize=10)\n",
704
  "plt.legend(loc='upper left')\n",
705
  "plt.grid(True)\n",
706
  "plt.tight_layout()\n",
707
+ "plt.show()\n",
708
+ "\n",
709
+ "plt.savefig(\"time_series_2.png\")"
710
  ]
711
  },
712
  {
 
740
  "plt.plot(spanish_series.index, spanish_series.values, label='Spanish', color='orange')\n",
741
  "\n",
742
  "# Adding labels and title\n",
743
+ "#plt.title('Evolution of English and Spanish Datasets Over Time')\n",
744
+ "plt.xlabel('Year', fontsize=10)\n",
745
+ "plt.ylabel('Number of Datasets', fontsize=10)\n",
746
  "plt.legend()\n",
747
  "plt.grid(True)\n",
748
+ "plt.xticks(rotation=45, fontsize=10)\n",
749
  "plt.tight_layout()\n",
750
+ "plt.show()\n",
751
+ "\n",
752
+ "plt.savefig(\"time_series_3.png\")"
753
  ]
754
  }
755
  ],
756
  "metadata": {
 
757
  "colab": {
 
758
  "provenance": []
759
  },
760
  "kernelspec": {
 
762
  "name": "python3"
763
  },
764
  "language_info": {
765
+ "name": "python"
 
766
  }
767
  },
768
  "nbformat": 4,
769
  "nbformat_minor": 0
770
+ }
numero_datasets_hub_output.ipynb DELETED
@@ -1,918 +0,0 @@
1
- {
2
- "cells": [
3
- {
4
- "cell_type": "code",
5
- "execution_count": null,
6
- "id": "9b494ecb",
7
- "metadata": {
8
- "colab": {
9
- "base_uri": "https://localhost:8080/"
10
- },
11
- "id": "bCPvBCk_VLoi",
12
- "outputId": "48174b27-072f-4cf9-bfcc-2a7cb12f60ba",
13
- "papermill": {
14
- "duration": null,
15
- "end_time": null,
16
- "exception": null,
17
- "start_time": null,
18
- "status": "completed"
19
- },
20
- "tags": []
21
- },
22
- "outputs": [],
23
- "source": [
24
- "!pip install huggingface_hub"
25
- ]
26
- },
27
- {
28
- "cell_type": "code",
29
- "execution_count": null,
30
- "id": "d736660e",
31
- "metadata": {
32
- "id": "NbQeXxudVJW9",
33
- "papermill": {
34
- "duration": null,
35
- "end_time": null,
36
- "exception": null,
37
- "start_time": null,
38
- "status": "completed"
39
- },
40
- "tags": []
41
- },
42
- "outputs": [],
43
- "source": [
44
- "from datetime import datetime\n",
45
- "\n",
46
- "import matplotlib.pyplot as plt\n",
47
- "import pandas as pd\n",
48
- "from huggingface_hub import HfApi\n"
49
- ]
50
- },
51
- {
52
- "cell_type": "code",
53
- "execution_count": null,
54
- "id": "8dc1a8d8",
55
- "metadata": {
56
- "colab": {
57
- "base_uri": "https://localhost:8080/"
58
- },
59
- "id": "ogyTHBYJVZ8I",
60
- "outputId": "f23a554a-7328-4e50-d87c-90368294467d",
61
- "papermill": {
62
- "duration": null,
63
- "end_time": null,
64
- "exception": null,
65
- "start_time": null,
66
- "status": "completed"
67
- },
68
- "tags": []
69
- },
70
- "outputs": [],
71
- "source": [
72
- "hf_api = HfApi()\n",
73
- "\n",
74
- "all_datasets = hf_api.list_datasets(full=True)\n",
75
- "\n",
76
- "total_count = len(list(all_datasets))\n",
77
- "print(total_count)"
78
- ]
79
- },
80
- {
81
- "cell_type": "code",
82
- "execution_count": null,
83
- "id": "299e6d56",
84
- "metadata": {
85
- "id": "GXDMUU-4XmaI",
86
- "papermill": {
87
- "duration": null,
88
- "end_time": null,
89
- "exception": null,
90
- "start_time": null,
91
- "status": "completed"
92
- },
93
- "tags": []
94
- },
95
- "outputs": [],
96
- "source": [
97
- "# language_filter = filter(lambda dataset: 'language:es' in dataset.tags, all_datasets) # 882\n",
98
- "\n",
99
- "# spanish_filter = filter(lambda d: \"language:es\" in d.tags and not any(tag.startswith(\"language:\") and tag != \"language:es\" for tag in d.tags), all_datasets) # 317\n",
100
- "\n",
101
- "#filtered_datasets_2 = filter(lambda dataset: \"es\" in dataset.card_data.language, all_datasets) # 882\n",
102
- "\n",
103
- "#filtered_datasets_3 = filter(lambda dataset: dataset.card_data.language == [\"es\"], all_datasets) #\n",
104
- "\n",
105
- "#for dataset in spanish_only_datasets:\n",
106
- "# print(dataset)\n",
107
- "# break"
108
- ]
109
- },
110
- {
111
- "cell_type": "code",
112
- "execution_count": null,
113
- "id": "691d8f3a",
114
- "metadata": {
115
- "colab": {
116
- "base_uri": "https://localhost:8080/"
117
- },
118
- "id": "pjCvHVq_hChx",
119
- "outputId": "d279d0bc-a3c6-4994-f23c-a7274b1f4ee8",
120
- "papermill": {
121
- "duration": null,
122
- "end_time": null,
123
- "exception": null,
124
- "start_time": null,
125
- "status": "completed"
126
- },
127
- "tags": []
128
- },
129
- "outputs": [],
130
- "source": [
131
- "hf_api = HfApi()\n",
132
- "\n",
133
- "all_datasets = hf_api.list_datasets(full=True)\n",
134
- "\n",
135
- "spanish_filter = filter(lambda d: \"language:es\" in d.tags and not any(tag.startswith(\"language:\") and tag != \"language:es\" for tag in d.tags), all_datasets) # 317\n",
136
- "spanish_datasets = list(spanish_filter)\n",
137
- "spanish_count = len(list(spanish_datasets))\n",
138
- "print(spanish_count)\n"
139
- ]
140
- },
141
- {
142
- "cell_type": "code",
143
- "execution_count": null,
144
- "id": "c9676c89",
145
- "metadata": {
146
- "colab": {
147
- "base_uri": "https://localhost:8080/"
148
- },
149
- "id": "WANGkTpGRw8t",
150
- "outputId": "da8931bf-7ae2-438d-8188-20190f568193",
151
- "papermill": {
152
- "duration": null,
153
- "end_time": null,
154
- "exception": null,
155
- "start_time": null,
156
- "status": "completed"
157
- },
158
- "tags": []
159
- },
160
- "outputs": [],
161
- "source": [
162
- "hf_api = HfApi()\n",
163
- "\n",
164
- "all_datasets = hf_api.list_datasets(full=True)\n",
165
- "\n",
166
- "english_filter = filter(lambda d: \"language:en\" in d.tags and not any(tag.startswith(\"language:\") and tag != \"language:en\" for tag in d.tags), all_datasets)\n",
167
- "english_datasets = list(english_filter)\n",
168
- "english_count = len(list(english_datasets))\n",
169
- "print(english_count)"
170
- ]
171
- },
172
- {
173
- "cell_type": "code",
174
- "execution_count": null,
175
- "id": "bf300ce6",
176
- "metadata": {
177
- "colab": {
178
- "base_uri": "https://localhost:8080/"
179
- },
180
- "id": "yPtF0G7SWS53",
181
- "outputId": "a2a51160-c803-4e7f-a6dc-8879eea1dd69",
182
- "papermill": {
183
- "duration": null,
184
- "end_time": null,
185
- "exception": null,
186
- "start_time": null,
187
- "status": "completed"
188
- },
189
- "tags": []
190
- },
191
- "outputs": [],
192
- "source": [
193
- "hf_api = HfApi()\n",
194
- "\n",
195
- "all_datasets = hf_api.list_datasets(full=True)\n",
196
- "\n",
197
- "chinese_filter = filter(lambda d: \"language:zh\" in d.tags and not any(tag.startswith(\"language:\") and tag != \"language:zh\" for tag in d.tags), all_datasets)\n",
198
- "chinese_datasets = list(chinese_filter)\n",
199
- "chinese_count = len(list(chinese_datasets))\n",
200
- "print(chinese_count)"
201
- ]
202
- },
203
- {
204
- "cell_type": "code",
205
- "execution_count": null,
206
- "id": "407c46fc",
207
- "metadata": {
208
- "colab": {
209
- "base_uri": "https://localhost:8080/"
210
- },
211
- "id": "RlxAlOOsW7p9",
212
- "outputId": "f1c12edd-5502-4018-b9a7-149f9fc29322",
213
- "papermill": {
214
- "duration": null,
215
- "end_time": null,
216
- "exception": null,
217
- "start_time": null,
218
- "status": "completed"
219
- },
220
- "tags": []
221
- },
222
- "outputs": [],
223
- "source": [
224
- "hf_api = HfApi()\n",
225
- "\n",
226
- "all_datasets = hf_api.list_datasets(full=True)\n",
227
- "\n",
228
- "french_filter = filter(lambda d: \"language:fr\" in d.tags and not any(tag.startswith(\"language:\") and tag != \"language:fr\" for tag in d.tags), all_datasets)\n",
229
- "french_datasets = list(french_filter)\n",
230
- "french_count = len(list(french_datasets))\n",
231
- "print(french_count)"
232
- ]
233
- },
234
- {
235
- "cell_type": "code",
236
- "execution_count": null,
237
- "id": "a7d82d5d",
238
- "metadata": {
239
- "colab": {
240
- "base_uri": "https://localhost:8080/"
241
- },
242
- "id": "OMQfBXjUYBPz",
243
- "outputId": "8cd3fdb9-0bc8-4d82-d25b-fb9eef7118ed",
244
- "papermill": {
245
- "duration": null,
246
- "end_time": null,
247
- "exception": null,
248
- "start_time": null,
249
- "status": "completed"
250
- },
251
- "tags": []
252
- },
253
- "outputs": [],
254
- "source": [
255
- "hf_api = HfApi()\n",
256
- "\n",
257
- "all_datasets = hf_api.list_datasets(full=True)\n",
258
- "\n",
259
- "mono_filter = filter(lambda dataset: sum(tag.startswith('language:') for tag in dataset.tags) == 1, all_datasets)\n",
260
- "mono_datasets = list(mono_filter)\n",
261
- "mono_count = len(list(mono_datasets))\n",
262
- "print(mono_count)"
263
- ]
264
- },
265
- {
266
- "cell_type": "code",
267
- "execution_count": null,
268
- "id": "6dc0ac68",
269
- "metadata": {
270
- "colab": {
271
- "base_uri": "https://localhost:8080/",
272
- "height": 180
273
- },
274
- "id": "sTPechkdWmYS",
275
- "outputId": "bb49f9f4-150b-4a29-d58e-faff4f88cce3",
276
- "papermill": {
277
- "duration": null,
278
- "end_time": null,
279
- "exception": null,
280
- "start_time": null,
281
- "status": "completed"
282
- },
283
- "tags": []
284
- },
285
- "outputs": [],
286
- "source": [
287
- "# Extract creation date\n",
288
- "\n",
289
- "creation_dates_spanish = [d.created_at.date() for d in spanish_datasets]\n",
290
- "assert len(creation_dates_spanish) == 318\n",
291
- "\n",
292
- "creation_dates_english = [d.created_at.date() for d in english_datasets]\n",
293
- "assert len(creation_dates_english) == 8336"
294
- ]
295
- },
296
- {
297
- "cell_type": "code",
298
- "execution_count": null,
299
- "id": "57d206ec",
300
- "metadata": {
301
- "id": "hefZVynDSjjE",
302
- "papermill": {
303
- "duration": null,
304
- "end_time": null,
305
- "exception": null,
306
- "start_time": null,
307
- "status": "completed"
308
- },
309
- "tags": []
310
- },
311
- "outputs": [],
312
- "source": [
313
- "print(creation_dates_spanish[0])"
314
- ]
315
- },
316
- {
317
- "cell_type": "markdown",
318
- "id": "b80e411d",
319
- "metadata": {
320
- "id": "aFaEBlkkSbrs",
321
- "papermill": {
322
- "duration": null,
323
- "end_time": null,
324
- "exception": null,
325
- "start_time": null,
326
- "status": "completed"
327
- },
328
- "tags": []
329
- },
330
- "source": [
331
- "## Bar Chart\n",
332
- "\n"
333
- ]
334
- },
335
- {
336
- "cell_type": "code",
337
- "execution_count": null,
338
- "id": "96652421",
339
- "metadata": {
340
- "id": "dYJ2zd4dShYh",
341
- "papermill": {
342
- "duration": null,
343
- "end_time": null,
344
- "exception": null,
345
- "start_time": null,
346
- "status": "completed"
347
- },
348
- "tags": []
349
- },
350
- "outputs": [],
351
- "source": [
352
- "import matplotlib.pyplot as plt\n",
353
- "from collections import Counter\n",
354
- "\n",
355
- "# Sample data (replace with your actual data)\n",
356
- "creation_dates_english = [d.created_at.date() for d in english_datasets]\n",
357
- "creation_dates_spanish = [d.created_at.date() for d in spanish_datasets]\n",
358
- "\n",
359
- "# Extract years from the creation dates\n",
360
- "years = sorted(set(date.year for date in creation_dates_english + creation_dates_spanish))\n",
361
- "english_counts = Counter(date.year for date in creation_dates_english)\n",
362
- "spanish_counts = Counter(date.year for date in creation_dates_spanish)\n",
363
- "\n",
364
- "# Plotting the bar chart\n",
365
- "plt.figure(figsize=(10, 6))\n",
366
- "plt.bar(years, [english_counts[year] for year in years], width=0.4, label='English Datasets', color='blue')\n",
367
- "plt.bar(years, [spanish_counts[year] for year in years], width=0.4, label='Spanish Datasets', color='orange', bottom=[english_counts[year] for year in years])\n",
368
- "\n",
369
- "# Adding labels and title\n",
370
- "plt.xlabel('Year')\n",
371
- "plt.ylabel('Number of Datasets')\n",
372
- "plt.title('Distribution of Monolingual English and Spanish Datasets by Year')\n",
373
- "plt.xticks(years)\n",
374
- "plt.legend()\n",
375
- "\n",
376
- "# Display the plot\n",
377
- "plt.grid(True)\n",
378
- "plt.tight_layout()\n",
379
- "plt.show()\n",
380
- "plt.savefig(\"plots/bar_stack.png\")\n"
381
- ]
382
- },
383
- {
384
- "cell_type": "code",
385
- "execution_count": null,
386
- "id": "2d1ae015",
387
- "metadata": {
388
- "id": "wViEE4wCUVgs",
389
- "papermill": {
390
- "duration": null,
391
- "end_time": null,
392
- "exception": null,
393
- "start_time": null,
394
- "status": "completed"
395
- },
396
- "tags": []
397
- },
398
- "outputs": [],
399
- "source": [
400
- "import matplotlib.pyplot as plt\n",
401
- "import numpy as np\n",
402
- "from collections import Counter\n",
403
- "\n",
404
- "# Sample data (replace with your actual data)\n",
405
- "creation_dates_english = [d.created_at.date() for d in english_datasets]\n",
406
- "creation_dates_spanish = [d.created_at.date() for d in spanish_datasets]\n",
407
- "\n",
408
- "# Extract years from the creation dates\n",
409
- "years = sorted(set(date.year for date in creation_dates_english + creation_dates_spanish))\n",
410
- "english_counts = Counter(date.year for date in creation_dates_english)\n",
411
- "spanish_counts = Counter(date.year for date in creation_dates_spanish)\n",
412
- "\n",
413
- "# Define the width of each bar\n",
414
- "bar_width = 0.4\n",
415
- "\n",
416
- "# Define the x-coordinates for the bars\n",
417
- "years_index = np.arange(len(years))\n",
418
- "\n",
419
- "# Plotting the side-by-side bar chart\n",
420
- "plt.figure(figsize=(10, 6))\n",
421
- "plt.bar(years_index - bar_width/2, [english_counts[year] for year in years], width=bar_width, label='English Datasets', color='blue')\n",
422
- "plt.bar(years_index + bar_width/2, [spanish_counts[year] for year in years], width=bar_width, label='Spanish Datasets', color='orange')\n",
423
- "\n",
424
- "# Adding labels and title\n",
425
- "plt.xlabel('Year')\n",
426
- "plt.ylabel('Number of Datasets')\n",
427
- "plt.title('Distribution of Monolingual English and Spanish Datasets by Year')\n",
428
- "plt.xticks(years_index, years)\n",
429
- "plt.legend()\n",
430
- "\n",
431
- "# Display the plot\n",
432
- "plt.grid(True)\n",
433
- "plt.tight_layout()\n",
434
- "plt.show()\n",
435
- "plt.savefig(\"plots/bar_width.png\")"
436
- ]
437
- },
438
- {
439
- "cell_type": "markdown",
440
- "id": "cddf7237",
441
- "metadata": {
442
- "id": "Hp8vNA6LUA1E",
443
- "papermill": {
444
- "duration": null,
445
- "end_time": null,
446
- "exception": null,
447
- "start_time": null,
448
- "status": "completed"
449
- },
450
- "tags": []
451
- },
452
- "source": [
453
- "# Stacked Area Chart\n"
454
- ]
455
- },
456
- {
457
- "cell_type": "code",
458
- "execution_count": null,
459
- "id": "68255399",
460
- "metadata": {
461
- "id": "CWgCunzGUCot",
462
- "papermill": {
463
- "duration": null,
464
- "end_time": null,
465
- "exception": null,
466
- "start_time": null,
467
- "status": "completed"
468
- },
469
- "tags": []
470
- },
471
- "outputs": [],
472
- "source": [
473
- "import matplotlib.pyplot as plt\n",
474
- "from collections import Counter\n",
475
- "\n",
476
- "# Sample data (replace with your actual data)\n",
477
- "creation_dates_english = [d.created_at.date() for d in english_datasets]\n",
478
- "creation_dates_spanish = [d.created_at.date() for d in spanish_datasets]\n",
479
- "\n",
480
- "# Extract years from the creation dates\n",
481
- "years = sorted(set(date.year for date in creation_dates_english + creation_dates_spanish))\n",
482
- "english_counts = Counter(date.year for date in creation_dates_english)\n",
483
- "spanish_counts = Counter(date.year for date in creation_dates_spanish)\n",
484
- "\n",
485
- "# Calculate cumulative counts\n",
486
- "english_datasets_cumulative = [english_counts[year] for year in years]\n",
487
- "spanish_datasets_cumulative = [spanish_counts[year] for year in years]\n",
488
- "for i in range(1, len(years)):\n",
489
- " english_datasets_cumulative[i] += english_datasets_cumulative[i-1]\n",
490
- " spanish_datasets_cumulative[i] += spanish_datasets_cumulative[i-1]\n",
491
- "\n",
492
- "# Plotting the stacked area chart\n",
493
- "plt.figure(figsize=(10, 6))\n",
494
- "plt.stackplot(years, english_datasets_cumulative, spanish_datasets_cumulative, labels=['English Datasets', 'Spanish Datasets'], colors=['blue', 'orange'])\n",
495
- "\n",
496
- "# Adding labels and title\n",
497
- "plt.xlabel('Year')\n",
498
- "plt.ylabel('Cumulative Number of Datasets')\n",
499
- "plt.title('Cumulative Growth of Monolingual English and Spanish Datasets Over Time')\n",
500
- "plt.xticks(years)\n",
501
- "plt.legend(loc='upper left')\n",
502
- "\n",
503
- "# Display the plot\n",
504
- "plt.grid(True)\n",
505
- "plt.tight_layout()\n",
506
- "plt.show()\n",
507
- "\n",
508
- "plt.savefig(\"plots/stack_area_1.png\")"
509
- ]
510
- },
511
- {
512
- "cell_type": "code",
513
- "execution_count": null,
514
- "id": "4ba74cf5",
515
- "metadata": {
516
- "id": "GwRpZwYWhau3",
517
- "papermill": {
518
- "duration": null,
519
- "end_time": null,
520
- "exception": null,
521
- "start_time": null,
522
- "status": "completed"
523
- },
524
- "tags": []
525
- },
526
- "outputs": [],
527
- "source": [
528
- "import matplotlib.pyplot as plt\n",
529
- "import pandas as pd\n",
530
- "from collections import Counter\n",
531
- "\n",
532
- "# Sample data (replace with your actual data)\n",
533
- "creation_dates_english = [d.created_at.date() for d in english_datasets]\n",
534
- "creation_dates_spanish = [d.created_at.date() for d in spanish_datasets]\n",
535
- "\n",
536
- "# Extract months from the creation dates\n",
537
- "months_english = [(date.year, date.month) for date in creation_dates_english]\n",
538
- "months_spanish = [(date.year, date.month) for date in creation_dates_spanish]\n",
539
- "\n",
540
- "# Count the occurrences of each month\n",
541
- "english_counts = Counter(months_english)\n",
542
- "spanish_counts = Counter(months_spanish)\n",
543
- "\n",
544
- "# Create a DataFrame for English datasets\n",
545
- "df_english = pd.DataFrame.from_dict(english_counts, orient='index', columns=['English'])\n",
546
- "df_english.index = pd.MultiIndex.from_tuples(df_english.index, names=['Year', 'Month'])\n",
547
- "df_english = df_english.sort_index()\n",
548
- "\n",
549
- "# Create a DataFrame for Spanish datasets\n",
550
- "df_spanish = pd.DataFrame.from_dict(spanish_counts, orient='index', columns=['Spanish'])\n",
551
- "df_spanish.index = pd.MultiIndex.from_tuples(df_spanish.index, names=['Year', 'Month'])\n",
552
- "df_spanish = df_spanish.sort_index()\n",
553
- "\n",
554
- "# Merge the DataFrames\n",
555
- "df = pd.merge(df_english, df_spanish, how='outer', left_index=True, right_index=True).fillna(0)\n",
556
- "\n",
557
- "# Convert index to datetime\n",
558
- "df.index = pd.to_datetime(df.index.map(lambda x: f'{x[0]}-{x[1]}'))\n",
559
- "\n",
560
- "# Calculate cumulative sum\n",
561
- "df_cumulative = df.cumsum()\n",
562
- "\n",
563
- "# Plotting the stacked area chart\n",
564
- "plt.figure(figsize=(8, 5))\n",
565
- "plt.stackplot(df_cumulative.index, df_cumulative['English'], df_cumulative['Spanish'], labels=['English', 'Spanish'], colors=['orange', 'blue'])\n",
566
- "\n",
567
- "# Adding labels and title\n",
568
- "plt.xlabel('Creation date')\n",
569
- "plt.ylabel('Cumulative number of monolingual datasets')\n",
570
- "plt.title('Cumulative growth of monolingual English and Spanish datasets in the Hugging Face Hub over time')\n",
571
- "\n",
572
- "# Display the plot\n",
573
- "plt.xticks(rotation=45)\n",
574
- "plt.legend(loc='upper left')\n",
575
- "plt.grid(False)\n",
576
- "plt.tight_layout()\n",
577
- "plt.show()\n",
578
- "\n",
579
- "plt.savefig(\"plots/stack_area_2.png\")"
580
- ]
581
- },
582
- {
583
- "cell_type": "code",
584
- "execution_count": null,
585
- "id": "d96225ce",
586
- "metadata": {
587
- "id": "kJQ0OgRtglOQ",
588
- "papermill": {
589
- "duration": null,
590
- "end_time": null,
591
- "exception": null,
592
- "start_time": null,
593
- "status": "completed"
594
- },
595
- "tags": []
596
- },
597
- "outputs": [],
598
- "source": [
599
- "import matplotlib.pyplot as plt\n",
600
- "import pandas as pd\n",
601
- "from collections import Counter\n",
602
- "\n",
603
- "# Sample data (replace with your actual data)\n",
604
- "creation_dates_english = [d.created_at.date() for d in english_datasets]\n",
605
- "creation_dates_spanish = [d.created_at.date() for d in spanish_datasets]\n",
606
- "\n",
607
- "# Extract months from the creation dates\n",
608
- "months_english = [(date.year, date.month) for date in creation_dates_english]\n",
609
- "months_spanish = [(date.year, date.month) for date in creation_dates_spanish]\n",
610
- "\n",
611
- "# Count the occurrences of each month\n",
612
- "english_counts = Counter(months_english)\n",
613
- "spanish_counts = Counter(months_spanish)\n",
614
- "\n",
615
- "# Create a DataFrame for English datasets\n",
616
- "df_english = pd.DataFrame.from_dict(english_counts, orient='index', columns=['English'])\n",
617
- "df_english.index = pd.MultiIndex.from_tuples(df_english.index, names=['Year', 'Month'])\n",
618
- "df_english = df_english.sort_index()\n",
619
- "\n",
620
- "# Create a DataFrame for Spanish datasets\n",
621
- "df_spanish = pd.DataFrame.from_dict(spanish_counts, orient='index', columns=['Spanish'])\n",
622
- "df_spanish.index = pd.MultiIndex.from_tuples(df_spanish.index, names=['Year', 'Month'])\n",
623
- "df_spanish = df_spanish.sort_index()\n",
624
- "\n",
625
- "# Merge the DataFrames\n",
626
- "df = pd.merge(df_english, df_spanish, how='outer', left_index=True, right_index=True).fillna(0)\n",
627
- "\n",
628
- "# Convert index to datetime\n",
629
- "df.index = pd.to_datetime(df.index.map(lambda x: f'{x[0]}-{x[1]}'))\n",
630
- "\n",
631
- "# Plotting the stacked area chart\n",
632
- "plt.figure(figsize=(10, 6))\n",
633
- "plt.stackplot(df.index, df['English'], df['Spanish'], labels=['English Datasets', 'Spanish Datasets'], colors=['blue', 'orange'])\n",
634
- "\n",
635
- "# Adding labels and title\n",
636
- "plt.xlabel('Date')\n",
637
- "plt.ylabel('Cumulative Number of Datasets')\n",
638
- "plt.title('Cumulative Growth of Monolingual English and Spanish Datasets Over Time')\n",
639
- "\n",
640
- "# Display the plot\n",
641
- "plt.xticks(rotation=45)\n",
642
- "plt.legend(loc='upper left')\n",
643
- "plt.grid(True)\n",
644
- "plt.tight_layout()\n",
645
- "plt.show()\n",
646
- "\n",
647
- "plt.savefig(\"plots/stack_area_3.png\")"
648
- ]
649
- },
650
- {
651
- "cell_type": "markdown",
652
- "id": "7bbec0ac",
653
- "metadata": {
654
- "id": "IAnFHiPlgnRE",
655
- "papermill": {
656
- "duration": null,
657
- "end_time": null,
658
- "exception": null,
659
- "start_time": null,
660
- "status": "completed"
661
- },
662
- "tags": []
663
- },
664
- "source": [
665
- "## Pie Chart"
666
- ]
667
- },
668
- {
669
- "cell_type": "code",
670
- "execution_count": null,
671
- "id": "7c3dd684",
672
- "metadata": {
673
- "id": "8tKR1x-kVeZT",
674
- "papermill": {
675
- "duration": null,
676
- "end_time": null,
677
- "exception": null,
678
- "start_time": null,
679
- "status": "completed"
680
- },
681
- "tags": []
682
- },
683
- "outputs": [],
684
- "source": [
685
- "import matplotlib.pyplot as plt\n",
686
- "from collections import Counter\n",
687
- "\n",
688
- "# Calculate the count of \"other\" datasets\n",
689
- "other_count = mono_count - (english_count + spanish_count + chinese_count + french_count)\n",
690
- "\n",
691
- "# Pie chart data\n",
692
- "labels = ['English', 'Chinese', 'French', 'Spanish', 'Other']\n",
693
- "sizes = [english_count, chinese_count, french_count, spanish_count, other_count]\n",
694
- "\n",
695
- "# Plotting the pie chart\n",
696
- "plt.figure(figsize=(8, 8))\n",
697
- "plt.pie(sizes, labels=labels, autopct='%1.1f%%', startangle=180, colors=['blue', 'red', 'green', 'orange', 'purple'])\n",
698
- "plt.title('Distribution of Monolingual Datasets by Language')\n",
699
- "plt.axis('equal') # Equal aspect ratio ensures that pie is drawn as a circle.\n",
700
- "\n",
701
- "# Display the plot\n",
702
- "plt.show()\n",
703
- "\n",
704
- "plt.savefig(\"plots/pie_chart.png\")"
705
- ]
706
- },
707
- {
708
- "cell_type": "markdown",
709
- "id": "11c1c9c8",
710
- "metadata": {
711
- "id": "z2xf8FrHROxy",
712
- "papermill": {
713
- "duration": null,
714
- "end_time": null,
715
- "exception": null,
716
- "start_time": null,
717
- "status": "completed"
718
- },
719
- "tags": []
720
- },
721
- "source": [
722
- "# Time series plot"
723
- ]
724
- },
725
- {
726
- "cell_type": "code",
727
- "execution_count": null,
728
- "id": "1bb6a676",
729
- "metadata": {
730
- "id": "DuPFSZKUhyQj",
731
- "papermill": {
732
- "duration": null,
733
- "end_time": null,
734
- "exception": null,
735
- "start_time": null,
736
- "status": "completed"
737
- },
738
- "tags": []
739
- },
740
- "outputs": [],
741
- "source": [
742
- "# Prepare data for plotting\n",
743
- "\n",
744
- "df = pd.DataFrame(creation_dates_spanish, columns=[\"Date\"])\n",
745
- "df[\"Count\"] = 1\n",
746
- "# Ensure the 'Date' column is of type datetime\n",
747
- "df['Date'] = pd.to_datetime(df['Date'])\n",
748
- "# Group by month and calculate cumulative sum\n",
749
- "df = df.groupby(pd.Grouper(key=\"Date\", freq=\"MS\")).sum().cumsum()\n",
750
- "\n",
751
- "# Plot the data\n",
752
- "plt.figure(figsize=(10, 6))\n",
753
- "plt.plot(\n",
754
- " df.index,\n",
755
- " df[\"Count\"],\n",
756
- " #marker=\"o\",\n",
757
- " color=\"g\"\n",
758
- ")\n",
759
- "plt.title(\"Evolución de bases de datos monolingües en español\")\n",
760
- "plt.xlabel(\"Fecha\")\n",
761
- "plt.ylabel(\"Número de bases de datos\")\n",
762
- "plt.grid(True)\n",
763
- "plt.xticks(rotation=45)\n",
764
- "plt.tight_layout()\n",
765
- "plt.show()"
766
- ]
767
- },
768
- {
769
- "cell_type": "code",
770
- "execution_count": null,
771
- "id": "2fc77d7f",
772
- "metadata": {
773
- "id": "-Vu3PIe2hITq",
774
- "papermill": {
775
- "duration": null,
776
- "end_time": null,
777
- "exception": null,
778
- "start_time": null,
779
- "status": "completed"
780
- },
781
- "tags": []
782
- },
783
- "outputs": [],
784
- "source": [
785
- "import matplotlib.pyplot as plt\n",
786
- "import pandas as pd\n",
787
- "from collections import Counter\n",
788
- "\n",
789
- "# Sample data (replace with your actual data)\n",
790
- "creation_dates_english = [d.created_at.date() for d in english_datasets]\n",
791
- "creation_dates_spanish = [d.created_at.date() for d in spanish_datasets]\n",
792
- "\n",
793
- "# Extract months from the creation dates\n",
794
- "months_english = [(date.year, date.month) for date in creation_dates_english]\n",
795
- "months_spanish = [(date.year, date.month) for date in creation_dates_spanish]\n",
796
- "\n",
797
- "# Count the occurrences of each month\n",
798
- "english_counts = Counter(months_english)\n",
799
- "spanish_counts = Counter(months_spanish)\n",
800
- "\n",
801
- "# Create a DataFrame for English datasets\n",
802
- "df_english = pd.DataFrame.from_dict(english_counts, orient='index', columns=['English'])\n",
803
- "df_english.index = pd.MultiIndex.from_tuples(df_english.index, names=['Year', 'Month'])\n",
804
- "df_english = df_english.sort_index()\n",
805
- "\n",
806
- "# Create a DataFrame for Spanish datasets\n",
807
- "df_spanish = pd.DataFrame.from_dict(spanish_counts, orient='index', columns=['Spanish'])\n",
808
- "df_spanish.index = pd.MultiIndex.from_tuples(df_spanish.index, names=['Year', 'Month'])\n",
809
- "df_spanish = df_spanish.sort_index()\n",
810
- "\n",
811
- "# Merge the DataFrames\n",
812
- "df = pd.merge(df_english, df_spanish, how='outer', left_index=True, right_index=True).fillna(0)\n",
813
- "\n",
814
- "# Convert index to datetime\n",
815
- "df.index = pd.to_datetime(df.index.map(lambda x: f'{x[0]}-{x[1]}'))\n",
816
- "\n",
817
- "# Calculate cumulative sum\n",
818
- "df_cumulative = df.cumsum()\n",
819
- "\n",
820
- "# Plotting the cumulative chart\n",
821
- "plt.figure(figsize=(10, 6))\n",
822
- "plt.plot(df_cumulative.index, df_cumulative['English'], label='English', color='blue')\n",
823
- "plt.plot(df_cumulative.index, df_cumulative['Spanish'], label='Spanish', color='orange')\n",
824
- "\n",
825
- "# Adding labels and title\n",
826
- "plt.xlabel('Date')\n",
827
- "plt.ylabel('Cumulative Number of Datasets')\n",
828
- "plt.title('Cumulative Growth of Monolingual English and Spanish Datasets Over Time')\n",
829
- "\n",
830
- "# Display the plot\n",
831
- "plt.xticks(rotation=45)\n",
832
- "plt.legend(loc='upper left')\n",
833
- "plt.grid(True)\n",
834
- "plt.tight_layout()\n",
835
- "plt.show()\n"
836
- ]
837
- },
838
- {
839
- "cell_type": "code",
840
- "execution_count": null,
841
- "id": "6c0d23ac",
842
- "metadata": {
843
- "id": "KG__of2IfdHu",
844
- "papermill": {
845
- "duration": null,
846
- "end_time": null,
847
- "exception": null,
848
- "start_time": null,
849
- "status": "completed"
850
- },
851
- "tags": []
852
- },
853
- "outputs": [],
854
- "source": [
855
- "import matplotlib.pyplot as plt\n",
856
- "import pandas as pd\n",
857
- "from collections import Counter\n",
858
- "\n",
859
- "# Sample data (replace with your actual data)\n",
860
- "creation_dates_english = [d.created_at.date() for d in english_datasets]\n",
861
- "creation_dates_spanish = [d.created_at.date() for d in spanish_datasets]\n",
862
- "\n",
863
- "# Extract years from the creation dates\n",
864
- "years = sorted(set(date.year for date in creation_dates_english + creation_dates_spanish))\n",
865
- "english_counts = Counter(date.year for date in creation_dates_english)\n",
866
- "spanish_counts = Counter(date.year for date in creation_dates_spanish)\n",
867
- "\n",
868
- "# Prepare data for plotting\n",
869
- "english_series = pd.Series([english_counts[year] for year in years], index=years)\n",
870
- "spanish_series = pd.Series([spanish_counts[year] for year in years], index=years)\n",
871
- "\n",
872
- "# Plotting the time series\n",
873
- "plt.figure(figsize=(10, 6))\n",
874
- "plt.plot(english_series.index, english_series.values, label='English', color='blue')\n",
875
- "plt.plot(spanish_series.index, spanish_series.values, label='Spanish', color='orange')\n",
876
- "\n",
877
- "# Adding labels and title\n",
878
- "plt.title('Evolution of English and Spanish Datasets Over Time')\n",
879
- "plt.xlabel('Year')\n",
880
- "plt.ylabel('Number of Datasets')\n",
881
- "plt.legend()\n",
882
- "plt.grid(True)\n",
883
- "plt.xticks(rotation=45)\n",
884
- "plt.tight_layout()\n",
885
- "plt.show()\n"
886
- ]
887
- }
888
- ],
889
- "metadata": {
890
- "accelerator": "GPU",
891
- "colab": {
892
- "gpuType": "T4",
893
- "provenance": []
894
- },
895
- "kernelspec": {
896
- "display_name": "Python 3",
897
- "name": "python3"
898
- },
899
- "language_info": {
900
- "name": "python",
901
- "version": "3.11.6"
902
- },
903
- "papermill": {
904
- "default_parameters": {},
905
- "duration": 0.047858,
906
- "end_time": "2024-05-15T09:04:29.634379",
907
- "environment_variables": {},
908
- "exception": null,
909
- "input_path": "numero_datasets_hub.ipynb",
910
- "output_path": "numero_datasets_hub_output.ipynb",
911
- "parameters": {},
912
- "start_time": "2024-05-15T09:04:29.586521",
913
- "version": "2.6.0"
914
- }
915
- },
916
- "nbformat": 4,
917
- "nbformat_minor": 5
918
- }
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
plots/bar_plot_horizontal.png ADDED
plots/bar_plot_vertical.png ADDED
plots/datasets_hub.png DELETED
Binary file (46.9 kB)
 
plots/stack_area.png ADDED
plots/stack_area_es.png ADDED
plots/time_series.png ADDED