Adding sampling to mc4
Browse files- bertin/__init__.py +0 -0
- mc4/README.md +525 -0
- mc4/dummy/af/0.0.0/dummy_data.zip +0 -0
- mc4/mc4.py +394 -0
- run_mlm_flax.py +2 -2
- run_mlm_flax_stream.py +14 -63
- run_stream.sh +4 -3
bertin/__init__.py
DELETED
File without changes
|
mc4/README.md
ADDED
@@ -0,0 +1,525 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
---
|
2 |
+
pretty_name: mC4
|
3 |
+
annotations_creators:
|
4 |
+
- no-annotation
|
5 |
+
language_creators:
|
6 |
+
- found
|
7 |
+
languages:
|
8 |
+
- af
|
9 |
+
- am
|
10 |
+
- ar
|
11 |
+
- az
|
12 |
+
- be
|
13 |
+
- bg
|
14 |
+
- bg-Latn
|
15 |
+
- bn
|
16 |
+
- ca
|
17 |
+
- ceb
|
18 |
+
- co
|
19 |
+
- cs
|
20 |
+
- cy
|
21 |
+
- da
|
22 |
+
- de
|
23 |
+
- el
|
24 |
+
- el-Latn
|
25 |
+
- en
|
26 |
+
- eo
|
27 |
+
- es
|
28 |
+
- et
|
29 |
+
- eu
|
30 |
+
- fa
|
31 |
+
- fi
|
32 |
+
- fil
|
33 |
+
- fr
|
34 |
+
- fy
|
35 |
+
- ga
|
36 |
+
- gd
|
37 |
+
- gl
|
38 |
+
- gu
|
39 |
+
- ha
|
40 |
+
- haw
|
41 |
+
- hi
|
42 |
+
- hi-Latn
|
43 |
+
- hmn
|
44 |
+
- ht
|
45 |
+
- hu
|
46 |
+
- hy
|
47 |
+
- id
|
48 |
+
- ig
|
49 |
+
- is
|
50 |
+
- it
|
51 |
+
- iw
|
52 |
+
- ja
|
53 |
+
- ja-Latn
|
54 |
+
- jv
|
55 |
+
- ka
|
56 |
+
- kk
|
57 |
+
- km
|
58 |
+
- kn
|
59 |
+
- ko
|
60 |
+
- ku
|
61 |
+
- ky
|
62 |
+
- la
|
63 |
+
- lb
|
64 |
+
- lo
|
65 |
+
- lt
|
66 |
+
- lv
|
67 |
+
- mg
|
68 |
+
- mi
|
69 |
+
- mk
|
70 |
+
- ml
|
71 |
+
- mn
|
72 |
+
- mr
|
73 |
+
- ms
|
74 |
+
- mt
|
75 |
+
- my
|
76 |
+
- ne
|
77 |
+
- nl
|
78 |
+
- "no"
|
79 |
+
- ny
|
80 |
+
- pa
|
81 |
+
- pl
|
82 |
+
- ps
|
83 |
+
- pt
|
84 |
+
- ro
|
85 |
+
- ru
|
86 |
+
- ru-Latn
|
87 |
+
- sd
|
88 |
+
- si
|
89 |
+
- sk
|
90 |
+
- sl
|
91 |
+
- sm
|
92 |
+
- sn
|
93 |
+
- so
|
94 |
+
- sq
|
95 |
+
- sr
|
96 |
+
- st
|
97 |
+
- su
|
98 |
+
- sv
|
99 |
+
- sw
|
100 |
+
- ta
|
101 |
+
- te
|
102 |
+
- tg
|
103 |
+
- th
|
104 |
+
- tr
|
105 |
+
- uk
|
106 |
+
- und
|
107 |
+
- ur
|
108 |
+
- uz
|
109 |
+
- vi
|
110 |
+
- xh
|
111 |
+
- yi
|
112 |
+
- yo
|
113 |
+
- zh
|
114 |
+
- zh-Latn
|
115 |
+
- zu
|
116 |
+
licenses:
|
117 |
+
- odc-by-1.0
|
118 |
+
multilinguality:
|
119 |
+
- multilingual
|
120 |
+
size_categories:
|
121 |
+
- n<1K
|
122 |
+
- 1K<n<10K
|
123 |
+
- 10K<n<100K
|
124 |
+
- 100K<n<1M
|
125 |
+
- 1M<n<10M
|
126 |
+
- 10M<n<100M
|
127 |
+
- 100M<n<1B
|
128 |
+
- 1B<n<10B
|
129 |
+
source_datasets:
|
130 |
+
- original
|
131 |
+
task_categories:
|
132 |
+
- sequence-modeling
|
133 |
+
task_ids:
|
134 |
+
- language-modeling
|
135 |
+
paperswithcode_id: mc4
|
136 |
+
---
|
137 |
+
|
138 |
+
# Dataset Card for mC4
|
139 |
+
|
140 |
+
## Table of Contents
|
141 |
+
|
142 |
+
- [Dataset Card for mC4](#dataset-card-for-mc4)
|
143 |
+
- [Table of Contents](#table-of-contents)
|
144 |
+
- [Dataset Description](#dataset-description)
|
145 |
+
- [Dataset Summary](#dataset-summary)
|
146 |
+
- [Supported Tasks and Leaderboards](#supported-tasks-and-leaderboards)
|
147 |
+
- [Languages](#languages)
|
148 |
+
- [Dataset Structure](#dataset-structure)
|
149 |
+
- [Data Instances](#data-instances)
|
150 |
+
- [Data Fields](#data-fields)
|
151 |
+
- [Data Splits](#data-splits)
|
152 |
+
- [Dataset Creation](#dataset-creation)
|
153 |
+
- [Curation Rationale](#curation-rationale)
|
154 |
+
- [Source Data](#source-data)
|
155 |
+
- [Initial Data Collection and Normalization](#initial-data-collection-and-normalization)
|
156 |
+
- [Who are the source language producers?](#who-are-the-source-language-producers)
|
157 |
+
- [Annotations](#annotations)
|
158 |
+
- [Annotation process](#annotation-process)
|
159 |
+
- [Who are the annotators?](#who-are-the-annotators)
|
160 |
+
- [Personal and Sensitive Information](#personal-and-sensitive-information)
|
161 |
+
- [Considerations for Using the Data](#considerations-for-using-the-data)
|
162 |
+
- [Social Impact of Dataset](#social-impact-of-dataset)
|
163 |
+
- [Discussion of Biases](#discussion-of-biases)
|
164 |
+
- [Other Known Limitations](#other-known-limitations)
|
165 |
+
- [Additional Information](#additional-information)
|
166 |
+
- [Dataset Curators](#dataset-curators)
|
167 |
+
- [Licensing Information](#licensing-information)
|
168 |
+
- [Citation Information](#citation-information)
|
169 |
+
- [Contributions](#contributions)
|
170 |
+
|
171 |
+
## Dataset Description
|
172 |
+
|
173 |
+
- **Homepage:** https://huggingface.co/datasets/allenai/c4
|
174 |
+
- **Paper:** https://arxiv.org/abs/1910.10683
|
175 |
+
|
176 |
+
### Dataset Summary
|
177 |
+
|
178 |
+
A multilingual colossal, cleaned version of Common Crawl's web crawl corpus. Based on Common Crawl dataset: "https://commoncrawl.org".
|
179 |
+
|
180 |
+
This is the version prepared by AllenAI, hosted at this address: https://huggingface.co/datasets/allenai/c4
|
181 |
+
|
182 |
+
108 languages are available and are reported in the table below.
|
183 |
+
|
184 |
+
Note that the languages that end with "-Latn" are simply romanized variants, i.e. written using the Latin script.
|
185 |
+
|
186 |
+
| language code | language name |
|
187 |
+
|:----------------|:---------------------|
|
188 |
+
| af | Afrikaans |
|
189 |
+
| am | Amharic |
|
190 |
+
| ar | Arabic |
|
191 |
+
| az | Azerbaijani |
|
192 |
+
| be | Belarusian |
|
193 |
+
| bg | Bulgarian |
|
194 |
+
| bg-Latn | Bulgarian (Latin) |
|
195 |
+
| bn | Bangla |
|
196 |
+
| ca | Catalan |
|
197 |
+
| ceb | Cebuano |
|
198 |
+
| co | Corsican |
|
199 |
+
| cs | Czech |
|
200 |
+
| cy | Welsh |
|
201 |
+
| da | Danish |
|
202 |
+
| de | German |
|
203 |
+
| el | Greek |
|
204 |
+
| el-Latn | Greek (Latin) |
|
205 |
+
| en | English |
|
206 |
+
| eo | Esperanto |
|
207 |
+
| es | Spanish |
|
208 |
+
| et | Estonian |
|
209 |
+
| eu | Basque |
|
210 |
+
| fa | Persian |
|
211 |
+
| fi | Finnish |
|
212 |
+
| fil | Filipino |
|
213 |
+
| fr | French |
|
214 |
+
| fy | Western Frisian |
|
215 |
+
| ga | Irish |
|
216 |
+
| gd | Scottish Gaelic |
|
217 |
+
| gl | Galician |
|
218 |
+
| gu | Gujarati |
|
219 |
+
| ha | Hausa |
|
220 |
+
| haw | Hawaiian |
|
221 |
+
| hi | Hindi |
|
222 |
+
| hi-Latn | Hindi (Latin script) |
|
223 |
+
| hmn | Hmong, Mong |
|
224 |
+
| ht | Haitian |
|
225 |
+
| hu | Hungarian |
|
226 |
+
| hy | Armenian |
|
227 |
+
| id | Indonesian |
|
228 |
+
| ig | Igbo |
|
229 |
+
| is | Icelandic |
|
230 |
+
| it | Italian |
|
231 |
+
| iw | former Hebrew |
|
232 |
+
| ja | Japanese |
|
233 |
+
| ja-Latn | Japanese (Latin) |
|
234 |
+
| jv | Javanese |
|
235 |
+
| ka | Georgian |
|
236 |
+
| kk | Kazakh |
|
237 |
+
| km | Khmer |
|
238 |
+
| kn | Kannada |
|
239 |
+
| ko | Korean |
|
240 |
+
| ku | Kurdish |
|
241 |
+
| ky | Kyrgyz |
|
242 |
+
| la | Latin |
|
243 |
+
| lb | Luxembourgish |
|
244 |
+
| lo | Lao |
|
245 |
+
| lt | Lithuanian |
|
246 |
+
| lv | Latvian |
|
247 |
+
| mg | Malagasy |
|
248 |
+
| mi | Maori |
|
249 |
+
| mk | Macedonian |
|
250 |
+
| ml | Malayalam |
|
251 |
+
| mn | Mongolian |
|
252 |
+
| mr | Marathi |
|
253 |
+
| ms | Malay |
|
254 |
+
| mt | Maltese |
|
255 |
+
| my | Burmese |
|
256 |
+
| ne | Nepali |
|
257 |
+
| nl | Dutch |
|
258 |
+
| no | Norwegian |
|
259 |
+
| ny | Nyanja |
|
260 |
+
| pa | Punjabi |
|
261 |
+
| pl | Polish |
|
262 |
+
| ps | Pashto |
|
263 |
+
| pt | Portuguese |
|
264 |
+
| ro | Romanian |
|
265 |
+
| ru | Russian |
|
266 |
+
| ru-Latn | Russian (Latin) |
|
267 |
+
| sd | Sindhi |
|
268 |
+
| si | Sinhala |
|
269 |
+
| sk | Slovak |
|
270 |
+
| sl | Slovenian |
|
271 |
+
| sm | San Marino |
|
272 |
+
| sn | Shona |
|
273 |
+
| so | Somali |
|
274 |
+
| sq | Albanian |
|
275 |
+
| sr | Serbian |
|
276 |
+
| st | Southern Sotho |
|
277 |
+
| su | Sundanese |
|
278 |
+
| sv | Swedish |
|
279 |
+
| sw | Swahili |
|
280 |
+
| ta | Tamil |
|
281 |
+
| te | Telugu |
|
282 |
+
| tg | Tajik |
|
283 |
+
| th | Thai |
|
284 |
+
| tr | Turkish |
|
285 |
+
| uk | Ukrainian |
|
286 |
+
| und | Unknown language |
|
287 |
+
| ur | Urdu |
|
288 |
+
| uz | Uzbek |
|
289 |
+
| vi | Vietnamese |
|
290 |
+
| xh | Xhosa |
|
291 |
+
| yi | Yiddish |
|
292 |
+
| yo | Yoruba |
|
293 |
+
| zh | Chinese |
|
294 |
+
| zh-Latn | Chinese (Latin) |
|
295 |
+
| zu | Zulu |
|
296 |
+
|
297 |
+
You can load the mC4 subset of any language like this:
|
298 |
+
|
299 |
+
```python
|
300 |
+
from datasets import load_dataset
|
301 |
+
|
302 |
+
en_mc4 = load_dataset("mc4", "en")
|
303 |
+
```
|
304 |
+
|
305 |
+
And if you can even specify a list of languages:
|
306 |
+
|
307 |
+
```python
|
308 |
+
from datasets import load_dataset
|
309 |
+
|
310 |
+
mc4_subset_with_five_languages = load_dataset("mc4", languages=["en", "fr", "es", "de", "zh"])
|
311 |
+
```
|
312 |
+
|
313 |
+
### Supported Tasks and Leaderboards
|
314 |
+
|
315 |
+
mC4 is mainly intended to pretrain language models and word representations.
|
316 |
+
|
317 |
+
### Languages
|
318 |
+
|
319 |
+
The dataset supports 108 languages.
|
320 |
+
|
321 |
+
## Dataset Structure
|
322 |
+
|
323 |
+
### Data Instances
|
324 |
+
|
325 |
+
An example form the `en` config is:
|
326 |
+
|
327 |
+
```
|
328 |
+
{'timestamp': '2018-06-24T01:32:39Z',
|
329 |
+
'text': 'Farm Resources in Plumas County\nShow Beginning Farmer Organizations & Professionals (304)\nThere are 304 resources serving Plumas County in the following categories:\nMap of Beginning Farmer Organizations & Professionals serving Plumas County\nVictoria Fisher - Office Manager - Loyalton, CA\nAmy Lynn Rasband - UCCE Plumas-Sierra Administrative Assistant II - Quincy , CA\nShow Farm Income Opportunities Organizations & Professionals (353)\nThere are 353 resources serving Plumas County in the following categories:\nFarm Ranch And Forest Retailers (18)\nMap of Farm Income Opportunities Organizations & Professionals serving Plumas County\nWarner Valley Wildlife Area - Plumas County\nShow Farm Resources Organizations & Professionals (297)\nThere are 297 resources serving Plumas County in the following categories:\nMap of Farm Resources Organizations & Professionals serving Plumas County\nThere are 57 resources serving Plumas County in the following categories:\nMap of Organic Certification Organizations & Professionals serving Plumas County',
|
330 |
+
'url': 'http://www.californialandcan.org/Plumas/Farm-Resources/'}
|
331 |
+
```
|
332 |
+
|
333 |
+
### Data Fields
|
334 |
+
|
335 |
+
The data have several fields:
|
336 |
+
|
337 |
+
- `url`: url of the source as a string
|
338 |
+
- `text`: text content as a string
|
339 |
+
- `timestamp`: timestamp as a string
|
340 |
+
|
341 |
+
### Data Splits
|
342 |
+
|
343 |
+
To build mC4, the authors used [CLD3](https://github.com/google/cld3) to identify over 100 languages. The resulting mC4 subsets for each language are reported in this table:
|
344 |
+
|
345 |
+
| config | train | validation |
|
346 |
+
|:---------|:--------|:-------------|
|
347 |
+
| af | ? | ? |
|
348 |
+
| am | ? | ? |
|
349 |
+
| ar | ? | ? |
|
350 |
+
| az | ? | ? |
|
351 |
+
| be | ? | ? |
|
352 |
+
| bg | ? | ? |
|
353 |
+
| bg-Latn | ? | ? |
|
354 |
+
| bn | ? | ? |
|
355 |
+
| ca | ? | ? |
|
356 |
+
| ceb | ? | ? |
|
357 |
+
| co | ? | ? |
|
358 |
+
| cs | ? | ? |
|
359 |
+
| cy | ? | ? |
|
360 |
+
| da | ? | ? |
|
361 |
+
| de | ? | ? |
|
362 |
+
| el | ? | ? |
|
363 |
+
| el-Latn | ? | ? |
|
364 |
+
| en | ? | ? |
|
365 |
+
| eo | ? | ? |
|
366 |
+
| es | ? | ? |
|
367 |
+
| et | ? | ? |
|
368 |
+
| eu | ? | ? |
|
369 |
+
| fa | ? | ? |
|
370 |
+
| fi | ? | ? |
|
371 |
+
| fil | ? | ? |
|
372 |
+
| fr | ? | ? |
|
373 |
+
| fy | ? | ? |
|
374 |
+
| ga | ? | ? |
|
375 |
+
| gd | ? | ? |
|
376 |
+
| gl | ? | ? |
|
377 |
+
| gu | ? | ? |
|
378 |
+
| ha | ? | ? |
|
379 |
+
| haw | ? | ? |
|
380 |
+
| hi | ? | ? |
|
381 |
+
| hi-Latn | ? | ? |
|
382 |
+
| hmn | ? | ? |
|
383 |
+
| ht | ? | ? |
|
384 |
+
| hu | ? | ? |
|
385 |
+
| hy | ? | ? |
|
386 |
+
| id | ? | ? |
|
387 |
+
| ig | ? | ? |
|
388 |
+
| is | ? | ? |
|
389 |
+
| it | ? | ? |
|
390 |
+
| iw | ? | ? |
|
391 |
+
| ja | ? | ? |
|
392 |
+
| ja-Latn | ? | ? |
|
393 |
+
| jv | ? | ? |
|
394 |
+
| ka | ? | ? |
|
395 |
+
| kk | ? | ? |
|
396 |
+
| km | ? | ? |
|
397 |
+
| kn | ? | ? |
|
398 |
+
| ko | ? | ? |
|
399 |
+
| ku | ? | ? |
|
400 |
+
| ky | ? | ? |
|
401 |
+
| la | ? | ? |
|
402 |
+
| lb | ? | ? |
|
403 |
+
| lo | ? | ? |
|
404 |
+
| lt | ? | ? |
|
405 |
+
| lv | ? | ? |
|
406 |
+
| mg | ? | ? |
|
407 |
+
| mi | ? | ? |
|
408 |
+
| mk | ? | ? |
|
409 |
+
| ml | ? | ? |
|
410 |
+
| mn | ? | ? |
|
411 |
+
| mr | ? | ? |
|
412 |
+
| ms | ? | ? |
|
413 |
+
| mt | ? | ? |
|
414 |
+
| my | ? | ? |
|
415 |
+
| ne | ? | ? |
|
416 |
+
| nl | ? | ? |
|
417 |
+
| no | ? | ? |
|
418 |
+
| ny | ? | ? |
|
419 |
+
| pa | ? | ? |
|
420 |
+
| pl | ? | ? |
|
421 |
+
| ps | ? | ? |
|
422 |
+
| pt | ? | ? |
|
423 |
+
| ro | ? | ? |
|
424 |
+
| ru | ? | ? |
|
425 |
+
| ru-Latn | ? | ? |
|
426 |
+
| sd | ? | ? |
|
427 |
+
| si | ? | ? |
|
428 |
+
| sk | ? | ? |
|
429 |
+
| sl | ? | ? |
|
430 |
+
| sm | ? | ? |
|
431 |
+
| sn | ? | ? |
|
432 |
+
| so | ? | ? |
|
433 |
+
| sq | ? | ? |
|
434 |
+
| sr | ? | ? |
|
435 |
+
| st | ? | ? |
|
436 |
+
| su | ? | ? |
|
437 |
+
| sv | ? | ? |
|
438 |
+
| sw | ? | ? |
|
439 |
+
| ta | ? | ? |
|
440 |
+
| te | ? | ? |
|
441 |
+
| tg | ? | ? |
|
442 |
+
| th | ? | ? |
|
443 |
+
| tr | ? | ? |
|
444 |
+
| uk | ? | ? |
|
445 |
+
| und | ? | ? |
|
446 |
+
| ur | ? | ? |
|
447 |
+
| uz | ? | ? |
|
448 |
+
| vi | ? | ? |
|
449 |
+
| xh | ? | ? |
|
450 |
+
| yi | ? | ? |
|
451 |
+
| yo | ? | ? |
|
452 |
+
| zh | ? | ? |
|
453 |
+
| zh-Latn | ? | ? |
|
454 |
+
| zu | ? | ? |
|
455 |
+
|
456 |
+
## Dataset Creation
|
457 |
+
|
458 |
+
### Curation Rationale
|
459 |
+
|
460 |
+
[More Information Needed]
|
461 |
+
|
462 |
+
### Source Data
|
463 |
+
|
464 |
+
#### Initial Data Collection and Normalization
|
465 |
+
|
466 |
+
[More Information Needed]
|
467 |
+
|
468 |
+
#### Who are the source language producers?
|
469 |
+
|
470 |
+
[More Information Needed]
|
471 |
+
|
472 |
+
### Annotations
|
473 |
+
|
474 |
+
#### Annotation process
|
475 |
+
|
476 |
+
[More Information Needed]
|
477 |
+
|
478 |
+
#### Who are the annotators?
|
479 |
+
|
480 |
+
[More Information Needed]
|
481 |
+
|
482 |
+
### Personal and Sensitive Information
|
483 |
+
|
484 |
+
[More Information Needed]
|
485 |
+
|
486 |
+
## Considerations for Using the Data
|
487 |
+
|
488 |
+
### Social Impact of Dataset
|
489 |
+
|
490 |
+
[More Information Needed]
|
491 |
+
|
492 |
+
### Discussion of Biases
|
493 |
+
|
494 |
+
[More Information Needed]
|
495 |
+
|
496 |
+
### Other Known Limitations
|
497 |
+
|
498 |
+
[More Information Needed]
|
499 |
+
|
500 |
+
## Additional Information
|
501 |
+
|
502 |
+
### Dataset Curators
|
503 |
+
|
504 |
+
[More Information Needed]
|
505 |
+
|
506 |
+
### Licensing Information
|
507 |
+
|
508 |
+
AllenAI are releasing this dataset under the terms of ODC-BY. By using this, you are also bound by the Common Crawl terms of use in respect of the content contained in the dataset.
|
509 |
+
|
510 |
+
### Citation Information
|
511 |
+
|
512 |
+
```
|
513 |
+
@article{2019t5,
|
514 |
+
author = {Colin Raffel and Noam Shazeer and Adam Roberts and Katherine Lee and Sharan Narang and Michael Matena and Yanqi Zhou and Wei Li and Peter J. Liu},
|
515 |
+
title = {Exploring the Limits of Transfer Learning with a Unified Text-to-Text Transformer},
|
516 |
+
journal = {arXiv e-prints},
|
517 |
+
year = {2019},
|
518 |
+
archivePrefix = {arXiv},
|
519 |
+
eprint = {1910.10683},
|
520 |
+
}
|
521 |
+
```
|
522 |
+
|
523 |
+
### Contributions
|
524 |
+
|
525 |
+
Thanks to [@dirkgr](https://github.com/dirkgr) and [@lhoestq](https://github.com/lhoestq) for adding this dataset.
|
mc4/dummy/af/0.0.0/dummy_data.zip
ADDED
Binary file (8.54 kB). View file
|
|
mc4/mc4.py
ADDED
@@ -0,0 +1,394 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
"""mC4 dataset based on Common Crawl."""
|
2 |
+
|
3 |
+
|
4 |
+
import gzip
|
5 |
+
import json
|
6 |
+
|
7 |
+
import datasets
|
8 |
+
import kenlm
|
9 |
+
import numpy as np
|
10 |
+
|
11 |
+
|
12 |
+
logger = datasets.logging.get_logger(__name__)
|
13 |
+
|
14 |
+
|
15 |
+
_DESCRIPTION = """\
|
16 |
+
A colossal, cleaned version of Common Crawl's web crawl corpus.
|
17 |
+
|
18 |
+
Based on Common Crawl dataset: "https://commoncrawl.org".
|
19 |
+
|
20 |
+
This is the processed version of Google's mC4 dataset by AllenAI.
|
21 |
+
"""
|
22 |
+
|
23 |
+
_CITATION = """
|
24 |
+
@article{2019t5,
|
25 |
+
author = {Colin Raffel and Noam Shazeer and Adam Roberts and Katherine Lee and Sharan Narang and Michael Matena and Yanqi Zhou and Wei Li and Peter J. Liu},
|
26 |
+
title = {Exploring the Limits of Transfer Learning with a Unified Text-to-Text Transformer},
|
27 |
+
journal = {arXiv e-prints},
|
28 |
+
year = {2019},
|
29 |
+
archivePrefix = {arXiv},
|
30 |
+
eprint = {1910.10683},
|
31 |
+
}
|
32 |
+
"""
|
33 |
+
|
34 |
+
_URL = "https://github.com/allenai/allennlp/discussions/5056"
|
35 |
+
|
36 |
+
_DATA_URL = "https://huggingface.co/datasets/allenai/c4/resolve/1ddc917116b730e1859edef32896ec5c16be51d0/multilingual/c4-{language}{split_suffix}.tfrecord-{index:05d}-of-{n_shards:05d}.json.gz"
|
37 |
+
|
38 |
+
_LANGUAGES = [
|
39 |
+
"af",
|
40 |
+
"am",
|
41 |
+
"ar",
|
42 |
+
"az",
|
43 |
+
"be",
|
44 |
+
"bg",
|
45 |
+
"bg-Latn",
|
46 |
+
"bn",
|
47 |
+
"ca",
|
48 |
+
"ceb",
|
49 |
+
"co",
|
50 |
+
"cs",
|
51 |
+
"cy",
|
52 |
+
"da",
|
53 |
+
"de",
|
54 |
+
"el",
|
55 |
+
"el-Latn",
|
56 |
+
"en",
|
57 |
+
"eo",
|
58 |
+
"es",
|
59 |
+
"et",
|
60 |
+
"eu",
|
61 |
+
"fa",
|
62 |
+
"fi",
|
63 |
+
"fil",
|
64 |
+
"fr",
|
65 |
+
"fy",
|
66 |
+
"ga",
|
67 |
+
"gd",
|
68 |
+
"gl",
|
69 |
+
"gu",
|
70 |
+
"ha",
|
71 |
+
"haw",
|
72 |
+
"hi",
|
73 |
+
"hi-Latn",
|
74 |
+
"hmn",
|
75 |
+
"ht",
|
76 |
+
"hu",
|
77 |
+
"hy",
|
78 |
+
"id",
|
79 |
+
"ig",
|
80 |
+
"is",
|
81 |
+
"it",
|
82 |
+
"iw",
|
83 |
+
"ja",
|
84 |
+
"ja-Latn",
|
85 |
+
"jv",
|
86 |
+
"ka",
|
87 |
+
"kk",
|
88 |
+
"km",
|
89 |
+
"kn",
|
90 |
+
"ko",
|
91 |
+
"ku",
|
92 |
+
"ky",
|
93 |
+
"la",
|
94 |
+
"lb",
|
95 |
+
"lo",
|
96 |
+
"lt",
|
97 |
+
"lv",
|
98 |
+
"mg",
|
99 |
+
"mi",
|
100 |
+
"mk",
|
101 |
+
"ml",
|
102 |
+
"mn",
|
103 |
+
"mr",
|
104 |
+
"ms",
|
105 |
+
"mt",
|
106 |
+
"my",
|
107 |
+
"ne",
|
108 |
+
"nl",
|
109 |
+
"no",
|
110 |
+
"ny",
|
111 |
+
"pa",
|
112 |
+
"pl",
|
113 |
+
"ps",
|
114 |
+
"pt",
|
115 |
+
"ro",
|
116 |
+
"ru",
|
117 |
+
"ru-Latn",
|
118 |
+
"sd",
|
119 |
+
"si",
|
120 |
+
"sk",
|
121 |
+
"sl",
|
122 |
+
"sm",
|
123 |
+
"sn",
|
124 |
+
"so",
|
125 |
+
"sq",
|
126 |
+
"sr",
|
127 |
+
"st",
|
128 |
+
"su",
|
129 |
+
"sv",
|
130 |
+
"sw",
|
131 |
+
"ta",
|
132 |
+
"te",
|
133 |
+
"tg",
|
134 |
+
"th",
|
135 |
+
"tr",
|
136 |
+
"uk",
|
137 |
+
"und",
|
138 |
+
"ur",
|
139 |
+
"uz",
|
140 |
+
"vi",
|
141 |
+
"xh",
|
142 |
+
"yi",
|
143 |
+
"yo",
|
144 |
+
"zh",
|
145 |
+
"zh-Latn",
|
146 |
+
"zu",
|
147 |
+
]
|
148 |
+
|
149 |
+
_N_SHARDS_PER_SPLIT = {
|
150 |
+
"af": {"train": 64, "validation": 1},
|
151 |
+
"am": {"train": 16, "validation": 1},
|
152 |
+
"ar": {"train": 1024, "validation": 4},
|
153 |
+
"az": {"train": 256, "validation": 1},
|
154 |
+
"be": {"train": 128, "validation": 1},
|
155 |
+
"bg": {"train": 1024, "validation": 1},
|
156 |
+
"bg-Latn": {"train": 4, "validation": 1},
|
157 |
+
"bn": {"train": 512, "validation": 1},
|
158 |
+
"ca": {"train": 512, "validation": 1},
|
159 |
+
"ceb": {"train": 8, "validation": 1},
|
160 |
+
"co": {"train": 8, "validation": 1},
|
161 |
+
"cs": {"train": 1024, "validation": 2},
|
162 |
+
"cy": {"train": 256, "validation": 1},
|
163 |
+
"da": {"train": 1024, "validation": 1},
|
164 |
+
"de": {"train": 2048, "validation": 16},
|
165 |
+
"el": {"train": 1024, "validation": 2},
|
166 |
+
"el-Latn": {"train": 16, "validation": 1},
|
167 |
+
"en": {"train": 11264, "validation": 128},
|
168 |
+
"eo": {"train": 32, "validation": 1},
|
169 |
+
"es": {"train": 2048, "validation": 16},
|
170 |
+
"et": {"train": 256, "validation": 1},
|
171 |
+
"eu": {"train": 64, "validation": 1},
|
172 |
+
"fa": {"train": 1024, "validation": 2},
|
173 |
+
"fi": {"train": 1024, "validation": 1},
|
174 |
+
"fil": {"train": 64, "validation": 1},
|
175 |
+
"fr": {"train": 2048, "validation": 16},
|
176 |
+
"fy": {"train": 16, "validation": 1},
|
177 |
+
"ga": {"train": 16, "validation": 1},
|
178 |
+
"gd": {"train": 16, "validation": 1},
|
179 |
+
"gl": {"train": 128, "validation": 1},
|
180 |
+
"gu": {"train": 64, "validation": 1},
|
181 |
+
"ha": {"train": 8, "validation": 1},
|
182 |
+
"haw": {"train": 2, "validation": 1},
|
183 |
+
"hi": {"train": 1024, "validation": 2},
|
184 |
+
"hi-Latn": {"train": 16, "validation": 1},
|
185 |
+
"hmn": {"train": 8, "validation": 1},
|
186 |
+
"ht": {"train": 8, "validation": 1},
|
187 |
+
"hu": {"train": 1024, "validation": 2},
|
188 |
+
"hy": {"train": 128, "validation": 1},
|
189 |
+
"id": {"train": 1024, "validation": 4},
|
190 |
+
"ig": {"train": 4, "validation": 1},
|
191 |
+
"is": {"train": 128, "validation": 1},
|
192 |
+
"it": {"train": 1024, "validation": 8},
|
193 |
+
"iw": {"train": 1024, "validation": 1},
|
194 |
+
"ja": {"train": 1024, "validation": 8},
|
195 |
+
"ja-Latn": {"train": 8, "validation": 1},
|
196 |
+
"jv": {"train": 8, "validation": 1},
|
197 |
+
"ka": {"train": 256, "validation": 1},
|
198 |
+
"kk": {"train": 256, "validation": 1},
|
199 |
+
"km": {"train": 64, "validation": 1},
|
200 |
+
"kn": {"train": 64, "validation": 1},
|
201 |
+
"ko": {"train": 1024, "validation": 1},
|
202 |
+
"ku": {"train": 16, "validation": 1},
|
203 |
+
"ky": {"train": 64, "validation": 1},
|
204 |
+
"la": {"train": 64, "validation": 1},
|
205 |
+
"lb": {"train": 32, "validation": 1},
|
206 |
+
"lo": {"train": 8, "validation": 1},
|
207 |
+
"lt": {"train": 512, "validation": 1},
|
208 |
+
"lv": {"train": 256, "validation": 1},
|
209 |
+
"mg": {"train": 8, "validation": 1},
|
210 |
+
"mi": {"train": 4, "validation": 1},
|
211 |
+
"mk": {"train": 128, "validation": 1},
|
212 |
+
"ml": {"train": 128, "validation": 1},
|
213 |
+
"mn": {"train": 128, "validation": 1},
|
214 |
+
"mr": {"train": 1024, "validation": 1},
|
215 |
+
"ms": {"train": 512, "validation": 1},
|
216 |
+
"mt": {"train": 128, "validation": 1},
|
217 |
+
"my": {"train": 64, "validation": 1},
|
218 |
+
"ne": {"train": 256, "validation": 1},
|
219 |
+
"nl": {"train": 1024, "validation": 4},
|
220 |
+
"no": {"train": 1024, "validation": 1},
|
221 |
+
"ny": {"train": 4, "validation": 1},
|
222 |
+
"pa": {"train": 32, "validation": 1},
|
223 |
+
"pl": {"train": 1024, "validation": 4},
|
224 |
+
"ps": {"train": 16, "validation": 1},
|
225 |
+
"pt": {"train": 1024, "validation": 4},
|
226 |
+
"ro": {"train": 1024, "validation": 2},
|
227 |
+
"ru": {"train": 4096, "validation": 32},
|
228 |
+
"ru-Latn": {"train": 32, "validation": 1},
|
229 |
+
"sd": {"train": 64, "validation": 1},
|
230 |
+
"si": {"train": 64, "validation": 1},
|
231 |
+
"sk": {"train": 512, "validation": 1},
|
232 |
+
"sl": {"train": 256, "validation": 1},
|
233 |
+
"sm": {"train": 4, "validation": 1},
|
234 |
+
"sn": {"train": 8, "validation": 1},
|
235 |
+
"so": {"train": 64, "validation": 1},
|
236 |
+
"sq": {"train": 128, "validation": 1},
|
237 |
+
"sr": {"train": 256, "validation": 1},
|
238 |
+
"st": {"train": 2, "validation": 1},
|
239 |
+
"su": {"train": 4, "validation": 1},
|
240 |
+
"sv": {"train": 1024, "validation": 2},
|
241 |
+
"sw": {"train": 32, "validation": 1},
|
242 |
+
"ta": {"train": 256, "validation": 1},
|
243 |
+
"te": {"train": 128, "validation": 1},
|
244 |
+
"tg": {"train": 64, "validation": 1},
|
245 |
+
"th": {"train": 1024, "validation": 1},
|
246 |
+
"tr": {"train": 1024, "validation": 4},
|
247 |
+
"uk": {"train": 1024, "validation": 2},
|
248 |
+
"und": {"train": 3072, "validation": 32},
|
249 |
+
"ur": {"train": 128, "validation": 1},
|
250 |
+
"uz": {"train": 32, "validation": 1},
|
251 |
+
"vi": {"train": 1024, "validation": 4},
|
252 |
+
"xh": {"train": 2, "validation": 1},
|
253 |
+
"yi": {"train": 16, "validation": 1},
|
254 |
+
"yo": {"train": 2, "validation": 1},
|
255 |
+
"zh": {"train": 1024, "validation": 2},
|
256 |
+
"zh-Latn": {"train": 8, "validation": 1},
|
257 |
+
"zu": {"train": 8, "validation": 1},
|
258 |
+
}
|
259 |
+
|
260 |
+
|
261 |
+
class Mc4Config(datasets.BuilderConfig):
|
262 |
+
"""BuilderConfig for mC4."""
|
263 |
+
|
264 |
+
def __init__(self, *args, languages, **kwargs):
|
265 |
+
"""BuilderConfig for mC4.
|
266 |
+
Args:
|
267 |
+
languages (:obj:`List[str]`): list of languages to load
|
268 |
+
**kwargs: keyword arguments forwarded to super.
|
269 |
+
"""
|
270 |
+
super().__init__(
|
271 |
+
*args,
|
272 |
+
name="+".join(languages),
|
273 |
+
**kwargs,
|
274 |
+
)
|
275 |
+
self.languages = languages
|
276 |
+
|
277 |
+
|
278 |
+
class Mc4(datasets.GeneratorBasedBuilder):
|
279 |
+
"""mC4, a colossal, cleaned version of Common Crawl's web crawl corpus."""
|
280 |
+
|
281 |
+
BUILDER_CONFIGS = [Mc4Config(languages=[lang]) for lang in _LANGUAGES]
|
282 |
+
BUILDER_CONFIG_CLASS = Mc4Config
|
283 |
+
|
284 |
+
def __init__(self, *args, writer_batch_size=None, **kwargs):
|
285 |
+
self.sampling_method = kwargs.pop("sampling_method")
|
286 |
+
if self.sampling_method:
|
287 |
+
self.perplexity_model = kwargs.pop("perplexity_model")
|
288 |
+
self.sampling_factor = kwargs.pop("sampling_factor")
|
289 |
+
self.boundaries = kwargs.pop("boundaries")
|
290 |
+
# Loading 5-gram model
|
291 |
+
# http://dl.fbaipublicfiles.com/cc_net/lm/es.arpa.bin
|
292 |
+
logger.info("loading model = %s", self.perplexity_model)
|
293 |
+
self.pp_model = kenlm.Model(self.perplexity_model)
|
294 |
+
if self.sampling_method == "gaussian":
|
295 |
+
self.should_keep_doc = self._should_keep_doc_gaussian
|
296 |
+
else:
|
297 |
+
self.should_keep_doc = self._should_keep_doc_gaussian
|
298 |
+
|
299 |
+
super().__init__(*args, writer_batch_size=writer_batch_size, **kwargs)
|
300 |
+
|
301 |
+
def get_perplexity(self, doc):
|
302 |
+
doc_log_score, doc_length = 0, 0
|
303 |
+
for line in doc.split("\n"):
|
304 |
+
log_score = self.pp_model.score(line)
|
305 |
+
length = len(line.split()) + 1
|
306 |
+
doc_log_score += log_score
|
307 |
+
doc_length += length
|
308 |
+
return 10.0 ** (-doc_log_score / doc_length)
|
309 |
+
|
310 |
+
|
311 |
+
def _should_keep_doc_step(self, doc, factor=1, boundaries=None):
|
312 |
+
perplexity = self.get_perplexity(doc)
|
313 |
+
if boundaries is None:
|
314 |
+
boundaries = [536394.99320948, 662247.50212365, 919250.87225178]
|
315 |
+
if perplexity <= boundaries[0]:
|
316 |
+
quartile_range = boundaries[0]
|
317 |
+
elif boundaries[0] < perplexity < boundaries[1]:
|
318 |
+
quartile_range = boundaries[1] - boundaries[0]
|
319 |
+
elif boundaries[1] < perplexity < boundaries[2]:
|
320 |
+
quartile_range = boundaries[2] - boundaries[1]
|
321 |
+
elif perplexity >= boundaries[2]:
|
322 |
+
quartile_range = 100 * boundaries[2]
|
323 |
+
probability = factor / quartile_range
|
324 |
+
return np.random() < probability
|
325 |
+
|
326 |
+
def _should_keep_doc_gaussian(self, doc, factor=0.4, boundaries=None):
|
327 |
+
perplexity = self.get_perplexity(doc)
|
328 |
+
if boundaries is not None:
|
329 |
+
m = boundaries[1]
|
330 |
+
else:
|
331 |
+
m = 662247.50212365
|
332 |
+
weighted_perplexity = factor * np.exp(-9/2*((perplexity-m)/m)**2)
|
333 |
+
return np.random.uniform() < weighted_perplexity
|
334 |
+
|
335 |
+
def _info(self):
|
336 |
+
return datasets.DatasetInfo(
|
337 |
+
description=_DESCRIPTION,
|
338 |
+
features=datasets.Features(
|
339 |
+
{
|
340 |
+
"text": datasets.Value("string"),
|
341 |
+
"timestamp": datasets.Value("string"),
|
342 |
+
"url": datasets.Value("string"),
|
343 |
+
}
|
344 |
+
),
|
345 |
+
supervised_keys=None,
|
346 |
+
homepage=_URL,
|
347 |
+
citation=_CITATION,
|
348 |
+
)
|
349 |
+
|
350 |
+
def _split_generators(self, dl_manager):
|
351 |
+
data_urls = {}
|
352 |
+
for split in ["train", "validation"]:
|
353 |
+
data_urls[split] = [
|
354 |
+
_DATA_URL.format(
|
355 |
+
language=self.config.name,
|
356 |
+
split_suffix="-validation" if split == "validation" else "",
|
357 |
+
index=index,
|
358 |
+
n_shards=_N_SHARDS_PER_SPLIT[lang][split],
|
359 |
+
)
|
360 |
+
for lang in self.config.languages
|
361 |
+
for index in range(_N_SHARDS_PER_SPLIT[lang][split])
|
362 |
+
]
|
363 |
+
train_downloaded_files = dl_manager.download(data_urls["train"])
|
364 |
+
validation_downloaded_files = dl_manager.download(data_urls["validation"])
|
365 |
+
return [
|
366 |
+
datasets.SplitGenerator(name=datasets.Split.TRAIN, gen_kwargs={"filepaths": train_downloaded_files}),
|
367 |
+
datasets.SplitGenerator(
|
368 |
+
name=datasets.Split.VALIDATION, gen_kwargs={"filepaths": validation_downloaded_files}
|
369 |
+
),
|
370 |
+
]
|
371 |
+
|
372 |
+
def _generate_examples(self, filepaths):
|
373 |
+
"""This function returns the examples in the raw (text) form by iterating on all the files."""
|
374 |
+
id_ = 0
|
375 |
+
for filepath in filepaths:
|
376 |
+
logger.info("generating examples from = %s", filepath)
|
377 |
+
with gzip.open(open(filepath, "rb"), "rt", encoding="utf-8") as f:
|
378 |
+
if self.sampling_method:
|
379 |
+
logger.info("sampling method = %s", self.sampling_method)
|
380 |
+
for line in f:
|
381 |
+
if line:
|
382 |
+
example = json.loads(line)
|
383 |
+
if self.should_keep_doc(
|
384 |
+
example["text"],
|
385 |
+
factor=self.sampling_factor,
|
386 |
+
boundaries=self.boundaries):
|
387 |
+
yield id_, example
|
388 |
+
id_ += 1
|
389 |
+
else:
|
390 |
+
for line in f:
|
391 |
+
if line:
|
392 |
+
example = json.loads(line)
|
393 |
+
yield id_, example
|
394 |
+
id_ += 1
|
run_mlm_flax.py
CHANGED
@@ -456,8 +456,6 @@ if __name__ == "__main__":
|
|
456 |
has_tensorboard = is_tensorboard_available()
|
457 |
if has_tensorboard and jax.process_index() == 0:
|
458 |
try:
|
459 |
-
from flax.metrics.tensorboard import SummaryWriter
|
460 |
-
summary_writer = SummaryWriter(log_dir=Path(training_args.output_dir))
|
461 |
# Enable Weight&Biases
|
462 |
import wandb
|
463 |
wandb.init(
|
@@ -468,6 +466,8 @@ if __name__ == "__main__":
|
|
468 |
wandb.config.update(training_args)
|
469 |
wandb.config.update(model_args)
|
470 |
wandb.config.update(data_args)
|
|
|
|
|
471 |
except ImportError as ie:
|
472 |
has_tensorboard = False
|
473 |
logger.warning(
|
|
|
456 |
has_tensorboard = is_tensorboard_available()
|
457 |
if has_tensorboard and jax.process_index() == 0:
|
458 |
try:
|
|
|
|
|
459 |
# Enable Weight&Biases
|
460 |
import wandb
|
461 |
wandb.init(
|
|
|
466 |
wandb.config.update(training_args)
|
467 |
wandb.config.update(model_args)
|
468 |
wandb.config.update(data_args)
|
469 |
+
from flax.metrics.tensorboard import SummaryWriter
|
470 |
+
summary_writer = SummaryWriter(log_dir=Path(training_args.output_dir))
|
471 |
except ImportError as ie:
|
472 |
has_tensorboard = False
|
473 |
logger.warning(
|
run_mlm_flax_stream.py
CHANGED
@@ -272,12 +272,12 @@ class SamplingArguments:
|
|
272 |
sampling_factor: Optional[int] = field(
|
273 |
default=1, metadata={"help": "Sampling factor. Integers for step function, decimals for gaussian."}
|
274 |
)
|
275 |
-
|
276 |
default="536394.99320948,662247.50212365,919250.87225178", metadata={"help": "Quartile boundaries"}
|
277 |
)
|
278 |
|
279 |
def __post_init__(self):
|
280 |
-
self.
|
281 |
|
282 |
|
283 |
def generate_batch_splits(samples_idx: jnp.ndarray, batch_size: int) -> jnp.ndarray:
|
@@ -393,6 +393,10 @@ if __name__ == "__main__":
|
|
393 |
cache_dir=model_args.cache_dir,
|
394 |
streaming=True,
|
395 |
split="train",
|
|
|
|
|
|
|
|
|
396 |
)
|
397 |
|
398 |
if model_args.config_name:
|
@@ -417,67 +421,14 @@ if __name__ == "__main__":
|
|
417 |
"You can do it from another script, save it, and load it from here, using --tokenizer_name."
|
418 |
)
|
419 |
|
420 |
-
#
|
421 |
-
#
|
422 |
-
|
423 |
-
|
424 |
-
|
425 |
-
|
426 |
-
|
427 |
-
|
428 |
-
log_score = pp_model.score(line)
|
429 |
-
length = len(line.split()) + 1
|
430 |
-
doc_log_score += log_score
|
431 |
-
doc_length += length
|
432 |
-
return 10.0 ** (-doc_log_score / doc_length)
|
433 |
-
|
434 |
-
def should_keep_doc_step(doc, factor=1, boundaires=None):
|
435 |
-
perplexity = get_perplexity(doc)
|
436 |
-
if boundaires is None:
|
437 |
-
boundaires = [536394.99320948, 662247.50212365, 919250.87225178]
|
438 |
-
if perplexity <= boundaires[0]:
|
439 |
-
quartile_range = boundaires[0]
|
440 |
-
elif boundaires[0] < perplexity < boundaires[1]:
|
441 |
-
quartile_range = boundaires[1] - boundaires[0]
|
442 |
-
elif boundaires[1] < perplexity < boundaires[2]:
|
443 |
-
quartile_range = boundaires[2] - boundaires[1]
|
444 |
-
elif perplexity >= boundaires[2]:
|
445 |
-
quartile_range = 100 * boundaires[2]
|
446 |
-
probability = factor / quartile_range
|
447 |
-
return np.random() < probability
|
448 |
-
|
449 |
-
def should_keep_doc_gaussian(doc, factor=0.4, boundaires=None):
|
450 |
-
perplexity = get_perplexity(doc)
|
451 |
-
if boundaires is not None:
|
452 |
-
m = boundaires[1]
|
453 |
-
else:
|
454 |
-
m = 662247.50212365
|
455 |
-
weighted_perplexity = factor*np.exp(-9/2*((perplexity-m)/m)**2)
|
456 |
-
return np.random.uniform() < weighted_perplexity
|
457 |
-
|
458 |
-
if sampling_args.sampling_method == "gaussian":
|
459 |
-
should_keep_doc = should_keep_doc_gaussian
|
460 |
-
else:
|
461 |
-
should_keep_doc = should_keep_doc_gaussian
|
462 |
-
|
463 |
-
def tokenize_function(examples):
|
464 |
-
return tokenizer([
|
465 |
-
example for example in examples[data_args.text_column_name]
|
466 |
-
if should_keep_doc(
|
467 |
-
example,
|
468 |
-
factor=sampling_args.sampling_factor,
|
469 |
-
boundaries=sampling_args.boundaries
|
470 |
-
)
|
471 |
-
], return_special_tokens_mask=True)
|
472 |
-
else:
|
473 |
-
# Otherwise, we tokenize every text, then concatenate them together before splitting them in smaller parts.
|
474 |
-
# We use `return_special_tokens_mask=True` because DataCollatorForLanguageModeling (see below) is more
|
475 |
-
# efficient when it receives the `special_tokens_mask`.
|
476 |
-
def tokenize_function(examples):
|
477 |
-
return tokenizer(
|
478 |
-
examples[data_args.text_column_name],
|
479 |
-
return_special_tokens_mask=True
|
480 |
-
)
|
481 |
|
482 |
tokenized_datasets = dataset.map(
|
483 |
tokenize_function,
|
|
|
272 |
sampling_factor: Optional[int] = field(
|
273 |
default=1, metadata={"help": "Sampling factor. Integers for step function, decimals for gaussian."}
|
274 |
)
|
275 |
+
boundaries: Optional[str] = field(
|
276 |
default="536394.99320948,662247.50212365,919250.87225178", metadata={"help": "Quartile boundaries"}
|
277 |
)
|
278 |
|
279 |
def __post_init__(self):
|
280 |
+
self.boundaries = [float(q) for q in self.boundaries.split(",")]
|
281 |
|
282 |
|
283 |
def generate_batch_splits(samples_idx: jnp.ndarray, batch_size: int) -> jnp.ndarray:
|
|
|
393 |
cache_dir=model_args.cache_dir,
|
394 |
streaming=True,
|
395 |
split="train",
|
396 |
+
sampling_method=sampling_args.sampling_method,
|
397 |
+
sampling_factor=sampling_args.sampling_factor,
|
398 |
+
boundaries=sampling_args.boundaries,
|
399 |
+
perplexity_model=sampling_args.perplexity_model,
|
400 |
)
|
401 |
|
402 |
if model_args.config_name:
|
|
|
421 |
"You can do it from another script, save it, and load it from here, using --tokenizer_name."
|
422 |
)
|
423 |
|
424 |
+
# Otherwise, we tokenize every text, then concatenate them together before splitting them in smaller parts.
|
425 |
+
# We use `return_special_tokens_mask=True` because DataCollatorForLanguageModeling (see below) is more
|
426 |
+
# efficient when it receives the `special_tokens_mask`.
|
427 |
+
def tokenize_function(examples):
|
428 |
+
return tokenizer(
|
429 |
+
examples[data_args.text_column_name],
|
430 |
+
return_special_tokens_mask=True
|
431 |
+
)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
432 |
|
433 |
tokenized_datasets = dataset.map(
|
434 |
tokenize_function,
|
run_stream.sh
CHANGED
@@ -1,11 +1,11 @@
|
|
1 |
# From https://arxiv.org/pdf/1907.11692.pdf for base model
|
2 |
python -c "import jax; print('TPUs', jax.device_count())"
|
3 |
-
./run_mlm_flax_stream.py \
|
4 |
-
--output_dir="./" \
|
5 |
--model_type="roberta" \
|
6 |
--config_name="./config-base.json" \
|
7 |
--tokenizer_name="./" \
|
8 |
-
--dataset_name="mc4" \
|
9 |
--dataset_config_name="es" \
|
10 |
--max_seq_length="128" \
|
11 |
--pad_to_max_length \
|
@@ -24,4 +24,5 @@ python -c "import jax; print('TPUs', jax.device_count())"
|
|
24 |
--num_train_steps="500000" \
|
25 |
--eval_steps="1000" \
|
26 |
--dtype="bfloat16" \
|
|
|
27 |
--logging_steps="500" 2>&1 | tee run_stream.log
|
|
|
1 |
# From https://arxiv.org/pdf/1907.11692.pdf for base model
|
2 |
python -c "import jax; print('TPUs', jax.device_count())"
|
3 |
+
python ./run_mlm_flax_stream.py \
|
4 |
+
--output_dir="./outputs" \
|
5 |
--model_type="roberta" \
|
6 |
--config_name="./config-base.json" \
|
7 |
--tokenizer_name="./" \
|
8 |
+
--dataset_name="./mc4" \
|
9 |
--dataset_config_name="es" \
|
10 |
--max_seq_length="128" \
|
11 |
--pad_to_max_length \
|
|
|
24 |
--num_train_steps="500000" \
|
25 |
--eval_steps="1000" \
|
26 |
--dtype="bfloat16" \
|
27 |
+
--sampling_method="steps" \
|
28 |
--logging_steps="500" 2>&1 | tee run_stream.log
|