versae commited on
Commit
c725c08
1 Parent(s): 8045dd9

Step... (240001/250000 | Loss: 2.1932833194732666, Acc: 0.5893170833587646): 4%|▉ | 10063/250000 [3:28:31<88:33:32, 1.33s/it]

Browse files
This view is limited to 50 files because it contains too many changes.   See raw diff
Files changed (50) hide show
  1. .gitattributes +2 -0
  2. .gitignore +4 -0
  3. config.json +25 -0
  4. configs/base/config.json +25 -0
  5. configs/base/tokenizer.json +0 -0
  6. configs/large/config.json +25 -0
  7. configs/large/tokenizer.json +0 -0
  8. convert.py +29 -0
  9. flax_model.msgpack +3 -0
  10. mc4/README.md +525 -0
  11. mc4/dummy/af/0.0.0/dummy_data.zip +0 -0
  12. mc4/mc4.py +426 -0
  13. mc4/mc4.py.lock +0 -0
  14. merges.txt +0 -0
  15. outputs/checkpoints/checkpoint-236000/config.json +25 -0
  16. outputs/checkpoints/checkpoint-236000/data_collator.joblib +3 -0
  17. outputs/checkpoints/checkpoint-236000/flax_model.msgpack +3 -0
  18. outputs/checkpoints/checkpoint-236000/optimizer_state.msgpack +3 -0
  19. outputs/checkpoints/checkpoint-236000/training_args.joblib +3 -0
  20. outputs/checkpoints/checkpoint-236000/training_state.json +1 -0
  21. outputs/checkpoints/checkpoint-237000/config.json +25 -0
  22. outputs/checkpoints/checkpoint-237000/data_collator.joblib +3 -0
  23. outputs/checkpoints/checkpoint-237000/flax_model.msgpack +3 -0
  24. outputs/checkpoints/checkpoint-237000/optimizer_state.msgpack +3 -0
  25. outputs/checkpoints/checkpoint-237000/training_args.joblib +3 -0
  26. outputs/checkpoints/checkpoint-237000/training_state.json +1 -0
  27. outputs/checkpoints/checkpoint-238000/config.json +25 -0
  28. outputs/checkpoints/checkpoint-238000/data_collator.joblib +3 -0
  29. outputs/checkpoints/checkpoint-238000/flax_model.msgpack +3 -0
  30. outputs/checkpoints/checkpoint-238000/optimizer_state.msgpack +3 -0
  31. outputs/checkpoints/checkpoint-238000/training_args.joblib +3 -0
  32. outputs/checkpoints/checkpoint-238000/training_state.json +1 -0
  33. outputs/checkpoints/checkpoint-239000/config.json +25 -0
  34. outputs/checkpoints/checkpoint-239000/data_collator.joblib +3 -0
  35. outputs/checkpoints/checkpoint-239000/flax_model.msgpack +3 -0
  36. outputs/checkpoints/checkpoint-239000/optimizer_state.msgpack +3 -0
  37. outputs/checkpoints/checkpoint-239000/training_args.joblib +3 -0
  38. outputs/checkpoints/checkpoint-239000/training_state.json +1 -0
  39. outputs/checkpoints/checkpoint-240000/config.json +25 -0
  40. outputs/checkpoints/checkpoint-240000/data_collator.joblib +3 -0
  41. outputs/checkpoints/checkpoint-240000/flax_model.msgpack +3 -0
  42. outputs/checkpoints/checkpoint-240000/optimizer_state.msgpack +3 -0
  43. outputs/checkpoints/checkpoint-240000/training_args.joblib +3 -0
  44. outputs/checkpoints/checkpoint-240000/training_state.json +1 -0
  45. outputs/config.json +25 -0
  46. outputs/data_collator.joblib +3 -0
  47. outputs/events.out.tfevents.1626132703.tablespoon.2534518.3.v2 +3 -0
  48. outputs/events.out.tfevents.1626132842.tablespoon.2540265.3.v2 +3 -0
  49. outputs/events.out.tfevents.1626535665.tablespoon.2656403.3.v2 +3 -0
  50. outputs/events.out.tfevents.1626537915.tablespoon.2714825.3.v2 +3 -0
.gitattributes CHANGED
@@ -15,3 +15,5 @@
15
  *.pt filter=lfs diff=lfs merge=lfs -text
16
  *.pth filter=lfs diff=lfs merge=lfs -text
17
  *tfevents* filter=lfs diff=lfs merge=lfs -text
 
 
 
15
  *.pt filter=lfs diff=lfs merge=lfs -text
16
  *.pth filter=lfs diff=lfs merge=lfs -text
17
  *tfevents* filter=lfs diff=lfs merge=lfs -text
18
+ *.log filter=lfs diff=lfs merge=lfs -text
19
+ *.wandb filter=lfs diff=lfs merge=lfs -text
.gitignore ADDED
@@ -0,0 +1,4 @@
 
 
 
 
 
1
+ #run*.log
2
+ debug*.log
3
+ run*.wandb
4
+ wandb/
config.json ADDED
@@ -0,0 +1,25 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "architectures": [
3
+ "RobertaForMaskedLM"
4
+ ],
5
+ "attention_probs_dropout_prob": 0.1,
6
+ "bos_token_id": 0,
7
+ "eos_token_id": 2,
8
+ "gradient_checkpointing": false,
9
+ "hidden_act": "gelu",
10
+ "hidden_dropout_prob": 0.1,
11
+ "hidden_size": 768,
12
+ "initializer_range": 0.02,
13
+ "intermediate_size": 3072,
14
+ "layer_norm_eps": 1e-05,
15
+ "max_position_embeddings": 514,
16
+ "model_type": "roberta",
17
+ "num_attention_heads": 12,
18
+ "num_hidden_layers": 12,
19
+ "pad_token_id": 1,
20
+ "position_embedding_type": "absolute",
21
+ "transformers_version": "4.9.0.dev0",
22
+ "type_vocab_size": 1,
23
+ "use_cache": true,
24
+ "vocab_size": 50265
25
+ }
configs/base/config.json ADDED
@@ -0,0 +1,25 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "architectures": [
3
+ "RobertaForMaskedLM"
4
+ ],
5
+ "attention_probs_dropout_prob": 0.1,
6
+ "bos_token_id": 0,
7
+ "eos_token_id": 2,
8
+ "gradient_checkpointing": false,
9
+ "hidden_act": "gelu",
10
+ "hidden_dropout_prob": 0.1,
11
+ "hidden_size": 768,
12
+ "initializer_range": 0.02,
13
+ "intermediate_size": 3072,
14
+ "layer_norm_eps": 1e-05,
15
+ "max_position_embeddings": 514,
16
+ "model_type": "roberta",
17
+ "num_attention_heads": 12,
18
+ "num_hidden_layers": 12,
19
+ "pad_token_id": 1,
20
+ "position_embedding_type": "absolute",
21
+ "transformers_version": "4.9.0.dev0",
22
+ "type_vocab_size": 1,
23
+ "use_cache": true,
24
+ "vocab_size": 50265
25
+ }
configs/base/tokenizer.json ADDED
The diff for this file is too large to render. See raw diff
 
configs/large/config.json ADDED
@@ -0,0 +1,25 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "architectures": [
3
+ "RobertaForMaskedLM"
4
+ ],
5
+ "attention_probs_dropout_prob": 0.1,
6
+ "bos_token_id": 0,
7
+ "eos_token_id": 2,
8
+ "gradient_checkpointing": false,
9
+ "hidden_act": "gelu",
10
+ "hidden_dropout_prob": 0.1,
11
+ "hidden_size": 1024,
12
+ "initializer_range": 0.02,
13
+ "intermediate_size": 4096,
14
+ "layer_norm_eps": 1e-05,
15
+ "max_position_embeddings": 514,
16
+ "model_type": "roberta",
17
+ "num_attention_heads": 16,
18
+ "num_hidden_layers": 24,
19
+ "pad_token_id": 1,
20
+ "position_embedding_type": "absolute",
21
+ "transformers_version": "4.9.0.dev0",
22
+ "type_vocab_size": 1,
23
+ "use_cache": true,
24
+ "vocab_size": 50265
25
+ }
configs/large/tokenizer.json ADDED
The diff for this file is too large to render. See raw diff
 
convert.py ADDED
@@ -0,0 +1,29 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ #!/usr/bin/env python
2
+ import tempfile
3
+
4
+ import jax
5
+ from jax import numpy as jnp
6
+ from transformers import AutoTokenizer, FlaxRobertaForMaskedLM, RobertaForMaskedLM
7
+
8
+
9
+ def to_f32(t):
10
+ return jax.tree_map(lambda x: x.astype(jnp.float32) if x.dtype == jnp.bfloat16 else x, t)
11
+
12
+
13
+ def main():
14
+ # Saving extra files from config.json and tokenizer.json files
15
+ tokenizer = AutoTokenizer.from_pretrained("./")
16
+ tokenizer.save_pretrained("./")
17
+
18
+ # Temporary saving bfloat16 Flax model into float32
19
+ tmp = tempfile.mkdtemp()
20
+ flax_model = FlaxRobertaForMaskedLM.from_pretrained("./")
21
+ flax_model.params = to_f32(flax_model.params)
22
+ flax_model.save_pretrained(tmp)
23
+ # Converting float32 Flax to PyTorch
24
+ model = RobertaForMaskedLM.from_pretrained(tmp, from_flax=True)
25
+ model.save_pretrained("./", save_config=False)
26
+
27
+
28
+ if __name__ == "__main__":
29
+ main()
flax_model.msgpack ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:9ae5937c808e6e457c600f1da0eb5f8f38f6a3137b2d59828bc675b0103214ca
3
+ size 249750019
mc4/README.md ADDED
@@ -0,0 +1,525 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ ---
2
+ pretty_name: mC4
3
+ annotations_creators:
4
+ - no-annotation
5
+ language_creators:
6
+ - found
7
+ languages:
8
+ - af
9
+ - am
10
+ - ar
11
+ - az
12
+ - be
13
+ - bg
14
+ - bg-Latn
15
+ - bn
16
+ - ca
17
+ - ceb
18
+ - co
19
+ - cs
20
+ - cy
21
+ - da
22
+ - de
23
+ - el
24
+ - el-Latn
25
+ - en
26
+ - eo
27
+ - es
28
+ - et
29
+ - eu
30
+ - fa
31
+ - fi
32
+ - fil
33
+ - fr
34
+ - fy
35
+ - ga
36
+ - gd
37
+ - gl
38
+ - gu
39
+ - ha
40
+ - haw
41
+ - hi
42
+ - hi-Latn
43
+ - hmn
44
+ - ht
45
+ - hu
46
+ - hy
47
+ - id
48
+ - ig
49
+ - is
50
+ - it
51
+ - iw
52
+ - ja
53
+ - ja-Latn
54
+ - jv
55
+ - ka
56
+ - kk
57
+ - km
58
+ - kn
59
+ - ko
60
+ - ku
61
+ - ky
62
+ - la
63
+ - lb
64
+ - lo
65
+ - lt
66
+ - lv
67
+ - mg
68
+ - mi
69
+ - mk
70
+ - ml
71
+ - mn
72
+ - mr
73
+ - ms
74
+ - mt
75
+ - my
76
+ - ne
77
+ - nl
78
+ - "no"
79
+ - ny
80
+ - pa
81
+ - pl
82
+ - ps
83
+ - pt
84
+ - ro
85
+ - ru
86
+ - ru-Latn
87
+ - sd
88
+ - si
89
+ - sk
90
+ - sl
91
+ - sm
92
+ - sn
93
+ - so
94
+ - sq
95
+ - sr
96
+ - st
97
+ - su
98
+ - sv
99
+ - sw
100
+ - ta
101
+ - te
102
+ - tg
103
+ - th
104
+ - tr
105
+ - uk
106
+ - und
107
+ - ur
108
+ - uz
109
+ - vi
110
+ - xh
111
+ - yi
112
+ - yo
113
+ - zh
114
+ - zh-Latn
115
+ - zu
116
+ licenses:
117
+ - odc-by-1.0
118
+ multilinguality:
119
+ - multilingual
120
+ size_categories:
121
+ - n<1K
122
+ - 1K<n<10K
123
+ - 10K<n<100K
124
+ - 100K<n<1M
125
+ - 1M<n<10M
126
+ - 10M<n<100M
127
+ - 100M<n<1B
128
+ - 1B<n<10B
129
+ source_datasets:
130
+ - original
131
+ task_categories:
132
+ - sequence-modeling
133
+ task_ids:
134
+ - language-modeling
135
+ paperswithcode_id: mc4
136
+ ---
137
+
138
+ # Dataset Card for mC4
139
+
140
+ ## Table of Contents
141
+
142
+ - [Dataset Card for mC4](#dataset-card-for-mc4)
143
+ - [Table of Contents](#table-of-contents)
144
+ - [Dataset Description](#dataset-description)
145
+ - [Dataset Summary](#dataset-summary)
146
+ - [Supported Tasks and Leaderboards](#supported-tasks-and-leaderboards)
147
+ - [Languages](#languages)
148
+ - [Dataset Structure](#dataset-structure)
149
+ - [Data Instances](#data-instances)
150
+ - [Data Fields](#data-fields)
151
+ - [Data Splits](#data-splits)
152
+ - [Dataset Creation](#dataset-creation)
153
+ - [Curation Rationale](#curation-rationale)
154
+ - [Source Data](#source-data)
155
+ - [Initial Data Collection and Normalization](#initial-data-collection-and-normalization)
156
+ - [Who are the source language producers?](#who-are-the-source-language-producers)
157
+ - [Annotations](#annotations)
158
+ - [Annotation process](#annotation-process)
159
+ - [Who are the annotators?](#who-are-the-annotators)
160
+ - [Personal and Sensitive Information](#personal-and-sensitive-information)
161
+ - [Considerations for Using the Data](#considerations-for-using-the-data)
162
+ - [Social Impact of Dataset](#social-impact-of-dataset)
163
+ - [Discussion of Biases](#discussion-of-biases)
164
+ - [Other Known Limitations](#other-known-limitations)
165
+ - [Additional Information](#additional-information)
166
+ - [Dataset Curators](#dataset-curators)
167
+ - [Licensing Information](#licensing-information)
168
+ - [Citation Information](#citation-information)
169
+ - [Contributions](#contributions)
170
+
171
+ ## Dataset Description
172
+
173
+ - **Homepage:** https://huggingface.co/datasets/allenai/c4
174
+ - **Paper:** https://arxiv.org/abs/1910.10683
175
+
176
+ ### Dataset Summary
177
+
178
+ A multilingual colossal, cleaned version of Common Crawl's web crawl corpus. Based on Common Crawl dataset: "https://commoncrawl.org".
179
+
180
+ This is the version prepared by AllenAI, hosted at this address: https://huggingface.co/datasets/allenai/c4
181
+
182
+ 108 languages are available and are reported in the table below.
183
+
184
+ Note that the languages that end with "-Latn" are simply romanized variants, i.e. written using the Latin script.
185
+
186
+ | language code | language name |
187
+ |:----------------|:---------------------|
188
+ | af | Afrikaans |
189
+ | am | Amharic |
190
+ | ar | Arabic |
191
+ | az | Azerbaijani |
192
+ | be | Belarusian |
193
+ | bg | Bulgarian |
194
+ | bg-Latn | Bulgarian (Latin) |
195
+ | bn | Bangla |
196
+ | ca | Catalan |
197
+ | ceb | Cebuano |
198
+ | co | Corsican |
199
+ | cs | Czech |
200
+ | cy | Welsh |
201
+ | da | Danish |
202
+ | de | German |
203
+ | el | Greek |
204
+ | el-Latn | Greek (Latin) |
205
+ | en | English |
206
+ | eo | Esperanto |
207
+ | es | Spanish |
208
+ | et | Estonian |
209
+ | eu | Basque |
210
+ | fa | Persian |
211
+ | fi | Finnish |
212
+ | fil | Filipino |
213
+ | fr | French |
214
+ | fy | Western Frisian |
215
+ | ga | Irish |
216
+ | gd | Scottish Gaelic |
217
+ | gl | Galician |
218
+ | gu | Gujarati |
219
+ | ha | Hausa |
220
+ | haw | Hawaiian |
221
+ | hi | Hindi |
222
+ | hi-Latn | Hindi (Latin script) |
223
+ | hmn | Hmong, Mong |
224
+ | ht | Haitian |
225
+ | hu | Hungarian |
226
+ | hy | Armenian |
227
+ | id | Indonesian |
228
+ | ig | Igbo |
229
+ | is | Icelandic |
230
+ | it | Italian |
231
+ | iw | former Hebrew |
232
+ | ja | Japanese |
233
+ | ja-Latn | Japanese (Latin) |
234
+ | jv | Javanese |
235
+ | ka | Georgian |
236
+ | kk | Kazakh |
237
+ | km | Khmer |
238
+ | kn | Kannada |
239
+ | ko | Korean |
240
+ | ku | Kurdish |
241
+ | ky | Kyrgyz |
242
+ | la | Latin |
243
+ | lb | Luxembourgish |
244
+ | lo | Lao |
245
+ | lt | Lithuanian |
246
+ | lv | Latvian |
247
+ | mg | Malagasy |
248
+ | mi | Maori |
249
+ | mk | Macedonian |
250
+ | ml | Malayalam |
251
+ | mn | Mongolian |
252
+ | mr | Marathi |
253
+ | ms | Malay |
254
+ | mt | Maltese |
255
+ | my | Burmese |
256
+ | ne | Nepali |
257
+ | nl | Dutch |
258
+ | no | Norwegian |
259
+ | ny | Nyanja |
260
+ | pa | Punjabi |
261
+ | pl | Polish |
262
+ | ps | Pashto |
263
+ | pt | Portuguese |
264
+ | ro | Romanian |
265
+ | ru | Russian |
266
+ | ru-Latn | Russian (Latin) |
267
+ | sd | Sindhi |
268
+ | si | Sinhala |
269
+ | sk | Slovak |
270
+ | sl | Slovenian |
271
+ | sm | San Marino |
272
+ | sn | Shona |
273
+ | so | Somali |
274
+ | sq | Albanian |
275
+ | sr | Serbian |
276
+ | st | Southern Sotho |
277
+ | su | Sundanese |
278
+ | sv | Swedish |
279
+ | sw | Swahili |
280
+ | ta | Tamil |
281
+ | te | Telugu |
282
+ | tg | Tajik |
283
+ | th | Thai |
284
+ | tr | Turkish |
285
+ | uk | Ukrainian |
286
+ | und | Unknown language |
287
+ | ur | Urdu |
288
+ | uz | Uzbek |
289
+ | vi | Vietnamese |
290
+ | xh | Xhosa |
291
+ | yi | Yiddish |
292
+ | yo | Yoruba |
293
+ | zh | Chinese |
294
+ | zh-Latn | Chinese (Latin) |
295
+ | zu | Zulu |
296
+
297
+ You can load the mC4 subset of any language like this:
298
+
299
+ ```python
300
+ from datasets import load_dataset
301
+
302
+ en_mc4 = load_dataset("mc4", "en")
303
+ ```
304
+
305
+ And if you can even specify a list of languages:
306
+
307
+ ```python
308
+ from datasets import load_dataset
309
+
310
+ mc4_subset_with_five_languages = load_dataset("mc4", languages=["en", "fr", "es", "de", "zh"])
311
+ ```
312
+
313
+ ### Supported Tasks and Leaderboards
314
+
315
+ mC4 is mainly intended to pretrain language models and word representations.
316
+
317
+ ### Languages
318
+
319
+ The dataset supports 108 languages.
320
+
321
+ ## Dataset Structure
322
+
323
+ ### Data Instances
324
+
325
+ An example form the `en` config is:
326
+
327
+ ```
328
+ {'timestamp': '2018-06-24T01:32:39Z',
329
+ 'text': 'Farm Resources in Plumas County\nShow Beginning Farmer Organizations & Professionals (304)\nThere are 304 resources serving Plumas County in the following categories:\nMap of Beginning Farmer Organizations & Professionals serving Plumas County\nVictoria Fisher - Office Manager - Loyalton, CA\nAmy Lynn Rasband - UCCE Plumas-Sierra Administrative Assistant II - Quincy , CA\nShow Farm Income Opportunities Organizations & Professionals (353)\nThere are 353 resources serving Plumas County in the following categories:\nFarm Ranch And Forest Retailers (18)\nMap of Farm Income Opportunities Organizations & Professionals serving Plumas County\nWarner Valley Wildlife Area - Plumas County\nShow Farm Resources Organizations & Professionals (297)\nThere are 297 resources serving Plumas County in the following categories:\nMap of Farm Resources Organizations & Professionals serving Plumas County\nThere are 57 resources serving Plumas County in the following categories:\nMap of Organic Certification Organizations & Professionals serving Plumas County',
330
+ 'url': 'http://www.californialandcan.org/Plumas/Farm-Resources/'}
331
+ ```
332
+
333
+ ### Data Fields
334
+
335
+ The data have several fields:
336
+
337
+ - `url`: url of the source as a string
338
+ - `text`: text content as a string
339
+ - `timestamp`: timestamp as a string
340
+
341
+ ### Data Splits
342
+
343
+ To build mC4, the authors used [CLD3](https://github.com/google/cld3) to identify over 100 languages. The resulting mC4 subsets for each language are reported in this table:
344
+
345
+ | config | train | validation |
346
+ |:---------|:--------|:-------------|
347
+ | af | ? | ? |
348
+ | am | ? | ? |
349
+ | ar | ? | ? |
350
+ | az | ? | ? |
351
+ | be | ? | ? |
352
+ | bg | ? | ? |
353
+ | bg-Latn | ? | ? |
354
+ | bn | ? | ? |
355
+ | ca | ? | ? |
356
+ | ceb | ? | ? |
357
+ | co | ? | ? |
358
+ | cs | ? | ? |
359
+ | cy | ? | ? |
360
+ | da | ? | ? |
361
+ | de | ? | ? |
362
+ | el | ? | ? |
363
+ | el-Latn | ? | ? |
364
+ | en | ? | ? |
365
+ | eo | ? | ? |
366
+ | es | ? | ? |
367
+ | et | ? | ? |
368
+ | eu | ? | ? |
369
+ | fa | ? | ? |
370
+ | fi | ? | ? |
371
+ | fil | ? | ? |
372
+ | fr | ? | ? |
373
+ | fy | ? | ? |
374
+ | ga | ? | ? |
375
+ | gd | ? | ? |
376
+ | gl | ? | ? |
377
+ | gu | ? | ? |
378
+ | ha | ? | ? |
379
+ | haw | ? | ? |
380
+ | hi | ? | ? |
381
+ | hi-Latn | ? | ? |
382
+ | hmn | ? | ? |
383
+ | ht | ? | ? |
384
+ | hu | ? | ? |
385
+ | hy | ? | ? |
386
+ | id | ? | ? |
387
+ | ig | ? | ? |
388
+ | is | ? | ? |
389
+ | it | ? | ? |
390
+ | iw | ? | ? |
391
+ | ja | ? | ? |
392
+ | ja-Latn | ? | ? |
393
+ | jv | ? | ? |
394
+ | ka | ? | ? |
395
+ | kk | ? | ? |
396
+ | km | ? | ? |
397
+ | kn | ? | ? |
398
+ | ko | ? | ? |
399
+ | ku | ? | ? |
400
+ | ky | ? | ? |
401
+ | la | ? | ? |
402
+ | lb | ? | ? |
403
+ | lo | ? | ? |
404
+ | lt | ? | ? |
405
+ | lv | ? | ? |
406
+ | mg | ? | ? |
407
+ | mi | ? | ? |
408
+ | mk | ? | ? |
409
+ | ml | ? | ? |
410
+ | mn | ? | ? |
411
+ | mr | ? | ? |
412
+ | ms | ? | ? |
413
+ | mt | ? | ? |
414
+ | my | ? | ? |
415
+ | ne | ? | ? |
416
+ | nl | ? | ? |
417
+ | no | ? | ? |
418
+ | ny | ? | ? |
419
+ | pa | ? | ? |
420
+ | pl | ? | ? |
421
+ | ps | ? | ? |
422
+ | pt | ? | ? |
423
+ | ro | ? | ? |
424
+ | ru | ? | ? |
425
+ | ru-Latn | ? | ? |
426
+ | sd | ? | ? |
427
+ | si | ? | ? |
428
+ | sk | ? | ? |
429
+ | sl | ? | ? |
430
+ | sm | ? | ? |
431
+ | sn | ? | ? |
432
+ | so | ? | ? |
433
+ | sq | ? | ? |
434
+ | sr | ? | ? |
435
+ | st | ? | ? |
436
+ | su | ? | ? |
437
+ | sv | ? | ? |
438
+ | sw | ? | ? |
439
+ | ta | ? | ? |
440
+ | te | ? | ? |
441
+ | tg | ? | ? |
442
+ | th | ? | ? |
443
+ | tr | ? | ? |
444
+ | uk | ? | ? |
445
+ | und | ? | ? |
446
+ | ur | ? | ? |
447
+ | uz | ? | ? |
448
+ | vi | ? | ? |
449
+ | xh | ? | ? |
450
+ | yi | ? | ? |
451
+ | yo | ? | ? |
452
+ | zh | ? | ? |
453
+ | zh-Latn | ? | ? |
454
+ | zu | ? | ? |
455
+
456
+ ## Dataset Creation
457
+
458
+ ### Curation Rationale
459
+
460
+ [More Information Needed]
461
+
462
+ ### Source Data
463
+
464
+ #### Initial Data Collection and Normalization
465
+
466
+ [More Information Needed]
467
+
468
+ #### Who are the source language producers?
469
+
470
+ [More Information Needed]
471
+
472
+ ### Annotations
473
+
474
+ #### Annotation process
475
+
476
+ [More Information Needed]
477
+
478
+ #### Who are the annotators?
479
+
480
+ [More Information Needed]
481
+
482
+ ### Personal and Sensitive Information
483
+
484
+ [More Information Needed]
485
+
486
+ ## Considerations for Using the Data
487
+
488
+ ### Social Impact of Dataset
489
+
490
+ [More Information Needed]
491
+
492
+ ### Discussion of Biases
493
+
494
+ [More Information Needed]
495
+
496
+ ### Other Known Limitations
497
+
498
+ [More Information Needed]
499
+
500
+ ## Additional Information
501
+
502
+ ### Dataset Curators
503
+
504
+ [More Information Needed]
505
+
506
+ ### Licensing Information
507
+
508
+ AllenAI are releasing this dataset under the terms of ODC-BY. By using this, you are also bound by the Common Crawl terms of use in respect of the content contained in the dataset.
509
+
510
+ ### Citation Information
511
+
512
+ ```
513
+ @article{2019t5,
514
+ author = {Colin Raffel and Noam Shazeer and Adam Roberts and Katherine Lee and Sharan Narang and Michael Matena and Yanqi Zhou and Wei Li and Peter J. Liu},
515
+ title = {Exploring the Limits of Transfer Learning with a Unified Text-to-Text Transformer},
516
+ journal = {arXiv e-prints},
517
+ year = {2019},
518
+ archivePrefix = {arXiv},
519
+ eprint = {1910.10683},
520
+ }
521
+ ```
522
+
523
+ ### Contributions
524
+
525
+ Thanks to [@dirkgr](https://github.com/dirkgr) and [@lhoestq](https://github.com/lhoestq) for adding this dataset.
mc4/dummy/af/0.0.0/dummy_data.zip ADDED
Binary file (8.54 kB). View file
 
mc4/mc4.py ADDED
@@ -0,0 +1,426 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """mC4 dataset based on Common Crawl."""
2
+
3
+
4
+ import gzip
5
+ import json
6
+
7
+ import datasets
8
+ import kenlm
9
+ import numpy as np
10
+ from numpy.random import default_rng
11
+
12
+
13
+ logger = datasets.logging.get_logger(__name__)
14
+
15
+
16
+ _DESCRIPTION = """\
17
+ A colossal, cleaned version of Common Crawl's web crawl corpus.
18
+
19
+ Based on Common Crawl dataset: "https://commoncrawl.org".
20
+
21
+ This is the processed version of Google's mC4 dataset by AllenAI.
22
+ """
23
+
24
+ _CITATION = """
25
+ @article{2019t5,
26
+ author = {Colin Raffel and Noam Shazeer and Adam Roberts and Katherine Lee and Sharan Narang and Michael Matena and Yanqi Zhou and Wei Li and Peter J. Liu},
27
+ title = {Exploring the Limits of Transfer Learning with a Unified Text-to-Text Transformer},
28
+ journal = {arXiv e-prints},
29
+ year = {2019},
30
+ archivePrefix = {arXiv},
31
+ eprint = {1910.10683},
32
+ }
33
+ """
34
+
35
+ _URL = "https://github.com/allenai/allennlp/discussions/5056"
36
+
37
+ _DATA_URL = "https://huggingface.co/datasets/allenai/c4/resolve/1ddc917116b730e1859edef32896ec5c16be51d0/multilingual/c4-{language}{split_suffix}.tfrecord-{index:05d}-of-{n_shards:05d}.json.gz"
38
+
39
+ _LANGUAGES = [
40
+ "af",
41
+ "am",
42
+ "ar",
43
+ "az",
44
+ "be",
45
+ "bg",
46
+ "bg-Latn",
47
+ "bn",
48
+ "ca",
49
+ "ceb",
50
+ "co",
51
+ "cs",
52
+ "cy",
53
+ "da",
54
+ "de",
55
+ "el",
56
+ "el-Latn",
57
+ "en",
58
+ "eo",
59
+ "es",
60
+ "et",
61
+ "eu",
62
+ "fa",
63
+ "fi",
64
+ "fil",
65
+ "fr",
66
+ "fy",
67
+ "ga",
68
+ "gd",
69
+ "gl",
70
+ "gu",
71
+ "ha",
72
+ "haw",
73
+ "hi",
74
+ "hi-Latn",
75
+ "hmn",
76
+ "ht",
77
+ "hu",
78
+ "hy",
79
+ "id",
80
+ "ig",
81
+ "is",
82
+ "it",
83
+ "iw",
84
+ "ja",
85
+ "ja-Latn",
86
+ "jv",
87
+ "ka",
88
+ "kk",
89
+ "km",
90
+ "kn",
91
+ "ko",
92
+ "ku",
93
+ "ky",
94
+ "la",
95
+ "lb",
96
+ "lo",
97
+ "lt",
98
+ "lv",
99
+ "mg",
100
+ "mi",
101
+ "mk",
102
+ "ml",
103
+ "mn",
104
+ "mr",
105
+ "ms",
106
+ "mt",
107
+ "my",
108
+ "ne",
109
+ "nl",
110
+ "no",
111
+ "ny",
112
+ "pa",
113
+ "pl",
114
+ "ps",
115
+ "pt",
116
+ "ro",
117
+ "ru",
118
+ "ru-Latn",
119
+ "sd",
120
+ "si",
121
+ "sk",
122
+ "sl",
123
+ "sm",
124
+ "sn",
125
+ "so",
126
+ "sq",
127
+ "sr",
128
+ "st",
129
+ "su",
130
+ "sv",
131
+ "sw",
132
+ "ta",
133
+ "te",
134
+ "tg",
135
+ "th",
136
+ "tr",
137
+ "uk",
138
+ "und",
139
+ "ur",
140
+ "uz",
141
+ "vi",
142
+ "xh",
143
+ "yi",
144
+ "yo",
145
+ "zh",
146
+ "zh-Latn",
147
+ "zu",
148
+ ]
149
+
150
+ _N_SHARDS_PER_SPLIT = {
151
+ "af": {"train": 64, "validation": 1},
152
+ "am": {"train": 16, "validation": 1},
153
+ "ar": {"train": 1024, "validation": 4},
154
+ "az": {"train": 256, "validation": 1},
155
+ "be": {"train": 128, "validation": 1},
156
+ "bg": {"train": 1024, "validation": 1},
157
+ "bg-Latn": {"train": 4, "validation": 1},
158
+ "bn": {"train": 512, "validation": 1},
159
+ "ca": {"train": 512, "validation": 1},
160
+ "ceb": {"train": 8, "validation": 1},
161
+ "co": {"train": 8, "validation": 1},
162
+ "cs": {"train": 1024, "validation": 2},
163
+ "cy": {"train": 256, "validation": 1},
164
+ "da": {"train": 1024, "validation": 1},
165
+ "de": {"train": 2048, "validation": 16},
166
+ "el": {"train": 1024, "validation": 2},
167
+ "el-Latn": {"train": 16, "validation": 1},
168
+ "en": {"train": 11264, "validation": 128},
169
+ "eo": {"train": 32, "validation": 1},
170
+ "es": {"train": 2048, "validation": 16},
171
+ "et": {"train": 256, "validation": 1},
172
+ "eu": {"train": 64, "validation": 1},
173
+ "fa": {"train": 1024, "validation": 2},
174
+ "fi": {"train": 1024, "validation": 1},
175
+ "fil": {"train": 64, "validation": 1},
176
+ "fr": {"train": 2048, "validation": 16},
177
+ "fy": {"train": 16, "validation": 1},
178
+ "ga": {"train": 16, "validation": 1},
179
+ "gd": {"train": 16, "validation": 1},
180
+ "gl": {"train": 128, "validation": 1},
181
+ "gu": {"train": 64, "validation": 1},
182
+ "ha": {"train": 8, "validation": 1},
183
+ "haw": {"train": 2, "validation": 1},
184
+ "hi": {"train": 1024, "validation": 2},
185
+ "hi-Latn": {"train": 16, "validation": 1},
186
+ "hmn": {"train": 8, "validation": 1},
187
+ "ht": {"train": 8, "validation": 1},
188
+ "hu": {"train": 1024, "validation": 2},
189
+ "hy": {"train": 128, "validation": 1},
190
+ "id": {"train": 1024, "validation": 4},
191
+ "ig": {"train": 4, "validation": 1},
192
+ "is": {"train": 128, "validation": 1},
193
+ "it": {"train": 1024, "validation": 8},
194
+ "iw": {"train": 1024, "validation": 1},
195
+ "ja": {"train": 1024, "validation": 8},
196
+ "ja-Latn": {"train": 8, "validation": 1},
197
+ "jv": {"train": 8, "validation": 1},
198
+ "ka": {"train": 256, "validation": 1},
199
+ "kk": {"train": 256, "validation": 1},
200
+ "km": {"train": 64, "validation": 1},
201
+ "kn": {"train": 64, "validation": 1},
202
+ "ko": {"train": 1024, "validation": 1},
203
+ "ku": {"train": 16, "validation": 1},
204
+ "ky": {"train": 64, "validation": 1},
205
+ "la": {"train": 64, "validation": 1},
206
+ "lb": {"train": 32, "validation": 1},
207
+ "lo": {"train": 8, "validation": 1},
208
+ "lt": {"train": 512, "validation": 1},
209
+ "lv": {"train": 256, "validation": 1},
210
+ "mg": {"train": 8, "validation": 1},
211
+ "mi": {"train": 4, "validation": 1},
212
+ "mk": {"train": 128, "validation": 1},
213
+ "ml": {"train": 128, "validation": 1},
214
+ "mn": {"train": 128, "validation": 1},
215
+ "mr": {"train": 1024, "validation": 1},
216
+ "ms": {"train": 512, "validation": 1},
217
+ "mt": {"train": 128, "validation": 1},
218
+ "my": {"train": 64, "validation": 1},
219
+ "ne": {"train": 256, "validation": 1},
220
+ "nl": {"train": 1024, "validation": 4},
221
+ "no": {"train": 1024, "validation": 1},
222
+ "ny": {"train": 4, "validation": 1},
223
+ "pa": {"train": 32, "validation": 1},
224
+ "pl": {"train": 1024, "validation": 4},
225
+ "ps": {"train": 16, "validation": 1},
226
+ "pt": {"train": 1024, "validation": 4},
227
+ "ro": {"train": 1024, "validation": 2},
228
+ "ru": {"train": 4096, "validation": 32},
229
+ "ru-Latn": {"train": 32, "validation": 1},
230
+ "sd": {"train": 64, "validation": 1},
231
+ "si": {"train": 64, "validation": 1},
232
+ "sk": {"train": 512, "validation": 1},
233
+ "sl": {"train": 256, "validation": 1},
234
+ "sm": {"train": 4, "validation": 1},
235
+ "sn": {"train": 8, "validation": 1},
236
+ "so": {"train": 64, "validation": 1},
237
+ "sq": {"train": 128, "validation": 1},
238
+ "sr": {"train": 256, "validation": 1},
239
+ "st": {"train": 2, "validation": 1},
240
+ "su": {"train": 4, "validation": 1},
241
+ "sv": {"train": 1024, "validation": 2},
242
+ "sw": {"train": 32, "validation": 1},
243
+ "ta": {"train": 256, "validation": 1},
244
+ "te": {"train": 128, "validation": 1},
245
+ "tg": {"train": 64, "validation": 1},
246
+ "th": {"train": 1024, "validation": 1},
247
+ "tr": {"train": 1024, "validation": 4},
248
+ "uk": {"train": 1024, "validation": 2},
249
+ "und": {"train": 3072, "validation": 32},
250
+ "ur": {"train": 128, "validation": 1},
251
+ "uz": {"train": 32, "validation": 1},
252
+ "vi": {"train": 1024, "validation": 4},
253
+ "xh": {"train": 2, "validation": 1},
254
+ "yi": {"train": 16, "validation": 1},
255
+ "yo": {"train": 2, "validation": 1},
256
+ "zh": {"train": 1024, "validation": 2},
257
+ "zh-Latn": {"train": 8, "validation": 1},
258
+ "zu": {"train": 8, "validation": 1},
259
+ }
260
+
261
+
262
+ class Mc4Config(datasets.BuilderConfig):
263
+ """BuilderConfig for mC4."""
264
+
265
+ def __init__(self, *args, languages, **kwargs):
266
+ """BuilderConfig for mC4.
267
+ Args:
268
+ languages (:obj:`List[str]`): list of languages to load
269
+ **kwargs: keyword arguments forwarded to super.
270
+ """
271
+ super().__init__(
272
+ *args,
273
+ name="+".join(languages),
274
+ **kwargs,
275
+ )
276
+ self.languages = languages
277
+
278
+
279
+ class Mc4(datasets.GeneratorBasedBuilder):
280
+ """mC4, a colossal, cleaned version of Common Crawl's web crawl corpus."""
281
+
282
+ BUILDER_CONFIGS = [Mc4Config(languages=[lang]) for lang in _LANGUAGES]
283
+ BUILDER_CONFIG_CLASS = Mc4Config
284
+
285
+ def __init__(self, *args, writer_batch_size=None, **kwargs):
286
+ self.data_files = kwargs.pop("data_files", {})
287
+ self.sampling_method = kwargs.pop("sampling_method", None)
288
+ self.perplexity_model = kwargs.pop("perplexity_model", None)
289
+ self.sampling_factor = kwargs.pop("sampling_factor", None)
290
+ self.boundaries = kwargs.pop("boundaries", None)
291
+ self.seed = kwargs.pop("seed", None)
292
+ if self.sampling_method:
293
+ if self.seed is not None:
294
+ self.rng = default_rng(self.seed)
295
+ else:
296
+ self.rng = default_rng()
297
+ if self.sampling_method == "random":
298
+ self.should_keep_doc = self._should_keep_doc_random
299
+ else:
300
+ # Loading 5-gram model
301
+ # http://dl.fbaipublicfiles.com/cc_net/lm/es.arpa.bin
302
+ logger.info("loading model = %s", self.perplexity_model)
303
+ self.pp_model = kenlm.Model(self.perplexity_model)
304
+ if self.sampling_method == "gaussian":
305
+ self.should_keep_doc = self._should_keep_doc_gaussian
306
+ else:
307
+ self.should_keep_doc = self._should_keep_doc_step
308
+ super().__init__(*args, writer_batch_size=writer_batch_size, **kwargs)
309
+
310
+ def get_perplexity(self, doc):
311
+ doc_log_score, doc_length = 0, 0
312
+ for line in doc.split("\n"):
313
+ log_score = self.pp_model.score(line)
314
+ length = len(line.split()) + 1
315
+ doc_log_score += log_score
316
+ doc_length += length
317
+ return 10.0 ** (-doc_log_score / doc_length)
318
+
319
+ def _should_keep_doc_step(self, doc, factor=1.5e5, boundaries=None):
320
+ perplexity = self.get_perplexity(doc)
321
+ if boundaries is None:
322
+ boundaries = [536394.99320948, 662247.50212365, 919250.87225178]
323
+ if perplexity <= boundaries[0]:
324
+ quartile_range = boundaries[0]
325
+ elif boundaries[0] < perplexity < boundaries[1]:
326
+ quartile_range = boundaries[1] - boundaries[0]
327
+ elif boundaries[1] < perplexity < boundaries[2]:
328
+ quartile_range = boundaries[2] - boundaries[1]
329
+ elif perplexity >= boundaries[2]:
330
+ quartile_range = 10 * boundaries[2]
331
+ probability = factor / quartile_range
332
+ return self.rng.uniform() < probability
333
+
334
+ def _should_keep_doc_gaussian(self, doc, factor=0.78, boundaries=None):
335
+ perplexity = self.get_perplexity(doc)
336
+ if boundaries is not None:
337
+ m = boundaries[1]
338
+ else:
339
+ m = 662247.50212365
340
+ exponential = np.exp(-9/2 * ((perplexity - m) / m) ** 2)
341
+ weighted_perplexity = factor * exponential
342
+ return self.rng.uniform() < weighted_perplexity
343
+
344
+ def _should_keep_doc_random(self, doc, factor=None, boundaries=None):
345
+ if factor is None:
346
+ factor = 0.5
347
+ return self.rng.uniform() <= factor
348
+
349
+ def _info(self):
350
+ return datasets.DatasetInfo(
351
+ description=_DESCRIPTION,
352
+ features=datasets.Features(
353
+ {
354
+ "text": datasets.Value("string"),
355
+ "timestamp": datasets.Value("string"),
356
+ "url": datasets.Value("string"),
357
+ }
358
+ ),
359
+ supervised_keys=None,
360
+ homepage=_URL,
361
+ citation=_CITATION,
362
+ )
363
+
364
+ def _split_generators(self, dl_manager):
365
+ data_urls = {}
366
+ for split in ["train", "validation"]:
367
+ data_urls[split] = [
368
+ _DATA_URL.format(
369
+ language=self.config.name,
370
+ split_suffix="-validation" if split == "validation" else "",
371
+ index=index,
372
+ n_shards=_N_SHARDS_PER_SPLIT[lang][split],
373
+ )
374
+ for lang in self.config.languages
375
+ for index in range(_N_SHARDS_PER_SPLIT[lang][split])
376
+ ]
377
+ if "train" in self.data_files:
378
+ train_downloaded_files = self.data_files["train"]
379
+ if not isinstance(train_downloaded_files, (tuple, list)):
380
+ train_downloaded_files = [train_downloaded_files]
381
+ else:
382
+ train_downloaded_files = dl_manager.download(data_urls["train"])
383
+ if "validation" in self.data_files:
384
+ validation_downloaded_files = self.data_files["validation"]
385
+ if not isinstance(validation_downloaded_files, (tuple, list)):
386
+ validation_downloaded_files = [validation_downloaded_files]
387
+ else:
388
+ validation_downloaded_files = dl_manager.download(data_urls["validation"])
389
+ return [
390
+ datasets.SplitGenerator(name=datasets.Split.TRAIN, gen_kwargs={"filepaths": train_downloaded_files}),
391
+ datasets.SplitGenerator(
392
+ name=datasets.Split.VALIDATION, gen_kwargs={"filepaths": validation_downloaded_files}
393
+ ),
394
+ ]
395
+
396
+ def _generate_examples(self, filepaths):
397
+ """This function returns the examples in the raw (text) form by iterating on all the files."""
398
+ id_ = 0
399
+ for filepath in filepaths:
400
+ logger.info("generating examples from = %s", filepath)
401
+ if filepath.endswith("jsonl"):
402
+ with open(filepath, "r", encoding="utf-8") as f:
403
+ for line in f:
404
+ if line:
405
+ example = json.loads(line)
406
+ yield id_, example
407
+ id_ += 1
408
+ else:
409
+ with gzip.open(open(filepath, "rb"), "rt", encoding="utf-8") as f:
410
+ if self.sampling_method:
411
+ logger.info("sampling method = %s", self.sampling_method)
412
+ for line in f:
413
+ if line:
414
+ example = json.loads(line)
415
+ if self.should_keep_doc(
416
+ example["text"],
417
+ factor=self.sampling_factor,
418
+ boundaries=self.boundaries):
419
+ yield id_, example
420
+ id_ += 1
421
+ else:
422
+ for line in f:
423
+ if line:
424
+ example = json.loads(line)
425
+ yield id_, example
426
+ id_ += 1
mc4/mc4.py.lock ADDED
File without changes
merges.txt ADDED
The diff for this file is too large to render. See raw diff
 
outputs/checkpoints/checkpoint-236000/config.json ADDED
@@ -0,0 +1,25 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "architectures": [
3
+ "RobertaForMaskedLM"
4
+ ],
5
+ "attention_probs_dropout_prob": 0.1,
6
+ "bos_token_id": 0,
7
+ "eos_token_id": 2,
8
+ "gradient_checkpointing": false,
9
+ "hidden_act": "gelu",
10
+ "hidden_dropout_prob": 0.1,
11
+ "hidden_size": 768,
12
+ "initializer_range": 0.02,
13
+ "intermediate_size": 3072,
14
+ "layer_norm_eps": 1e-05,
15
+ "max_position_embeddings": 514,
16
+ "model_type": "roberta",
17
+ "num_attention_heads": 12,
18
+ "num_hidden_layers": 12,
19
+ "pad_token_id": 1,
20
+ "position_embedding_type": "absolute",
21
+ "transformers_version": "4.9.0.dev0",
22
+ "type_vocab_size": 1,
23
+ "use_cache": true,
24
+ "vocab_size": 50265
25
+ }
outputs/checkpoints/checkpoint-236000/data_collator.joblib ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:0321b1a9629e1be122045cd72470365a63c8496fec109fdeec34827f01ffbb9e
3
+ size 1471424
outputs/checkpoints/checkpoint-236000/flax_model.msgpack ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:99845155f064d68ed7235bec28f18280d889893696b16d39d5060ccc1565fe7c
3
+ size 249750019
outputs/checkpoints/checkpoint-236000/optimizer_state.msgpack ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:6f1bb7accdbbac41048ab318cdbd0a0298793a5ca4a203aad1e0404dd23dd7bf
3
+ size 499500278
outputs/checkpoints/checkpoint-236000/training_args.joblib ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:750e4dfee62b3647178ef58384b169e0bfc4aff8738f70be19c457301459d630
3
+ size 1871
outputs/checkpoints/checkpoint-236000/training_state.json ADDED
@@ -0,0 +1 @@
 
 
1
+ {"step": 236001}
outputs/checkpoints/checkpoint-237000/config.json ADDED
@@ -0,0 +1,25 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "architectures": [
3
+ "RobertaForMaskedLM"
4
+ ],
5
+ "attention_probs_dropout_prob": 0.1,
6
+ "bos_token_id": 0,
7
+ "eos_token_id": 2,
8
+ "gradient_checkpointing": false,
9
+ "hidden_act": "gelu",
10
+ "hidden_dropout_prob": 0.1,
11
+ "hidden_size": 768,
12
+ "initializer_range": 0.02,
13
+ "intermediate_size": 3072,
14
+ "layer_norm_eps": 1e-05,
15
+ "max_position_embeddings": 514,
16
+ "model_type": "roberta",
17
+ "num_attention_heads": 12,
18
+ "num_hidden_layers": 12,
19
+ "pad_token_id": 1,
20
+ "position_embedding_type": "absolute",
21
+ "transformers_version": "4.9.0.dev0",
22
+ "type_vocab_size": 1,
23
+ "use_cache": true,
24
+ "vocab_size": 50265
25
+ }
outputs/checkpoints/checkpoint-237000/data_collator.joblib ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:0321b1a9629e1be122045cd72470365a63c8496fec109fdeec34827f01ffbb9e
3
+ size 1471424
outputs/checkpoints/checkpoint-237000/flax_model.msgpack ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:755f3c2edda86bdb78a5b091d753ed5c1a3eee3a4bc01fc92e97a4e00d44666b
3
+ size 249750019
outputs/checkpoints/checkpoint-237000/optimizer_state.msgpack ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:1b092b38f151ded8fc7c3904381f4043b42d21461bc277acfb423ff6c288c662
3
+ size 499500278
outputs/checkpoints/checkpoint-237000/training_args.joblib ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:750e4dfee62b3647178ef58384b169e0bfc4aff8738f70be19c457301459d630
3
+ size 1871
outputs/checkpoints/checkpoint-237000/training_state.json ADDED
@@ -0,0 +1 @@
 
 
1
+ {"step": 237001}
outputs/checkpoints/checkpoint-238000/config.json ADDED
@@ -0,0 +1,25 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "architectures": [
3
+ "RobertaForMaskedLM"
4
+ ],
5
+ "attention_probs_dropout_prob": 0.1,
6
+ "bos_token_id": 0,
7
+ "eos_token_id": 2,
8
+ "gradient_checkpointing": false,
9
+ "hidden_act": "gelu",
10
+ "hidden_dropout_prob": 0.1,
11
+ "hidden_size": 768,
12
+ "initializer_range": 0.02,
13
+ "intermediate_size": 3072,
14
+ "layer_norm_eps": 1e-05,
15
+ "max_position_embeddings": 514,
16
+ "model_type": "roberta",
17
+ "num_attention_heads": 12,
18
+ "num_hidden_layers": 12,
19
+ "pad_token_id": 1,
20
+ "position_embedding_type": "absolute",
21
+ "transformers_version": "4.9.0.dev0",
22
+ "type_vocab_size": 1,
23
+ "use_cache": true,
24
+ "vocab_size": 50265
25
+ }
outputs/checkpoints/checkpoint-238000/data_collator.joblib ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:0321b1a9629e1be122045cd72470365a63c8496fec109fdeec34827f01ffbb9e
3
+ size 1471424
outputs/checkpoints/checkpoint-238000/flax_model.msgpack ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:2318ad2b972dbe362e6ed41efb0d5d26e3fa8b0d854d6f396f8133203977fc13
3
+ size 249750019
outputs/checkpoints/checkpoint-238000/optimizer_state.msgpack ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:2a6fc7fda05af98fa8d25993f5adf8e54f8ba210dc0f0c455e55706c5b9e932c
3
+ size 499500278
outputs/checkpoints/checkpoint-238000/training_args.joblib ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:750e4dfee62b3647178ef58384b169e0bfc4aff8738f70be19c457301459d630
3
+ size 1871
outputs/checkpoints/checkpoint-238000/training_state.json ADDED
@@ -0,0 +1 @@
 
 
1
+ {"step": 238001}
outputs/checkpoints/checkpoint-239000/config.json ADDED
@@ -0,0 +1,25 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "architectures": [
3
+ "RobertaForMaskedLM"
4
+ ],
5
+ "attention_probs_dropout_prob": 0.1,
6
+ "bos_token_id": 0,
7
+ "eos_token_id": 2,
8
+ "gradient_checkpointing": false,
9
+ "hidden_act": "gelu",
10
+ "hidden_dropout_prob": 0.1,
11
+ "hidden_size": 768,
12
+ "initializer_range": 0.02,
13
+ "intermediate_size": 3072,
14
+ "layer_norm_eps": 1e-05,
15
+ "max_position_embeddings": 514,
16
+ "model_type": "roberta",
17
+ "num_attention_heads": 12,
18
+ "num_hidden_layers": 12,
19
+ "pad_token_id": 1,
20
+ "position_embedding_type": "absolute",
21
+ "transformers_version": "4.9.0.dev0",
22
+ "type_vocab_size": 1,
23
+ "use_cache": true,
24
+ "vocab_size": 50265
25
+ }
outputs/checkpoints/checkpoint-239000/data_collator.joblib ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:0321b1a9629e1be122045cd72470365a63c8496fec109fdeec34827f01ffbb9e
3
+ size 1471424
outputs/checkpoints/checkpoint-239000/flax_model.msgpack ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:a91241143d68cab5eaabcd673afdbeb921b9bd7421f64e916068a9bbbb9f3f61
3
+ size 249750019
outputs/checkpoints/checkpoint-239000/optimizer_state.msgpack ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:da4afed84b087814ce34440c407482a6d72fd5b561c6f9abc97c6a9169702c6a
3
+ size 499500278
outputs/checkpoints/checkpoint-239000/training_args.joblib ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:750e4dfee62b3647178ef58384b169e0bfc4aff8738f70be19c457301459d630
3
+ size 1871
outputs/checkpoints/checkpoint-239000/training_state.json ADDED
@@ -0,0 +1 @@
 
 
1
+ {"step": 239001}
outputs/checkpoints/checkpoint-240000/config.json ADDED
@@ -0,0 +1,25 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "architectures": [
3
+ "RobertaForMaskedLM"
4
+ ],
5
+ "attention_probs_dropout_prob": 0.1,
6
+ "bos_token_id": 0,
7
+ "eos_token_id": 2,
8
+ "gradient_checkpointing": false,
9
+ "hidden_act": "gelu",
10
+ "hidden_dropout_prob": 0.1,
11
+ "hidden_size": 768,
12
+ "initializer_range": 0.02,
13
+ "intermediate_size": 3072,
14
+ "layer_norm_eps": 1e-05,
15
+ "max_position_embeddings": 514,
16
+ "model_type": "roberta",
17
+ "num_attention_heads": 12,
18
+ "num_hidden_layers": 12,
19
+ "pad_token_id": 1,
20
+ "position_embedding_type": "absolute",
21
+ "transformers_version": "4.9.0.dev0",
22
+ "type_vocab_size": 1,
23
+ "use_cache": true,
24
+ "vocab_size": 50265
25
+ }
outputs/checkpoints/checkpoint-240000/data_collator.joblib ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:0321b1a9629e1be122045cd72470365a63c8496fec109fdeec34827f01ffbb9e
3
+ size 1471424
outputs/checkpoints/checkpoint-240000/flax_model.msgpack ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:9ae5937c808e6e457c600f1da0eb5f8f38f6a3137b2d59828bc675b0103214ca
3
+ size 249750019
outputs/checkpoints/checkpoint-240000/optimizer_state.msgpack ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:88c8e3403699be77d3def6ff91b6fa612b9df84c14ca7c447c1fefbc9c626b17
3
+ size 499500278
outputs/checkpoints/checkpoint-240000/training_args.joblib ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:750e4dfee62b3647178ef58384b169e0bfc4aff8738f70be19c457301459d630
3
+ size 1871
outputs/checkpoints/checkpoint-240000/training_state.json ADDED
@@ -0,0 +1 @@
 
 
1
+ {"step": 240001}
outputs/config.json ADDED
@@ -0,0 +1,25 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "architectures": [
3
+ "RobertaForMaskedLM"
4
+ ],
5
+ "attention_probs_dropout_prob": 0.1,
6
+ "bos_token_id": 0,
7
+ "eos_token_id": 2,
8
+ "gradient_checkpointing": false,
9
+ "hidden_act": "gelu",
10
+ "hidden_dropout_prob": 0.1,
11
+ "hidden_size": 768,
12
+ "initializer_range": 0.02,
13
+ "intermediate_size": 3072,
14
+ "layer_norm_eps": 1e-05,
15
+ "max_position_embeddings": 514,
16
+ "model_type": "roberta",
17
+ "num_attention_heads": 12,
18
+ "num_hidden_layers": 12,
19
+ "pad_token_id": 1,
20
+ "position_embedding_type": "absolute",
21
+ "transformers_version": "4.9.0.dev0",
22
+ "type_vocab_size": 1,
23
+ "use_cache": true,
24
+ "vocab_size": 50265
25
+ }
outputs/data_collator.joblib ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:0321b1a9629e1be122045cd72470365a63c8496fec109fdeec34827f01ffbb9e
3
+ size 1471424
outputs/events.out.tfevents.1626132703.tablespoon.2534518.3.v2 ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:dcf653e7864d18167096d734dbd860d15fbba06384015f694de1099fc39f95de
3
+ size 40
outputs/events.out.tfevents.1626132842.tablespoon.2540265.3.v2 ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:cb1b3e5c4ef3d4d9f4f8c03d897f931b27c715e76908f1738efc5b18d5684daf
3
+ size 34421733
outputs/events.out.tfevents.1626535665.tablespoon.2656403.3.v2 ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:95635a084003eb01fefc99ecfe5a77a4c924fd2cee2f1a511c504dc1d63feaf5
3
+ size 40
outputs/events.out.tfevents.1626537915.tablespoon.2714825.3.v2 ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:09c2bcb1c66711eed2ae348defb56f15368a5da3205c64a1b686d29c595ad408
3
+ size 40