versae commited on
Commit
d6c5011
0 Parent(s):

Model at 210k steps, mlm acc 0.6509

Browse files
This view is limited to 50 files because it contains too many changes.   See raw diff
Files changed (50) hide show
  1. .gitattributes +19 -0
  2. .gitignore +4 -0
  3. config.json +25 -0
  4. configs/base/config.json +25 -0
  5. configs/base/tokenizer.json +0 -0
  6. configs/large/config.json +25 -0
  7. configs/large/tokenizer.json +0 -0
  8. convert.py +29 -0
  9. flax_model.msgpack +3 -0
  10. mc4/README.md +525 -0
  11. mc4/dummy/af/0.0.0/dummy_data.zip +0 -0
  12. mc4/mc4.py +426 -0
  13. mc4/mc4.py.lock +0 -0
  14. merges.txt +0 -0
  15. outputs/checkpoints/checkpoint-170001/config.json +25 -0
  16. outputs/checkpoints/checkpoint-170001/data_collator.joblib +3 -0
  17. outputs/checkpoints/checkpoint-170001/flax_model.msgpack +3 -0
  18. outputs/checkpoints/checkpoint-170001/optimizer_state.msgpack +3 -0
  19. outputs/checkpoints/checkpoint-170001/training_args.joblib +3 -0
  20. outputs/checkpoints/checkpoint-170001/training_state.json +1 -0
  21. outputs/checkpoints/checkpoint-180001/config.json +25 -0
  22. outputs/checkpoints/checkpoint-180001/data_collator.joblib +3 -0
  23. outputs/checkpoints/checkpoint-180001/flax_model.msgpack +3 -0
  24. outputs/checkpoints/checkpoint-180001/optimizer_state.msgpack +3 -0
  25. outputs/checkpoints/checkpoint-180001/training_args.joblib +3 -0
  26. outputs/checkpoints/checkpoint-180001/training_state.json +1 -0
  27. outputs/checkpoints/checkpoint-190001/config.json +25 -0
  28. outputs/checkpoints/checkpoint-190001/data_collator.joblib +3 -0
  29. outputs/checkpoints/checkpoint-190001/flax_model.msgpack +3 -0
  30. outputs/checkpoints/checkpoint-190001/optimizer_state.msgpack +3 -0
  31. outputs/checkpoints/checkpoint-190001/training_args.joblib +3 -0
  32. outputs/checkpoints/checkpoint-190001/training_state.json +1 -0
  33. outputs/checkpoints/checkpoint-200001/config.json +25 -0
  34. outputs/checkpoints/checkpoint-200001/data_collator.joblib +3 -0
  35. outputs/checkpoints/checkpoint-200001/flax_model.msgpack +3 -0
  36. outputs/checkpoints/checkpoint-200001/optimizer_state.msgpack +3 -0
  37. outputs/checkpoints/checkpoint-200001/training_args.joblib +3 -0
  38. outputs/checkpoints/checkpoint-200001/training_state.json +1 -0
  39. outputs/checkpoints/checkpoint-210001/config.json +25 -0
  40. outputs/checkpoints/checkpoint-210001/data_collator.joblib +3 -0
  41. outputs/checkpoints/checkpoint-210001/flax_model.msgpack +3 -0
  42. outputs/checkpoints/checkpoint-210001/optimizer_state.msgpack +3 -0
  43. outputs/checkpoints/checkpoint-210001/training_args.joblib +3 -0
  44. outputs/checkpoints/checkpoint-210001/training_state.json +1 -0
  45. outputs/config.json +25 -0
  46. outputs/data_collator.joblib +3 -0
  47. outputs/events.out.tfevents.1626132703.tablespoon.2534518.3.v2 +3 -0
  48. outputs/events.out.tfevents.1626132842.tablespoon.2540265.3.v2 +3 -0
  49. outputs/flax_model.msgpack +3 -0
  50. outputs/optimizer_state.msgpack +3 -0
.gitattributes ADDED
@@ -0,0 +1,19 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ *.bin.* filter=lfs diff=lfs merge=lfs -text
2
+ *.lfs.* filter=lfs diff=lfs merge=lfs -text
3
+ *.bin filter=lfs diff=lfs merge=lfs -text
4
+ *.h5 filter=lfs diff=lfs merge=lfs -text
5
+ *.tflite filter=lfs diff=lfs merge=lfs -text
6
+ *.tar.gz filter=lfs diff=lfs merge=lfs -text
7
+ *.ot filter=lfs diff=lfs merge=lfs -text
8
+ *.onnx filter=lfs diff=lfs merge=lfs -text
9
+ *.arrow filter=lfs diff=lfs merge=lfs -text
10
+ *.ftz filter=lfs diff=lfs merge=lfs -text
11
+ *.joblib filter=lfs diff=lfs merge=lfs -text
12
+ *.model filter=lfs diff=lfs merge=lfs -text
13
+ *.msgpack filter=lfs diff=lfs merge=lfs -text
14
+ *.pb filter=lfs diff=lfs merge=lfs -text
15
+ *.pt filter=lfs diff=lfs merge=lfs -text
16
+ *.pth filter=lfs diff=lfs merge=lfs -text
17
+ *tfevents* filter=lfs diff=lfs merge=lfs -text
18
+ *.log filter=lfs diff=lfs merge=lfs -text
19
+ *.wandb filter=lfs diff=lfs merge=lfs -text
.gitignore ADDED
@@ -0,0 +1,4 @@
 
 
 
 
 
1
+ run*.log
2
+ debug*.log
3
+ run*.wandb
4
+ wandb/
config.json ADDED
@@ -0,0 +1,25 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "architectures": [
3
+ "RobertaForMaskedLM"
4
+ ],
5
+ "attention_probs_dropout_prob": 0.1,
6
+ "bos_token_id": 0,
7
+ "eos_token_id": 2,
8
+ "gradient_checkpointing": false,
9
+ "hidden_act": "gelu",
10
+ "hidden_dropout_prob": 0.1,
11
+ "hidden_size": 768,
12
+ "initializer_range": 0.02,
13
+ "intermediate_size": 3072,
14
+ "layer_norm_eps": 1e-05,
15
+ "max_position_embeddings": 514,
16
+ "model_type": "roberta",
17
+ "num_attention_heads": 12,
18
+ "num_hidden_layers": 12,
19
+ "pad_token_id": 1,
20
+ "position_embedding_type": "absolute",
21
+ "transformers_version": "4.9.0.dev0",
22
+ "type_vocab_size": 1,
23
+ "use_cache": true,
24
+ "vocab_size": 50265
25
+ }
configs/base/config.json ADDED
@@ -0,0 +1,25 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "architectures": [
3
+ "RobertaForMaskedLM"
4
+ ],
5
+ "attention_probs_dropout_prob": 0.1,
6
+ "bos_token_id": 0,
7
+ "eos_token_id": 2,
8
+ "gradient_checkpointing": false,
9
+ "hidden_act": "gelu",
10
+ "hidden_dropout_prob": 0.1,
11
+ "hidden_size": 768,
12
+ "initializer_range": 0.02,
13
+ "intermediate_size": 3072,
14
+ "layer_norm_eps": 1e-05,
15
+ "max_position_embeddings": 514,
16
+ "model_type": "roberta",
17
+ "num_attention_heads": 12,
18
+ "num_hidden_layers": 12,
19
+ "pad_token_id": 1,
20
+ "position_embedding_type": "absolute",
21
+ "transformers_version": "4.9.0.dev0",
22
+ "type_vocab_size": 1,
23
+ "use_cache": true,
24
+ "vocab_size": 50265
25
+ }
configs/base/tokenizer.json ADDED
The diff for this file is too large to render. See raw diff
 
configs/large/config.json ADDED
@@ -0,0 +1,25 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "architectures": [
3
+ "RobertaForMaskedLM"
4
+ ],
5
+ "attention_probs_dropout_prob": 0.1,
6
+ "bos_token_id": 0,
7
+ "eos_token_id": 2,
8
+ "gradient_checkpointing": false,
9
+ "hidden_act": "gelu",
10
+ "hidden_dropout_prob": 0.1,
11
+ "hidden_size": 1024,
12
+ "initializer_range": 0.02,
13
+ "intermediate_size": 4096,
14
+ "layer_norm_eps": 1e-05,
15
+ "max_position_embeddings": 514,
16
+ "model_type": "roberta",
17
+ "num_attention_heads": 16,
18
+ "num_hidden_layers": 24,
19
+ "pad_token_id": 1,
20
+ "position_embedding_type": "absolute",
21
+ "transformers_version": "4.9.0.dev0",
22
+ "type_vocab_size": 1,
23
+ "use_cache": true,
24
+ "vocab_size": 50265
25
+ }
configs/large/tokenizer.json ADDED
The diff for this file is too large to render. See raw diff
 
convert.py ADDED
@@ -0,0 +1,29 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ #!/usr/bin/env python
2
+ import tempfile
3
+
4
+ import jax
5
+ from jax import numpy as jnp
6
+ from transformers import AutoTokenizer, FlaxRobertaForMaskedLM, RobertaForMaskedLM
7
+
8
+
9
+ def to_f32(t):
10
+ return jax.tree_map(lambda x: x.astype(jnp.float32) if x.dtype == jnp.bfloat16 else x, t)
11
+
12
+
13
+ def main():
14
+ # Saving extra files from config.json and tokenizer.json files
15
+ tokenizer = AutoTokenizer.from_pretrained("./")
16
+ tokenizer.save_pretrained("./")
17
+
18
+ # Temporary saving bfloat16 Flax model into float32
19
+ tmp = tempfile.mkdtemp()
20
+ flax_model = FlaxRobertaForMaskedLM.from_pretrained("./")
21
+ flax_model.params = to_f32(flax_model.params)
22
+ flax_model.save_pretrained(tmp)
23
+ # Converting float32 Flax to PyTorch
24
+ model = RobertaForMaskedLM.from_pretrained(tmp, from_flax=True)
25
+ model.save_pretrained("./", save_config=False)
26
+
27
+
28
+ if __name__ == "__main__":
29
+ main()
flax_model.msgpack ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:9f64c5b858c8917d0c7c86909e61da0c0564704de3797a9d43c2570cac4b0247
3
+ size 249750019
mc4/README.md ADDED
@@ -0,0 +1,525 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ ---
2
+ pretty_name: mC4
3
+ annotations_creators:
4
+ - no-annotation
5
+ language_creators:
6
+ - found
7
+ languages:
8
+ - af
9
+ - am
10
+ - ar
11
+ - az
12
+ - be
13
+ - bg
14
+ - bg-Latn
15
+ - bn
16
+ - ca
17
+ - ceb
18
+ - co
19
+ - cs
20
+ - cy
21
+ - da
22
+ - de
23
+ - el
24
+ - el-Latn
25
+ - en
26
+ - eo
27
+ - es
28
+ - et
29
+ - eu
30
+ - fa
31
+ - fi
32
+ - fil
33
+ - fr
34
+ - fy
35
+ - ga
36
+ - gd
37
+ - gl
38
+ - gu
39
+ - ha
40
+ - haw
41
+ - hi
42
+ - hi-Latn
43
+ - hmn
44
+ - ht
45
+ - hu
46
+ - hy
47
+ - id
48
+ - ig
49
+ - is
50
+ - it
51
+ - iw
52
+ - ja
53
+ - ja-Latn
54
+ - jv
55
+ - ka
56
+ - kk
57
+ - km
58
+ - kn
59
+ - ko
60
+ - ku
61
+ - ky
62
+ - la
63
+ - lb
64
+ - lo
65
+ - lt
66
+ - lv
67
+ - mg
68
+ - mi
69
+ - mk
70
+ - ml
71
+ - mn
72
+ - mr
73
+ - ms
74
+ - mt
75
+ - my
76
+ - ne
77
+ - nl
78
+ - "no"
79
+ - ny
80
+ - pa
81
+ - pl
82
+ - ps
83
+ - pt
84
+ - ro
85
+ - ru
86
+ - ru-Latn
87
+ - sd
88
+ - si
89
+ - sk
90
+ - sl
91
+ - sm
92
+ - sn
93
+ - so
94
+ - sq
95
+ - sr
96
+ - st
97
+ - su
98
+ - sv
99
+ - sw
100
+ - ta
101
+ - te
102
+ - tg
103
+ - th
104
+ - tr
105
+ - uk
106
+ - und
107
+ - ur
108
+ - uz
109
+ - vi
110
+ - xh
111
+ - yi
112
+ - yo
113
+ - zh
114
+ - zh-Latn
115
+ - zu
116
+ licenses:
117
+ - odc-by-1.0
118
+ multilinguality:
119
+ - multilingual
120
+ size_categories:
121
+ - n<1K
122
+ - 1K<n<10K
123
+ - 10K<n<100K
124
+ - 100K<n<1M
125
+ - 1M<n<10M
126
+ - 10M<n<100M
127
+ - 100M<n<1B
128
+ - 1B<n<10B
129
+ source_datasets:
130
+ - original
131
+ task_categories:
132
+ - sequence-modeling
133
+ task_ids:
134
+ - language-modeling
135
+ paperswithcode_id: mc4
136
+ ---
137
+
138
+ # Dataset Card for mC4
139
+
140
+ ## Table of Contents
141
+
142
+ - [Dataset Card for mC4](#dataset-card-for-mc4)
143
+ - [Table of Contents](#table-of-contents)
144
+ - [Dataset Description](#dataset-description)
145
+ - [Dataset Summary](#dataset-summary)
146
+ - [Supported Tasks and Leaderboards](#supported-tasks-and-leaderboards)
147
+ - [Languages](#languages)
148
+ - [Dataset Structure](#dataset-structure)
149
+ - [Data Instances](#data-instances)
150
+ - [Data Fields](#data-fields)
151
+ - [Data Splits](#data-splits)
152
+ - [Dataset Creation](#dataset-creation)
153
+ - [Curation Rationale](#curation-rationale)
154
+ - [Source Data](#source-data)
155
+ - [Initial Data Collection and Normalization](#initial-data-collection-and-normalization)
156
+ - [Who are the source language producers?](#who-are-the-source-language-producers)
157
+ - [Annotations](#annotations)
158
+ - [Annotation process](#annotation-process)
159
+ - [Who are the annotators?](#who-are-the-annotators)
160
+ - [Personal and Sensitive Information](#personal-and-sensitive-information)
161
+ - [Considerations for Using the Data](#considerations-for-using-the-data)
162
+ - [Social Impact of Dataset](#social-impact-of-dataset)
163
+ - [Discussion of Biases](#discussion-of-biases)
164
+ - [Other Known Limitations](#other-known-limitations)
165
+ - [Additional Information](#additional-information)
166
+ - [Dataset Curators](#dataset-curators)
167
+ - [Licensing Information](#licensing-information)
168
+ - [Citation Information](#citation-information)
169
+ - [Contributions](#contributions)
170
+
171
+ ## Dataset Description
172
+
173
+ - **Homepage:** https://huggingface.co/datasets/allenai/c4
174
+ - **Paper:** https://arxiv.org/abs/1910.10683
175
+
176
+ ### Dataset Summary
177
+
178
+ A multilingual colossal, cleaned version of Common Crawl's web crawl corpus. Based on Common Crawl dataset: "https://commoncrawl.org".
179
+
180
+ This is the version prepared by AllenAI, hosted at this address: https://huggingface.co/datasets/allenai/c4
181
+
182
+ 108 languages are available and are reported in the table below.
183
+
184
+ Note that the languages that end with "-Latn" are simply romanized variants, i.e. written using the Latin script.
185
+
186
+ | language code | language name |
187
+ |:----------------|:---------------------|
188
+ | af | Afrikaans |
189
+ | am | Amharic |
190
+ | ar | Arabic |
191
+ | az | Azerbaijani |
192
+ | be | Belarusian |
193
+ | bg | Bulgarian |
194
+ | bg-Latn | Bulgarian (Latin) |
195
+ | bn | Bangla |
196
+ | ca | Catalan |
197
+ | ceb | Cebuano |
198
+ | co | Corsican |
199
+ | cs | Czech |
200
+ | cy | Welsh |
201
+ | da | Danish |
202
+ | de | German |
203
+ | el | Greek |
204
+ | el-Latn | Greek (Latin) |
205
+ | en | English |
206
+ | eo | Esperanto |
207
+ | es | Spanish |
208
+ | et | Estonian |
209
+ | eu | Basque |
210
+ | fa | Persian |
211
+ | fi | Finnish |
212
+ | fil | Filipino |
213
+ | fr | French |
214
+ | fy | Western Frisian |
215
+ | ga | Irish |
216
+ | gd | Scottish Gaelic |
217
+ | gl | Galician |
218
+ | gu | Gujarati |
219
+ | ha | Hausa |
220
+ | haw | Hawaiian |
221
+ | hi | Hindi |
222
+ | hi-Latn | Hindi (Latin script) |
223
+ | hmn | Hmong, Mong |
224
+ | ht | Haitian |
225
+ | hu | Hungarian |
226
+ | hy | Armenian |
227
+ | id | Indonesian |
228
+ | ig | Igbo |
229
+ | is | Icelandic |
230
+ | it | Italian |
231
+ | iw | former Hebrew |
232
+ | ja | Japanese |
233
+ | ja-Latn | Japanese (Latin) |
234
+ | jv | Javanese |
235
+ | ka | Georgian |
236
+ | kk | Kazakh |
237
+ | km | Khmer |
238
+ | kn | Kannada |
239
+ | ko | Korean |
240
+ | ku | Kurdish |
241
+ | ky | Kyrgyz |
242
+ | la | Latin |
243
+ | lb | Luxembourgish |
244
+ | lo | Lao |
245
+ | lt | Lithuanian |
246
+ | lv | Latvian |
247
+ | mg | Malagasy |
248
+ | mi | Maori |
249
+ | mk | Macedonian |
250
+ | ml | Malayalam |
251
+ | mn | Mongolian |
252
+ | mr | Marathi |
253
+ | ms | Malay |
254
+ | mt | Maltese |
255
+ | my | Burmese |
256
+ | ne | Nepali |
257
+ | nl | Dutch |
258
+ | no | Norwegian |
259
+ | ny | Nyanja |
260
+ | pa | Punjabi |
261
+ | pl | Polish |
262
+ | ps | Pashto |
263
+ | pt | Portuguese |
264
+ | ro | Romanian |
265
+ | ru | Russian |
266
+ | ru-Latn | Russian (Latin) |
267
+ | sd | Sindhi |
268
+ | si | Sinhala |
269
+ | sk | Slovak |
270
+ | sl | Slovenian |
271
+ | sm | San Marino |
272
+ | sn | Shona |
273
+ | so | Somali |
274
+ | sq | Albanian |
275
+ | sr | Serbian |
276
+ | st | Southern Sotho |
277
+ | su | Sundanese |
278
+ | sv | Swedish |
279
+ | sw | Swahili |
280
+ | ta | Tamil |
281
+ | te | Telugu |
282
+ | tg | Tajik |
283
+ | th | Thai |
284
+ | tr | Turkish |
285
+ | uk | Ukrainian |
286
+ | und | Unknown language |
287
+ | ur | Urdu |
288
+ | uz | Uzbek |
289
+ | vi | Vietnamese |
290
+ | xh | Xhosa |
291
+ | yi | Yiddish |
292
+ | yo | Yoruba |
293
+ | zh | Chinese |
294
+ | zh-Latn | Chinese (Latin) |
295
+ | zu | Zulu |
296
+
297
+ You can load the mC4 subset of any language like this:
298
+
299
+ ```python
300
+ from datasets import load_dataset
301
+
302
+ en_mc4 = load_dataset("mc4", "en")
303
+ ```
304
+
305
+ And if you can even specify a list of languages:
306
+
307
+ ```python
308
+ from datasets import load_dataset
309
+
310
+ mc4_subset_with_five_languages = load_dataset("mc4", languages=["en", "fr", "es", "de", "zh"])
311
+ ```
312
+
313
+ ### Supported Tasks and Leaderboards
314
+
315
+ mC4 is mainly intended to pretrain language models and word representations.
316
+
317
+ ### Languages
318
+
319
+ The dataset supports 108 languages.
320
+
321
+ ## Dataset Structure
322
+
323
+ ### Data Instances
324
+
325
+ An example form the `en` config is:
326
+
327
+ ```
328
+ {'timestamp': '2018-06-24T01:32:39Z',
329
+ 'text': 'Farm Resources in Plumas County\nShow Beginning Farmer Organizations & Professionals (304)\nThere are 304 resources serving Plumas County in the following categories:\nMap of Beginning Farmer Organizations & Professionals serving Plumas County\nVictoria Fisher - Office Manager - Loyalton, CA\nAmy Lynn Rasband - UCCE Plumas-Sierra Administrative Assistant II - Quincy , CA\nShow Farm Income Opportunities Organizations & Professionals (353)\nThere are 353 resources serving Plumas County in the following categories:\nFarm Ranch And Forest Retailers (18)\nMap of Farm Income Opportunities Organizations & Professionals serving Plumas County\nWarner Valley Wildlife Area - Plumas County\nShow Farm Resources Organizations & Professionals (297)\nThere are 297 resources serving Plumas County in the following categories:\nMap of Farm Resources Organizations & Professionals serving Plumas County\nThere are 57 resources serving Plumas County in the following categories:\nMap of Organic Certification Organizations & Professionals serving Plumas County',
330
+ 'url': 'http://www.californialandcan.org/Plumas/Farm-Resources/'}
331
+ ```
332
+
333
+ ### Data Fields
334
+
335
+ The data have several fields:
336
+
337
+ - `url`: url of the source as a string
338
+ - `text`: text content as a string
339
+ - `timestamp`: timestamp as a string
340
+
341
+ ### Data Splits
342
+
343
+ To build mC4, the authors used [CLD3](https://github.com/google/cld3) to identify over 100 languages. The resulting mC4 subsets for each language are reported in this table:
344
+
345
+ | config | train | validation |
346
+ |:---------|:--------|:-------------|
347
+ | af | ? | ? |
348
+ | am | ? | ? |
349
+ | ar | ? | ? |
350
+ | az | ? | ? |
351
+ | be | ? | ? |
352
+ | bg | ? | ? |
353
+ | bg-Latn | ? | ? |
354
+ | bn | ? | ? |
355
+ | ca | ? | ? |
356
+ | ceb | ? | ? |
357
+ | co | ? | ? |
358
+ | cs | ? | ? |
359
+ | cy | ? | ? |
360
+ | da | ? | ? |
361
+ | de | ? | ? |
362
+ | el | ? | ? |
363
+ | el-Latn | ? | ? |
364
+ | en | ? | ? |
365
+ | eo | ? | ? |
366
+ | es | ? | ? |
367
+ | et | ? | ? |
368
+ | eu | ? | ? |
369
+ | fa | ? | ? |
370
+ | fi | ? | ? |
371
+ | fil | ? | ? |
372
+ | fr | ? | ? |
373
+ | fy | ? | ? |
374
+ | ga | ? | ? |
375
+ | gd | ? | ? |
376
+ | gl | ? | ? |
377
+ | gu | ? | ? |
378
+ | ha | ? | ? |
379
+ | haw | ? | ? |
380
+ | hi | ? | ? |
381
+ | hi-Latn | ? | ? |
382
+ | hmn | ? | ? |
383
+ | ht | ? | ? |
384
+ | hu | ? | ? |
385
+ | hy | ? | ? |
386
+ | id | ? | ? |
387
+ | ig | ? | ? |
388
+ | is | ? | ? |
389
+ | it | ? | ? |
390
+ | iw | ? | ? |
391
+ | ja | ? | ? |
392
+ | ja-Latn | ? | ? |
393
+ | jv | ? | ? |
394
+ | ka | ? | ? |
395
+ | kk | ? | ? |
396
+ | km | ? | ? |
397
+ | kn | ? | ? |
398
+ | ko | ? | ? |
399
+ | ku | ? | ? |
400
+ | ky | ? | ? |
401
+ | la | ? | ? |
402
+ | lb | ? | ? |
403
+ | lo | ? | ? |
404
+ | lt | ? | ? |
405
+ | lv | ? | ? |
406
+ | mg | ? | ? |
407
+ | mi | ? | ? |
408
+ | mk | ? | ? |
409
+ | ml | ? | ? |
410
+ | mn | ? | ? |
411
+ | mr | ? | ? |
412
+ | ms | ? | ? |
413
+ | mt | ? | ? |
414
+ | my | ? | ? |
415
+ | ne | ? | ? |
416
+ | nl | ? | ? |
417
+ | no | ? | ? |
418
+ | ny | ? | ? |
419
+ | pa | ? | ? |
420
+ | pl | ? | ? |
421
+ | ps | ? | ? |
422
+ | pt | ? | ? |
423
+ | ro | ? | ? |
424
+ | ru | ? | ? |
425
+ | ru-Latn | ? | ? |
426
+ | sd | ? | ? |
427
+ | si | ? | ? |
428
+ | sk | ? | ? |
429
+ | sl | ? | ? |
430
+ | sm | ? | ? |
431
+ | sn | ? | ? |
432
+ | so | ? | ? |
433
+ | sq | ? | ? |
434
+ | sr | ? | ? |
435
+ | st | ? | ? |
436
+ | su | ? | ? |
437
+ | sv | ? | ? |
438
+ | sw | ? | ? |
439
+ | ta | ? | ? |
440
+ | te | ? | ? |
441
+ | tg | ? | ? |
442
+ | th | ? | ? |
443
+ | tr | ? | ? |
444
+ | uk | ? | ? |
445
+ | und | ? | ? |
446
+ | ur | ? | ? |
447
+ | uz | ? | ? |
448
+ | vi | ? | ? |
449
+ | xh | ? | ? |
450
+ | yi | ? | ? |
451
+ | yo | ? | ? |
452
+ | zh | ? | ? |
453
+ | zh-Latn | ? | ? |
454
+ | zu | ? | ? |
455
+
456
+ ## Dataset Creation
457
+
458
+ ### Curation Rationale
459
+
460
+ [More Information Needed]
461
+
462
+ ### Source Data
463
+
464
+ #### Initial Data Collection and Normalization
465
+
466
+ [More Information Needed]
467
+
468
+ #### Who are the source language producers?
469
+
470
+ [More Information Needed]
471
+
472
+ ### Annotations
473
+
474
+ #### Annotation process
475
+
476
+ [More Information Needed]
477
+
478
+ #### Who are the annotators?
479
+
480
+ [More Information Needed]
481
+
482
+ ### Personal and Sensitive Information
483
+
484
+ [More Information Needed]
485
+
486
+ ## Considerations for Using the Data
487
+
488
+ ### Social Impact of Dataset
489
+
490
+ [More Information Needed]
491
+
492
+ ### Discussion of Biases
493
+
494
+ [More Information Needed]
495
+
496
+ ### Other Known Limitations
497
+
498
+ [More Information Needed]
499
+
500
+ ## Additional Information
501
+
502
+ ### Dataset Curators
503
+
504
+ [More Information Needed]
505
+
506
+ ### Licensing Information
507
+
508
+ AllenAI are releasing this dataset under the terms of ODC-BY. By using this, you are also bound by the Common Crawl terms of use in respect of the content contained in the dataset.
509
+
510
+ ### Citation Information
511
+
512
+ ```
513
+ @article{2019t5,
514
+ author = {Colin Raffel and Noam Shazeer and Adam Roberts and Katherine Lee and Sharan Narang and Michael Matena and Yanqi Zhou and Wei Li and Peter J. Liu},
515
+ title = {Exploring the Limits of Transfer Learning with a Unified Text-to-Text Transformer},
516
+ journal = {arXiv e-prints},
517
+ year = {2019},
518
+ archivePrefix = {arXiv},
519
+ eprint = {1910.10683},
520
+ }
521
+ ```
522
+
523
+ ### Contributions
524
+
525
+ Thanks to [@dirkgr](https://github.com/dirkgr) and [@lhoestq](https://github.com/lhoestq) for adding this dataset.
mc4/dummy/af/0.0.0/dummy_data.zip ADDED
Binary file (8.54 kB). View file
 
mc4/mc4.py ADDED
@@ -0,0 +1,426 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """mC4 dataset based on Common Crawl."""
2
+
3
+
4
+ import gzip
5
+ import json
6
+
7
+ import datasets
8
+ import kenlm
9
+ import numpy as np
10
+ from numpy.random import default_rng
11
+
12
+
13
+ logger = datasets.logging.get_logger(__name__)
14
+
15
+
16
+ _DESCRIPTION = """\
17
+ A colossal, cleaned version of Common Crawl's web crawl corpus.
18
+
19
+ Based on Common Crawl dataset: "https://commoncrawl.org".
20
+
21
+ This is the processed version of Google's mC4 dataset by AllenAI.
22
+ """
23
+
24
+ _CITATION = """
25
+ @article{2019t5,
26
+ author = {Colin Raffel and Noam Shazeer and Adam Roberts and Katherine Lee and Sharan Narang and Michael Matena and Yanqi Zhou and Wei Li and Peter J. Liu},
27
+ title = {Exploring the Limits of Transfer Learning with a Unified Text-to-Text Transformer},
28
+ journal = {arXiv e-prints},
29
+ year = {2019},
30
+ archivePrefix = {arXiv},
31
+ eprint = {1910.10683},
32
+ }
33
+ """
34
+
35
+ _URL = "https://github.com/allenai/allennlp/discussions/5056"
36
+
37
+ _DATA_URL = "https://huggingface.co/datasets/allenai/c4/resolve/1ddc917116b730e1859edef32896ec5c16be51d0/multilingual/c4-{language}{split_suffix}.tfrecord-{index:05d}-of-{n_shards:05d}.json.gz"
38
+
39
+ _LANGUAGES = [
40
+ "af",
41
+ "am",
42
+ "ar",
43
+ "az",
44
+ "be",
45
+ "bg",
46
+ "bg-Latn",
47
+ "bn",
48
+ "ca",
49
+ "ceb",
50
+ "co",
51
+ "cs",
52
+ "cy",
53
+ "da",
54
+ "de",
55
+ "el",
56
+ "el-Latn",
57
+ "en",
58
+ "eo",
59
+ "es",
60
+ "et",
61
+ "eu",
62
+ "fa",
63
+ "fi",
64
+ "fil",
65
+ "fr",
66
+ "fy",
67
+ "ga",
68
+ "gd",
69
+ "gl",
70
+ "gu",
71
+ "ha",
72
+ "haw",
73
+ "hi",
74
+ "hi-Latn",
75
+ "hmn",
76
+ "ht",
77
+ "hu",
78
+ "hy",
79
+ "id",
80
+ "ig",
81
+ "is",
82
+ "it",
83
+ "iw",
84
+ "ja",
85
+ "ja-Latn",
86
+ "jv",
87
+ "ka",
88
+ "kk",
89
+ "km",
90
+ "kn",
91
+ "ko",
92
+ "ku",
93
+ "ky",
94
+ "la",
95
+ "lb",
96
+ "lo",
97
+ "lt",
98
+ "lv",
99
+ "mg",
100
+ "mi",
101
+ "mk",
102
+ "ml",
103
+ "mn",
104
+ "mr",
105
+ "ms",
106
+ "mt",
107
+ "my",
108
+ "ne",
109
+ "nl",
110
+ "no",
111
+ "ny",
112
+ "pa",
113
+ "pl",
114
+ "ps",
115
+ "pt",
116
+ "ro",
117
+ "ru",
118
+ "ru-Latn",
119
+ "sd",
120
+ "si",
121
+ "sk",
122
+ "sl",
123
+ "sm",
124
+ "sn",
125
+ "so",
126
+ "sq",
127
+ "sr",
128
+ "st",
129
+ "su",
130
+ "sv",
131
+ "sw",
132
+ "ta",
133
+ "te",
134
+ "tg",
135
+ "th",
136
+ "tr",
137
+ "uk",
138
+ "und",
139
+ "ur",
140
+ "uz",
141
+ "vi",
142
+ "xh",
143
+ "yi",
144
+ "yo",
145
+ "zh",
146
+ "zh-Latn",
147
+ "zu",
148
+ ]
149
+
150
+ _N_SHARDS_PER_SPLIT = {
151
+ "af": {"train": 64, "validation": 1},
152
+ "am": {"train": 16, "validation": 1},
153
+ "ar": {"train": 1024, "validation": 4},
154
+ "az": {"train": 256, "validation": 1},
155
+ "be": {"train": 128, "validation": 1},
156
+ "bg": {"train": 1024, "validation": 1},
157
+ "bg-Latn": {"train": 4, "validation": 1},
158
+ "bn": {"train": 512, "validation": 1},
159
+ "ca": {"train": 512, "validation": 1},
160
+ "ceb": {"train": 8, "validation": 1},
161
+ "co": {"train": 8, "validation": 1},
162
+ "cs": {"train": 1024, "validation": 2},
163
+ "cy": {"train": 256, "validation": 1},
164
+ "da": {"train": 1024, "validation": 1},
165
+ "de": {"train": 2048, "validation": 16},
166
+ "el": {"train": 1024, "validation": 2},
167
+ "el-Latn": {"train": 16, "validation": 1},
168
+ "en": {"train": 11264, "validation": 128},
169
+ "eo": {"train": 32, "validation": 1},
170
+ "es": {"train": 2048, "validation": 16},
171
+ "et": {"train": 256, "validation": 1},
172
+ "eu": {"train": 64, "validation": 1},
173
+ "fa": {"train": 1024, "validation": 2},
174
+ "fi": {"train": 1024, "validation": 1},
175
+ "fil": {"train": 64, "validation": 1},
176
+ "fr": {"train": 2048, "validation": 16},
177
+ "fy": {"train": 16, "validation": 1},
178
+ "ga": {"train": 16, "validation": 1},
179
+ "gd": {"train": 16, "validation": 1},
180
+ "gl": {"train": 128, "validation": 1},
181
+ "gu": {"train": 64, "validation": 1},
182
+ "ha": {"train": 8, "validation": 1},
183
+ "haw": {"train": 2, "validation": 1},
184
+ "hi": {"train": 1024, "validation": 2},
185
+ "hi-Latn": {"train": 16, "validation": 1},
186
+ "hmn": {"train": 8, "validation": 1},
187
+ "ht": {"train": 8, "validation": 1},
188
+ "hu": {"train": 1024, "validation": 2},
189
+ "hy": {"train": 128, "validation": 1},
190
+ "id": {"train": 1024, "validation": 4},
191
+ "ig": {"train": 4, "validation": 1},
192
+ "is": {"train": 128, "validation": 1},
193
+ "it": {"train": 1024, "validation": 8},
194
+ "iw": {"train": 1024, "validation": 1},
195
+ "ja": {"train": 1024, "validation": 8},
196
+ "ja-Latn": {"train": 8, "validation": 1},
197
+ "jv": {"train": 8, "validation": 1},
198
+ "ka": {"train": 256, "validation": 1},
199
+ "kk": {"train": 256, "validation": 1},
200
+ "km": {"train": 64, "validation": 1},
201
+ "kn": {"train": 64, "validation": 1},
202
+ "ko": {"train": 1024, "validation": 1},
203
+ "ku": {"train": 16, "validation": 1},
204
+ "ky": {"train": 64, "validation": 1},
205
+ "la": {"train": 64, "validation": 1},
206
+ "lb": {"train": 32, "validation": 1},
207
+ "lo": {"train": 8, "validation": 1},
208
+ "lt": {"train": 512, "validation": 1},
209
+ "lv": {"train": 256, "validation": 1},
210
+ "mg": {"train": 8, "validation": 1},
211
+ "mi": {"train": 4, "validation": 1},
212
+ "mk": {"train": 128, "validation": 1},
213
+ "ml": {"train": 128, "validation": 1},
214
+ "mn": {"train": 128, "validation": 1},
215
+ "mr": {"train": 1024, "validation": 1},
216
+ "ms": {"train": 512, "validation": 1},
217
+ "mt": {"train": 128, "validation": 1},
218
+ "my": {"train": 64, "validation": 1},
219
+ "ne": {"train": 256, "validation": 1},
220
+ "nl": {"train": 1024, "validation": 4},
221
+ "no": {"train": 1024, "validation": 1},
222
+ "ny": {"train": 4, "validation": 1},
223
+ "pa": {"train": 32, "validation": 1},
224
+ "pl": {"train": 1024, "validation": 4},
225
+ "ps": {"train": 16, "validation": 1},
226
+ "pt": {"train": 1024, "validation": 4},
227
+ "ro": {"train": 1024, "validation": 2},
228
+ "ru": {"train": 4096, "validation": 32},
229
+ "ru-Latn": {"train": 32, "validation": 1},
230
+ "sd": {"train": 64, "validation": 1},
231
+ "si": {"train": 64, "validation": 1},
232
+ "sk": {"train": 512, "validation": 1},
233
+ "sl": {"train": 256, "validation": 1},
234
+ "sm": {"train": 4, "validation": 1},
235
+ "sn": {"train": 8, "validation": 1},
236
+ "so": {"train": 64, "validation": 1},
237
+ "sq": {"train": 128, "validation": 1},
238
+ "sr": {"train": 256, "validation": 1},
239
+ "st": {"train": 2, "validation": 1},
240
+ "su": {"train": 4, "validation": 1},
241
+ "sv": {"train": 1024, "validation": 2},
242
+ "sw": {"train": 32, "validation": 1},
243
+ "ta": {"train": 256, "validation": 1},
244
+ "te": {"train": 128, "validation": 1},
245
+ "tg": {"train": 64, "validation": 1},
246
+ "th": {"train": 1024, "validation": 1},
247
+ "tr": {"train": 1024, "validation": 4},
248
+ "uk": {"train": 1024, "validation": 2},
249
+ "und": {"train": 3072, "validation": 32},
250
+ "ur": {"train": 128, "validation": 1},
251
+ "uz": {"train": 32, "validation": 1},
252
+ "vi": {"train": 1024, "validation": 4},
253
+ "xh": {"train": 2, "validation": 1},
254
+ "yi": {"train": 16, "validation": 1},
255
+ "yo": {"train": 2, "validation": 1},
256
+ "zh": {"train": 1024, "validation": 2},
257
+ "zh-Latn": {"train": 8, "validation": 1},
258
+ "zu": {"train": 8, "validation": 1},
259
+ }
260
+
261
+
262
+ class Mc4Config(datasets.BuilderConfig):
263
+ """BuilderConfig for mC4."""
264
+
265
+ def __init__(self, *args, languages, **kwargs):
266
+ """BuilderConfig for mC4.
267
+ Args:
268
+ languages (:obj:`List[str]`): list of languages to load
269
+ **kwargs: keyword arguments forwarded to super.
270
+ """
271
+ super().__init__(
272
+ *args,
273
+ name="+".join(languages),
274
+ **kwargs,
275
+ )
276
+ self.languages = languages
277
+
278
+
279
+ class Mc4(datasets.GeneratorBasedBuilder):
280
+ """mC4, a colossal, cleaned version of Common Crawl's web crawl corpus."""
281
+
282
+ BUILDER_CONFIGS = [Mc4Config(languages=[lang]) for lang in _LANGUAGES]
283
+ BUILDER_CONFIG_CLASS = Mc4Config
284
+
285
+ def __init__(self, *args, writer_batch_size=None, **kwargs):
286
+ self.data_files = kwargs.pop("data_files", {})
287
+ self.sampling_method = kwargs.pop("sampling_method", None)
288
+ self.perplexity_model = kwargs.pop("perplexity_model", None)
289
+ self.sampling_factor = kwargs.pop("sampling_factor", None)
290
+ self.boundaries = kwargs.pop("boundaries", None)
291
+ self.seed = kwargs.pop("seed", None)
292
+ if self.sampling_method:
293
+ if self.seed is not None:
294
+ self.rng = default_rng(self.seed)
295
+ else:
296
+ self.rng = default_rng()
297
+ if self.sampling_method == "random":
298
+ self.should_keep_doc = self._should_keep_doc_random
299
+ else:
300
+ # Loading 5-gram model
301
+ # http://dl.fbaipublicfiles.com/cc_net/lm/es.arpa.bin
302
+ logger.info("loading model = %s", self.perplexity_model)
303
+ self.pp_model = kenlm.Model(self.perplexity_model)
304
+ if self.sampling_method == "gaussian":
305
+ self.should_keep_doc = self._should_keep_doc_gaussian
306
+ else:
307
+ self.should_keep_doc = self._should_keep_doc_step
308
+ super().__init__(*args, writer_batch_size=writer_batch_size, **kwargs)
309
+
310
+ def get_perplexity(self, doc):
311
+ doc_log_score, doc_length = 0, 0
312
+ for line in doc.split("\n"):
313
+ log_score = self.pp_model.score(line)
314
+ length = len(line.split()) + 1
315
+ doc_log_score += log_score
316
+ doc_length += length
317
+ return 10.0 ** (-doc_log_score / doc_length)
318
+
319
+ def _should_keep_doc_step(self, doc, factor=1.5e5, boundaries=None):
320
+ perplexity = self.get_perplexity(doc)
321
+ if boundaries is None:
322
+ boundaries = [536394.99320948, 662247.50212365, 919250.87225178]
323
+ if perplexity <= boundaries[0]:
324
+ quartile_range = boundaries[0]
325
+ elif boundaries[0] < perplexity < boundaries[1]:
326
+ quartile_range = boundaries[1] - boundaries[0]
327
+ elif boundaries[1] < perplexity < boundaries[2]:
328
+ quartile_range = boundaries[2] - boundaries[1]
329
+ elif perplexity >= boundaries[2]:
330
+ quartile_range = 10 * boundaries[2]
331
+ probability = factor / quartile_range
332
+ return self.rng.uniform() < probability
333
+
334
+ def _should_keep_doc_gaussian(self, doc, factor=0.78, boundaries=None):
335
+ perplexity = self.get_perplexity(doc)
336
+ if boundaries is not None:
337
+ m = boundaries[1]
338
+ else:
339
+ m = 662247.50212365
340
+ exponential = np.exp(-9/2 * ((perplexity - m) / m) ** 2)
341
+ weighted_perplexity = factor * exponential
342
+ return self.rng.uniform() < weighted_perplexity
343
+
344
+ def _should_keep_doc_random(self, doc, factor=None, boundaries=None):
345
+ if factor is None:
346
+ factor = 0.5
347
+ return self.rng.uniform() <= factor
348
+
349
+ def _info(self):
350
+ return datasets.DatasetInfo(
351
+ description=_DESCRIPTION,
352
+ features=datasets.Features(
353
+ {
354
+ "text": datasets.Value("string"),
355
+ "timestamp": datasets.Value("string"),
356
+ "url": datasets.Value("string"),
357
+ }
358
+ ),
359
+ supervised_keys=None,
360
+ homepage=_URL,
361
+ citation=_CITATION,
362
+ )
363
+
364
+ def _split_generators(self, dl_manager):
365
+ data_urls = {}
366
+ for split in ["train", "validation"]:
367
+ data_urls[split] = [
368
+ _DATA_URL.format(
369
+ language=self.config.name,
370
+ split_suffix="-validation" if split == "validation" else "",
371
+ index=index,
372
+ n_shards=_N_SHARDS_PER_SPLIT[lang][split],
373
+ )
374
+ for lang in self.config.languages
375
+ for index in range(_N_SHARDS_PER_SPLIT[lang][split])
376
+ ]
377
+ if "train" in self.data_files:
378
+ train_downloaded_files = self.data_files["train"]
379
+ if not isinstance(train_downloaded_files, (tuple, list)):
380
+ train_downloaded_files = [train_downloaded_files]
381
+ else:
382
+ train_downloaded_files = dl_manager.download(data_urls["train"])
383
+ if "validation" in self.data_files:
384
+ validation_downloaded_files = self.data_files["validation"]
385
+ if not isinstance(validation_downloaded_files, (tuple, list)):
386
+ validation_downloaded_files = [validation_downloaded_files]
387
+ else:
388
+ validation_downloaded_files = dl_manager.download(data_urls["validation"])
389
+ return [
390
+ datasets.SplitGenerator(name=datasets.Split.TRAIN, gen_kwargs={"filepaths": train_downloaded_files}),
391
+ datasets.SplitGenerator(
392
+ name=datasets.Split.VALIDATION, gen_kwargs={"filepaths": validation_downloaded_files}
393
+ ),
394
+ ]
395
+
396
+ def _generate_examples(self, filepaths):
397
+ """This function returns the examples in the raw (text) form by iterating on all the files."""
398
+ id_ = 0
399
+ for filepath in filepaths:
400
+ logger.info("generating examples from = %s", filepath)
401
+ if filepath.endswith("jsonl"):
402
+ with open(filepath, "r", encoding="utf-8") as f:
403
+ for line in f:
404
+ if line:
405
+ example = json.loads(line)
406
+ yield id_, example
407
+ id_ += 1
408
+ else:
409
+ with gzip.open(open(filepath, "rb"), "rt", encoding="utf-8") as f:
410
+ if self.sampling_method:
411
+ logger.info("sampling method = %s", self.sampling_method)
412
+ for line in f:
413
+ if line:
414
+ example = json.loads(line)
415
+ if self.should_keep_doc(
416
+ example["text"],
417
+ factor=self.sampling_factor,
418
+ boundaries=self.boundaries):
419
+ yield id_, example
420
+ id_ += 1
421
+ else:
422
+ for line in f:
423
+ if line:
424
+ example = json.loads(line)
425
+ yield id_, example
426
+ id_ += 1
mc4/mc4.py.lock ADDED
File without changes
merges.txt ADDED
The diff for this file is too large to render. See raw diff
 
outputs/checkpoints/checkpoint-170001/config.json ADDED
@@ -0,0 +1,25 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "architectures": [
3
+ "RobertaForMaskedLM"
4
+ ],
5
+ "attention_probs_dropout_prob": 0.1,
6
+ "bos_token_id": 0,
7
+ "eos_token_id": 2,
8
+ "gradient_checkpointing": false,
9
+ "hidden_act": "gelu",
10
+ "hidden_dropout_prob": 0.1,
11
+ "hidden_size": 768,
12
+ "initializer_range": 0.02,
13
+ "intermediate_size": 3072,
14
+ "layer_norm_eps": 1e-05,
15
+ "max_position_embeddings": 514,
16
+ "model_type": "roberta",
17
+ "num_attention_heads": 12,
18
+ "num_hidden_layers": 12,
19
+ "pad_token_id": 1,
20
+ "position_embedding_type": "absolute",
21
+ "transformers_version": "4.9.0.dev0",
22
+ "type_vocab_size": 1,
23
+ "use_cache": true,
24
+ "vocab_size": 50265
25
+ }
outputs/checkpoints/checkpoint-170001/data_collator.joblib ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:e02a6e9cfa63cb321cac9402efd29841b652999fcbf787800ae050e747b161ee
3
+ size 1471394
outputs/checkpoints/checkpoint-170001/flax_model.msgpack ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:82acddc9d2695672d6083ebf0bcde93b488f7aba953505236ac3f977e6f70284
3
+ size 249750019
outputs/checkpoints/checkpoint-170001/optimizer_state.msgpack ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:f44ea5ea6a253566d5b813a6ddda12274ce966aa2da9b84c2886126f906f50ac
3
+ size 499500278
outputs/checkpoints/checkpoint-170001/training_args.joblib ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:992b48eed1c6d2f7f720200cd91ad0097c32fde0bda7f370b8b6bee448eaede7
3
+ size 1873
outputs/checkpoints/checkpoint-170001/training_state.json ADDED
@@ -0,0 +1 @@
 
 
1
+ {"step": 170001}
outputs/checkpoints/checkpoint-180001/config.json ADDED
@@ -0,0 +1,25 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "architectures": [
3
+ "RobertaForMaskedLM"
4
+ ],
5
+ "attention_probs_dropout_prob": 0.1,
6
+ "bos_token_id": 0,
7
+ "eos_token_id": 2,
8
+ "gradient_checkpointing": false,
9
+ "hidden_act": "gelu",
10
+ "hidden_dropout_prob": 0.1,
11
+ "hidden_size": 768,
12
+ "initializer_range": 0.02,
13
+ "intermediate_size": 3072,
14
+ "layer_norm_eps": 1e-05,
15
+ "max_position_embeddings": 514,
16
+ "model_type": "roberta",
17
+ "num_attention_heads": 12,
18
+ "num_hidden_layers": 12,
19
+ "pad_token_id": 1,
20
+ "position_embedding_type": "absolute",
21
+ "transformers_version": "4.9.0.dev0",
22
+ "type_vocab_size": 1,
23
+ "use_cache": true,
24
+ "vocab_size": 50265
25
+ }
outputs/checkpoints/checkpoint-180001/data_collator.joblib ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:e02a6e9cfa63cb321cac9402efd29841b652999fcbf787800ae050e747b161ee
3
+ size 1471394
outputs/checkpoints/checkpoint-180001/flax_model.msgpack ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:d4cdfd93674fdb84b689fe6ce896b29a0afecb90e270f95cc718df2fca4b59b8
3
+ size 249750019
outputs/checkpoints/checkpoint-180001/optimizer_state.msgpack ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:2576e99225ff5d89b059b8cb255049597f29bfd7c29a8a19ce83be9439aad468
3
+ size 499500278
outputs/checkpoints/checkpoint-180001/training_args.joblib ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:992b48eed1c6d2f7f720200cd91ad0097c32fde0bda7f370b8b6bee448eaede7
3
+ size 1873
outputs/checkpoints/checkpoint-180001/training_state.json ADDED
@@ -0,0 +1 @@
 
 
1
+ {"step": 180001}
outputs/checkpoints/checkpoint-190001/config.json ADDED
@@ -0,0 +1,25 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "architectures": [
3
+ "RobertaForMaskedLM"
4
+ ],
5
+ "attention_probs_dropout_prob": 0.1,
6
+ "bos_token_id": 0,
7
+ "eos_token_id": 2,
8
+ "gradient_checkpointing": false,
9
+ "hidden_act": "gelu",
10
+ "hidden_dropout_prob": 0.1,
11
+ "hidden_size": 768,
12
+ "initializer_range": 0.02,
13
+ "intermediate_size": 3072,
14
+ "layer_norm_eps": 1e-05,
15
+ "max_position_embeddings": 514,
16
+ "model_type": "roberta",
17
+ "num_attention_heads": 12,
18
+ "num_hidden_layers": 12,
19
+ "pad_token_id": 1,
20
+ "position_embedding_type": "absolute",
21
+ "transformers_version": "4.9.0.dev0",
22
+ "type_vocab_size": 1,
23
+ "use_cache": true,
24
+ "vocab_size": 50265
25
+ }
outputs/checkpoints/checkpoint-190001/data_collator.joblib ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:e02a6e9cfa63cb321cac9402efd29841b652999fcbf787800ae050e747b161ee
3
+ size 1471394
outputs/checkpoints/checkpoint-190001/flax_model.msgpack ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:101266d4390596691c16eca919169708aee4fa432bd7973bf51885daf6ea5b75
3
+ size 249750019
outputs/checkpoints/checkpoint-190001/optimizer_state.msgpack ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:fe2b0c85e7f2988ae9d5ea7856586fe2486a5297592019dd384b01e2ba59e95b
3
+ size 499500278
outputs/checkpoints/checkpoint-190001/training_args.joblib ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:992b48eed1c6d2f7f720200cd91ad0097c32fde0bda7f370b8b6bee448eaede7
3
+ size 1873
outputs/checkpoints/checkpoint-190001/training_state.json ADDED
@@ -0,0 +1 @@
 
 
1
+ {"step": 190001}
outputs/checkpoints/checkpoint-200001/config.json ADDED
@@ -0,0 +1,25 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "architectures": [
3
+ "RobertaForMaskedLM"
4
+ ],
5
+ "attention_probs_dropout_prob": 0.1,
6
+ "bos_token_id": 0,
7
+ "eos_token_id": 2,
8
+ "gradient_checkpointing": false,
9
+ "hidden_act": "gelu",
10
+ "hidden_dropout_prob": 0.1,
11
+ "hidden_size": 768,
12
+ "initializer_range": 0.02,
13
+ "intermediate_size": 3072,
14
+ "layer_norm_eps": 1e-05,
15
+ "max_position_embeddings": 514,
16
+ "model_type": "roberta",
17
+ "num_attention_heads": 12,
18
+ "num_hidden_layers": 12,
19
+ "pad_token_id": 1,
20
+ "position_embedding_type": "absolute",
21
+ "transformers_version": "4.9.0.dev0",
22
+ "type_vocab_size": 1,
23
+ "use_cache": true,
24
+ "vocab_size": 50265
25
+ }
outputs/checkpoints/checkpoint-200001/data_collator.joblib ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:e02a6e9cfa63cb321cac9402efd29841b652999fcbf787800ae050e747b161ee
3
+ size 1471394
outputs/checkpoints/checkpoint-200001/flax_model.msgpack ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:88e78ae26ac4d3f4891d12ce9c6856b25907020e2aa4a3a833b95a37746d25c6
3
+ size 249750019
outputs/checkpoints/checkpoint-200001/optimizer_state.msgpack ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:0f5c4c1ff0bae24abf15f6eeb6bedadc4cad96c37845d7f1d92f56959c671002
3
+ size 499500278
outputs/checkpoints/checkpoint-200001/training_args.joblib ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:992b48eed1c6d2f7f720200cd91ad0097c32fde0bda7f370b8b6bee448eaede7
3
+ size 1873
outputs/checkpoints/checkpoint-200001/training_state.json ADDED
@@ -0,0 +1 @@
 
 
1
+ {"step": 200001}
outputs/checkpoints/checkpoint-210001/config.json ADDED
@@ -0,0 +1,25 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "architectures": [
3
+ "RobertaForMaskedLM"
4
+ ],
5
+ "attention_probs_dropout_prob": 0.1,
6
+ "bos_token_id": 0,
7
+ "eos_token_id": 2,
8
+ "gradient_checkpointing": false,
9
+ "hidden_act": "gelu",
10
+ "hidden_dropout_prob": 0.1,
11
+ "hidden_size": 768,
12
+ "initializer_range": 0.02,
13
+ "intermediate_size": 3072,
14
+ "layer_norm_eps": 1e-05,
15
+ "max_position_embeddings": 514,
16
+ "model_type": "roberta",
17
+ "num_attention_heads": 12,
18
+ "num_hidden_layers": 12,
19
+ "pad_token_id": 1,
20
+ "position_embedding_type": "absolute",
21
+ "transformers_version": "4.9.0.dev0",
22
+ "type_vocab_size": 1,
23
+ "use_cache": true,
24
+ "vocab_size": 50265
25
+ }
outputs/checkpoints/checkpoint-210001/data_collator.joblib ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:e02a6e9cfa63cb321cac9402efd29841b652999fcbf787800ae050e747b161ee
3
+ size 1471394
outputs/checkpoints/checkpoint-210001/flax_model.msgpack ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:9f64c5b858c8917d0c7c86909e61da0c0564704de3797a9d43c2570cac4b0247
3
+ size 249750019
outputs/checkpoints/checkpoint-210001/optimizer_state.msgpack ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:edd1079d9598c4890bc8e4a7222bcf12fb70e9c4e00fc33499e01a01e37915e3
3
+ size 499500278
outputs/checkpoints/checkpoint-210001/training_args.joblib ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:992b48eed1c6d2f7f720200cd91ad0097c32fde0bda7f370b8b6bee448eaede7
3
+ size 1873
outputs/checkpoints/checkpoint-210001/training_state.json ADDED
@@ -0,0 +1 @@
 
 
1
+ {"step": 210001}
outputs/config.json ADDED
@@ -0,0 +1,25 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "architectures": [
3
+ "RobertaForMaskedLM"
4
+ ],
5
+ "attention_probs_dropout_prob": 0.1,
6
+ "bos_token_id": 0,
7
+ "eos_token_id": 2,
8
+ "gradient_checkpointing": false,
9
+ "hidden_act": "gelu",
10
+ "hidden_dropout_prob": 0.1,
11
+ "hidden_size": 768,
12
+ "initializer_range": 0.02,
13
+ "intermediate_size": 3072,
14
+ "layer_norm_eps": 1e-05,
15
+ "max_position_embeddings": 514,
16
+ "model_type": "roberta",
17
+ "num_attention_heads": 12,
18
+ "num_hidden_layers": 12,
19
+ "pad_token_id": 1,
20
+ "position_embedding_type": "absolute",
21
+ "transformers_version": "4.9.0.dev0",
22
+ "type_vocab_size": 1,
23
+ "use_cache": true,
24
+ "vocab_size": 50265
25
+ }
outputs/data_collator.joblib ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:e02a6e9cfa63cb321cac9402efd29841b652999fcbf787800ae050e747b161ee
3
+ size 1471394
outputs/events.out.tfevents.1626132703.tablespoon.2534518.3.v2 ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:dcf653e7864d18167096d734dbd860d15fbba06384015f694de1099fc39f95de
3
+ size 40
outputs/events.out.tfevents.1626132842.tablespoon.2540265.3.v2 ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:eede710c8b36371055e5299d3f6797af265dfcbd6b5034c4aff6d2ee6402d900
3
+ size 32408059
outputs/flax_model.msgpack ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:9f64c5b858c8917d0c7c86909e61da0c0564704de3797a9d43c2570cac4b0247
3
+ size 249750019
outputs/optimizer_state.msgpack ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:edd1079d9598c4890bc8e4a7222bcf12fb70e9c4e00fc33499e01a01e37915e3
3
+ size 499500278