versae
commited on
Commit
•
d6c5011
0
Parent(s):
Model at 210k steps, mlm acc 0.6509
Browse filesThis view is limited to 50 files because it contains too many changes.
See raw diff
- .gitattributes +19 -0
- .gitignore +4 -0
- config.json +25 -0
- configs/base/config.json +25 -0
- configs/base/tokenizer.json +0 -0
- configs/large/config.json +25 -0
- configs/large/tokenizer.json +0 -0
- convert.py +29 -0
- flax_model.msgpack +3 -0
- mc4/README.md +525 -0
- mc4/dummy/af/0.0.0/dummy_data.zip +0 -0
- mc4/mc4.py +426 -0
- mc4/mc4.py.lock +0 -0
- merges.txt +0 -0
- outputs/checkpoints/checkpoint-170001/config.json +25 -0
- outputs/checkpoints/checkpoint-170001/data_collator.joblib +3 -0
- outputs/checkpoints/checkpoint-170001/flax_model.msgpack +3 -0
- outputs/checkpoints/checkpoint-170001/optimizer_state.msgpack +3 -0
- outputs/checkpoints/checkpoint-170001/training_args.joblib +3 -0
- outputs/checkpoints/checkpoint-170001/training_state.json +1 -0
- outputs/checkpoints/checkpoint-180001/config.json +25 -0
- outputs/checkpoints/checkpoint-180001/data_collator.joblib +3 -0
- outputs/checkpoints/checkpoint-180001/flax_model.msgpack +3 -0
- outputs/checkpoints/checkpoint-180001/optimizer_state.msgpack +3 -0
- outputs/checkpoints/checkpoint-180001/training_args.joblib +3 -0
- outputs/checkpoints/checkpoint-180001/training_state.json +1 -0
- outputs/checkpoints/checkpoint-190001/config.json +25 -0
- outputs/checkpoints/checkpoint-190001/data_collator.joblib +3 -0
- outputs/checkpoints/checkpoint-190001/flax_model.msgpack +3 -0
- outputs/checkpoints/checkpoint-190001/optimizer_state.msgpack +3 -0
- outputs/checkpoints/checkpoint-190001/training_args.joblib +3 -0
- outputs/checkpoints/checkpoint-190001/training_state.json +1 -0
- outputs/checkpoints/checkpoint-200001/config.json +25 -0
- outputs/checkpoints/checkpoint-200001/data_collator.joblib +3 -0
- outputs/checkpoints/checkpoint-200001/flax_model.msgpack +3 -0
- outputs/checkpoints/checkpoint-200001/optimizer_state.msgpack +3 -0
- outputs/checkpoints/checkpoint-200001/training_args.joblib +3 -0
- outputs/checkpoints/checkpoint-200001/training_state.json +1 -0
- outputs/checkpoints/checkpoint-210001/config.json +25 -0
- outputs/checkpoints/checkpoint-210001/data_collator.joblib +3 -0
- outputs/checkpoints/checkpoint-210001/flax_model.msgpack +3 -0
- outputs/checkpoints/checkpoint-210001/optimizer_state.msgpack +3 -0
- outputs/checkpoints/checkpoint-210001/training_args.joblib +3 -0
- outputs/checkpoints/checkpoint-210001/training_state.json +1 -0
- outputs/config.json +25 -0
- outputs/data_collator.joblib +3 -0
- outputs/events.out.tfevents.1626132703.tablespoon.2534518.3.v2 +3 -0
- outputs/events.out.tfevents.1626132842.tablespoon.2540265.3.v2 +3 -0
- outputs/flax_model.msgpack +3 -0
- outputs/optimizer_state.msgpack +3 -0
.gitattributes
ADDED
@@ -0,0 +1,19 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
*.bin.* filter=lfs diff=lfs merge=lfs -text
|
2 |
+
*.lfs.* filter=lfs diff=lfs merge=lfs -text
|
3 |
+
*.bin filter=lfs diff=lfs merge=lfs -text
|
4 |
+
*.h5 filter=lfs diff=lfs merge=lfs -text
|
5 |
+
*.tflite filter=lfs diff=lfs merge=lfs -text
|
6 |
+
*.tar.gz filter=lfs diff=lfs merge=lfs -text
|
7 |
+
*.ot filter=lfs diff=lfs merge=lfs -text
|
8 |
+
*.onnx filter=lfs diff=lfs merge=lfs -text
|
9 |
+
*.arrow filter=lfs diff=lfs merge=lfs -text
|
10 |
+
*.ftz filter=lfs diff=lfs merge=lfs -text
|
11 |
+
*.joblib filter=lfs diff=lfs merge=lfs -text
|
12 |
+
*.model filter=lfs diff=lfs merge=lfs -text
|
13 |
+
*.msgpack filter=lfs diff=lfs merge=lfs -text
|
14 |
+
*.pb filter=lfs diff=lfs merge=lfs -text
|
15 |
+
*.pt filter=lfs diff=lfs merge=lfs -text
|
16 |
+
*.pth filter=lfs diff=lfs merge=lfs -text
|
17 |
+
*tfevents* filter=lfs diff=lfs merge=lfs -text
|
18 |
+
*.log filter=lfs diff=lfs merge=lfs -text
|
19 |
+
*.wandb filter=lfs diff=lfs merge=lfs -text
|
.gitignore
ADDED
@@ -0,0 +1,4 @@
|
|
|
|
|
|
|
|
|
|
|
1 |
+
run*.log
|
2 |
+
debug*.log
|
3 |
+
run*.wandb
|
4 |
+
wandb/
|
config.json
ADDED
@@ -0,0 +1,25 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
{
|
2 |
+
"architectures": [
|
3 |
+
"RobertaForMaskedLM"
|
4 |
+
],
|
5 |
+
"attention_probs_dropout_prob": 0.1,
|
6 |
+
"bos_token_id": 0,
|
7 |
+
"eos_token_id": 2,
|
8 |
+
"gradient_checkpointing": false,
|
9 |
+
"hidden_act": "gelu",
|
10 |
+
"hidden_dropout_prob": 0.1,
|
11 |
+
"hidden_size": 768,
|
12 |
+
"initializer_range": 0.02,
|
13 |
+
"intermediate_size": 3072,
|
14 |
+
"layer_norm_eps": 1e-05,
|
15 |
+
"max_position_embeddings": 514,
|
16 |
+
"model_type": "roberta",
|
17 |
+
"num_attention_heads": 12,
|
18 |
+
"num_hidden_layers": 12,
|
19 |
+
"pad_token_id": 1,
|
20 |
+
"position_embedding_type": "absolute",
|
21 |
+
"transformers_version": "4.9.0.dev0",
|
22 |
+
"type_vocab_size": 1,
|
23 |
+
"use_cache": true,
|
24 |
+
"vocab_size": 50265
|
25 |
+
}
|
configs/base/config.json
ADDED
@@ -0,0 +1,25 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
{
|
2 |
+
"architectures": [
|
3 |
+
"RobertaForMaskedLM"
|
4 |
+
],
|
5 |
+
"attention_probs_dropout_prob": 0.1,
|
6 |
+
"bos_token_id": 0,
|
7 |
+
"eos_token_id": 2,
|
8 |
+
"gradient_checkpointing": false,
|
9 |
+
"hidden_act": "gelu",
|
10 |
+
"hidden_dropout_prob": 0.1,
|
11 |
+
"hidden_size": 768,
|
12 |
+
"initializer_range": 0.02,
|
13 |
+
"intermediate_size": 3072,
|
14 |
+
"layer_norm_eps": 1e-05,
|
15 |
+
"max_position_embeddings": 514,
|
16 |
+
"model_type": "roberta",
|
17 |
+
"num_attention_heads": 12,
|
18 |
+
"num_hidden_layers": 12,
|
19 |
+
"pad_token_id": 1,
|
20 |
+
"position_embedding_type": "absolute",
|
21 |
+
"transformers_version": "4.9.0.dev0",
|
22 |
+
"type_vocab_size": 1,
|
23 |
+
"use_cache": true,
|
24 |
+
"vocab_size": 50265
|
25 |
+
}
|
configs/base/tokenizer.json
ADDED
The diff for this file is too large to render.
See raw diff
|
|
configs/large/config.json
ADDED
@@ -0,0 +1,25 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
{
|
2 |
+
"architectures": [
|
3 |
+
"RobertaForMaskedLM"
|
4 |
+
],
|
5 |
+
"attention_probs_dropout_prob": 0.1,
|
6 |
+
"bos_token_id": 0,
|
7 |
+
"eos_token_id": 2,
|
8 |
+
"gradient_checkpointing": false,
|
9 |
+
"hidden_act": "gelu",
|
10 |
+
"hidden_dropout_prob": 0.1,
|
11 |
+
"hidden_size": 1024,
|
12 |
+
"initializer_range": 0.02,
|
13 |
+
"intermediate_size": 4096,
|
14 |
+
"layer_norm_eps": 1e-05,
|
15 |
+
"max_position_embeddings": 514,
|
16 |
+
"model_type": "roberta",
|
17 |
+
"num_attention_heads": 16,
|
18 |
+
"num_hidden_layers": 24,
|
19 |
+
"pad_token_id": 1,
|
20 |
+
"position_embedding_type": "absolute",
|
21 |
+
"transformers_version": "4.9.0.dev0",
|
22 |
+
"type_vocab_size": 1,
|
23 |
+
"use_cache": true,
|
24 |
+
"vocab_size": 50265
|
25 |
+
}
|
configs/large/tokenizer.json
ADDED
The diff for this file is too large to render.
See raw diff
|
|
convert.py
ADDED
@@ -0,0 +1,29 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
#!/usr/bin/env python
|
2 |
+
import tempfile
|
3 |
+
|
4 |
+
import jax
|
5 |
+
from jax import numpy as jnp
|
6 |
+
from transformers import AutoTokenizer, FlaxRobertaForMaskedLM, RobertaForMaskedLM
|
7 |
+
|
8 |
+
|
9 |
+
def to_f32(t):
|
10 |
+
return jax.tree_map(lambda x: x.astype(jnp.float32) if x.dtype == jnp.bfloat16 else x, t)
|
11 |
+
|
12 |
+
|
13 |
+
def main():
|
14 |
+
# Saving extra files from config.json and tokenizer.json files
|
15 |
+
tokenizer = AutoTokenizer.from_pretrained("./")
|
16 |
+
tokenizer.save_pretrained("./")
|
17 |
+
|
18 |
+
# Temporary saving bfloat16 Flax model into float32
|
19 |
+
tmp = tempfile.mkdtemp()
|
20 |
+
flax_model = FlaxRobertaForMaskedLM.from_pretrained("./")
|
21 |
+
flax_model.params = to_f32(flax_model.params)
|
22 |
+
flax_model.save_pretrained(tmp)
|
23 |
+
# Converting float32 Flax to PyTorch
|
24 |
+
model = RobertaForMaskedLM.from_pretrained(tmp, from_flax=True)
|
25 |
+
model.save_pretrained("./", save_config=False)
|
26 |
+
|
27 |
+
|
28 |
+
if __name__ == "__main__":
|
29 |
+
main()
|
flax_model.msgpack
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:9f64c5b858c8917d0c7c86909e61da0c0564704de3797a9d43c2570cac4b0247
|
3 |
+
size 249750019
|
mc4/README.md
ADDED
@@ -0,0 +1,525 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
---
|
2 |
+
pretty_name: mC4
|
3 |
+
annotations_creators:
|
4 |
+
- no-annotation
|
5 |
+
language_creators:
|
6 |
+
- found
|
7 |
+
languages:
|
8 |
+
- af
|
9 |
+
- am
|
10 |
+
- ar
|
11 |
+
- az
|
12 |
+
- be
|
13 |
+
- bg
|
14 |
+
- bg-Latn
|
15 |
+
- bn
|
16 |
+
- ca
|
17 |
+
- ceb
|
18 |
+
- co
|
19 |
+
- cs
|
20 |
+
- cy
|
21 |
+
- da
|
22 |
+
- de
|
23 |
+
- el
|
24 |
+
- el-Latn
|
25 |
+
- en
|
26 |
+
- eo
|
27 |
+
- es
|
28 |
+
- et
|
29 |
+
- eu
|
30 |
+
- fa
|
31 |
+
- fi
|
32 |
+
- fil
|
33 |
+
- fr
|
34 |
+
- fy
|
35 |
+
- ga
|
36 |
+
- gd
|
37 |
+
- gl
|
38 |
+
- gu
|
39 |
+
- ha
|
40 |
+
- haw
|
41 |
+
- hi
|
42 |
+
- hi-Latn
|
43 |
+
- hmn
|
44 |
+
- ht
|
45 |
+
- hu
|
46 |
+
- hy
|
47 |
+
- id
|
48 |
+
- ig
|
49 |
+
- is
|
50 |
+
- it
|
51 |
+
- iw
|
52 |
+
- ja
|
53 |
+
- ja-Latn
|
54 |
+
- jv
|
55 |
+
- ka
|
56 |
+
- kk
|
57 |
+
- km
|
58 |
+
- kn
|
59 |
+
- ko
|
60 |
+
- ku
|
61 |
+
- ky
|
62 |
+
- la
|
63 |
+
- lb
|
64 |
+
- lo
|
65 |
+
- lt
|
66 |
+
- lv
|
67 |
+
- mg
|
68 |
+
- mi
|
69 |
+
- mk
|
70 |
+
- ml
|
71 |
+
- mn
|
72 |
+
- mr
|
73 |
+
- ms
|
74 |
+
- mt
|
75 |
+
- my
|
76 |
+
- ne
|
77 |
+
- nl
|
78 |
+
- "no"
|
79 |
+
- ny
|
80 |
+
- pa
|
81 |
+
- pl
|
82 |
+
- ps
|
83 |
+
- pt
|
84 |
+
- ro
|
85 |
+
- ru
|
86 |
+
- ru-Latn
|
87 |
+
- sd
|
88 |
+
- si
|
89 |
+
- sk
|
90 |
+
- sl
|
91 |
+
- sm
|
92 |
+
- sn
|
93 |
+
- so
|
94 |
+
- sq
|
95 |
+
- sr
|
96 |
+
- st
|
97 |
+
- su
|
98 |
+
- sv
|
99 |
+
- sw
|
100 |
+
- ta
|
101 |
+
- te
|
102 |
+
- tg
|
103 |
+
- th
|
104 |
+
- tr
|
105 |
+
- uk
|
106 |
+
- und
|
107 |
+
- ur
|
108 |
+
- uz
|
109 |
+
- vi
|
110 |
+
- xh
|
111 |
+
- yi
|
112 |
+
- yo
|
113 |
+
- zh
|
114 |
+
- zh-Latn
|
115 |
+
- zu
|
116 |
+
licenses:
|
117 |
+
- odc-by-1.0
|
118 |
+
multilinguality:
|
119 |
+
- multilingual
|
120 |
+
size_categories:
|
121 |
+
- n<1K
|
122 |
+
- 1K<n<10K
|
123 |
+
- 10K<n<100K
|
124 |
+
- 100K<n<1M
|
125 |
+
- 1M<n<10M
|
126 |
+
- 10M<n<100M
|
127 |
+
- 100M<n<1B
|
128 |
+
- 1B<n<10B
|
129 |
+
source_datasets:
|
130 |
+
- original
|
131 |
+
task_categories:
|
132 |
+
- sequence-modeling
|
133 |
+
task_ids:
|
134 |
+
- language-modeling
|
135 |
+
paperswithcode_id: mc4
|
136 |
+
---
|
137 |
+
|
138 |
+
# Dataset Card for mC4
|
139 |
+
|
140 |
+
## Table of Contents
|
141 |
+
|
142 |
+
- [Dataset Card for mC4](#dataset-card-for-mc4)
|
143 |
+
- [Table of Contents](#table-of-contents)
|
144 |
+
- [Dataset Description](#dataset-description)
|
145 |
+
- [Dataset Summary](#dataset-summary)
|
146 |
+
- [Supported Tasks and Leaderboards](#supported-tasks-and-leaderboards)
|
147 |
+
- [Languages](#languages)
|
148 |
+
- [Dataset Structure](#dataset-structure)
|
149 |
+
- [Data Instances](#data-instances)
|
150 |
+
- [Data Fields](#data-fields)
|
151 |
+
- [Data Splits](#data-splits)
|
152 |
+
- [Dataset Creation](#dataset-creation)
|
153 |
+
- [Curation Rationale](#curation-rationale)
|
154 |
+
- [Source Data](#source-data)
|
155 |
+
- [Initial Data Collection and Normalization](#initial-data-collection-and-normalization)
|
156 |
+
- [Who are the source language producers?](#who-are-the-source-language-producers)
|
157 |
+
- [Annotations](#annotations)
|
158 |
+
- [Annotation process](#annotation-process)
|
159 |
+
- [Who are the annotators?](#who-are-the-annotators)
|
160 |
+
- [Personal and Sensitive Information](#personal-and-sensitive-information)
|
161 |
+
- [Considerations for Using the Data](#considerations-for-using-the-data)
|
162 |
+
- [Social Impact of Dataset](#social-impact-of-dataset)
|
163 |
+
- [Discussion of Biases](#discussion-of-biases)
|
164 |
+
- [Other Known Limitations](#other-known-limitations)
|
165 |
+
- [Additional Information](#additional-information)
|
166 |
+
- [Dataset Curators](#dataset-curators)
|
167 |
+
- [Licensing Information](#licensing-information)
|
168 |
+
- [Citation Information](#citation-information)
|
169 |
+
- [Contributions](#contributions)
|
170 |
+
|
171 |
+
## Dataset Description
|
172 |
+
|
173 |
+
- **Homepage:** https://huggingface.co/datasets/allenai/c4
|
174 |
+
- **Paper:** https://arxiv.org/abs/1910.10683
|
175 |
+
|
176 |
+
### Dataset Summary
|
177 |
+
|
178 |
+
A multilingual colossal, cleaned version of Common Crawl's web crawl corpus. Based on Common Crawl dataset: "https://commoncrawl.org".
|
179 |
+
|
180 |
+
This is the version prepared by AllenAI, hosted at this address: https://huggingface.co/datasets/allenai/c4
|
181 |
+
|
182 |
+
108 languages are available and are reported in the table below.
|
183 |
+
|
184 |
+
Note that the languages that end with "-Latn" are simply romanized variants, i.e. written using the Latin script.
|
185 |
+
|
186 |
+
| language code | language name |
|
187 |
+
|:----------------|:---------------------|
|
188 |
+
| af | Afrikaans |
|
189 |
+
| am | Amharic |
|
190 |
+
| ar | Arabic |
|
191 |
+
| az | Azerbaijani |
|
192 |
+
| be | Belarusian |
|
193 |
+
| bg | Bulgarian |
|
194 |
+
| bg-Latn | Bulgarian (Latin) |
|
195 |
+
| bn | Bangla |
|
196 |
+
| ca | Catalan |
|
197 |
+
| ceb | Cebuano |
|
198 |
+
| co | Corsican |
|
199 |
+
| cs | Czech |
|
200 |
+
| cy | Welsh |
|
201 |
+
| da | Danish |
|
202 |
+
| de | German |
|
203 |
+
| el | Greek |
|
204 |
+
| el-Latn | Greek (Latin) |
|
205 |
+
| en | English |
|
206 |
+
| eo | Esperanto |
|
207 |
+
| es | Spanish |
|
208 |
+
| et | Estonian |
|
209 |
+
| eu | Basque |
|
210 |
+
| fa | Persian |
|
211 |
+
| fi | Finnish |
|
212 |
+
| fil | Filipino |
|
213 |
+
| fr | French |
|
214 |
+
| fy | Western Frisian |
|
215 |
+
| ga | Irish |
|
216 |
+
| gd | Scottish Gaelic |
|
217 |
+
| gl | Galician |
|
218 |
+
| gu | Gujarati |
|
219 |
+
| ha | Hausa |
|
220 |
+
| haw | Hawaiian |
|
221 |
+
| hi | Hindi |
|
222 |
+
| hi-Latn | Hindi (Latin script) |
|
223 |
+
| hmn | Hmong, Mong |
|
224 |
+
| ht | Haitian |
|
225 |
+
| hu | Hungarian |
|
226 |
+
| hy | Armenian |
|
227 |
+
| id | Indonesian |
|
228 |
+
| ig | Igbo |
|
229 |
+
| is | Icelandic |
|
230 |
+
| it | Italian |
|
231 |
+
| iw | former Hebrew |
|
232 |
+
| ja | Japanese |
|
233 |
+
| ja-Latn | Japanese (Latin) |
|
234 |
+
| jv | Javanese |
|
235 |
+
| ka | Georgian |
|
236 |
+
| kk | Kazakh |
|
237 |
+
| km | Khmer |
|
238 |
+
| kn | Kannada |
|
239 |
+
| ko | Korean |
|
240 |
+
| ku | Kurdish |
|
241 |
+
| ky | Kyrgyz |
|
242 |
+
| la | Latin |
|
243 |
+
| lb | Luxembourgish |
|
244 |
+
| lo | Lao |
|
245 |
+
| lt | Lithuanian |
|
246 |
+
| lv | Latvian |
|
247 |
+
| mg | Malagasy |
|
248 |
+
| mi | Maori |
|
249 |
+
| mk | Macedonian |
|
250 |
+
| ml | Malayalam |
|
251 |
+
| mn | Mongolian |
|
252 |
+
| mr | Marathi |
|
253 |
+
| ms | Malay |
|
254 |
+
| mt | Maltese |
|
255 |
+
| my | Burmese |
|
256 |
+
| ne | Nepali |
|
257 |
+
| nl | Dutch |
|
258 |
+
| no | Norwegian |
|
259 |
+
| ny | Nyanja |
|
260 |
+
| pa | Punjabi |
|
261 |
+
| pl | Polish |
|
262 |
+
| ps | Pashto |
|
263 |
+
| pt | Portuguese |
|
264 |
+
| ro | Romanian |
|
265 |
+
| ru | Russian |
|
266 |
+
| ru-Latn | Russian (Latin) |
|
267 |
+
| sd | Sindhi |
|
268 |
+
| si | Sinhala |
|
269 |
+
| sk | Slovak |
|
270 |
+
| sl | Slovenian |
|
271 |
+
| sm | San Marino |
|
272 |
+
| sn | Shona |
|
273 |
+
| so | Somali |
|
274 |
+
| sq | Albanian |
|
275 |
+
| sr | Serbian |
|
276 |
+
| st | Southern Sotho |
|
277 |
+
| su | Sundanese |
|
278 |
+
| sv | Swedish |
|
279 |
+
| sw | Swahili |
|
280 |
+
| ta | Tamil |
|
281 |
+
| te | Telugu |
|
282 |
+
| tg | Tajik |
|
283 |
+
| th | Thai |
|
284 |
+
| tr | Turkish |
|
285 |
+
| uk | Ukrainian |
|
286 |
+
| und | Unknown language |
|
287 |
+
| ur | Urdu |
|
288 |
+
| uz | Uzbek |
|
289 |
+
| vi | Vietnamese |
|
290 |
+
| xh | Xhosa |
|
291 |
+
| yi | Yiddish |
|
292 |
+
| yo | Yoruba |
|
293 |
+
| zh | Chinese |
|
294 |
+
| zh-Latn | Chinese (Latin) |
|
295 |
+
| zu | Zulu |
|
296 |
+
|
297 |
+
You can load the mC4 subset of any language like this:
|
298 |
+
|
299 |
+
```python
|
300 |
+
from datasets import load_dataset
|
301 |
+
|
302 |
+
en_mc4 = load_dataset("mc4", "en")
|
303 |
+
```
|
304 |
+
|
305 |
+
And if you can even specify a list of languages:
|
306 |
+
|
307 |
+
```python
|
308 |
+
from datasets import load_dataset
|
309 |
+
|
310 |
+
mc4_subset_with_five_languages = load_dataset("mc4", languages=["en", "fr", "es", "de", "zh"])
|
311 |
+
```
|
312 |
+
|
313 |
+
### Supported Tasks and Leaderboards
|
314 |
+
|
315 |
+
mC4 is mainly intended to pretrain language models and word representations.
|
316 |
+
|
317 |
+
### Languages
|
318 |
+
|
319 |
+
The dataset supports 108 languages.
|
320 |
+
|
321 |
+
## Dataset Structure
|
322 |
+
|
323 |
+
### Data Instances
|
324 |
+
|
325 |
+
An example form the `en` config is:
|
326 |
+
|
327 |
+
```
|
328 |
+
{'timestamp': '2018-06-24T01:32:39Z',
|
329 |
+
'text': 'Farm Resources in Plumas County\nShow Beginning Farmer Organizations & Professionals (304)\nThere are 304 resources serving Plumas County in the following categories:\nMap of Beginning Farmer Organizations & Professionals serving Plumas County\nVictoria Fisher - Office Manager - Loyalton, CA\nAmy Lynn Rasband - UCCE Plumas-Sierra Administrative Assistant II - Quincy , CA\nShow Farm Income Opportunities Organizations & Professionals (353)\nThere are 353 resources serving Plumas County in the following categories:\nFarm Ranch And Forest Retailers (18)\nMap of Farm Income Opportunities Organizations & Professionals serving Plumas County\nWarner Valley Wildlife Area - Plumas County\nShow Farm Resources Organizations & Professionals (297)\nThere are 297 resources serving Plumas County in the following categories:\nMap of Farm Resources Organizations & Professionals serving Plumas County\nThere are 57 resources serving Plumas County in the following categories:\nMap of Organic Certification Organizations & Professionals serving Plumas County',
|
330 |
+
'url': 'http://www.californialandcan.org/Plumas/Farm-Resources/'}
|
331 |
+
```
|
332 |
+
|
333 |
+
### Data Fields
|
334 |
+
|
335 |
+
The data have several fields:
|
336 |
+
|
337 |
+
- `url`: url of the source as a string
|
338 |
+
- `text`: text content as a string
|
339 |
+
- `timestamp`: timestamp as a string
|
340 |
+
|
341 |
+
### Data Splits
|
342 |
+
|
343 |
+
To build mC4, the authors used [CLD3](https://github.com/google/cld3) to identify over 100 languages. The resulting mC4 subsets for each language are reported in this table:
|
344 |
+
|
345 |
+
| config | train | validation |
|
346 |
+
|:---------|:--------|:-------------|
|
347 |
+
| af | ? | ? |
|
348 |
+
| am | ? | ? |
|
349 |
+
| ar | ? | ? |
|
350 |
+
| az | ? | ? |
|
351 |
+
| be | ? | ? |
|
352 |
+
| bg | ? | ? |
|
353 |
+
| bg-Latn | ? | ? |
|
354 |
+
| bn | ? | ? |
|
355 |
+
| ca | ? | ? |
|
356 |
+
| ceb | ? | ? |
|
357 |
+
| co | ? | ? |
|
358 |
+
| cs | ? | ? |
|
359 |
+
| cy | ? | ? |
|
360 |
+
| da | ? | ? |
|
361 |
+
| de | ? | ? |
|
362 |
+
| el | ? | ? |
|
363 |
+
| el-Latn | ? | ? |
|
364 |
+
| en | ? | ? |
|
365 |
+
| eo | ? | ? |
|
366 |
+
| es | ? | ? |
|
367 |
+
| et | ? | ? |
|
368 |
+
| eu | ? | ? |
|
369 |
+
| fa | ? | ? |
|
370 |
+
| fi | ? | ? |
|
371 |
+
| fil | ? | ? |
|
372 |
+
| fr | ? | ? |
|
373 |
+
| fy | ? | ? |
|
374 |
+
| ga | ? | ? |
|
375 |
+
| gd | ? | ? |
|
376 |
+
| gl | ? | ? |
|
377 |
+
| gu | ? | ? |
|
378 |
+
| ha | ? | ? |
|
379 |
+
| haw | ? | ? |
|
380 |
+
| hi | ? | ? |
|
381 |
+
| hi-Latn | ? | ? |
|
382 |
+
| hmn | ? | ? |
|
383 |
+
| ht | ? | ? |
|
384 |
+
| hu | ? | ? |
|
385 |
+
| hy | ? | ? |
|
386 |
+
| id | ? | ? |
|
387 |
+
| ig | ? | ? |
|
388 |
+
| is | ? | ? |
|
389 |
+
| it | ? | ? |
|
390 |
+
| iw | ? | ? |
|
391 |
+
| ja | ? | ? |
|
392 |
+
| ja-Latn | ? | ? |
|
393 |
+
| jv | ? | ? |
|
394 |
+
| ka | ? | ? |
|
395 |
+
| kk | ? | ? |
|
396 |
+
| km | ? | ? |
|
397 |
+
| kn | ? | ? |
|
398 |
+
| ko | ? | ? |
|
399 |
+
| ku | ? | ? |
|
400 |
+
| ky | ? | ? |
|
401 |
+
| la | ? | ? |
|
402 |
+
| lb | ? | ? |
|
403 |
+
| lo | ? | ? |
|
404 |
+
| lt | ? | ? |
|
405 |
+
| lv | ? | ? |
|
406 |
+
| mg | ? | ? |
|
407 |
+
| mi | ? | ? |
|
408 |
+
| mk | ? | ? |
|
409 |
+
| ml | ? | ? |
|
410 |
+
| mn | ? | ? |
|
411 |
+
| mr | ? | ? |
|
412 |
+
| ms | ? | ? |
|
413 |
+
| mt | ? | ? |
|
414 |
+
| my | ? | ? |
|
415 |
+
| ne | ? | ? |
|
416 |
+
| nl | ? | ? |
|
417 |
+
| no | ? | ? |
|
418 |
+
| ny | ? | ? |
|
419 |
+
| pa | ? | ? |
|
420 |
+
| pl | ? | ? |
|
421 |
+
| ps | ? | ? |
|
422 |
+
| pt | ? | ? |
|
423 |
+
| ro | ? | ? |
|
424 |
+
| ru | ? | ? |
|
425 |
+
| ru-Latn | ? | ? |
|
426 |
+
| sd | ? | ? |
|
427 |
+
| si | ? | ? |
|
428 |
+
| sk | ? | ? |
|
429 |
+
| sl | ? | ? |
|
430 |
+
| sm | ? | ? |
|
431 |
+
| sn | ? | ? |
|
432 |
+
| so | ? | ? |
|
433 |
+
| sq | ? | ? |
|
434 |
+
| sr | ? | ? |
|
435 |
+
| st | ? | ? |
|
436 |
+
| su | ? | ? |
|
437 |
+
| sv | ? | ? |
|
438 |
+
| sw | ? | ? |
|
439 |
+
| ta | ? | ? |
|
440 |
+
| te | ? | ? |
|
441 |
+
| tg | ? | ? |
|
442 |
+
| th | ? | ? |
|
443 |
+
| tr | ? | ? |
|
444 |
+
| uk | ? | ? |
|
445 |
+
| und | ? | ? |
|
446 |
+
| ur | ? | ? |
|
447 |
+
| uz | ? | ? |
|
448 |
+
| vi | ? | ? |
|
449 |
+
| xh | ? | ? |
|
450 |
+
| yi | ? | ? |
|
451 |
+
| yo | ? | ? |
|
452 |
+
| zh | ? | ? |
|
453 |
+
| zh-Latn | ? | ? |
|
454 |
+
| zu | ? | ? |
|
455 |
+
|
456 |
+
## Dataset Creation
|
457 |
+
|
458 |
+
### Curation Rationale
|
459 |
+
|
460 |
+
[More Information Needed]
|
461 |
+
|
462 |
+
### Source Data
|
463 |
+
|
464 |
+
#### Initial Data Collection and Normalization
|
465 |
+
|
466 |
+
[More Information Needed]
|
467 |
+
|
468 |
+
#### Who are the source language producers?
|
469 |
+
|
470 |
+
[More Information Needed]
|
471 |
+
|
472 |
+
### Annotations
|
473 |
+
|
474 |
+
#### Annotation process
|
475 |
+
|
476 |
+
[More Information Needed]
|
477 |
+
|
478 |
+
#### Who are the annotators?
|
479 |
+
|
480 |
+
[More Information Needed]
|
481 |
+
|
482 |
+
### Personal and Sensitive Information
|
483 |
+
|
484 |
+
[More Information Needed]
|
485 |
+
|
486 |
+
## Considerations for Using the Data
|
487 |
+
|
488 |
+
### Social Impact of Dataset
|
489 |
+
|
490 |
+
[More Information Needed]
|
491 |
+
|
492 |
+
### Discussion of Biases
|
493 |
+
|
494 |
+
[More Information Needed]
|
495 |
+
|
496 |
+
### Other Known Limitations
|
497 |
+
|
498 |
+
[More Information Needed]
|
499 |
+
|
500 |
+
## Additional Information
|
501 |
+
|
502 |
+
### Dataset Curators
|
503 |
+
|
504 |
+
[More Information Needed]
|
505 |
+
|
506 |
+
### Licensing Information
|
507 |
+
|
508 |
+
AllenAI are releasing this dataset under the terms of ODC-BY. By using this, you are also bound by the Common Crawl terms of use in respect of the content contained in the dataset.
|
509 |
+
|
510 |
+
### Citation Information
|
511 |
+
|
512 |
+
```
|
513 |
+
@article{2019t5,
|
514 |
+
author = {Colin Raffel and Noam Shazeer and Adam Roberts and Katherine Lee and Sharan Narang and Michael Matena and Yanqi Zhou and Wei Li and Peter J. Liu},
|
515 |
+
title = {Exploring the Limits of Transfer Learning with a Unified Text-to-Text Transformer},
|
516 |
+
journal = {arXiv e-prints},
|
517 |
+
year = {2019},
|
518 |
+
archivePrefix = {arXiv},
|
519 |
+
eprint = {1910.10683},
|
520 |
+
}
|
521 |
+
```
|
522 |
+
|
523 |
+
### Contributions
|
524 |
+
|
525 |
+
Thanks to [@dirkgr](https://github.com/dirkgr) and [@lhoestq](https://github.com/lhoestq) for adding this dataset.
|
mc4/dummy/af/0.0.0/dummy_data.zip
ADDED
Binary file (8.54 kB). View file
|
|
mc4/mc4.py
ADDED
@@ -0,0 +1,426 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
"""mC4 dataset based on Common Crawl."""
|
2 |
+
|
3 |
+
|
4 |
+
import gzip
|
5 |
+
import json
|
6 |
+
|
7 |
+
import datasets
|
8 |
+
import kenlm
|
9 |
+
import numpy as np
|
10 |
+
from numpy.random import default_rng
|
11 |
+
|
12 |
+
|
13 |
+
logger = datasets.logging.get_logger(__name__)
|
14 |
+
|
15 |
+
|
16 |
+
_DESCRIPTION = """\
|
17 |
+
A colossal, cleaned version of Common Crawl's web crawl corpus.
|
18 |
+
|
19 |
+
Based on Common Crawl dataset: "https://commoncrawl.org".
|
20 |
+
|
21 |
+
This is the processed version of Google's mC4 dataset by AllenAI.
|
22 |
+
"""
|
23 |
+
|
24 |
+
_CITATION = """
|
25 |
+
@article{2019t5,
|
26 |
+
author = {Colin Raffel and Noam Shazeer and Adam Roberts and Katherine Lee and Sharan Narang and Michael Matena and Yanqi Zhou and Wei Li and Peter J. Liu},
|
27 |
+
title = {Exploring the Limits of Transfer Learning with a Unified Text-to-Text Transformer},
|
28 |
+
journal = {arXiv e-prints},
|
29 |
+
year = {2019},
|
30 |
+
archivePrefix = {arXiv},
|
31 |
+
eprint = {1910.10683},
|
32 |
+
}
|
33 |
+
"""
|
34 |
+
|
35 |
+
_URL = "https://github.com/allenai/allennlp/discussions/5056"
|
36 |
+
|
37 |
+
_DATA_URL = "https://huggingface.co/datasets/allenai/c4/resolve/1ddc917116b730e1859edef32896ec5c16be51d0/multilingual/c4-{language}{split_suffix}.tfrecord-{index:05d}-of-{n_shards:05d}.json.gz"
|
38 |
+
|
39 |
+
_LANGUAGES = [
|
40 |
+
"af",
|
41 |
+
"am",
|
42 |
+
"ar",
|
43 |
+
"az",
|
44 |
+
"be",
|
45 |
+
"bg",
|
46 |
+
"bg-Latn",
|
47 |
+
"bn",
|
48 |
+
"ca",
|
49 |
+
"ceb",
|
50 |
+
"co",
|
51 |
+
"cs",
|
52 |
+
"cy",
|
53 |
+
"da",
|
54 |
+
"de",
|
55 |
+
"el",
|
56 |
+
"el-Latn",
|
57 |
+
"en",
|
58 |
+
"eo",
|
59 |
+
"es",
|
60 |
+
"et",
|
61 |
+
"eu",
|
62 |
+
"fa",
|
63 |
+
"fi",
|
64 |
+
"fil",
|
65 |
+
"fr",
|
66 |
+
"fy",
|
67 |
+
"ga",
|
68 |
+
"gd",
|
69 |
+
"gl",
|
70 |
+
"gu",
|
71 |
+
"ha",
|
72 |
+
"haw",
|
73 |
+
"hi",
|
74 |
+
"hi-Latn",
|
75 |
+
"hmn",
|
76 |
+
"ht",
|
77 |
+
"hu",
|
78 |
+
"hy",
|
79 |
+
"id",
|
80 |
+
"ig",
|
81 |
+
"is",
|
82 |
+
"it",
|
83 |
+
"iw",
|
84 |
+
"ja",
|
85 |
+
"ja-Latn",
|
86 |
+
"jv",
|
87 |
+
"ka",
|
88 |
+
"kk",
|
89 |
+
"km",
|
90 |
+
"kn",
|
91 |
+
"ko",
|
92 |
+
"ku",
|
93 |
+
"ky",
|
94 |
+
"la",
|
95 |
+
"lb",
|
96 |
+
"lo",
|
97 |
+
"lt",
|
98 |
+
"lv",
|
99 |
+
"mg",
|
100 |
+
"mi",
|
101 |
+
"mk",
|
102 |
+
"ml",
|
103 |
+
"mn",
|
104 |
+
"mr",
|
105 |
+
"ms",
|
106 |
+
"mt",
|
107 |
+
"my",
|
108 |
+
"ne",
|
109 |
+
"nl",
|
110 |
+
"no",
|
111 |
+
"ny",
|
112 |
+
"pa",
|
113 |
+
"pl",
|
114 |
+
"ps",
|
115 |
+
"pt",
|
116 |
+
"ro",
|
117 |
+
"ru",
|
118 |
+
"ru-Latn",
|
119 |
+
"sd",
|
120 |
+
"si",
|
121 |
+
"sk",
|
122 |
+
"sl",
|
123 |
+
"sm",
|
124 |
+
"sn",
|
125 |
+
"so",
|
126 |
+
"sq",
|
127 |
+
"sr",
|
128 |
+
"st",
|
129 |
+
"su",
|
130 |
+
"sv",
|
131 |
+
"sw",
|
132 |
+
"ta",
|
133 |
+
"te",
|
134 |
+
"tg",
|
135 |
+
"th",
|
136 |
+
"tr",
|
137 |
+
"uk",
|
138 |
+
"und",
|
139 |
+
"ur",
|
140 |
+
"uz",
|
141 |
+
"vi",
|
142 |
+
"xh",
|
143 |
+
"yi",
|
144 |
+
"yo",
|
145 |
+
"zh",
|
146 |
+
"zh-Latn",
|
147 |
+
"zu",
|
148 |
+
]
|
149 |
+
|
150 |
+
_N_SHARDS_PER_SPLIT = {
|
151 |
+
"af": {"train": 64, "validation": 1},
|
152 |
+
"am": {"train": 16, "validation": 1},
|
153 |
+
"ar": {"train": 1024, "validation": 4},
|
154 |
+
"az": {"train": 256, "validation": 1},
|
155 |
+
"be": {"train": 128, "validation": 1},
|
156 |
+
"bg": {"train": 1024, "validation": 1},
|
157 |
+
"bg-Latn": {"train": 4, "validation": 1},
|
158 |
+
"bn": {"train": 512, "validation": 1},
|
159 |
+
"ca": {"train": 512, "validation": 1},
|
160 |
+
"ceb": {"train": 8, "validation": 1},
|
161 |
+
"co": {"train": 8, "validation": 1},
|
162 |
+
"cs": {"train": 1024, "validation": 2},
|
163 |
+
"cy": {"train": 256, "validation": 1},
|
164 |
+
"da": {"train": 1024, "validation": 1},
|
165 |
+
"de": {"train": 2048, "validation": 16},
|
166 |
+
"el": {"train": 1024, "validation": 2},
|
167 |
+
"el-Latn": {"train": 16, "validation": 1},
|
168 |
+
"en": {"train": 11264, "validation": 128},
|
169 |
+
"eo": {"train": 32, "validation": 1},
|
170 |
+
"es": {"train": 2048, "validation": 16},
|
171 |
+
"et": {"train": 256, "validation": 1},
|
172 |
+
"eu": {"train": 64, "validation": 1},
|
173 |
+
"fa": {"train": 1024, "validation": 2},
|
174 |
+
"fi": {"train": 1024, "validation": 1},
|
175 |
+
"fil": {"train": 64, "validation": 1},
|
176 |
+
"fr": {"train": 2048, "validation": 16},
|
177 |
+
"fy": {"train": 16, "validation": 1},
|
178 |
+
"ga": {"train": 16, "validation": 1},
|
179 |
+
"gd": {"train": 16, "validation": 1},
|
180 |
+
"gl": {"train": 128, "validation": 1},
|
181 |
+
"gu": {"train": 64, "validation": 1},
|
182 |
+
"ha": {"train": 8, "validation": 1},
|
183 |
+
"haw": {"train": 2, "validation": 1},
|
184 |
+
"hi": {"train": 1024, "validation": 2},
|
185 |
+
"hi-Latn": {"train": 16, "validation": 1},
|
186 |
+
"hmn": {"train": 8, "validation": 1},
|
187 |
+
"ht": {"train": 8, "validation": 1},
|
188 |
+
"hu": {"train": 1024, "validation": 2},
|
189 |
+
"hy": {"train": 128, "validation": 1},
|
190 |
+
"id": {"train": 1024, "validation": 4},
|
191 |
+
"ig": {"train": 4, "validation": 1},
|
192 |
+
"is": {"train": 128, "validation": 1},
|
193 |
+
"it": {"train": 1024, "validation": 8},
|
194 |
+
"iw": {"train": 1024, "validation": 1},
|
195 |
+
"ja": {"train": 1024, "validation": 8},
|
196 |
+
"ja-Latn": {"train": 8, "validation": 1},
|
197 |
+
"jv": {"train": 8, "validation": 1},
|
198 |
+
"ka": {"train": 256, "validation": 1},
|
199 |
+
"kk": {"train": 256, "validation": 1},
|
200 |
+
"km": {"train": 64, "validation": 1},
|
201 |
+
"kn": {"train": 64, "validation": 1},
|
202 |
+
"ko": {"train": 1024, "validation": 1},
|
203 |
+
"ku": {"train": 16, "validation": 1},
|
204 |
+
"ky": {"train": 64, "validation": 1},
|
205 |
+
"la": {"train": 64, "validation": 1},
|
206 |
+
"lb": {"train": 32, "validation": 1},
|
207 |
+
"lo": {"train": 8, "validation": 1},
|
208 |
+
"lt": {"train": 512, "validation": 1},
|
209 |
+
"lv": {"train": 256, "validation": 1},
|
210 |
+
"mg": {"train": 8, "validation": 1},
|
211 |
+
"mi": {"train": 4, "validation": 1},
|
212 |
+
"mk": {"train": 128, "validation": 1},
|
213 |
+
"ml": {"train": 128, "validation": 1},
|
214 |
+
"mn": {"train": 128, "validation": 1},
|
215 |
+
"mr": {"train": 1024, "validation": 1},
|
216 |
+
"ms": {"train": 512, "validation": 1},
|
217 |
+
"mt": {"train": 128, "validation": 1},
|
218 |
+
"my": {"train": 64, "validation": 1},
|
219 |
+
"ne": {"train": 256, "validation": 1},
|
220 |
+
"nl": {"train": 1024, "validation": 4},
|
221 |
+
"no": {"train": 1024, "validation": 1},
|
222 |
+
"ny": {"train": 4, "validation": 1},
|
223 |
+
"pa": {"train": 32, "validation": 1},
|
224 |
+
"pl": {"train": 1024, "validation": 4},
|
225 |
+
"ps": {"train": 16, "validation": 1},
|
226 |
+
"pt": {"train": 1024, "validation": 4},
|
227 |
+
"ro": {"train": 1024, "validation": 2},
|
228 |
+
"ru": {"train": 4096, "validation": 32},
|
229 |
+
"ru-Latn": {"train": 32, "validation": 1},
|
230 |
+
"sd": {"train": 64, "validation": 1},
|
231 |
+
"si": {"train": 64, "validation": 1},
|
232 |
+
"sk": {"train": 512, "validation": 1},
|
233 |
+
"sl": {"train": 256, "validation": 1},
|
234 |
+
"sm": {"train": 4, "validation": 1},
|
235 |
+
"sn": {"train": 8, "validation": 1},
|
236 |
+
"so": {"train": 64, "validation": 1},
|
237 |
+
"sq": {"train": 128, "validation": 1},
|
238 |
+
"sr": {"train": 256, "validation": 1},
|
239 |
+
"st": {"train": 2, "validation": 1},
|
240 |
+
"su": {"train": 4, "validation": 1},
|
241 |
+
"sv": {"train": 1024, "validation": 2},
|
242 |
+
"sw": {"train": 32, "validation": 1},
|
243 |
+
"ta": {"train": 256, "validation": 1},
|
244 |
+
"te": {"train": 128, "validation": 1},
|
245 |
+
"tg": {"train": 64, "validation": 1},
|
246 |
+
"th": {"train": 1024, "validation": 1},
|
247 |
+
"tr": {"train": 1024, "validation": 4},
|
248 |
+
"uk": {"train": 1024, "validation": 2},
|
249 |
+
"und": {"train": 3072, "validation": 32},
|
250 |
+
"ur": {"train": 128, "validation": 1},
|
251 |
+
"uz": {"train": 32, "validation": 1},
|
252 |
+
"vi": {"train": 1024, "validation": 4},
|
253 |
+
"xh": {"train": 2, "validation": 1},
|
254 |
+
"yi": {"train": 16, "validation": 1},
|
255 |
+
"yo": {"train": 2, "validation": 1},
|
256 |
+
"zh": {"train": 1024, "validation": 2},
|
257 |
+
"zh-Latn": {"train": 8, "validation": 1},
|
258 |
+
"zu": {"train": 8, "validation": 1},
|
259 |
+
}
|
260 |
+
|
261 |
+
|
262 |
+
class Mc4Config(datasets.BuilderConfig):
|
263 |
+
"""BuilderConfig for mC4."""
|
264 |
+
|
265 |
+
def __init__(self, *args, languages, **kwargs):
|
266 |
+
"""BuilderConfig for mC4.
|
267 |
+
Args:
|
268 |
+
languages (:obj:`List[str]`): list of languages to load
|
269 |
+
**kwargs: keyword arguments forwarded to super.
|
270 |
+
"""
|
271 |
+
super().__init__(
|
272 |
+
*args,
|
273 |
+
name="+".join(languages),
|
274 |
+
**kwargs,
|
275 |
+
)
|
276 |
+
self.languages = languages
|
277 |
+
|
278 |
+
|
279 |
+
class Mc4(datasets.GeneratorBasedBuilder):
|
280 |
+
"""mC4, a colossal, cleaned version of Common Crawl's web crawl corpus."""
|
281 |
+
|
282 |
+
BUILDER_CONFIGS = [Mc4Config(languages=[lang]) for lang in _LANGUAGES]
|
283 |
+
BUILDER_CONFIG_CLASS = Mc4Config
|
284 |
+
|
285 |
+
def __init__(self, *args, writer_batch_size=None, **kwargs):
|
286 |
+
self.data_files = kwargs.pop("data_files", {})
|
287 |
+
self.sampling_method = kwargs.pop("sampling_method", None)
|
288 |
+
self.perplexity_model = kwargs.pop("perplexity_model", None)
|
289 |
+
self.sampling_factor = kwargs.pop("sampling_factor", None)
|
290 |
+
self.boundaries = kwargs.pop("boundaries", None)
|
291 |
+
self.seed = kwargs.pop("seed", None)
|
292 |
+
if self.sampling_method:
|
293 |
+
if self.seed is not None:
|
294 |
+
self.rng = default_rng(self.seed)
|
295 |
+
else:
|
296 |
+
self.rng = default_rng()
|
297 |
+
if self.sampling_method == "random":
|
298 |
+
self.should_keep_doc = self._should_keep_doc_random
|
299 |
+
else:
|
300 |
+
# Loading 5-gram model
|
301 |
+
# http://dl.fbaipublicfiles.com/cc_net/lm/es.arpa.bin
|
302 |
+
logger.info("loading model = %s", self.perplexity_model)
|
303 |
+
self.pp_model = kenlm.Model(self.perplexity_model)
|
304 |
+
if self.sampling_method == "gaussian":
|
305 |
+
self.should_keep_doc = self._should_keep_doc_gaussian
|
306 |
+
else:
|
307 |
+
self.should_keep_doc = self._should_keep_doc_step
|
308 |
+
super().__init__(*args, writer_batch_size=writer_batch_size, **kwargs)
|
309 |
+
|
310 |
+
def get_perplexity(self, doc):
|
311 |
+
doc_log_score, doc_length = 0, 0
|
312 |
+
for line in doc.split("\n"):
|
313 |
+
log_score = self.pp_model.score(line)
|
314 |
+
length = len(line.split()) + 1
|
315 |
+
doc_log_score += log_score
|
316 |
+
doc_length += length
|
317 |
+
return 10.0 ** (-doc_log_score / doc_length)
|
318 |
+
|
319 |
+
def _should_keep_doc_step(self, doc, factor=1.5e5, boundaries=None):
|
320 |
+
perplexity = self.get_perplexity(doc)
|
321 |
+
if boundaries is None:
|
322 |
+
boundaries = [536394.99320948, 662247.50212365, 919250.87225178]
|
323 |
+
if perplexity <= boundaries[0]:
|
324 |
+
quartile_range = boundaries[0]
|
325 |
+
elif boundaries[0] < perplexity < boundaries[1]:
|
326 |
+
quartile_range = boundaries[1] - boundaries[0]
|
327 |
+
elif boundaries[1] < perplexity < boundaries[2]:
|
328 |
+
quartile_range = boundaries[2] - boundaries[1]
|
329 |
+
elif perplexity >= boundaries[2]:
|
330 |
+
quartile_range = 10 * boundaries[2]
|
331 |
+
probability = factor / quartile_range
|
332 |
+
return self.rng.uniform() < probability
|
333 |
+
|
334 |
+
def _should_keep_doc_gaussian(self, doc, factor=0.78, boundaries=None):
|
335 |
+
perplexity = self.get_perplexity(doc)
|
336 |
+
if boundaries is not None:
|
337 |
+
m = boundaries[1]
|
338 |
+
else:
|
339 |
+
m = 662247.50212365
|
340 |
+
exponential = np.exp(-9/2 * ((perplexity - m) / m) ** 2)
|
341 |
+
weighted_perplexity = factor * exponential
|
342 |
+
return self.rng.uniform() < weighted_perplexity
|
343 |
+
|
344 |
+
def _should_keep_doc_random(self, doc, factor=None, boundaries=None):
|
345 |
+
if factor is None:
|
346 |
+
factor = 0.5
|
347 |
+
return self.rng.uniform() <= factor
|
348 |
+
|
349 |
+
def _info(self):
|
350 |
+
return datasets.DatasetInfo(
|
351 |
+
description=_DESCRIPTION,
|
352 |
+
features=datasets.Features(
|
353 |
+
{
|
354 |
+
"text": datasets.Value("string"),
|
355 |
+
"timestamp": datasets.Value("string"),
|
356 |
+
"url": datasets.Value("string"),
|
357 |
+
}
|
358 |
+
),
|
359 |
+
supervised_keys=None,
|
360 |
+
homepage=_URL,
|
361 |
+
citation=_CITATION,
|
362 |
+
)
|
363 |
+
|
364 |
+
def _split_generators(self, dl_manager):
|
365 |
+
data_urls = {}
|
366 |
+
for split in ["train", "validation"]:
|
367 |
+
data_urls[split] = [
|
368 |
+
_DATA_URL.format(
|
369 |
+
language=self.config.name,
|
370 |
+
split_suffix="-validation" if split == "validation" else "",
|
371 |
+
index=index,
|
372 |
+
n_shards=_N_SHARDS_PER_SPLIT[lang][split],
|
373 |
+
)
|
374 |
+
for lang in self.config.languages
|
375 |
+
for index in range(_N_SHARDS_PER_SPLIT[lang][split])
|
376 |
+
]
|
377 |
+
if "train" in self.data_files:
|
378 |
+
train_downloaded_files = self.data_files["train"]
|
379 |
+
if not isinstance(train_downloaded_files, (tuple, list)):
|
380 |
+
train_downloaded_files = [train_downloaded_files]
|
381 |
+
else:
|
382 |
+
train_downloaded_files = dl_manager.download(data_urls["train"])
|
383 |
+
if "validation" in self.data_files:
|
384 |
+
validation_downloaded_files = self.data_files["validation"]
|
385 |
+
if not isinstance(validation_downloaded_files, (tuple, list)):
|
386 |
+
validation_downloaded_files = [validation_downloaded_files]
|
387 |
+
else:
|
388 |
+
validation_downloaded_files = dl_manager.download(data_urls["validation"])
|
389 |
+
return [
|
390 |
+
datasets.SplitGenerator(name=datasets.Split.TRAIN, gen_kwargs={"filepaths": train_downloaded_files}),
|
391 |
+
datasets.SplitGenerator(
|
392 |
+
name=datasets.Split.VALIDATION, gen_kwargs={"filepaths": validation_downloaded_files}
|
393 |
+
),
|
394 |
+
]
|
395 |
+
|
396 |
+
def _generate_examples(self, filepaths):
|
397 |
+
"""This function returns the examples in the raw (text) form by iterating on all the files."""
|
398 |
+
id_ = 0
|
399 |
+
for filepath in filepaths:
|
400 |
+
logger.info("generating examples from = %s", filepath)
|
401 |
+
if filepath.endswith("jsonl"):
|
402 |
+
with open(filepath, "r", encoding="utf-8") as f:
|
403 |
+
for line in f:
|
404 |
+
if line:
|
405 |
+
example = json.loads(line)
|
406 |
+
yield id_, example
|
407 |
+
id_ += 1
|
408 |
+
else:
|
409 |
+
with gzip.open(open(filepath, "rb"), "rt", encoding="utf-8") as f:
|
410 |
+
if self.sampling_method:
|
411 |
+
logger.info("sampling method = %s", self.sampling_method)
|
412 |
+
for line in f:
|
413 |
+
if line:
|
414 |
+
example = json.loads(line)
|
415 |
+
if self.should_keep_doc(
|
416 |
+
example["text"],
|
417 |
+
factor=self.sampling_factor,
|
418 |
+
boundaries=self.boundaries):
|
419 |
+
yield id_, example
|
420 |
+
id_ += 1
|
421 |
+
else:
|
422 |
+
for line in f:
|
423 |
+
if line:
|
424 |
+
example = json.loads(line)
|
425 |
+
yield id_, example
|
426 |
+
id_ += 1
|
mc4/mc4.py.lock
ADDED
File without changes
|
merges.txt
ADDED
The diff for this file is too large to render.
See raw diff
|
|
outputs/checkpoints/checkpoint-170001/config.json
ADDED
@@ -0,0 +1,25 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
{
|
2 |
+
"architectures": [
|
3 |
+
"RobertaForMaskedLM"
|
4 |
+
],
|
5 |
+
"attention_probs_dropout_prob": 0.1,
|
6 |
+
"bos_token_id": 0,
|
7 |
+
"eos_token_id": 2,
|
8 |
+
"gradient_checkpointing": false,
|
9 |
+
"hidden_act": "gelu",
|
10 |
+
"hidden_dropout_prob": 0.1,
|
11 |
+
"hidden_size": 768,
|
12 |
+
"initializer_range": 0.02,
|
13 |
+
"intermediate_size": 3072,
|
14 |
+
"layer_norm_eps": 1e-05,
|
15 |
+
"max_position_embeddings": 514,
|
16 |
+
"model_type": "roberta",
|
17 |
+
"num_attention_heads": 12,
|
18 |
+
"num_hidden_layers": 12,
|
19 |
+
"pad_token_id": 1,
|
20 |
+
"position_embedding_type": "absolute",
|
21 |
+
"transformers_version": "4.9.0.dev0",
|
22 |
+
"type_vocab_size": 1,
|
23 |
+
"use_cache": true,
|
24 |
+
"vocab_size": 50265
|
25 |
+
}
|
outputs/checkpoints/checkpoint-170001/data_collator.joblib
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:e02a6e9cfa63cb321cac9402efd29841b652999fcbf787800ae050e747b161ee
|
3 |
+
size 1471394
|
outputs/checkpoints/checkpoint-170001/flax_model.msgpack
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:82acddc9d2695672d6083ebf0bcde93b488f7aba953505236ac3f977e6f70284
|
3 |
+
size 249750019
|
outputs/checkpoints/checkpoint-170001/optimizer_state.msgpack
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:f44ea5ea6a253566d5b813a6ddda12274ce966aa2da9b84c2886126f906f50ac
|
3 |
+
size 499500278
|
outputs/checkpoints/checkpoint-170001/training_args.joblib
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:992b48eed1c6d2f7f720200cd91ad0097c32fde0bda7f370b8b6bee448eaede7
|
3 |
+
size 1873
|
outputs/checkpoints/checkpoint-170001/training_state.json
ADDED
@@ -0,0 +1 @@
|
|
|
|
|
1 |
+
{"step": 170001}
|
outputs/checkpoints/checkpoint-180001/config.json
ADDED
@@ -0,0 +1,25 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
{
|
2 |
+
"architectures": [
|
3 |
+
"RobertaForMaskedLM"
|
4 |
+
],
|
5 |
+
"attention_probs_dropout_prob": 0.1,
|
6 |
+
"bos_token_id": 0,
|
7 |
+
"eos_token_id": 2,
|
8 |
+
"gradient_checkpointing": false,
|
9 |
+
"hidden_act": "gelu",
|
10 |
+
"hidden_dropout_prob": 0.1,
|
11 |
+
"hidden_size": 768,
|
12 |
+
"initializer_range": 0.02,
|
13 |
+
"intermediate_size": 3072,
|
14 |
+
"layer_norm_eps": 1e-05,
|
15 |
+
"max_position_embeddings": 514,
|
16 |
+
"model_type": "roberta",
|
17 |
+
"num_attention_heads": 12,
|
18 |
+
"num_hidden_layers": 12,
|
19 |
+
"pad_token_id": 1,
|
20 |
+
"position_embedding_type": "absolute",
|
21 |
+
"transformers_version": "4.9.0.dev0",
|
22 |
+
"type_vocab_size": 1,
|
23 |
+
"use_cache": true,
|
24 |
+
"vocab_size": 50265
|
25 |
+
}
|
outputs/checkpoints/checkpoint-180001/data_collator.joblib
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:e02a6e9cfa63cb321cac9402efd29841b652999fcbf787800ae050e747b161ee
|
3 |
+
size 1471394
|
outputs/checkpoints/checkpoint-180001/flax_model.msgpack
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:d4cdfd93674fdb84b689fe6ce896b29a0afecb90e270f95cc718df2fca4b59b8
|
3 |
+
size 249750019
|
outputs/checkpoints/checkpoint-180001/optimizer_state.msgpack
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:2576e99225ff5d89b059b8cb255049597f29bfd7c29a8a19ce83be9439aad468
|
3 |
+
size 499500278
|
outputs/checkpoints/checkpoint-180001/training_args.joblib
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:992b48eed1c6d2f7f720200cd91ad0097c32fde0bda7f370b8b6bee448eaede7
|
3 |
+
size 1873
|
outputs/checkpoints/checkpoint-180001/training_state.json
ADDED
@@ -0,0 +1 @@
|
|
|
|
|
1 |
+
{"step": 180001}
|
outputs/checkpoints/checkpoint-190001/config.json
ADDED
@@ -0,0 +1,25 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
{
|
2 |
+
"architectures": [
|
3 |
+
"RobertaForMaskedLM"
|
4 |
+
],
|
5 |
+
"attention_probs_dropout_prob": 0.1,
|
6 |
+
"bos_token_id": 0,
|
7 |
+
"eos_token_id": 2,
|
8 |
+
"gradient_checkpointing": false,
|
9 |
+
"hidden_act": "gelu",
|
10 |
+
"hidden_dropout_prob": 0.1,
|
11 |
+
"hidden_size": 768,
|
12 |
+
"initializer_range": 0.02,
|
13 |
+
"intermediate_size": 3072,
|
14 |
+
"layer_norm_eps": 1e-05,
|
15 |
+
"max_position_embeddings": 514,
|
16 |
+
"model_type": "roberta",
|
17 |
+
"num_attention_heads": 12,
|
18 |
+
"num_hidden_layers": 12,
|
19 |
+
"pad_token_id": 1,
|
20 |
+
"position_embedding_type": "absolute",
|
21 |
+
"transformers_version": "4.9.0.dev0",
|
22 |
+
"type_vocab_size": 1,
|
23 |
+
"use_cache": true,
|
24 |
+
"vocab_size": 50265
|
25 |
+
}
|
outputs/checkpoints/checkpoint-190001/data_collator.joblib
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:e02a6e9cfa63cb321cac9402efd29841b652999fcbf787800ae050e747b161ee
|
3 |
+
size 1471394
|
outputs/checkpoints/checkpoint-190001/flax_model.msgpack
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:101266d4390596691c16eca919169708aee4fa432bd7973bf51885daf6ea5b75
|
3 |
+
size 249750019
|
outputs/checkpoints/checkpoint-190001/optimizer_state.msgpack
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:fe2b0c85e7f2988ae9d5ea7856586fe2486a5297592019dd384b01e2ba59e95b
|
3 |
+
size 499500278
|
outputs/checkpoints/checkpoint-190001/training_args.joblib
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:992b48eed1c6d2f7f720200cd91ad0097c32fde0bda7f370b8b6bee448eaede7
|
3 |
+
size 1873
|
outputs/checkpoints/checkpoint-190001/training_state.json
ADDED
@@ -0,0 +1 @@
|
|
|
|
|
1 |
+
{"step": 190001}
|
outputs/checkpoints/checkpoint-200001/config.json
ADDED
@@ -0,0 +1,25 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
{
|
2 |
+
"architectures": [
|
3 |
+
"RobertaForMaskedLM"
|
4 |
+
],
|
5 |
+
"attention_probs_dropout_prob": 0.1,
|
6 |
+
"bos_token_id": 0,
|
7 |
+
"eos_token_id": 2,
|
8 |
+
"gradient_checkpointing": false,
|
9 |
+
"hidden_act": "gelu",
|
10 |
+
"hidden_dropout_prob": 0.1,
|
11 |
+
"hidden_size": 768,
|
12 |
+
"initializer_range": 0.02,
|
13 |
+
"intermediate_size": 3072,
|
14 |
+
"layer_norm_eps": 1e-05,
|
15 |
+
"max_position_embeddings": 514,
|
16 |
+
"model_type": "roberta",
|
17 |
+
"num_attention_heads": 12,
|
18 |
+
"num_hidden_layers": 12,
|
19 |
+
"pad_token_id": 1,
|
20 |
+
"position_embedding_type": "absolute",
|
21 |
+
"transformers_version": "4.9.0.dev0",
|
22 |
+
"type_vocab_size": 1,
|
23 |
+
"use_cache": true,
|
24 |
+
"vocab_size": 50265
|
25 |
+
}
|
outputs/checkpoints/checkpoint-200001/data_collator.joblib
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:e02a6e9cfa63cb321cac9402efd29841b652999fcbf787800ae050e747b161ee
|
3 |
+
size 1471394
|
outputs/checkpoints/checkpoint-200001/flax_model.msgpack
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:88e78ae26ac4d3f4891d12ce9c6856b25907020e2aa4a3a833b95a37746d25c6
|
3 |
+
size 249750019
|
outputs/checkpoints/checkpoint-200001/optimizer_state.msgpack
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:0f5c4c1ff0bae24abf15f6eeb6bedadc4cad96c37845d7f1d92f56959c671002
|
3 |
+
size 499500278
|
outputs/checkpoints/checkpoint-200001/training_args.joblib
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:992b48eed1c6d2f7f720200cd91ad0097c32fde0bda7f370b8b6bee448eaede7
|
3 |
+
size 1873
|
outputs/checkpoints/checkpoint-200001/training_state.json
ADDED
@@ -0,0 +1 @@
|
|
|
|
|
1 |
+
{"step": 200001}
|
outputs/checkpoints/checkpoint-210001/config.json
ADDED
@@ -0,0 +1,25 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
{
|
2 |
+
"architectures": [
|
3 |
+
"RobertaForMaskedLM"
|
4 |
+
],
|
5 |
+
"attention_probs_dropout_prob": 0.1,
|
6 |
+
"bos_token_id": 0,
|
7 |
+
"eos_token_id": 2,
|
8 |
+
"gradient_checkpointing": false,
|
9 |
+
"hidden_act": "gelu",
|
10 |
+
"hidden_dropout_prob": 0.1,
|
11 |
+
"hidden_size": 768,
|
12 |
+
"initializer_range": 0.02,
|
13 |
+
"intermediate_size": 3072,
|
14 |
+
"layer_norm_eps": 1e-05,
|
15 |
+
"max_position_embeddings": 514,
|
16 |
+
"model_type": "roberta",
|
17 |
+
"num_attention_heads": 12,
|
18 |
+
"num_hidden_layers": 12,
|
19 |
+
"pad_token_id": 1,
|
20 |
+
"position_embedding_type": "absolute",
|
21 |
+
"transformers_version": "4.9.0.dev0",
|
22 |
+
"type_vocab_size": 1,
|
23 |
+
"use_cache": true,
|
24 |
+
"vocab_size": 50265
|
25 |
+
}
|
outputs/checkpoints/checkpoint-210001/data_collator.joblib
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:e02a6e9cfa63cb321cac9402efd29841b652999fcbf787800ae050e747b161ee
|
3 |
+
size 1471394
|
outputs/checkpoints/checkpoint-210001/flax_model.msgpack
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:9f64c5b858c8917d0c7c86909e61da0c0564704de3797a9d43c2570cac4b0247
|
3 |
+
size 249750019
|
outputs/checkpoints/checkpoint-210001/optimizer_state.msgpack
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:edd1079d9598c4890bc8e4a7222bcf12fb70e9c4e00fc33499e01a01e37915e3
|
3 |
+
size 499500278
|
outputs/checkpoints/checkpoint-210001/training_args.joblib
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:992b48eed1c6d2f7f720200cd91ad0097c32fde0bda7f370b8b6bee448eaede7
|
3 |
+
size 1873
|
outputs/checkpoints/checkpoint-210001/training_state.json
ADDED
@@ -0,0 +1 @@
|
|
|
|
|
1 |
+
{"step": 210001}
|
outputs/config.json
ADDED
@@ -0,0 +1,25 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
{
|
2 |
+
"architectures": [
|
3 |
+
"RobertaForMaskedLM"
|
4 |
+
],
|
5 |
+
"attention_probs_dropout_prob": 0.1,
|
6 |
+
"bos_token_id": 0,
|
7 |
+
"eos_token_id": 2,
|
8 |
+
"gradient_checkpointing": false,
|
9 |
+
"hidden_act": "gelu",
|
10 |
+
"hidden_dropout_prob": 0.1,
|
11 |
+
"hidden_size": 768,
|
12 |
+
"initializer_range": 0.02,
|
13 |
+
"intermediate_size": 3072,
|
14 |
+
"layer_norm_eps": 1e-05,
|
15 |
+
"max_position_embeddings": 514,
|
16 |
+
"model_type": "roberta",
|
17 |
+
"num_attention_heads": 12,
|
18 |
+
"num_hidden_layers": 12,
|
19 |
+
"pad_token_id": 1,
|
20 |
+
"position_embedding_type": "absolute",
|
21 |
+
"transformers_version": "4.9.0.dev0",
|
22 |
+
"type_vocab_size": 1,
|
23 |
+
"use_cache": true,
|
24 |
+
"vocab_size": 50265
|
25 |
+
}
|
outputs/data_collator.joblib
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:e02a6e9cfa63cb321cac9402efd29841b652999fcbf787800ae050e747b161ee
|
3 |
+
size 1471394
|
outputs/events.out.tfevents.1626132703.tablespoon.2534518.3.v2
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:dcf653e7864d18167096d734dbd860d15fbba06384015f694de1099fc39f95de
|
3 |
+
size 40
|
outputs/events.out.tfevents.1626132842.tablespoon.2540265.3.v2
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:eede710c8b36371055e5299d3f6797af265dfcbd6b5034c4aff6d2ee6402d900
|
3 |
+
size 32408059
|
outputs/flax_model.msgpack
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:9f64c5b858c8917d0c7c86909e61da0c0564704de3797a9d43c2570cac4b0247
|
3 |
+
size 249750019
|
outputs/optimizer_state.msgpack
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:edd1079d9598c4890bc8e4a7222bcf12fb70e9c4e00fc33499e01a01e37915e3
|
3 |
+
size 499500278
|