Gnider commited on
Commit
404f3ce
1 Parent(s): 861f814

Upload 9 files

Browse files
README.md ADDED
@@ -0,0 +1,32 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ ---
2
+ language:
3
+ - ru
4
+ tags:
5
+ - PyTorch
6
+ - Transformers
7
+ thumbnail: "https://github.com/sberbank-ai/ru-gpts"
8
+ ---
9
+
10
+ # rugpt3small\_based\_on\_gpt2
11
+ The model architecture design, pretraining, and evaluation are documented in our preprint: [**A Family of Pretrained Transformer Language Models for Russian**](https://arxiv.org/abs/2309.10931).
12
+
13
+ The model was pretrained with sequence length 1024 using transformers by the [SberDevices](https://sberdevices.ru/) team on 80B tokens around 3 epochs. After that, the model was finetuned with the context size of 2048.
14
+
15
+ Total training time took around one week on 32 GPUs.
16
+
17
+ # Authors
18
+ + NLP core team RnD [Telegram channel](https://t.me/nlpcoreteam):
19
+ + Dmitry Zmitrovich
20
+
21
+
22
+ # Cite us
23
+ ```
24
+ @misc{zmitrovich2023family,
25
+ title={A Family of Pretrained Transformer Language Models for Russian},
26
+ author={Dmitry Zmitrovich and Alexander Abramov and Andrey Kalmykov and Maria Tikhonova and Ekaterina Taktasheva and Danil Astafurov and Mark Baushenko and Artem Snegirev and Tatiana Shavrina and Sergey Markov and Vladislav Mikhailov and Alena Fenogenova},
27
+ year={2023},
28
+ eprint={2309.10931},
29
+ archivePrefix={arXiv},
30
+ primaryClass={cs.CL}
31
+ }
32
+ ```
config.json ADDED
@@ -0,0 +1,35 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "activation_function": "gelu_new",
3
+ "architectures": [
4
+ "GPT2LMHeadModel"
5
+ ],
6
+ "attn_pdrop": 0.1,
7
+ "bos_token_id": 1,
8
+ "embd_pdrop": 0.1,
9
+ "eos_token_id": 2,
10
+ "gradient_checkpointing": false,
11
+ "id2label": {
12
+ "0": "LABEL_0"
13
+ },
14
+ "initializer_range": 0.02,
15
+ "label2id": {
16
+ "LABEL_0": 0
17
+ },
18
+ "layer_norm_epsilon": 1e-05,
19
+ "model_type": "gpt2",
20
+ "n_ctx": 2048,
21
+ "n_embd": 768,
22
+ "n_head": 12,
23
+ "n_inner": null,
24
+ "n_layer": 12,
25
+ "n_positions": 2048,
26
+ "pad_token_id": 0,
27
+ "resid_pdrop": 0.1,
28
+ "summary_activation": null,
29
+ "summary_first_dropout": 0.1,
30
+ "summary_proj_to_labels": true,
31
+ "summary_type": "cls_index",
32
+ "summary_use_proj": true,
33
+ "use_cache": true,
34
+ "vocab_size": 50264
35
+ }
flax_model.msgpack ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:322182be41c143f2f8ecd8b9e76018d09ae5ca4c752b8858a257811bb4c5f5da
3
+ size 500931352
gitattributes ADDED
@@ -0,0 +1,9 @@
 
 
 
 
 
 
 
 
 
 
1
+ *.bin.* filter=lfs diff=lfs merge=lfs -text
2
+ *.lfs.* filter=lfs diff=lfs merge=lfs -text
3
+ *.bin filter=lfs diff=lfs merge=lfs -text
4
+ *.h5 filter=lfs diff=lfs merge=lfs -text
5
+ *.tflite filter=lfs diff=lfs merge=lfs -text
6
+ *.tar.gz filter=lfs diff=lfs merge=lfs -text
7
+ *.ot filter=lfs diff=lfs merge=lfs -text
8
+ *.onnx filter=lfs diff=lfs merge=lfs -text
9
+ *.msgpack filter=lfs diff=lfs merge=lfs -text
merges.txt ADDED
The diff for this file is too large to render. See raw diff
 
pytorch_model.bin ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:0d2a4a387b0430654bb1f6813436bd31f8e10a06e18ea0e5c41b33120b78458e
3
+ size 551290714
special_tokens_map.json ADDED
@@ -0,0 +1,31 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "bos_token": {
3
+ "content": "<s>",
4
+ "lstrip": false,
5
+ "normalized": true,
6
+ "rstrip": false,
7
+ "single_word": false
8
+ },
9
+ "eos_token": {
10
+ "content": "</s>",
11
+ "lstrip": false,
12
+ "normalized": true,
13
+ "rstrip": false,
14
+ "single_word": false
15
+ },
16
+ "mask_token": "<mask>",
17
+ "pad_token": {
18
+ "content": "<pad>",
19
+ "lstrip": false,
20
+ "normalized": true,
21
+ "rstrip": false,
22
+ "single_word": false
23
+ },
24
+ "unk_token": {
25
+ "content": "<unk>",
26
+ "lstrip": false,
27
+ "normalized": true,
28
+ "rstrip": false,
29
+ "single_word": false
30
+ }
31
+ }
tokenizer_config.json ADDED
@@ -0,0 +1,58 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "add_bos_token": false,
3
+ "add_prefix_space": false,
4
+ "added_tokens_decoder": {
5
+ "0": {
6
+ "content": "<pad>",
7
+ "lstrip": false,
8
+ "normalized": true,
9
+ "rstrip": false,
10
+ "single_word": false,
11
+ "special": true
12
+ },
13
+ "1": {
14
+ "content": "<s>",
15
+ "lstrip": false,
16
+ "normalized": true,
17
+ "rstrip": false,
18
+ "single_word": false,
19
+ "special": true
20
+ },
21
+ "2": {
22
+ "content": "</s>",
23
+ "lstrip": false,
24
+ "normalized": true,
25
+ "rstrip": false,
26
+ "single_word": false,
27
+ "special": true
28
+ },
29
+ "3": {
30
+ "content": "<unk>",
31
+ "lstrip": false,
32
+ "normalized": true,
33
+ "rstrip": false,
34
+ "single_word": false,
35
+ "special": true
36
+ },
37
+ "4": {
38
+ "content": "<mask>",
39
+ "lstrip": false,
40
+ "normalized": false,
41
+ "rstrip": false,
42
+ "single_word": false,
43
+ "special": true
44
+ }
45
+ },
46
+ "bos_token": "<s>",
47
+ "clean_up_tokenization_spaces": true,
48
+ "eos_token": "</s>",
49
+ "errors": "replace",
50
+ "mask_token": "<mask>",
51
+ "model_max_length": 2048,
52
+ "pad_token": "<pad>",
53
+ "padding_side": "left",
54
+ "tokenizer_class": "GPT2Tokenizer",
55
+ "truncation_side": "left",
56
+ "trust_remote_code": false,
57
+ "unk_token": "<unk>"
58
+ }
vocab.json ADDED
The diff for this file is too large to render. See raw diff