fgaim commited on
Commit
b4e7ad6
1 Parent(s): 9082501

Update config and tokenizer

Browse files
Files changed (7) hide show
  1. README.md +25 -4
  2. config.json +2 -1
  3. merges.txt +0 -0
  4. special_tokens_map.json +1 -1
  5. tokenizer.json +0 -0
  6. tokenizer_config.json +1 -1
  7. vocab.json +0 -0
README.md CHANGED
@@ -1,14 +1,14 @@
1
  ---
2
  language: ti
3
  widget:
4
- - text: "ዓቕሚ ደቀንስትዮ [MASK] ብግብሪ ተራእዩ"
5
  ---
6
 
7
- # RoBERTa Pretrained for Tigrinya Language
8
 
9
  We pretrain a RoBERTa base model for Tigrinya on a dataset of 40 million tokens trained for 40 epochs.
10
 
11
- Contained in this repo are the original pretrained Flax model that was trained on a TPU v3.8 and it's correponding PyTorch version.
12
 
13
 
14
  ## Hyperparameters
@@ -17,6 +17,27 @@ The hyperparameters corresponding to model sizes mentioned above are as follows:
17
 
18
  | Model Size | L | AH | HS | FFN | P | Seq |
19
  |------------|----|----|-----|------|------|------|
20
- | BASE | 12 | 12 | 768 | 3072 | 125M | 128 |
21
 
22
  (L = number of layers; AH = number of attention heads; HS = hidden size; FFN = feedforward network dimension; P = number of parameters; Seq = maximum sequence length.)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
  ---
2
  language: ti
3
  widget:
4
+ - text: "ዓቕሚ መንእሰይ ኤርትራ <mask> ተራእዩ"
5
  ---
6
 
7
+ # TiRoBERTa: RoBERTa Pretrained for the Tigrinya Language
8
 
9
  We pretrain a RoBERTa base model for Tigrinya on a dataset of 40 million tokens trained for 40 epochs.
10
 
11
+ Contained in this repo is the original pretrained Flax model that was trained on a TPU v3.8 and it's corresponding PyTorch version.
12
 
13
 
14
  ## Hyperparameters
17
 
18
  | Model Size | L | AH | HS | FFN | P | Seq |
19
  |------------|----|----|-----|------|------|------|
20
+ | BASE | 12 | 12 | 768 | 3072 | 125M | 512 |
21
 
22
  (L = number of layers; AH = number of attention heads; HS = hidden size; FFN = feedforward network dimension; P = number of parameters; Seq = maximum sequence length.)
23
+
24
+ ### Framework versions
25
+
26
+ - Transformers 4.12.0.dev0
27
+ - Pytorch 1.9.0+cu111
28
+ - Datasets 1.13.3
29
+ - Tokenizers 0.10.3
30
+
31
+
32
+ ## Citation
33
+
34
+ If you use this model in your product or research, please cite as follows:
35
+
36
+ ```
37
+ @article{Fitsum2021TiPLMs,
38
+ author={Fitsum Gaim and Wonsuk Yang and Jong C. Park},
39
+ title={Monolingual Pre-trained Language Models for Tigrinya},
40
+ year=2021,
41
+ publisher={WiNLP 2021 at EMNLP 2021}
42
+ }
43
+ ```
config.json CHANGED
@@ -5,6 +5,7 @@
5
  ],
6
  "attention_probs_dropout_prob": 0.1,
7
  "bos_token_id": 0,
 
8
  "eos_token_id": 2,
9
  "gradient_checkpointing": false,
10
  "hidden_act": "gelu",
@@ -20,7 +21,7 @@
20
  "pad_token_id": 1,
21
  "position_embedding_type": "absolute",
22
  "torch_dtype": "float32",
23
- "transformers_version": "4.9.0.dev0",
24
  "type_vocab_size": 1,
25
  "use_cache": true,
26
  "vocab_size": 50265
5
  ],
6
  "attention_probs_dropout_prob": 0.1,
7
  "bos_token_id": 0,
8
+ "classifier_dropout": null,
9
  "eos_token_id": 2,
10
  "gradient_checkpointing": false,
11
  "hidden_act": "gelu",
21
  "pad_token_id": 1,
22
  "position_embedding_type": "absolute",
23
  "torch_dtype": "float32",
24
+ "transformers_version": "4.12.0.dev0",
25
  "type_vocab_size": 1,
26
  "use_cache": true,
27
  "vocab_size": 50265
merges.txt CHANGED
The diff for this file is too large to render. See raw diff
special_tokens_map.json CHANGED
@@ -1 +1 @@
1
- {"bos_token": "<s>", "eos_token": "</s>", "unk_token": "<unk>", "sep_token": "</s>", "pad_token": "<pad>", "cls_token": "<s>", "mask_token": {"content": "<mask>", "single_word": false, "lstrip": true, "rstrip": false, "normalized": false}}
1
+ {"bos_token": {"content": "<s>", "single_word": false, "lstrip": false, "rstrip": false, "normalized": true}, "eos_token": {"content": "</s>", "single_word": false, "lstrip": false, "rstrip": false, "normalized": true}, "unk_token": {"content": "<unk>", "single_word": false, "lstrip": false, "rstrip": false, "normalized": true}, "sep_token": {"content": "</s>", "single_word": false, "lstrip": false, "rstrip": false, "normalized": true}, "pad_token": {"content": "<pad>", "single_word": false, "lstrip": false, "rstrip": false, "normalized": true}, "cls_token": {"content": "<s>", "single_word": false, "lstrip": false, "rstrip": false, "normalized": true}, "mask_token": {"content": "<mask>", "single_word": false, "lstrip": true, "rstrip": false, "normalized": true}}
tokenizer.json CHANGED
The diff for this file is too large to render. See raw diff
tokenizer_config.json CHANGED
@@ -1 +1 @@
1
- {"unk_token": "<unk>", "bos_token": "<s>", "eos_token": "</s>", "add_prefix_space": false, "errors": "replace", "sep_token": "</s>", "cls_token": "<s>", "pad_token": "<pad>", "mask_token": "<mask>", "special_tokens_map_file": null, "name_or_path": "roberta-base-ti", "tokenizer_class": "RobertaTokenizer"}
1
+ {"errors": "replace", "unk_token": {"content": "<unk>", "single_word": false, "lstrip": false, "rstrip": false, "normalized": true, "__type": "AddedToken"}, "bos_token": {"content": "<s>", "single_word": false, "lstrip": false, "rstrip": false, "normalized": true, "__type": "AddedToken"}, "eos_token": {"content": "</s>", "single_word": false, "lstrip": false, "rstrip": false, "normalized": true, "__type": "AddedToken"}, "add_prefix_space": false, "sep_token": {"content": "</s>", "single_word": false, "lstrip": false, "rstrip": false, "normalized": true, "__type": "AddedToken"}, "cls_token": {"content": "<s>", "single_word": false, "lstrip": false, "rstrip": false, "normalized": true, "__type": "AddedToken"}, "pad_token": {"content": "<pad>", "single_word": false, "lstrip": false, "rstrip": false, "normalized": true, "__type": "AddedToken"}, "mask_token": {"content": "<mask>", "single_word": false, "lstrip": true, "rstrip": false, "normalized": true, "__type": "AddedToken"}, "special_tokens_map_file": null, "name_or_path": "./", "tokenizer_class": "RobertaTokenizer"}
vocab.json CHANGED
The diff for this file is too large to render. See raw diff