matejulcar commited on
Commit
121120f
1 Parent(s): 6e2c345

first release

Browse files
Files changed (5) hide show
  1. README.md +20 -1
  2. config.json +25 -0
  3. pytorch_model.bin +3 -0
  4. tokenizer.json +0 -0
  5. tokenizer_config.json +24 -0
README.md CHANGED
@@ -1,3 +1,22 @@
1
  ---
2
- license: cc-by-sa-4.0
 
 
 
3
  ---
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
  ---
2
+ language:
3
+ - sl
4
+ - en
5
+ licence: cc-by-sa-4.0
6
  ---
7
+
8
+ # SlEng-bert
9
+
10
+ SlEng-bert is a bilingual, Slovene-English masked language model.
11
+
12
+ SlEng-bert was trained from scratch on Slovene and English, conversational, non-standard, and slang language.
13
+ The model has 12 transformer layers, and is roughly equal in size to BERT and RoBERTa base models. The pre-training task used was masked language modeling, with no other tasks (like NSP).
14
+
15
+ The tokenizer and corpora used to train SlEng-bert were also used for training the [SloBERTa-SlEng](https://huggingface.co/cjvt/sloberta-sleng) model.
16
+ The difference between the two is: SlEng-bert was trained from scratch for 40 epochs; SloBERTa-SlEng is SloBERTa further pre-trained for 2 epochs on new corpora.
17
+
18
+ ## Training corpora
19
+
20
+ The model was trained on English and Slovene tweets, Slovene corpora [MaCoCu](http://hdl.handle.net/11356/1517) and [Frenk](http://hdl.handle.net/11356/1201),
21
+ and a small subset of English [Oscar](https://huggingface.co/datasets/oscar) corpus. We tried to keep the sizes of English and Slovene corpora as equal as possible.
22
+ Training corpora had in total about 2.7 billion words.
config.json ADDED
@@ -0,0 +1,25 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "architectures": [
3
+ "CamembertForMaskedLM"
4
+ ],
5
+ "attention_probs_dropout_prob": 0.1,
6
+ "bos_token_id": 0,
7
+ "eos_token_id": 2,
8
+ "gradient_checkpointing": false,
9
+ "hidden_act": "gelu",
10
+ "hidden_dropout_prob": 0.1,
11
+ "hidden_size": 768,
12
+ "initializer_range": 0.02,
13
+ "intermediate_size": 3072,
14
+ "layer_norm_eps": 1e-05,
15
+ "max_position_embeddings": 514,
16
+ "model_type": "camembert",
17
+ "num_attention_heads": 12,
18
+ "num_hidden_layers": 12,
19
+ "pad_token_id": 1,
20
+ "position_embedding_type": "absolute",
21
+ "transformers_version": "4.6.1",
22
+ "type_vocab_size": 1,
23
+ "use_cache": true,
24
+ "vocab_size": 40005
25
+ }
pytorch_model.bin ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:6a7b1b3e94569c1da0d4409daa6b38daf048bc1feb149845b97eead1a8fc777e
3
+ size 467443364
tokenizer.json ADDED
The diff for this file is too large to render. See raw diff
 
tokenizer_config.json ADDED
@@ -0,0 +1,24 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "additional_special_tokens": [
3
+ "<s>NOTUSED",
4
+ "</s>NOTUSED"
5
+ ],
6
+ "bos_token": "<s>",
7
+ "cls_token": "<s>",
8
+ "eos_token": "</s>",
9
+ "mask_token": {
10
+ "__type": "AddedToken",
11
+ "content": "<mask>",
12
+ "lstrip": true,
13
+ "normalized": true,
14
+ "rstrip": false,
15
+ "single_word": false
16
+ },
17
+ "name_or_path": "./slen-from-sloberta",
18
+ "pad_token": "<pad>",
19
+ "sep_token": "</s>",
20
+ "sp_model_kwargs": {},
21
+ "special_tokens_map_file": null,
22
+ "tokenizer_class": "CamembertTokenizer",
23
+ "unk_token": "<unk>"
24
+ }