ccasimiro commited on
Commit
7ab93ed
1 Parent(s): 9635fcc

upload model

Browse files
Files changed (9) hide show
  1. args.json +17 -0
  2. config.json +25 -0
  3. dict.txt +0 -0
  4. merges.txt +0 -0
  5. process.log +1 -0
  6. pytorch_model.bin +3 -0
  7. special_tokens_map.json +1 -0
  8. tokenizer_config.json +1 -0
  9. vocab.json +0 -0
args.json ADDED
@@ -0,0 +1,17 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "output_root": "/gpfs/projects/bsc88/corpus-utils-lm/23-12-2020-72f8c7e/output/model-ready_output/2020-12-23-1900-daf4-ab38",
3
+ "files": "/gpfs/projects/bsc88/corpus-utils-lm/23-12-2020-72f8c7e/output/model-ready_output/2020-12-23-1900-daf4-ab38/train_valid_test_split_output/2020-12-23-1905-daf4-a0e0/train.txt",
4
+ "vocab_name": "roberta-ca",
5
+ "clean_text": true,
6
+ "handle_chinese_chars": true,
7
+ "strip_accents": false,
8
+ "lowercase": false,
9
+ "vocab_size": 52000,
10
+ "limit_alphabet": 1000,
11
+ "show_progress": true,
12
+ "min_frequency": 2,
13
+ "extra_tokens": [],
14
+ "reserve_tokens": 0,
15
+ "tokenizer": "bbpe",
16
+ "commit_hash": "daf4d660ec8a4b28d2bc29b3063779100ab85796\n"
17
+ }
config.json ADDED
@@ -0,0 +1,25 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "architectures": [
3
+ "RobertaForMaskedLM"
4
+ ],
5
+ "attention_probs_dropout_prob": 0.1,
6
+ "bos_token_id": 0,
7
+ "eos_token_id": 2,
8
+ "gradient_checkpointing": false,
9
+ "hidden_act": "gelu",
10
+ "hidden_dropout_prob": 0.1,
11
+ "hidden_size": 768,
12
+ "initializer_range": 0.02,
13
+ "intermediate_size": 3072,
14
+ "layer_norm_eps": 1e-05,
15
+ "max_position_embeddings": 514,
16
+ "model_type": "roberta",
17
+ "num_attention_heads": 12,
18
+ "num_hidden_layers": 12,
19
+ "pad_token_id": 1,
20
+ "position_embedding_type": "absolute",
21
+ "transformers_version": "4.4.0",
22
+ "type_vocab_size": 1,
23
+ "use_cache": true,
24
+ "vocab_size": 52000
25
+ }
dict.txt ADDED
The diff for this file is too large to render. See raw diff
merges.txt ADDED
The diff for this file is too large to render. See raw diff
process.log ADDED
@@ -0,0 +1 @@
 
1
+ INFO:root:Function "train_tokenizer" took 306.3926444167737 seconds to complete.
pytorch_model.bin ADDED
@@ -0,0 +1,3 @@
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:d720b1dddaef37080df8761bea199e3a307cd86cdd261fe3430a674579118f21
3
+ size 504420627
special_tokens_map.json ADDED
@@ -0,0 +1 @@
 
1
+ {"bos_token": {"content": "<s>", "single_word": false, "lstrip": false, "rstrip": false, "normalized": true}, "eos_token": {"content": "</s>", "single_word": false, "lstrip": false, "rstrip": false, "normalized": true}, "unk_token": {"content": "<unk>", "single_word": false, "lstrip": false, "rstrip": false, "normalized": true}, "sep_token": {"content": "</s>", "single_word": false, "lstrip": false, "rstrip": false, "normalized": true}, "pad_token": {"content": "<pad>", "single_word": false, "lstrip": false, "rstrip": false, "normalized": true}, "cls_token": {"content": "<s>", "single_word": false, "lstrip": false, "rstrip": false, "normalized": true}, "mask_token": {"content": "<mask>", "single_word": false, "lstrip": true, "rstrip": false, "normalized": true}}
tokenizer_config.json ADDED
@@ -0,0 +1 @@
 
1
+ {"unk_token": {"content": "<unk>", "single_word": false, "lstrip": false, "rstrip": false, "normalized": true, "__type": "AddedToken"}, "bos_token": {"content": "<s>", "single_word": false, "lstrip": false, "rstrip": false, "normalized": true, "__type": "AddedToken"}, "eos_token": {"content": "</s>", "single_word": false, "lstrip": false, "rstrip": false, "normalized": true, "__type": "AddedToken"}, "add_prefix_space": true, "errors": "replace", "sep_token": {"content": "</s>", "single_word": false, "lstrip": false, "rstrip": false, "normalized": true, "__type": "AddedToken"}, "cls_token": {"content": "<s>", "single_word": false, "lstrip": false, "rstrip": false, "normalized": true, "__type": "AddedToken"}, "pad_token": {"content": "<pad>", "single_word": false, "lstrip": false, "rstrip": false, "normalized": true, "__type": "AddedToken"}, "mask_token": {"content": "<mask>", "single_word": false, "lstrip": true, "rstrip": false, "normalized": true, "__type": "AddedToken"}, "max_len": 512, "special_tokens_map_file": null, "name_or_path": "/gpfs/projects/bsc88/corpus-utils-lm/23-12-2020-72f8c7e/output/model-ready_output/2020-12-23-1900-daf4-ab38/train_tokenizer_output/2020-12-23-1913-daf4-ed9c"}
vocab.json ADDED
The diff for this file is too large to render. See raw diff