ccasimiro commited on
Commit
cc67d3e
1 Parent(s): 0c26026

upload model

Browse files
Files changed (10) hide show
  1. README.md +0 -0
  2. args.json +24 -0
  3. config.json +25 -0
  4. dict.txt +0 -0
  5. merges.txt +0 -0
  6. process.log +8 -0
  7. pytorch_model.bin +3 -0
  8. special_tokens_map.json +1 -0
  9. tokenizer_config.json +1 -0
  10. vocab.json +0 -0
README.md ADDED
File without changes
args.json ADDED
@@ -0,0 +1,24 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "custom_vocab_files": [
3
+ "/home/usuaris/veu/casimiro.pio.carrino/projects/corpus-utils-lm/corpora/bio/biomedical-clinical.txt"
4
+ ],
5
+ "vocab_name": "bio-biomedical-clinical-vocab-52k",
6
+ "tokenizer": "bbpe",
7
+ "lowercase": false,
8
+ "vocab_size": 52000,
9
+ "min_frequency": 10,
10
+ "extra_tokens": [],
11
+ "limit_alphabet": 1000,
12
+ "no_show_progress": false,
13
+ "strip_accents": false,
14
+ "no_handle_chinese_chars": false,
15
+ "no_clean_text": false,
16
+ "reserve_tokens": 0,
17
+ "use_tokenizers": false,
18
+ "no_fairseq": false,
19
+ "files": [
20
+ "/home/usuaris/veu/casimiro.pio.carrino/projects/corpus-utils-lm/corpora/bio/biomedical-clinical.txt"
21
+ ],
22
+ "output_root_path": "/home/usuaris/veu/casimiro.pio.carrino/projects/corpus-utils-lm/output/model-ready_output/bio-biomedical-clinical-vocab-52k-2021-04-26-0955-3a71-240f",
23
+ "commit_hash": "3a7116cf776527c411869becbe6fad8b9e3f5e56"
24
+ }
config.json ADDED
@@ -0,0 +1,25 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "architectures": [
3
+ "RobertaForMaskedLM"
4
+ ],
5
+ "attention_probs_dropout_prob": 0.1,
6
+ "bos_token_id": 0,
7
+ "eos_token_id": 2,
8
+ "gradient_checkpointing": false,
9
+ "hidden_act": "gelu",
10
+ "hidden_dropout_prob": 0.1,
11
+ "hidden_size": 768,
12
+ "initializer_range": 0.02,
13
+ "intermediate_size": 3072,
14
+ "layer_norm_eps": 1e-05,
15
+ "max_position_embeddings": 514,
16
+ "model_type": "roberta",
17
+ "num_attention_heads": 12,
18
+ "num_hidden_layers": 12,
19
+ "pad_token_id": 1,
20
+ "position_embedding_type": "absolute",
21
+ "transformers_version": "4.4.0",
22
+ "type_vocab_size": 1,
23
+ "use_cache": true,
24
+ "vocab_size": 52000
25
+ }
dict.txt ADDED
The diff for this file is too large to render. See raw diff
 
merges.txt ADDED
The diff for this file is too large to render. See raw diff
 
process.log ADDED
@@ -0,0 +1,8 @@
 
 
 
 
 
 
 
 
 
1
+ Executing train_tokenizer.py
2
+ ------------------------------
3
+ training bbpe tokenizer
4
+ Initialize an empty tokenizer
5
+ training
6
+ saving model tokenizer to /home/usuaris/veu/casimiro.pio.carrino/projects/corpus-utils-lm/output/model-ready_output/bio-biomedical-clinical-vocab-52k-2021-04-26-0955-3a71-240f/train_tokenizer_output/train-tokenizer-2021-04-26-1009-3a71-e9ca
7
+ saving pretrained to /home/usuaris/veu/casimiro.pio.carrino/projects/corpus-utils-lm/output/model-ready_output/bio-biomedical-clinical-vocab-52k-2021-04-26-0955-3a71-240f/train_tokenizer_output/train-tokenizer-2021-04-26-1009-3a71-e9ca
8
+ saving config to /home/usuaris/veu/casimiro.pio.carrino/projects/corpus-utils-lm/output/model-ready_output/bio-biomedical-clinical-vocab-52k-2021-04-26-0955-3a71-240f/train_tokenizer_output/train-tokenizer-2021-04-26-1009-3a71-e9ca
pytorch_model.bin ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:2e0c8d7bb348e40327b7c38eb6995c74d5c64345bc4ab9b3deff58e7359f15f7
3
+ size 504420627
special_tokens_map.json ADDED
@@ -0,0 +1 @@
 
 
1
+ {"bos_token": {"content": "<s>", "single_word": false, "lstrip": false, "rstrip": false, "normalized": true}, "eos_token": {"content": "</s>", "single_word": false, "lstrip": false, "rstrip": false, "normalized": true}, "unk_token": {"content": "<unk>", "single_word": false, "lstrip": false, "rstrip": false, "normalized": true}, "sep_token": {"content": "</s>", "single_word": false, "lstrip": false, "rstrip": false, "normalized": true}, "pad_token": {"content": "<pad>", "single_word": false, "lstrip": false, "rstrip": false, "normalized": true}, "cls_token": {"content": "<s>", "single_word": false, "lstrip": false, "rstrip": false, "normalized": true}, "mask_token": {"content": "<mask>", "single_word": false, "lstrip": true, "rstrip": false, "normalized": true}}
tokenizer_config.json ADDED
@@ -0,0 +1 @@
 
 
1
+ {"unk_token": {"content": "<unk>", "single_word": false, "lstrip": false, "rstrip": false, "normalized": true, "__type": "AddedToken"}, "bos_token": {"content": "<s>", "single_word": false, "lstrip": false, "rstrip": false, "normalized": true, "__type": "AddedToken"}, "eos_token": {"content": "</s>", "single_word": false, "lstrip": false, "rstrip": false, "normalized": true, "__type": "AddedToken"}, "add_prefix_space": true, "errors": "replace", "sep_token": {"content": "</s>", "single_word": false, "lstrip": false, "rstrip": false, "normalized": true, "__type": "AddedToken"}, "cls_token": {"content": "<s>", "single_word": false, "lstrip": false, "rstrip": false, "normalized": true, "__type": "AddedToken"}, "pad_token": {"content": "<pad>", "single_word": false, "lstrip": false, "rstrip": false, "normalized": true, "__type": "AddedToken"}, "mask_token": {"content": "<mask>", "single_word": false, "lstrip": true, "rstrip": false, "normalized": true, "__type": "AddedToken"}, "max_len": 512, "special_tokens_map_file": null, "name_or_path": "/home/usuaris/veu/casimiro.pio.carrino/projects/corpus-utils-lm/output/model-ready_output/bio-biomedical-clinical-vocab-52k-2021-04-26-0955-3a71-240f/train_tokenizer_output/train-tokenizer-2021-04-26-1009-3a71-e9ca"}
vocab.json ADDED
The diff for this file is too large to render. See raw diff