Robert Gale commited on
Commit
7a67698
1 Parent(s): 76ba67c

Initial commit

Browse files
README.md ADDED
@@ -0,0 +1,34 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # BORT
2
+
3
+ BORT is a pretrained LLM that is designed to accept a mixture of English phonemes (in IPA) and orthography, made with clinical language evaluation tasks in mind. From the paper:
4
+
5
+ > Robert Gale, Alexandra C. Salem, Gerasimos Fergadiotis, and Steven Bedrick. 2023. **Mixed Orthographic/Phonemic Language Modeling: Beyond Orthographically Restricted Transformers (BORT).** In Proceedings of the 8th Workshop on Representation Learning for NLP (RepL4NLP-2023), pages TBD, Online. Association for Computational Linguistics.
6
+
7
+ ## Limitations
8
+
9
+ The models presented here were trained with the basic inventory of English phonemes found in CMUDict. However, a more fine-grained phonetic analysis would require a pronunciation dictionary with more narrowly defined entries. Additionally, while this paper focused on models trained with English-only resources (pre-trained BART-BASE, English Wikipedia text, CMUDict, and the English AphasiaBank), the techniques should be applicable to non-English language models as well. Finally, from a clinical standpoint, the model we describe in this paper assumes the existence of transcribed input (from either a manual or automated source, discussed in detail in §2.1 of the paper; in its current form, this represents a limitation to its clinical implementation, though not to its use in research settings with archival or newly-transcribed datasets.
10
+
11
+ ## Ethics Statement
12
+
13
+ Our use of the AphasiaBank data was governed by the TalkBank consortium's data use agreement, and the underlying recordings were collected and shared with approval of the contributing sites' institutional review boards.
14
+ Limitations exist regarding accents and dialect, which in turn would affect the scenarios in which a system based on our model could (and should) be used.
15
+ It should also be noted that these models and any derived technology are not meant to be tools to diagnose medical conditions, a task best left to qualified clinicians.
16
+
17
+ ## Pre-trained Model Variants
18
+
19
+ - **BORT-PR** (upload ETA ≤ ACL 2023)
20
+ - **BORT-SP** (upload ETA ≤ ACL 2023)
21
+ - **BORT-PR-SP** (upload ETA ≤ ACL 2023)
22
+ - **BORT-PR-NOISY** (upload ETA ≤ ACL 2023)
23
+ - **BORT-SP-NOISY** (upload ETA ≤ ACL 2023)
24
+ - **BORT-PR-SP-NOISY** (upload ETA ≤ ACL 2023)
25
+
26
+ ## Wikipedia Dataset Used in Pre-Training
27
+
28
+ The BPE-tokenized version of the dataset, including metadata used in word transforms.
29
+
30
+ - **Dataset** (upload ETA ≤ ACL 2023)
31
+
32
+ ## Acknowledgements
33
+
34
+ This work was supported by the National Institute on Deafness and Other Communication Disorders of the National Institutes of Health under award 5R01DC015999 (Principal Investigators: Bedrick \& Fergadiotis). The content is solely the responsibility of the authors and does not necessarily represent the official views of the National Institutes of Health.
config.json ADDED
@@ -0,0 +1,47 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "_name_or_path": "palat/bort",
3
+ "activation_dropout": 0.0,
4
+ "activation_function": "gelu",
5
+ "architectures": [
6
+ "BartForConditionalGeneration"
7
+ ],
8
+ "attention_dropout": 0.1,
9
+ "bos_token_id": 0,
10
+ "classifier_dropout": 0.0,
11
+ "d_model": 768,
12
+ "decoder_attention_heads": 12,
13
+ "decoder_ffn_dim": 3072,
14
+ "decoder_layerdrop": 0,
15
+ "decoder_layers": 6,
16
+ "decoder_start_token_id": 2,
17
+ "dropout": 0.1,
18
+ "embedding_dim": 768,
19
+ "encoder_attention_heads": 12,
20
+ "encoder_ffn_dim": 3072,
21
+ "encoder_layerdrop": 0,
22
+ "encoder_layers": 6,
23
+ "eos_token_id": 2,
24
+ "forced_bos_token_id": 0,
25
+ "forced_eos_token_id": 2,
26
+ "id2label": {
27
+ "0": "LABEL_0",
28
+ "1": "LABEL_1",
29
+ "2": "LABEL_2"
30
+ },
31
+ "init_std": 0.02,
32
+ "is_encoder_decoder": true,
33
+ "label2id": {
34
+ "LABEL_0": 0,
35
+ "LABEL_1": 1,
36
+ "LABEL_2": 2
37
+ },
38
+ "max_position_embeddings": 1024,
39
+ "model_type": "bart",
40
+ "num_hidden_layers": 6,
41
+ "pad_token_id": 1,
42
+ "scale_embedding": false,
43
+ "torch_dtype": "float32",
44
+ "transformers_version": "4.30.2",
45
+ "use_cache": true,
46
+ "vocab_size": 51201
47
+ }
generation_config.json ADDED
@@ -0,0 +1,10 @@
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "_from_model_config": true,
3
+ "bos_token_id": 0,
4
+ "decoder_start_token_id": 2,
5
+ "eos_token_id": 2,
6
+ "forced_bos_token_id": 0,
7
+ "forced_eos_token_id": 2,
8
+ "pad_token_id": 1,
9
+ "transformers_version": "4.30.2"
10
+ }
merges.txt ADDED
The diff for this file is too large to render. See raw diff
 
pytorch_model.bin ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:37cb75c11b8fb7d52124dfefe8ee3ec966ef18a79e0630a01abc80ecaa97b753
3
+ size 560644462
pytorch_model.bort-pr-sp-noisy.bin ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:37cb75c11b8fb7d52124dfefe8ee3ec966ef18a79e0630a01abc80ecaa97b753
3
+ size 560644462
pytorch_model.bort-pr-sp.bin ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:16b039a4206a21be7d158757a775719f4ed5c3503c4caac2ac1e339ec824874d
3
+ size 560642896
special_tokens_map.json ADDED
@@ -0,0 +1,51 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "bos_token": {
3
+ "content": "<s>",
4
+ "lstrip": false,
5
+ "normalized": true,
6
+ "rstrip": false,
7
+ "single_word": false
8
+ },
9
+ "cls_token": {
10
+ "content": "<s>",
11
+ "lstrip": false,
12
+ "normalized": true,
13
+ "rstrip": false,
14
+ "single_word": false
15
+ },
16
+ "eos_token": {
17
+ "content": "</s>",
18
+ "lstrip": false,
19
+ "normalized": true,
20
+ "rstrip": false,
21
+ "single_word": false
22
+ },
23
+ "mask_token": {
24
+ "content": "<mask>",
25
+ "lstrip": true,
26
+ "normalized": true,
27
+ "rstrip": false,
28
+ "single_word": false
29
+ },
30
+ "pad_token": {
31
+ "content": "<pad>",
32
+ "lstrip": false,
33
+ "normalized": true,
34
+ "rstrip": false,
35
+ "single_word": false
36
+ },
37
+ "sep_token": {
38
+ "content": "</s>",
39
+ "lstrip": false,
40
+ "normalized": true,
41
+ "rstrip": false,
42
+ "single_word": false
43
+ },
44
+ "unk_token": {
45
+ "content": "<unk>",
46
+ "lstrip": false,
47
+ "normalized": true,
48
+ "rstrip": false,
49
+ "single_word": false
50
+ }
51
+ }
tf_model.h5 ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:b49c5661e2a0c18ea2b355de4b640b0e23d7457ea6cf8566613e74910dbd1a02
3
+ size 561051436
tokenizer_config.json ADDED
@@ -0,0 +1,63 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "add_prefix_space": false,
3
+ "bos_token": {
4
+ "__type": "AddedToken",
5
+ "content": "<s>",
6
+ "lstrip": false,
7
+ "normalized": true,
8
+ "rstrip": false,
9
+ "single_word": false
10
+ },
11
+ "clean_up_tokenization_spaces": true,
12
+ "cls_token": {
13
+ "__type": "AddedToken",
14
+ "content": "<s>",
15
+ "lstrip": false,
16
+ "normalized": true,
17
+ "rstrip": false,
18
+ "single_word": false
19
+ },
20
+ "eos_token": {
21
+ "__type": "AddedToken",
22
+ "content": "</s>",
23
+ "lstrip": false,
24
+ "normalized": true,
25
+ "rstrip": false,
26
+ "single_word": false
27
+ },
28
+ "errors": "replace",
29
+ "mask_token": {
30
+ "__type": "AddedToken",
31
+ "content": "<mask>",
32
+ "lstrip": true,
33
+ "normalized": true,
34
+ "rstrip": false,
35
+ "single_word": false
36
+ },
37
+ "model_max_length": 1024,
38
+ "pad_token": {
39
+ "__type": "AddedToken",
40
+ "content": "<pad>",
41
+ "lstrip": false,
42
+ "normalized": true,
43
+ "rstrip": false,
44
+ "single_word": false
45
+ },
46
+ "sep_token": {
47
+ "__type": "AddedToken",
48
+ "content": "</s>",
49
+ "lstrip": false,
50
+ "normalized": true,
51
+ "rstrip": false,
52
+ "single_word": false
53
+ },
54
+ "tokenizer_class": "BartTokenizer",
55
+ "unk_token": {
56
+ "__type": "AddedToken",
57
+ "content": "<unk>",
58
+ "lstrip": false,
59
+ "normalized": true,
60
+ "rstrip": false,
61
+ "single_word": false
62
+ }
63
+ }
vocab.json ADDED
The diff for this file is too large to render. See raw diff