Spaces:
Runtime error
Runtime error
dkoshman
commited on
Commit
•
41c9661
1
Parent(s):
e33424f
two line change
Browse files- data_preprocessing.py +2 -2
data_preprocessing.py
CHANGED
@@ -88,7 +88,7 @@ class RandomTransformImage(object):
|
|
88 |
return image
|
89 |
|
90 |
|
91 |
-
def generate_tex_tokenizer(
|
92 |
"""Returns a tokeniser trained on tex strings from dataset"""
|
93 |
|
94 |
tokenizer = tokenizers.Tokenizer(tokenizers.models.BPE(unk_token="[UNK]"))
|
@@ -97,7 +97,7 @@ def generate_tex_tokenizer(dataset):
|
|
97 |
special_tokens=["[UNK]", "[CLS]", "[SEP]", "[PAD]", "[MASK]"]
|
98 |
)
|
99 |
tokenizer.pre_tokenizer = tokenizers.pre_tokenizers.Whitespace()
|
100 |
-
tokenizer.train_from_iterator(
|
101 |
tokenizer.post_processor = tokenizers.processors.TemplateProcessing(
|
102 |
single="$A [SEP]",
|
103 |
special_tokens=[("[SEP]", tokenizer.token_to_id("[SEP]"))]
|
|
|
88 |
return image
|
89 |
|
90 |
|
91 |
+
def generate_tex_tokenizer(texs):
|
92 |
"""Returns a tokeniser trained on tex strings from dataset"""
|
93 |
|
94 |
tokenizer = tokenizers.Tokenizer(tokenizers.models.BPE(unk_token="[UNK]"))
|
|
|
97 |
special_tokens=["[UNK]", "[CLS]", "[SEP]", "[PAD]", "[MASK]"]
|
98 |
)
|
99 |
tokenizer.pre_tokenizer = tokenizers.pre_tokenizers.Whitespace()
|
100 |
+
tokenizer.train_from_iterator(texs, trainer=tokenizer_trainer)
|
101 |
tokenizer.post_processor = tokenizers.processors.TemplateProcessing(
|
102 |
single="$A [SEP]",
|
103 |
special_tokens=[("[SEP]", tokenizer.token_to_id("[SEP]"))]
|