dkoshman commited on
Commit
41c9661
1 Parent(s): e33424f

two line change

Browse files
Files changed (1) hide show
  1. data_preprocessing.py +2 -2
data_preprocessing.py CHANGED
@@ -88,7 +88,7 @@ class RandomTransformImage(object):
88
  return image
89
 
90
 
91
- def generate_tex_tokenizer(dataset):
92
  """Returns a tokeniser trained on tex strings from dataset"""
93
 
94
  tokenizer = tokenizers.Tokenizer(tokenizers.models.BPE(unk_token="[UNK]"))
@@ -97,7 +97,7 @@ def generate_tex_tokenizer(dataset):
97
  special_tokens=["[UNK]", "[CLS]", "[SEP]", "[PAD]", "[MASK]"]
98
  )
99
  tokenizer.pre_tokenizer = tokenizers.pre_tokenizers.Whitespace()
100
- tokenizer.train_from_iterator((item['tex'] for item in dataset), trainer=tokenizer_trainer)
101
  tokenizer.post_processor = tokenizers.processors.TemplateProcessing(
102
  single="$A [SEP]",
103
  special_tokens=[("[SEP]", tokenizer.token_to_id("[SEP]"))]
 
88
  return image
89
 
90
 
91
+ def generate_tex_tokenizer(texs):
92
  """Returns a tokeniser trained on tex strings from dataset"""
93
 
94
  tokenizer = tokenizers.Tokenizer(tokenizers.models.BPE(unk_token="[UNK]"))
 
97
  special_tokens=["[UNK]", "[CLS]", "[SEP]", "[PAD]", "[MASK]"]
98
  )
99
  tokenizer.pre_tokenizer = tokenizers.pre_tokenizers.Whitespace()
100
+ tokenizer.train_from_iterator(texs, trainer=tokenizer_trainer)
101
  tokenizer.post_processor = tokenizers.processors.TemplateProcessing(
102
  single="$A [SEP]",
103
  special_tokens=[("[SEP]", tokenizer.token_to_id("[SEP]"))]