bert-tiny-char-ctc-bak-denoise / char_tokenizer.py
cointegrated's picture
Create char_tokenizer.py
b9a4929
"""
Mostly copypasted from
https://huggingface.co/IlyaGusev/ru-word-stress-transformer/blob/main/char_tokenizer.py
with Apache 2.0 license
"""
import os
from typing import Optional, Tuple, List
from collections import OrderedDict
from torch.utils.data import Dataset
from transformers import PreTrainedTokenizer, AutoTokenizer
def load_vocab(vocab_file):
vocab = OrderedDict()
with open(vocab_file, "r", encoding="utf-8") as reader:
tokens = reader.readlines()
for index, token in enumerate(tokens):
token = token.rstrip("\n")
vocab[token] = index
return vocab
class CharTokenizer(PreTrainedTokenizer):
vocab_files_names = {"vocab_file": "vocab.txt"}
def __init__(
self,
vocab_file=None,
pad_token="[pad]",
unk_token="[unk]",
bos_token="[bos]",
eos_token="[eos]",
cls_token="[cls]",
sep_token="[sep]",
mask_token="[mask]",
space_token="▁",
do_lower_case=False,
*args,
**kwargs
):
super().__init__(
pad_token=pad_token,
unk_token=unk_token,
bos_token=bos_token,
eos_token=eos_token,
cls_token=cls_token,
mask_token=mask_token,
do_lower_case=do_lower_case,
**kwargs
)
self.do_lower_case = do_lower_case
self.space_token = space_token
if not vocab_file or not os.path.isfile(vocab_file):
self.vocab = OrderedDict()
self.ids_to_tokens = OrderedDict()
else:
self.vocab = load_vocab(vocab_file)
self.ids_to_tokens = OrderedDict([(ids, tok) for tok, ids in self.vocab.items()])
def train(self, file_path):
vocab = set()
with open(file_path) as r:
for line in r:
word = line.strip()
if self.do_lower_case:
word = word.lower()
vocab |= set(word)
vocab = list(vocab)
vocab.sort()
special_tokens = [self.pad_token, self.unk_token, self.bos_token, self.eos_token]
vocab = special_tokens + vocab
for i, ch in enumerate(vocab):
self.vocab[ch] = i
self.ids_to_tokens = vocab
@property
def vocab_size(self):
return len(self.vocab)
def get_vocab(self):
return self.vocab
def _convert_token_to_id(self, token):
if self.do_lower_case:
token = token.lower()
return self.vocab.get(token, self.vocab[self.unk_token])
def _convert_id_to_token(self, index):
return self.ids_to_tokens[index]
def prepare_for_tokenization(
self, text, is_split_into_words: bool = False, spaces=0, **kwargs
):
if spaces:
pad = self.space_token * spaces
text = pad + pad.join(text) + pad
return (text, kwargs)
def _tokenize(self, text, spaces=0):
if self.do_lower_case:
text = text.lower()
return list(text)
def convert_tokens_to_string(self, tokens):
return "".join(tokens)
def build_inputs_with_special_tokens(
self,
token_ids_0: List[int],
token_ids_1: Optional[List[int]] = None
) -> List[int]:
bos = [self.bos_token_id]
eos = [self.eos_token_id]
return bos + token_ids_0 + eos
def get_special_tokens_mask(
self,
token_ids_0: List[int],
token_ids_1: Optional[List[int]] = None
) -> List[int]:
return [1] + ([0] * len(token_ids_0)) + [1]
def create_token_type_ids_from_sequences(
self,
token_ids_0: List[int],
token_ids_1: Optional[List[int]] = None
) -> List[int]:
return (len(token_ids_0) + 2) * [0]
def save_vocabulary(
self,
save_directory: str,
filename_prefix: Optional[str] = None
) -> Tuple[str]:
assert os.path.isdir(save_directory)
vocab_file = os.path.join(
save_directory,
(filename_prefix + "-" if filename_prefix else "") +
self.vocab_files_names["vocab_file"]
)
index = 0
with open(vocab_file, "w", encoding="utf-8") as writer:
for token, token_index in sorted(self.vocab.items(), key=lambda kv: kv[1]):
assert index == token_index
writer.write(token + "\n")
index += 1
return (vocab_file,)
def clean_up_tokenization(self, text, space='▁'):
res = []
prev = space
for c in text:
if c != prev and c != space:
res.append(c)
prev = c
return ''.join(res)
AutoTokenizer.register("char_tokenizer", CharTokenizer)