Upload syllabletokenizer.py
Browse files- syllabletokenizer.py +88 -0
syllabletokenizer.py
ADDED
@@ -0,0 +1,88 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
from transformers import PreTrainedTokenizer
|
2 |
+
import json
|
3 |
+
import os
|
4 |
+
|
5 |
+
class SyllableTokenizer(PreTrainedTokenizer):
|
6 |
+
def __init__(
|
7 |
+
self,
|
8 |
+
vocab_file,
|
9 |
+
do_lower_case=False,
|
10 |
+
do_basic_tokenize=True,
|
11 |
+
never_split=None,
|
12 |
+
unk_token="[UNK]",
|
13 |
+
sep_token="[SEP]",
|
14 |
+
eos_token="[EOS]",
|
15 |
+
bos_token="[BOS]",
|
16 |
+
pad_token="[PAD]",
|
17 |
+
cls_token="[CLS]",
|
18 |
+
mask_token="[MASK]",
|
19 |
+
tokenize_chinese_chars=True,
|
20 |
+
**kwargs
|
21 |
+
):
|
22 |
+
# Load vocabulary
|
23 |
+
with open(vocab_file, 'r', encoding='utf-8') as f:
|
24 |
+
self.vocab = json.load(f)
|
25 |
+
# Initialize special tokens
|
26 |
+
self.mask_token = mask_token
|
27 |
+
self.sep_token = sep_token
|
28 |
+
self.cls_token = cls_token
|
29 |
+
self.pad_token = pad_token
|
30 |
+
self.eos_token = eos_token
|
31 |
+
self.bos_token = bos_token
|
32 |
+
self.unk_token = unk_token
|
33 |
+
|
34 |
+
self.ids_to_tokens = {id: token for token, id in self.vocab.items()}
|
35 |
+
super().__init__(pad_token=self.pad_token, eos_token=self.eos_token, bos_token=self.bos_token, unk_token=self.unk_token, mask_token=self.mask_token, **kwargs)
|
36 |
+
|
37 |
+
@property
|
38 |
+
def vocab_size(self):
|
39 |
+
return len(self.vocab)
|
40 |
+
|
41 |
+
def get_vocab(self):
|
42 |
+
return dict(self.vocab, **self.added_tokens_encoder)
|
43 |
+
|
44 |
+
def _tokenize(self, text):
|
45 |
+
return list(" ".join(text.split())) # Erase duplicate space
|
46 |
+
|
47 |
+
def _convert_token_to_id(self, token):
|
48 |
+
""" Converts a token (str) in an id using the vocab. """
|
49 |
+
return self.vocab.get(token, self.vocab.get(self.unk_token))
|
50 |
+
|
51 |
+
def _convert_id_to_token(self, index):
|
52 |
+
"""Converts an index (integer) in a token (str) using the vocab."""
|
53 |
+
return self.ids_to_tokens.get(index, self.unk_token)
|
54 |
+
|
55 |
+
def convert_tokens_to_string(self, tokens):
|
56 |
+
""" Converts a sequence of tokens (string) in a single string. """
|
57 |
+
return "".join(tokens).strip()
|
58 |
+
|
59 |
+
def save_vocabulary(self, vocab_path, filename_prefix=None):
|
60 |
+
"""
|
61 |
+
Save the tokenizer vocabulary and special tokens file to a directory.
|
62 |
+
|
63 |
+
Args:
|
64 |
+
vocab_path (str): The directory in which to save the vocabulary.
|
65 |
+
filename_prefix (str, optional): A prefix to add to the saved vocabulary filename.
|
66 |
+
|
67 |
+
Returns:
|
68 |
+
Tuple[str]: Paths to the files saved.
|
69 |
+
"""
|
70 |
+
index = 0
|
71 |
+
if os.path.isdir(vocab_path):
|
72 |
+
vocab_filename = "vocab.txt" if filename_prefix is None else f"{filename_prefix}_vocab.txt"
|
73 |
+
vocab_file = os.path.join(vocab_path, vocab_filename)
|
74 |
+
else:
|
75 |
+
vocab_file = vocab_path
|
76 |
+
|
77 |
+
with open(vocab_file, "w", encoding="utf-8") as writer:
|
78 |
+
for token, token_index in sorted(self.vocab.items(), key=lambda kv: kv[1]):
|
79 |
+
if index != token_index:
|
80 |
+
logger.warning(
|
81 |
+
f"Saving vocabulary to {vocab_file}: vocabulary indices are not consecutive. "
|
82 |
+
"Please check that the vocabulary is not corrupted!"
|
83 |
+
)
|
84 |
+
index = token_index
|
85 |
+
writer.write(token + "\n")
|
86 |
+
index += 1
|
87 |
+
|
88 |
+
return (vocab_file,)
|