File size: 5,148 Bytes
ab26e54 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 |
#!/usr/bin/env python3
import json
from typing import Iterator, List, Union
from tokenizers import AddedToken, Regex, Tokenizer, decoders, normalizers, pre_tokenizers, trainers
from tokenizers.implementations.base_tokenizer import BaseTokenizer
from tokenizers.models import Unigram
from tokenizers.processors import TemplateProcessing
class SentencePieceUnigramTokenizer(BaseTokenizer):
"""
This class is a copy of `DeDLOC's tokenizer implementation <https://github.com/yandex-research/DeDLOC/blob/main/sahajbert/tokenizer/tokenizer_model.py>`__ .
Custom SentencePiece Unigram Tokenizer with NMT, NKFC, spaces and lower-casing characters normalization
Represents the Unigram algorithm, with the pretokenization used by SentencePiece
"""
def __init__(
self,
replacement: str = "▁",
add_prefix_space: bool = True,
unk_token: Union[str, AddedToken] = "<unk>",
eos_token: Union[str, AddedToken] = "</s>",
pad_token: Union[str, AddedToken] = "<pad>",
):
self.special_tokens = {
"pad": {"id": 0, "token": pad_token},
"eos": {"id": 1, "token": eos_token},
"unk": {"id": 2, "token": unk_token},
}
self.special_tokens_list = [None] * len(self.special_tokens)
for token_dict in self.special_tokens.values():
self.special_tokens_list[token_dict["id"]] = token_dict["token"]
tokenizer = Tokenizer(Unigram())
# the following regexes are taken directly from https://github.com/aub-mind/arabert/blob/f92f06a29804f74878e2d1e39ea57fba8dcb0eac/preprocess.py
url = " [رابط] "
email = " [بريد] "
usr = " [مستخدم] "
url_regexes = [
r"(http(s)?:\/\/.)?(www\.)?[-a-zA-Z0-9@:%._\+~#=]{2,256}\.[a-z]{2,6}\b([-a-zA-Z0-9@:%_\+.~#?&//=]*)",
r"@(https?|ftp)://(-\.)?([^\s/?\.#-]+\.?)+(/[^\s]*)?$@iS",
r"http[s]?://[a-zA-Z0-9_\-./~\?=%&]+",
r"www[a-zA-Z0-9_\-?=%&/.~]+",
r"[a-zA-Z]+\.com",
r"(?=http)[^\s]+",
r"(?=www)[^\s]+",
r"://",
]
email_regexes = [r"[\w-]+@([\w-]+\.)+[\w-]+", r"\S+@\S+"]
user_mention_regex = r"@[\w\d]+"
tokenizer.normalizer = normalizers.Sequence(
[
normalizers.Nmt(),
normalizers.NFKC(),
# remove links, emails, user mentions ans hashtags
*[normalizers.Replace(Regex(r), url) for r in url_regexes],
*[normalizers.Replace(Regex(r), email) for r in email_regexes],
normalizers.Replace(Regex(user_mention_regex), usr),
# remove html
normalizers.Replace(Regex("<br />"), " "),
normalizers.Replace(Regex("</?[^>]+>"), " "),
# remove extra white space
normalizers.Replace(Regex(" {2,}"), " "),
normalizers.Lowercase(),
]
)
tokenizer.pre_tokenizer = pre_tokenizers.Sequence(
[
pre_tokenizers.Metaspace(replacement=replacement, add_prefix_space=add_prefix_space),
pre_tokenizers.Digits(individual_digits=True),
pre_tokenizers.Punctuation(),
]
)
tokenizer.decoder = decoders.Metaspace(replacement=replacement, add_prefix_space=add_prefix_space)
tokenizer.post_processor = TemplateProcessing(
single=f"$A {self.special_tokens['eos']['token']}",
special_tokens=[(self.special_tokens["eos"]["token"], self.special_tokens["eos"]["id"])],
)
parameters = {
"model": "SentencePieceUnigram",
"replacement": replacement,
"add_prefix_space": add_prefix_space,
}
super().__init__(tokenizer, parameters)
def train(
self,
files: Union[str, List[str]],
vocab_size: int = 8000,
show_progress: bool = True,
):
"""Train the model using the given files"""
trainer = trainers.UnigramTrainer(
vocab_size=vocab_size,
special_tokens=self.special_tokens_list,
show_progress=show_progress,
)
if isinstance(files, str):
files = [files]
self._tokenizer.train(files, trainer=trainer)
self.add_unk_id()
def train_from_iterator(
self,
iterator: Union[Iterator[str], Iterator[Iterator[str]]],
vocab_size: int = 8000,
show_progress: bool = True,
):
"""Train the model using the given iterator"""
trainer = trainers.UnigramTrainer(
vocab_size=vocab_size,
special_tokens=self.special_tokens_list,
show_progress=show_progress,
)
self._tokenizer.train_from_iterator(iterator, trainer=trainer)
self.add_unk_id()
def add_unk_id(self):
tokenizer_json = json.loads(self._tokenizer.to_str())
tokenizer_json["model"]["unk_id"] = self.special_tokens["unk"]["id"]
self._tokenizer = Tokenizer.from_str(json.dumps(tokenizer_json))
|