Spaces:
Sleeping
Sleeping
import collections | |
import unicodedata | |
import six | |
def convert_to_unicode(text): | |
"""Converts `text` to Unicode (if it's not already), assuming utf-8 input.""" | |
if six.PY3: | |
if isinstance(text, str): | |
return text | |
elif isinstance(text, bytes): | |
return text.decode("utf-8", "ignore") | |
else: | |
raise ValueError("Unsupported string type: %s" % (type(text))) | |
elif six.PY2: | |
if isinstance(text, str): | |
return text.decode("utf-8", "ignore") | |
elif isinstance(text, unicode): | |
return text | |
else: | |
raise ValueError("Unsupported string type: %s" % (type(text))) | |
else: | |
raise ValueError("Not running on Python2 or Python 3?") | |
def load_vocab(vocab_file): | |
vocab = collections.OrderedDict() | |
index = 0 | |
with open(vocab_file, "r", encoding='utf-8') as reader: | |
while True: | |
token = reader.readline() | |
if token.split(): token = token.split()[0] # to support SentencePiece vocab file | |
token = convert_to_unicode(token) | |
if not token: | |
break | |
token = token.strip() | |
vocab[token] = index | |
index += 1 | |
return vocab | |
##### | |
from bert.bpe_helper import BPE | |
import sentencepiece as spm | |
def convert_by_vocab(vocab, items): | |
output = [] | |
for item in items: | |
output.append(vocab[item]) | |
return output | |
class ThaiTokenizer(object): | |
"""Tokenizes Thai texts.""" | |
def __init__(self, vocab_file, spm_file): | |
self.vocab = load_vocab(vocab_file) | |
self.inv_vocab = {v: k for k, v in self.vocab.items()} | |
self.bpe = BPE(vocab_file) | |
self.s = spm.SentencePieceProcessor() | |
self.s.Load(spm_file) | |
def tokenize(self, text): | |
bpe_tokens = self.bpe.encode(text).split(' ') | |
spm_tokens = self.s.EncodeAsPieces(text) | |
tokens = bpe_tokens if len(bpe_tokens) < len(spm_tokens) else spm_tokens | |
split_tokens = [] | |
for token in tokens: | |
new_token = token | |
if token.startswith('_') and not token in self.vocab: | |
split_tokens.append('_') | |
new_token = token[1:] | |
if not new_token in self.vocab: | |
split_tokens.append('<unk>') | |
else: | |
split_tokens.append(new_token) | |
return split_tokens | |
def convert_tokens_to_ids(self, tokens): | |
return convert_by_vocab(self.vocab, tokens) | |
def convert_ids_to_tokens(self, ids): | |
return convert_by_vocab(self.inv_vocab, ids) |