Spaces:

SuperBigtoo
/

ThaiNewsClassify

Sleeping

App Files Files Community

ThaiNewsClassify / thai_tokenization.py

SuperBigtoo

commit

833997d about 1 year ago

raw

history blame

2.32 kB

	import collections
	import unicodedata
	import six

	def convert_to_unicode(text):
	"""Converts `text` to Unicode (if it's not already), assuming utf-8 input."""
	if six.PY3:
	if isinstance(text, str):
	return text
	elif isinstance(text, bytes):
	return text.decode("utf-8", "ignore")
	else:
	raise ValueError("Unsupported string type: %s" % (type(text)))
	elif six.PY2:
	if isinstance(text, str):
	return text.decode("utf-8", "ignore")
	elif isinstance(text, unicode):
	return text
	else:
	raise ValueError("Unsupported string type: %s" % (type(text)))
	else:
	raise ValueError("Not running on Python2 or Python 3?")

	def load_vocab(vocab_file):
	vocab = collections.OrderedDict()
	index = 0
	with open(vocab_file, "r", encoding='utf-8') as reader:
	while True:
	token = reader.readline()
	if token.split(): token = token.split()[0] # to support SentencePiece vocab file
	token = convert_to_unicode(token)
	if not token:
	break
	token = token.strip()
	vocab[token] = index
	index += 1
	return vocab

	#####

	from bert.bpe_helper import BPE
	import sentencepiece as spm

	def convert_by_vocab(vocab, items):
	output = []
	for item in items:
	output.append(vocab[item])
	return output

	class ThaiTokenizer(object):
	"""Tokenizes Thai texts."""

	def __init__(self, vocab_file, spm_file):
	self.vocab = load_vocab(vocab_file)
	self.inv_vocab = {v: k for k, v in self.vocab.items()}

	self.bpe = BPE(vocab_file)
	self.s = spm.SentencePieceProcessor()
	self.s.Load(spm_file)

	def tokenize(self, text):
	bpe_tokens = self.bpe.encode(text).split(' ')
	spm_tokens = self.s.EncodeAsPieces(text)

	tokens = bpe_tokens if len(bpe_tokens) < len(spm_tokens) else spm_tokens

	split_tokens = []

	for token in tokens:
	new_token = token

	if token.startswith('_') and not token in self.vocab:
	split_tokens.append('_')
	new_token = token[1:]

	if not new_token in self.vocab:
	split_tokens.append('<unk>')
	else:
	split_tokens.append(new_token)

	return split_tokens

	def convert_tokens_to_ids(self, tokens):
	return convert_by_vocab(self.vocab, tokens)

	def convert_ids_to_tokens(self, ids):
	return convert_by_vocab(self.inv_vocab, ids)