File size: 2,324 Bytes
833997d
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
import collections
import unicodedata
import six

def convert_to_unicode(text):
  """Converts `text` to Unicode (if it's not already), assuming utf-8 input."""
  if six.PY3:
    if isinstance(text, str):
      return text
    elif isinstance(text, bytes):
      return text.decode("utf-8", "ignore")
    else:
      raise ValueError("Unsupported string type: %s" % (type(text)))
  elif six.PY2:
    if isinstance(text, str):
      return text.decode("utf-8", "ignore")
    elif isinstance(text, unicode):
      return text
    else:
      raise ValueError("Unsupported string type: %s" % (type(text)))
  else:
    raise ValueError("Not running on Python2 or Python 3?")

def load_vocab(vocab_file):
  vocab = collections.OrderedDict()
  index = 0
  with open(vocab_file, "r", encoding='utf-8') as reader:
    while True:
      token = reader.readline()
      if token.split(): token = token.split()[0] # to support SentencePiece vocab file
      token = convert_to_unicode(token)
      if not token:
        break
      token = token.strip()
      vocab[token] = index
      index += 1
  return vocab

#####

from bert.bpe_helper import BPE
import sentencepiece as spm

def convert_by_vocab(vocab, items):
  output = []
  for item in items:
    output.append(vocab[item])
  return output

class ThaiTokenizer(object):
  """Tokenizes Thai texts."""

  def __init__(self, vocab_file, spm_file):
    self.vocab = load_vocab(vocab_file)
    self.inv_vocab = {v: k for k, v in self.vocab.items()}

    self.bpe = BPE(vocab_file)
    self.s = spm.SentencePieceProcessor()
    self.s.Load(spm_file)

  def tokenize(self, text):
    bpe_tokens = self.bpe.encode(text).split(' ')
    spm_tokens = self.s.EncodeAsPieces(text)

    tokens = bpe_tokens if len(bpe_tokens) < len(spm_tokens) else spm_tokens

    split_tokens = []

    for token in tokens:
      new_token = token

      if token.startswith('_') and not token in self.vocab:
        split_tokens.append('_')
        new_token = token[1:]

      if not new_token in self.vocab:
        split_tokens.append('<unk>')
      else:
        split_tokens.append(new_token)

    return split_tokens

  def convert_tokens_to_ids(self, tokens):
    return convert_by_vocab(self.vocab, tokens)

  def convert_ids_to_tokens(self, ids):
    return convert_by_vocab(self.inv_vocab, ids)