raghunc0nano-gpt-shakespeare-demo / tokenizer_utils.py
raghunc0's picture
Upload folder using huggingface_hub
a2dad08
raw
history blame
687 Bytes
import torch
class IntCharTokenizer:
def __init__(self, text):
self.chars, self.vocab_size = self._get_uniq_chars(text)
self.int_to_char = {i: c for i, c in enumerate(self.chars)}
self.char_to_int = {c: i for i, c in enumerate(self.chars)}
def _get_uniq_chars(self, text):
chars = sorted(list(set(text)))
return chars, len(chars)
def encode(self, text):
#enc = lambda s: [self.char_to_int[c] for c in s]
return [self.char_to_int[c] for c in text]
def decode(self, tokens):
#dec = lambda s: ''.join(self.int_to_char[i] for i in s)
return ''.join(self.int_to_char[i] for i in tokens)