Spaces:
Sleeping
Sleeping
File size: 495 Bytes
9fe7c42 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 |
import torch
class CharacterLevelTokenizer:
def __init__(self,data):
self.data = data
self.vocab = sorted(list(set(self.data)))
self.VOCAB_SIZE = len(self.vocab)
self.i_s = {i:s for i,s in enumerate(self.vocab)}
self.s_i = {s:i for i,s in self.i_s.items()}
def encode(self,s):
return torch.tensor([self.s_i[c] for c in s],dtype=torch.long)
def decode(self,s):
return ''.join([self.i_s[i.item()] for i in s]) |