Spaces:
Running
on
Zero
Running
on
Zero
from abc import ABC, abstractmethod, abstractproperty | |
from typing import Dict, List, Tuple, Union | |
T = Union[str, bytes] | |
class BaseTokenizer(ABC): | |
def tokenize(self, line: str) -> Tuple[List[T], List[int]]: | |
tokens = self.text2tokens(line) | |
ids = self.tokens2ids(tokens) | |
return tokens, ids | |
def detokenize(self, ids: List[int]) -> Tuple[str, List[T]]: | |
tokens = self.ids2tokens(ids) | |
text = self.tokens2text(tokens) | |
return text, tokens | |
def text2tokens(self, line: str) -> List[T]: | |
raise NotImplementedError("abstract method") | |
def tokens2text(self, tokens: List[T]) -> str: | |
raise NotImplementedError("abstract method") | |
def tokens2ids(self, tokens: List[T]) -> List[int]: | |
raise NotImplementedError("abstract method") | |
def ids2tokens(self, ids: List[int]) -> List[T]: | |
raise NotImplementedError("abstract method") | |
def vocab_size(self) -> int: | |
raise NotImplementedError("abstract method") | |
def symbol_table(self) -> Dict[T, int]: | |
raise NotImplementedError("abstract method") | |