GraphGen / graphgen /models /llm /tokenizer.py
chenzihong-gavin
init
acd7cf4
from dataclasses import dataclass
from typing import List
import tiktoken
try:
from transformers import AutoTokenizer
TRANSFORMERS_AVAILABLE = True
except ImportError:
AutoTokenizer = None
TRANSFORMERS_AVAILABLE = False
def get_tokenizer(tokenizer_name: str = "cl100k_base"):
"""
Get a tokenizer instance by name.
:param tokenizer_name: tokenizer name, tiktoken encoding name or Hugging Face model name
:return: tokenizer instance
"""
if tokenizer_name in tiktoken.list_encoding_names():
return tiktoken.get_encoding(tokenizer_name)
if TRANSFORMERS_AVAILABLE:
try:
return AutoTokenizer.from_pretrained(tokenizer_name)
except Exception as e:
raise ValueError(f"Failed to load tokenizer from Hugging Face: {e}") from e
else:
raise ValueError("Hugging Face Transformers is not available, please install it first.")
@dataclass
class Tokenizer:
model_name: str = "cl100k_base"
def __post_init__(self):
self.tokenizer = get_tokenizer(self.model_name)
def encode_string(self, text: str) -> List[int]:
"""
Encode text to tokens
:param text
:return: tokens
"""
return self.tokenizer.encode(text)
def decode_tokens(self, tokens: List[int]) -> str:
"""
Decode tokens to text
:param tokens
:return: text
"""
return self.tokenizer.decode(tokens)
def chunk_by_token_size(
self, content: str, overlap_token_size=128, max_token_size=1024
):
tokens = self.encode_string(content)
results = []
for index, start in enumerate(
range(0, len(tokens), max_token_size - overlap_token_size)
):
chunk_content = self.decode_tokens(
tokens[start : start + max_token_size]
)
results.append(
{
"tokens": min(max_token_size, len(tokens) - start),
"content": chunk_content.strip(),
"chunk_order_index": index,
}
)
return results