Spaces:
Sleeping
Sleeping
import tiktoken | |
from transformers import AutoTokenizer | |
# ... existing code ... | |
def analyze_tokens_detailed(text, model): | |
""" | |
For a given text and model, returns a list of dicts with details for each token: | |
- token string | |
- token id | |
- decoded value | |
- token length | |
- NSL value (token length / max token length in sequence) | |
- subword fertility (number of tokens per word) | |
Also returns the decoded output for the entire sequence. | |
""" | |
# Tokenize | |
if 'gpt' in model: | |
tokenizer = tiktoken.encoding_for_model(model) | |
token_ids = tokenizer.encode(text) | |
tokens = [tokenizer.decode([tid]) for tid in token_ids] | |
else: | |
tokenizer = AutoTokenizer.from_pretrained(model) | |
token_ids = tokenizer.encode(text, add_special_tokens=False) | |
tokens = [tokenizer.decode([tid]) for tid in token_ids] | |
# Decoded output for the entire sequence | |
if 'gpt' in model: | |
decoded_output = tokenizer.decode(token_ids) | |
else: | |
decoded_output = tokenizer.decode(token_ids) | |
# Token lengths | |
token_lengths = [len(t) for t in tokens] | |
max_token_length = max(token_lengths) if token_lengths else 1 | |
nsl_values = [l / max_token_length for l in token_lengths] | |
# Subword fertility: number of tokens per word | |
# Map each token to its originating word (approximate) | |
words = text.split() | |
word_token_counts = [] | |
if len(words) > 0: | |
# Use a simple greedy approach: assign tokens to words in order | |
import re | |
text_pointer = 0 | |
word_idx = 0 | |
token_word_map = [] | |
for token in tokens: | |
# Find the next word that matches the start of the token | |
while word_idx < len(words) and not text[text_pointer:].startswith(words[word_idx]): | |
text_pointer += 1 | |
if word_idx < len(words): | |
token_word_map.append(word_idx) | |
text_pointer += len(token) | |
if text_pointer >= len(text) or (word_idx + 1 < len(words) and text[text_pointer:].startswith(words[word_idx + 1])): | |
word_idx += 1 | |
else: | |
token_word_map.append(-1) | |
# Count tokens per word | |
from collections import Counter | |
fertility_counter = Counter(token_word_map) | |
subword_fertility = [fertility_counter[i] for i in range(len(words))] | |
# Assign fertility to each token | |
token_fertility = [fertility_counter[idx] if idx >= 0 else 0 for idx in token_word_map] | |
else: | |
token_fertility = [1 for _ in tokens] | |
# Build table | |
table = [] | |
for i, (token, tid, decoded, length, nsl, fert) in enumerate(zip(tokens, token_ids, tokens, token_lengths, nsl_values, token_fertility)): | |
table.append({ | |
'token': token, | |
'token_id': tid, | |
'decoded': decoded, | |
'token_length': length, | |
'nsl': nsl, | |
'subword_fertility': fert | |
}) | |
return { | |
'model': model, | |
'decoded_output': decoded_output, | |
'tokens': table | |
} | |
# ... existing code ... | |