Spaces:
Sleeping
Sleeping
| import gradio as gr | |
| from transformers import AutoTokenizer | |
| import collections | |
| # Map of display names to HF model IDs | |
| MODEL_MAP = { | |
| "Nomic Embed v1.5": "nomic-ai/nomic-embed-text-v1.5", | |
| "MixedBread XSmall v1": "mixedbread-ai/mxbai-embed-xsmall-v1", | |
| "Google EmbeddingGemma 300m": "google/embeddinggemma-300m", | |
| "all-MiniLM-L6-v2": "sentence-transformers/all-MiniLM-L6-v2", | |
| "BGE-M3": "BAAI/bge-m3", | |
| "BERT Base (Baseline WordPiece)": "bert-base-uncased", | |
| "RoBERTa Base (Byte-Level BPE)": "roberta-base", | |
| "E5 Mistral 7B (Llama Tokenizer)": "intfloat/e5-mistral-7b-instruct", | |
| } | |
| # Global cache for tokenizers | |
| tokenizer_cache = {} | |
| def get_tokenizer(model_name): | |
| """Lazy load tokenizers.""" | |
| model_id = MODEL_MAP[model_name] | |
| if model_id not in tokenizer_cache: | |
| print(f"Loading tokenizer: {model_id}...") | |
| try: | |
| tokenizer_cache[model_id] = AutoTokenizer.from_pretrained(model_id, trust_remote_code=True) | |
| except Exception as e: | |
| return None, f"Error loading tokenizer: {str(e)}" | |
| return tokenizer_cache[model_id], None | |
| def format_byte_token(text): | |
| """ | |
| Attempts to identify if a token is a RoBERTa/GPT-2 style byte mapping | |
| (e.g., 'â' representing 0xE2) and converts it to <0xXX> for clarity. | |
| """ | |
| # If the text is just one char and looks "weird" (extended unicode), | |
| # it might be a byte mapping. | |
| if len(text) == 1 and ord(text) > 256: | |
| # This is a heuristic: RoBERTa maps bytes to specific unicode ranges. | |
| # It's safer to just label it as a byte artifact if it matches our fragmentation logic. | |
| return f"<{hex(ord(text))}>" | |
| return text | |
| def analyze_tokenization(text, model_name=MODEL_MAP.keys().__iter__().__next__()): | |
| tokenizer, error = get_tokenizer(model_name) | |
| if error: | |
| return [], error | |
| try: | |
| # Tokenize with offsets | |
| encoding = tokenizer(text, add_special_tokens=False, return_offsets_mapping=True) | |
| except Exception as e: | |
| return [], f"Tokenization failed: {str(e)}" | |
| tokens = tokenizer.convert_ids_to_tokens(encoding["input_ids"]) | |
| ids = encoding["input_ids"] | |
| offsets = encoding["offset_mapping"] | |
| # Map character indices to the list of tokens that cover them | |
| char_coverage = collections.defaultdict(list) | |
| for i, (start, end) in enumerate(offsets): | |
| for char_idx in range(start, end): | |
| char_coverage[char_idx].append(i) | |
| output_spans = [] | |
| for i, (token, token_id) in enumerate(zip(tokens, ids)): | |
| label = None | |
| display_text = token | |
| # --- Visual Cleanup for RoBERTa/GPT-2 --- | |
| # Replace the special 'Ġ' (G with dot) which represents a space | |
| display_text = display_text.replace('Ġ', ' ') | |
| # Replace 'Ċ' (C with dot) which represents a newline | |
| display_text = display_text.replace('Ċ', '\n') | |
| # Replace 'ĉ' which represents a tab/control | |
| display_text = display_text.replace('ĉ', '\t') | |
| # Check 1: Explicit UNK (The "Hard Failure") | |
| if token_id == tokenizer.unk_token_id: | |
| label = "UNK (Data Loss)" | |
| # Check 2: Byte Fallback / Fragmentation | |
| start, end = offsets[i] | |
| is_fragment = False | |
| # If a single character in the input generated multiple tokens, it's a fragmentation/byte-split | |
| if (end - start) == 1: | |
| tokens_covering_this_char = char_coverage[start] | |
| if len(tokens_covering_this_char) > 1: | |
| is_fragment = True | |
| # Check for Llama/Mistral style byte tokens (<0xE2>) | |
| if token.startswith("<0x") and token.endswith(">"): | |
| is_fragment = True | |
| if is_fragment and label is None: | |
| label = "Byte/Fragment" | |
| # If it's a RoBERTa weird char (like â), try to show it as hex | |
| # to make it look less like random noise | |
| if len(display_text) == 1 and ord(display_text) > 127: | |
| # It's likely a mapped byte. We don't have the reverse map easily accessible, | |
| # but we can mark it clearly. | |
| display_text = f"<{display_text}>" | |
| # Check 3: Subwords (Blue) | |
| if label is None: | |
| # WordPiece '##' | |
| if token.startswith("##"): | |
| label = "Subword" | |
| # SentencePiece/RoBERTa often treats non-leading-space tokens as subwords | |
| elif i > 0 and not token.startswith("Ġ") and not token.startswith(" "): | |
| # Heuristic: If previous token ended at the same spot this one starts | |
| prev_end = offsets[i-1][1] | |
| if start == prev_end: | |
| label = "Subword" | |
| output_spans.append((display_text, label)) | |
| return output_spans, f"Total Tokens: {len(tokens)}" | |
| # Scientific text example | |
| scientific_text = "Acidity (pKa)2.97 (25 °C)[5] 13.82 (20 °C)[3] UV-vis (λmax)210 nm (χ)−72.23·10−6 cm3/mol" | |
| with gr.Blocks(title="Embedding Model Tokenizer Detective") as demo: | |
| gr.Markdown( | |
| """ | |
| # 🕵️♀️ Embedding Model Tokenizer Detective | |
| Different embedding models handle unknown characters (OOV) differently. | |
| * **Red (UNK):** The model **deleted** information. It saw a symbol it didn't know and replaced it with a generic placeholder. | |
| * **Orange (Byte/Fragment):** The model **struggled** and split a single character (like a Greek letter or math symbol) into multiple raw bytes. | |
| * **Blue:** Standard subword splitting. | |
| """ | |
| ) | |
| with gr.Row(): | |
| with gr.Column(): | |
| input_text = gr.Textbox( | |
| label="Input Text", | |
| lines=5, | |
| placeholder="Enter scientific or multilingual text here...", | |
| value=scientific_text | |
| ) | |
| model_selector = gr.Dropdown( | |
| label="Select Embedding Model / Tokenizer", | |
| choices=list(MODEL_MAP.keys()), | |
| value="Nomic Embed v1.5" | |
| ) | |
| analyze_btn = gr.Button("Diagnose Tokenization", variant="primary") | |
| with gr.Column(): | |
| output_display = gr.HighlightedText( | |
| label="Tokenized Analysis", | |
| combine_adjacent=False, | |
| show_legend=True, | |
| color_map={"UNK (Data Loss)": "red", "Byte/Fragment": "orange", "Subword": "blue"} | |
| ) | |
| stats_output = gr.Label(label="Statistics") | |
| analyze_btn.click( | |
| fn=analyze_tokenization, | |
| inputs=[input_text, model_selector], | |
| outputs=[output_display, stats_output] | |
| ) | |
| gr.Examples( | |
| examples=[ | |
| ["The quick brown fox jumps over the lazy dog."], | |
| [scientific_text], | |
| ["susceptibility (Ⅹ) = −72.23·10−6 cm3/mol"], | |
| ["汉字漢字カタカナひらがな"], | |
| ["⅕ of a pizza is 2 slices."], | |
| ["😊 😂 🥺"], | |
| ], | |
| inputs=[input_text], | |
| #outputs=[output_display, stats_output], | |
| fn=analyze_tokenization, | |
| run_on_click=True | |
| ) | |
| if __name__ == "__main__": | |
| demo.launch() |