import html import streamlit as st from transformers import AutoTokenizer import colorsys st.set_page_config(layout="wide", page_title="Text Tokenizer") def get_random_color(token_id): # Generate a color based on the token id to ensure consistency hue = (hash(str(token_id)) % 1000) / 1000.0 return f"hsla({int(hue * 360)}, 70%, 30%, 70%)" def load_tokenizer(model_name="Qwen/Qwen2.5-Coder-7B-Instruct"): if 'tokenizer' not in st.session_state: st.session_state.tokenizer = AutoTokenizer.from_pretrained(model_name, trust_remote_code=True) return st.session_state.tokenizer st.title("Text Tokenizer") selected_model = "Qwen/Qwen2.5-Coder-7B-Instruct" # Load tokenizer based on selection try: tokenizer = load_tokenizer(selected_model) st.success(f"Loaded tokenizer: {selected_model}") except Exception as e: st.error(f"Failed to load tokenizer: {e}") st.stop() # Input text area input_text = st.text_area("Enter text to tokenize", height=200) # Tokenize button if st.button("Tokenize") and input_text: tokens = tokenizer.encode(input_text) st.write(f"Total tokens: {len(tokens)}") # Generate colored text visualization result = "" prev_tokens = [] prev_string = "" for token in tokens: color = get_random_color(token) current_string = tokenizer.decode(prev_tokens + [token]) prev_tokens.append(token) current_delta = current_string[len(prev_string):] prev_string = current_string current_delta = html.escape(current_delta) current_delta = (current_delta .replace("\n", "↵
") .replace(" ", " ") .replace("\t", "    ")) result += f'{current_delta}' st.html(f'
{result}
') # Show raw tokens (optional) with st.expander("View raw tokens"): token_strings = [tokenizer.decode([t]) for t in tokens] for i, (token_id, token_str) in enumerate(zip(tokens, token_strings)): st.write(f"{i}: Token ID {token_id} → '{token_str}'")