Spaces:
Runtime error
Runtime error
| import streamlit as st | |
| import pandas as pd | |
| import numpy as np | |
| from transformers import AutoTokenizer, AutoModel | |
| import torch | |
| from typing import Dict, List, Tuple | |
| import plotly.express as px | |
| from sklearn.decomposition import PCA | |
| from sklearn.manifold import TSNE | |
| import plotly.graph_objects as go | |
| # Set Streamlit page configuration | |
| st.set_page_config( | |
| page_title="Token & Embedding Visualizer", | |
| layout="wide" | |
| ) | |
| # Define colors for different token types | |
| COLORS = { | |
| 'Special': '#FFB6C1', | |
| 'Subword': '#98FB98', | |
| 'Word': '#87CEFA', | |
| 'Punctuation': '#DDA0DD' | |
| } | |
| def load_models_and_tokenizers() -> Tuple[Dict, Dict]: | |
| """Load tokenizers and models with error handling""" | |
| model_names = { | |
| "BERT": "bert-base-uncased", | |
| "RoBERTa": "roberta-base", | |
| "DistilBERT": "distilbert-base-uncased", | |
| "MPNet": "microsoft/mpnet-base", | |
| "DeBERTa": "microsoft/deberta-base", | |
| } | |
| tokenizers = {} | |
| models = {} | |
| for name, model_name in model_names.items(): | |
| try: | |
| tokenizers[name] = AutoTokenizer.from_pretrained(model_name) | |
| models[name] = AutoModel.from_pretrained(model_name) | |
| st.success(f"β Loaded {name}") | |
| except Exception as e: | |
| st.warning(f"Γ Failed to load {name}: {str(e)}") | |
| return tokenizers, models | |
| def classify_token(token: str) -> str: | |
| """Classify token type based on its characteristics""" | |
| if token.startswith(('##', 'β', 'Δ ', '_', '.')): | |
| return 'Subword' | |
| elif token in ['[CLS]', '[SEP]', '<s>', '</s>', '<pad>', '[PAD]', '[MASK]', '<mask>']: | |
| return 'Special' | |
| elif token in [',', '.', '!', '?', ';', ':', '"', "'", '(', ')', '[', ']', '{', '}']: | |
| return 'Punctuation' | |
| else: | |
| return 'Word' | |
| def get_embeddings(text: str, model, tokenizer) -> Tuple[torch.Tensor, List[str]]: | |
| """Get embeddings and tokens from the model and tokenizer""" | |
| inputs = tokenizer(text, return_tensors="pt", padding=True, truncation=True) | |
| outputs = model(**inputs) | |
| embeddings = outputs.last_hidden_state[0] # Get first batch | |
| tokens = tokenizer.convert_ids_to_tokens(inputs['input_ids'][0]) | |
| return embeddings, tokens | |
| def visualize_embeddings(embeddings: torch.Tensor, tokens: List[str], method: str = 'PCA') -> go.Figure: | |
| """Visualize embeddings using PCA or t-SNE""" | |
| embed_array = embeddings.numpy() | |
| if method == 'PCA': | |
| reducer = PCA(n_components=3) | |
| reduced_embeddings = reducer.fit_transform(embed_array) | |
| variance_explained = reducer.explained_variance_ratio_ | |
| method_info = f"Total variance explained: {sum(variance_explained):.2%}" | |
| else: # t-SNE | |
| reducer = TSNE(n_components=3, random_state=42, perplexity=min(30, len(tokens)-1)) | |
| reduced_embeddings = reducer.fit_transform(embed_array) | |
| method_info = "t-SNE embedding (perplexity: {})".format(reducer.perplexity) | |
| df = pd.DataFrame({ | |
| 'x': reduced_embeddings[:, 0], | |
| 'y': reduced_embeddings[:, 1], | |
| 'z': reduced_embeddings[:, 2], | |
| 'token': tokens, | |
| 'type': [classify_token(t) for t in tokens] | |
| }) | |
| fig = go.Figure() | |
| for token_type in df['type'].unique(): | |
| mask = df['type'] == token_type | |
| fig.add_trace(go.Scatter3d( | |
| x=df[mask]['x'], | |
| y=df[mask]['y'], | |
| z=df[mask]['z'], | |
| mode='markers+text', | |
| name=token_type, | |
| text=df[mask]['token'], | |
| hovertemplate="Token: %{text}<br>Type: " + token_type + "<extra></extra>", | |
| marker=dict( | |
| size=8, | |
| color=COLORS[token_type], | |
| opacity=0.8 | |
| ) | |
| )) | |
| fig.update_layout( | |
| title=f"{method} Visualization of Token Embeddings<br><sup>{method_info}</sup>", | |
| scene=dict( | |
| xaxis_title=f"{method}_1", | |
| yaxis_title=f"{method}_2", | |
| zaxis_title=f"{method}_3" | |
| ), | |
| width=800, | |
| height=800 | |
| ) | |
| return fig | |
| def compute_token_similarities(embeddings: torch.Tensor, tokens: List[str]) -> pd.DataFrame: | |
| """Compute cosine similarities between token embeddings""" | |
| normalized_embeddings = embeddings / embeddings.norm(dim=1, keepdim=True) | |
| similarities = torch.mm(normalized_embeddings, normalized_embeddings.t()) | |
| sim_df = pd.DataFrame(similarities.numpy(), columns=tokens, index=tokens) | |
| return sim_df | |
| # Streamlit app title | |
| st.title("π€ Token & Embedding Visualizer") | |
| # Load models and tokenizers | |
| tokenizers, models = load_models_and_tokenizers() | |
| # Create tabs for different visualizations | |
| token_tab, embedding_tab, similarity_tab = st.tabs([ | |
| "Token Visualization", | |
| "Embedding Visualization", | |
| "Token Similarities" | |
| ]) | |
| # Default text for analysis | |
| default_text = "Hello world! Let's analyze how neural networks process language. The transformer architecture revolutionized NLP." | |
| text_input = st.text_area("Enter text to analyze:", value=default_text, height=100) | |
| with token_tab: | |
| st.markdown(""" | |
| Token colors represent: | |
| - π¦ Blue: Complete words | |
| - π© Green: Subwords | |
| - π¨ Pink: Special tokens | |
| - πͺ Purple: Punctuation | |
| """) | |
| selected_models = st.multiselect( | |
| "Select models to compare tokens", | |
| options=list(tokenizers.keys()), | |
| default=["BERT", "RoBERTa"], | |
| max_selections=4 | |
| ) | |
| if text_input and selected_models: | |
| cols = st.columns(len(selected_models)) | |
| for idx, model_name in enumerate(selected_models): | |
| with cols[idx]: | |
| st.subheader(model_name) | |
| tokenizer = tokenizers[model_name] | |
| tokens = tokenizer.tokenize(text_input) | |
| token_ids = tokenizer.encode(text_input) | |
| if len(tokens) != len(token_ids): | |
| tokens = tokenizer.convert_ids_to_tokens(token_ids) | |
| st.metric("Tokens", len(tokens)) | |
| html_tokens = [] | |
| for token in tokens: | |
| color = COLORS[classify_token(token)] | |
| token_text = token.replace('<', '<').replace('>', '>') | |
| html_tokens.append( | |
| f'<span style="background-color: {color}; padding: 2px 4px; ' | |
| f'margin: 2px; border-radius: 3px; font-family: monospace;">' | |
| f'{token_text}</span>' | |
| ) | |
| st.markdown( | |
| '<div style="background-color: white; padding: 10px; ' | |
| 'border-radius: 5px; border: 1px solid #ddd;">' | |
| f'{"".join(html_tokens)}</div>', | |
| unsafe_allow_html=True | |
| ) | |
| with embedding_tab: | |
| st.markdown(""" | |
| This tab shows how tokens are embedded in the model's vector space. | |
| - Compare different dimensionality reduction techniques | |
| - Observe clustering of similar tokens | |
| - Explore the relationship between different token types | |
| """) | |
| col1, col2 = st.columns([2, 1]) | |
| with col1: | |
| selected_model = st.selectbox( | |
| "Select model for embedding visualization", | |
| options=list(models.keys()) | |
| ) | |
| with col2: | |
| viz_method = st.radio( | |
| "Select visualization method", | |
| options=['PCA', 't-SNE'], | |
| horizontal=True | |
| ) | |
| if text_input and selected_model: | |
| with st.spinner(f"Generating embeddings with {selected_model}..."): | |
| embeddings, tokens = get_embeddings( | |
| text_input, | |
| models[selected_model], | |
| tokenizers[selected_model] | |
| ) | |
| fig = visualize_embeddings(embeddings, tokens, viz_method) | |
| st.plotly_chart(fig, use_container_width=True) | |
| with st.expander("Embedding Statistics"): | |
| embed_stats = pd.DataFrame({ | |
| 'Token': tokens, | |
| 'Type': [classify_token(t) for t in tokens], | |
| 'Mean': embeddings.mean(dim=1).numpy(), | |
| 'Std': embeddings.std(dim=1).numpy(), | |
| 'Norm': torch.norm(embeddings, dim=1).numpy() | |
| }) | |
| st.dataframe(embed_stats, use_container_width=True) | |
| with similarity_tab: | |
| st.markdown(""" | |
| Explore token similarities based on their embedding representations. | |
| - Darker colors indicate higher similarity | |
| - Hover over cells to see exact similarity scores | |
| """) | |
| if text_input and selected_model: | |
| with st.spinner("Computing token similarities..."): | |
| # Ensure unique token names by appending their index | |
| unique_tokens = [f"{token}_{i}" for i, token in enumerate(tokens)] | |
| sim_df = compute_token_similarities(embeddings, tokens) | |
| sim_df.columns = unique_tokens # Update column names | |
| sim_df.index = unique_tokens # Update row names | |
| fig = px.imshow( | |
| sim_df, | |
| labels=dict(color="Cosine Similarity"), | |
| color_continuous_scale="RdYlBu", | |
| aspect="auto" | |
| ) | |
| fig.update_layout( | |
| title="Token Similarity Matrix", | |
| width=800, | |
| height=800 | |
| ) | |
| st.plotly_chart(fig, use_container_width=True) | |
| st.subheader("Most Similar Token Pairs") | |
| sim_matrix = sim_df.values | |
| np.fill_diagonal(sim_matrix, 0) # Exclude self-similarities | |
| top_k = min(10, len(tokens)) | |
| pairs = [] | |
| for i in range(len(tokens)): | |
| for j in range(i+1, len(tokens)): | |
| pairs.append((tokens[i], tokens[j], sim_matrix[i, j])) | |
| top_pairs = sorted(pairs, key=lambda x: x[2], reverse=True)[:top_k] | |
| for token1, token2, sim in top_pairs: | |
| st.write(f"'{token1}' β '{token2}': {sim:.3f}") | |
| st.markdown("---") | |
| st.markdown(""" | |
| π‘ **Tips:** | |
| - Try comparing how different models tokenize and embed the same text | |
| - Use PCA for global structure and t-SNE for local relationships | |
| - Check the similarity matrix for interesting token relationships | |
| - Experiment with different text types (technical, casual, mixed) | |
| """) |