| import streamlit as st |
| import torch |
| import numpy as np |
| import pandas as pd |
| from transformers import AutoTokenizer, AutoModel, pipeline |
| from typing import Optional, Tuple, Dict, Any, List |
| import json |
|
|
| try: |
| from sklearn.decomposition import PCA |
| except ImportError: |
| PCA = None |
| try: |
| import plotly.express as px |
| except ImportError: |
| px = None |
|
|
| st.set_page_config(page_title="BERT – Tokenizer & Embeddings Demo", layout="wide") |
|
|
| st.title("BERT – Architecture, Tokenizer, ID↔Token, Fill-Mask, Embeddings, PCA Map") |
|
|
| |
| |
| |
|
|
| def _device() -> torch.device: |
| return torch.device("cuda" if torch.cuda.is_available() else "cpu") |
|
|
|
|
| def count_params(model: torch.nn.Module) -> Tuple[int, int]: |
| total = sum(p.numel() for p in model.parameters()) |
| trainable = sum(p.numel() for p in model.parameters() if p.requires_grad) |
| return total, trainable |
|
|
|
|
| def safe_json(obj: Any) -> str: |
| try: |
| return json.dumps(obj, indent=2, ensure_ascii=False, default=str) |
| except Exception: |
| return str(obj) |
| |
| |
| |
| |
|
|
| st.sidebar.header("⚙️ Settings") |
| model_name = st.sidebar.text_input("Hugging Face model name", value="google-bert/bert-base-uncased") |
| use_hidden_states = st.sidebar.checkbox("output_hidden_states", value=False) |
| max_vocab_rows = st.sidebar.slider("Rows per page (vocab viewer)", 50, 2000, 500, step=50) |
|
|
| device = _device() |
| st.sidebar.write("Device:", str(device)) |
|
|
|
|
| @st.cache_resource(show_spinner=False) |
| def load_tokenizer_and_model(model_name: str, output_hidden_states: bool): |
| tok = AutoTokenizer.from_pretrained(model_name, use_fast=True) |
| mdl = AutoModel.from_pretrained(model_name, output_hidden_states=output_hidden_states) |
| mdl.eval() |
| return tok, mdl |
|
|
|
|
| @st.cache_resource(show_spinner=False) |
| def load_fill_mask(model_name: str): |
| |
| return pipeline("fill-mask", model=model_name) |
|
|
|
|
| with st.spinner("Loading tokenizer + model…"): |
| tokenizer, model = load_tokenizer_and_model(model_name, use_hidden_states) |
| model = model.to(device) |
|
|
| |
| |
| |
| |
| TAB_LABELS = [ |
| "Architecture", |
| "Tokenizer vocab", |
| "ID ↔ Token", |
| "Fill-mask", |
| "Embeddings output", |
| "Embeddings map", |
| ] |
|
|
| if "active_tab" not in st.session_state: |
| st.session_state["active_tab"] = TAB_LABELS[0] |
|
|
| active_tab = st.radio( |
| "Section", |
| TAB_LABELS, |
| index=TAB_LABELS.index(st.session_state["active_tab"]), |
| horizontal=True, |
| key="main_tab_selector", |
| ) |
| st.session_state["active_tab"] = active_tab |
|
|
| |
| |
| |
|
|
| if active_tab == "Architecture": |
| col1, col2 = st.columns([1, 1]) |
|
|
| with col1: |
| st.subheader("Infos générales") |
| total, trainable = count_params(model) |
| st.write( |
| { |
| "model_id": model_name, |
| "model_class": model.__class__.__name__, |
| "total_params": total, |
| "trainable_params": trainable, |
| "dtype": str(next(model.parameters()).dtype), |
| "device": str(next(model.parameters()).device), |
| } |
| ) |
|
|
| st.subheader("model.eval()") |
| st.write("✅ Le modèle est en mode évaluation (`eval()`).") |
|
|
| st.subheader("config (model.config)") |
| try: |
| cfg = model.config.to_dict() |
| except Exception: |
| cfg = vars(model.config) |
| st.code(safe_json(cfg), language="json") |
|
|
| with col2: |
| st.subheader("Architecture (str(model))") |
| |
| model_str = str(model) |
| if len(model_str) > 12000: |
| model_str = model_str[:12000] + "\n...\n[tronqué]" |
| st.code(model_str) |
|
|
| st.subheader("Couche d’input embeddings") |
| try: |
| emb_layer = model.get_input_embeddings() |
| w = emb_layer.weight |
| st.write( |
| { |
| "embedding_weight_shape": list(w.shape), |
| "vocab_size (weight)": int(w.shape[0]), |
| "hidden_dim": int(w.shape[1]), |
| } |
| ) |
| except Exception as e: |
| st.warning(f"Impossible d’accéder à get_input_embeddings(): {e}") |
|
|
| |
| |
| |
| if active_tab == "Tokenizer vocab": |
| st.subheader("Tokenizer vocabulary") |
| st.write({"len(tokenizer)": len(tokenizer), "model": model_name}) |
|
|
| |
| total = len(tokenizer) |
| if total == 0: |
| st.warning("Tokenizer vocabulary appears empty.") |
| else: |
| |
| max_start = max(total - max_vocab_rows, 0) |
| start = st.slider("Start ID", 0, max_start, min(1000, max_start), step=max_vocab_rows) |
| end = min(start + max_vocab_rows, total) |
|
|
| ids = list(range(start, end)) |
| |
| tokens = [tokenizer.decode(i) for i in ids] |
|
|
| df = pd.DataFrame({"ID": ids, "token": tokens}) |
| st.dataframe(df, use_container_width=True, height=520) |
|
|
| with st.expander("Special tokens"): |
| st.write("special_tokens_map:", tokenizer.special_tokens_map) |
| st.write("all_special_tokens:", getattr(tokenizer, "all_special_tokens", [])) |
| st.write("all_special_ids:", getattr(tokenizer, "all_special_ids", [])) |
|
|
| |
| |
| |
| if active_tab == "ID ↔ Token": |
| st.subheader("Convert text → ids/tokens and ids → text") |
|
|
| text = st.text_area( |
| "Text to tokenize", |
| value="Sustainable thermal insulation biocomposites from rice husk", |
| height=100, |
| ) |
|
|
| enc = tokenizer(text, return_tensors="pt") |
| ids = enc["input_ids"][0].tolist() |
| toks = tokenizer.convert_ids_to_tokens(ids) |
| decoded_list = tokenizer.decode(ids, skip_special_tokens=False) |
| decoded_clean = tokenizer.decode(ids, skip_special_tokens=True) |
|
|
| c1, c2 = st.columns(2) |
| with c1: |
| st.markdown("**input_ids**") |
| st.code(ids) |
| st.markdown("**tokens**") |
| st.code(toks) |
| with c2: |
| st.markdown("**decode(ids) (keep specials)**") |
| st.code(decoded_list) |
| st.markdown("**decode(ids) (skip specials)**") |
| st.code(decoded_clean) |
|
|
| st.divider() |
| st.subheader("Single conversions") |
| cc1, cc2 = st.columns(2) |
|
|
| with cc1: |
| st.markdown("**ID → token**") |
| id_in = st.number_input("ID", min_value=0, max_value=max(len(tokenizer) - 1, 0), value=min(101, max(len(tokenizer) - 1, 0))) |
| |
| st.write({"id": int(id_in), "token": tokenizer.decode([int(id_in)])}) |
|
|
| with cc2: |
| st.markdown("**token → ID**") |
| tok_in = st.text_input("Token (as in vocab, e.g. 'insulation' or '##ing')", value="insulation") |
| if tok_in: |
| st.write({"token": tok_in, "id": int(tokenizer.convert_tokens_to_ids(tok_in))}) |
|
|
| |
| |
| |
| if active_tab == "Embeddings output": |
| st.subheader("Model forward → last_hidden_state") |
|
|
| text2 = st.text_area( |
| "Text for embeddings", |
| value="Sustainable thermal insulation biocomposites from rice husk", |
| height=90, |
| ) |
|
|
| inputs = tokenizer(text2, return_tensors="pt").to(device) |
| with torch.no_grad(): |
| outputs = model(**inputs) |
|
|
| last_hidden = getattr(outputs, "last_hidden_state", None) |
| if last_hidden is None: |
| st.warning("This model output has no last_hidden_state (unexpected for AutoModel). Try another model.") |
| else: |
| toks = tokenizer.convert_ids_to_tokens(inputs["input_ids"][0].tolist()) |
| emb = last_hidden[0].detach().cpu().numpy() |
|
|
| df = pd.DataFrame( |
| emb, |
| index=[f"{i} {t}" for i, t in enumerate(toks)], |
| columns=[f"d{j}" for j in range(emb.shape[1])], |
| ) |
| st.dataframe(df, use_container_width=True, height=520) |
|
|
| |
| |
| |
| if active_tab == "Embeddings map": |
| st.subheader("Multi-sentence embeddings → PCA map") |
| st.write("Enter several sentences (one per line). Embeddings are computed and projected to 2D for visualization.") |
|
|
| default_sentences = "Sustainable thermal insulation biocomposites.\nRice husk and natural fibers.\nEnergy-efficient building materials.\nRecycled plastic composites.\nWood fiber insulation." |
| sentences_text = st.text_area("Sentences (one per line)", value=default_sentences, height=120, key="embed_map_sentences") |
|
|
| level = st.radio("Embedding level", ["Token level", "Sentence level"], horizontal=True, key="embed_map_level") |
|
|
| if st.button("Compute embeddings and plot", type="primary", key="embed_map_btn"): |
| lines = [s.strip() for s in sentences_text.strip().split("\n") if s.strip()] |
| if not lines: |
| st.warning("Enter at least one sentence.") |
| elif PCA is None: |
| st.error("scikit-learn is required for PCA. Install it with `pip install scikit-learn`.") |
| elif px is None: |
| st.error("plotly is required. Install it with `pip install plotly`.") |
| else: |
| with st.spinner("Computing embeddings…"): |
| all_embeddings: List[np.ndarray] = [] |
| all_labels: List[str] = [] |
|
|
| for sent in lines: |
| inputs = tokenizer(sent, return_tensors="pt", padding=True, truncation=True, max_length=512).to(device) |
| with torch.no_grad(): |
| out = model(**inputs) |
|
|
| last_hidden = out.last_hidden_state[0].detach().cpu().numpy() |
| ids = inputs["input_ids"][0].tolist() |
| tokens = tokenizer.convert_ids_to_tokens(ids) |
|
|
| if level == "Token level": |
| special = {"[CLS]", "[SEP]", "[PAD]", "<s>", "</s>", "<pad>", "<unk>"} |
| for i, tok in enumerate(tokens): |
| if tok in special or (tok.startswith("[") and tok.endswith("]")): |
| continue |
| all_embeddings.append(last_hidden[i]) |
| all_labels.append(f"{tok}|{sent[:20]}…" if len(sent) > 20 else f"{tok}|{sent}") |
| else: |
| mask = (inputs["attention_mask"][0].cpu().numpy() == 1) |
| mask[0] = False |
| idx = np.where(mask)[0] |
| if len(idx) >= 2: |
| mask[idx[-1]] = False |
| pooled = last_hidden[mask].mean(axis=0) if mask.any() else last_hidden[0] |
| all_embeddings.append(pooled) |
| all_labels.append(sent[:80] + "…" if len(sent) > 80 else sent) |
|
|
| if len(all_embeddings) < 2: |
| st.warning("Not enough points to plot (need at least 2). Try more sentences or token-level mode.") |
| else: |
| X = np.array(all_embeddings) |
| pca = PCA(n_components=2) |
| reduced = pca.fit_transform(X) |
|
|
| fig = px.scatter( |
| x=reduced[:, 0], |
| y=reduced[:, 1], |
| text=all_labels, |
| title="BERT embeddings (PCA 2D)", |
| ) |
| fig.update_traces(textposition="top center", mode="markers+text", textfont_size=9) |
| fig.update_layout( |
| xaxis_title="PC1", |
| yaxis_title="PC2", |
| height=600, |
| showlegend=False, |
| ) |
| st.plotly_chart(fig, use_container_width=True) |
| st.caption(f"Points: {len(all_labels)} | Variance explained: {pca.explained_variance_ratio_.sum():.1%}") |
|
|
| |
| |
| |
| if active_tab == "Fill-mask": |
| st.subheader("Masked language modeling (pipeline: fill-mask)") |
| st.caption("For English BERT, use [MASK]. For RoBERTa-like models, mask token differs (e.g. <mask>).") |
|
|
| with st.spinner("Loading fill-mask pipeline…"): |
| fill_mask = load_fill_mask(model_name) |
|
|
| mask_token = getattr(fill_mask.tokenizer, "mask_token", "[MASK]") |
| st.write({"mask_token": mask_token}) |
|
|
| default_prompt = f"Peintre officiel de la marine et fondateur de la société {mask_token} des artistes français" |
| prompt = st.text_area("Prompt with a mask token", value=default_prompt, height=90) |
|
|
| top_k = st.slider("top_k", 1, 20, 5) |
|
|
| if st.button("Run fill-mask"): |
| try: |
| results = fill_mask(prompt, top_k=top_k) |
| |
| out_df = pd.DataFrame( |
| [{"sequence": r.get("sequence"), "score": float(r.get("score", 0.0)), "token_str": r.get("token_str")} for r in results] |
| ) |
| st.dataframe(out_df, use_container_width=True, height=300) |
| except Exception as e: |
| st.error(f"fill-mask failed: {e}") |
| st.info("Tip: make sure your prompt uses the right mask token for the selected model.") |