import gradio as gr import torch from transformers import BertTokenizer, BertModel import re import numpy as np # Load BERT tokenizer and model tokenizer = BertTokenizer.from_pretrained('bert-base-uncased') model = BertModel.from_pretrained('bert-base-uncased') # Text preprocessing: remove ASCII chars and lowercase def preprocess_text(text): text = re.sub(r'[^\x00-\x7F]+', '', text) # Remove non-ASCII characters return text.lower() # Function to tokenize and generate embeddings def generate_embeddings(input_text): cleaned_text = preprocess_text(input_text) inputs = tokenizer(cleaned_text, return_tensors="pt") with torch.no_grad(): outputs = model(**inputs) token_ids = inputs["input_ids"][0] tokens = tokenizer.convert_ids_to_tokens(token_ids) embeddings = outputs.last_hidden_state.squeeze(0) token_embedding_pairs = [] for token, embedding in zip(tokens, embeddings): vector = embedding[:5].numpy() # Truncate to first 5 dims for display vector_str = ', '.join(f"{v:.4f}" for v in vector) token_embedding_pairs.append(f"{token}: [{vector_str}...]") return "\n".join(token_embedding_pairs) # Gradio UI iface = gr.Interface( fn=generate_embeddings, inputs=gr.Textbox(lines=4, placeholder="Enter text here..."), outputs="text", title="BERT Token Embeddings Viewer", description="This app removes ASCII chars, lowercases text, tokenizes using BERT, and shows tokens with embeddings (truncated)." ) if __name__ == "__main__": iface.launch()