Spaces:
Sleeping
Sleeping
| import gradio as gr | |
| import torch | |
| from transformers import BertTokenizer, BertModel | |
| import re | |
| import numpy as np | |
| # Load BERT tokenizer and model | |
| tokenizer = BertTokenizer.from_pretrained('bert-base-uncased') | |
| model = BertModel.from_pretrained('bert-base-uncased') | |
| # Text preprocessing: remove ASCII chars and lowercase | |
| def preprocess_text(text): | |
| text = re.sub(r'[^\x00-\x7F]+', '', text) # Remove non-ASCII characters | |
| return text.lower() | |
| # Function to tokenize and generate embeddings | |
| def generate_embeddings(input_text): | |
| cleaned_text = preprocess_text(input_text) | |
| inputs = tokenizer(cleaned_text, return_tensors="pt") | |
| with torch.no_grad(): | |
| outputs = model(**inputs) | |
| token_ids = inputs["input_ids"][0] | |
| tokens = tokenizer.convert_ids_to_tokens(token_ids) | |
| embeddings = outputs.last_hidden_state.squeeze(0) | |
| token_embedding_pairs = [] | |
| for token, embedding in zip(tokens, embeddings): | |
| vector = embedding[:5].numpy() # Truncate to first 5 dims for display | |
| vector_str = ', '.join(f"{v:.4f}" for v in vector) | |
| token_embedding_pairs.append(f"{token}: [{vector_str}...]") | |
| return "\n".join(token_embedding_pairs) | |
| # Gradio UI | |
| iface = gr.Interface( | |
| fn=generate_embeddings, | |
| inputs=gr.Textbox(lines=4, placeholder="Enter text here..."), | |
| outputs="text", | |
| title="BERT Token Embeddings Viewer", | |
| description="This app removes ASCII chars, lowercases text, tokenizes using BERT, and shows tokens with embeddings (truncated)." | |
| ) | |
| if __name__ == "__main__": | |
| iface.launch() | |