Spaces:
Sleeping
Sleeping
| import re | |
| import torch | |
| import gradio as gr | |
| from transformers import BertTokenizer, BertModel | |
| # Load tokenizer and model | |
| tokenizer = BertTokenizer.from_pretrained('bert-base-uncased') | |
| model = BertModel.from_pretrained('bert-base-uncased') | |
| def process_text(text): | |
| # Remove ASCII characters and lowercase | |
| cleaned = re.sub(r'[^\x00-\x7F]+', '', text).lower() | |
| # Tokenize | |
| inputs = tokenizer(cleaned, return_tensors="pt") | |
| tokens = tokenizer.convert_ids_to_tokens(inputs["input_ids"][0]) | |
| # Get BERT embeddings | |
| with torch.no_grad(): | |
| outputs = model(**inputs) | |
| embeddings = outputs.last_hidden_state.squeeze(0) # (seq_len, hidden_size) | |
| # Pair each token with its embedding (truncated for display) | |
| token_embeddings = [] | |
| for token, emb in zip(tokens, embeddings): | |
| token_embeddings.append([token, str(emb[:5].tolist()) + '...']) # truncate vector for readability | |
| return token_embeddings | |
| # Gradio interface | |
| gr.Interface( | |
| fn=process_text, | |
| inputs=gr.Textbox(lines=4, placeholder="Enter text here..."), | |
| outputs=gr.Dataframe(headers=["Token", "Embedding (truncated)"]), | |
| title="BERT Tokenizer & Embeddings Viewer", | |
| description="Removes ASCII characters, lowercases text, tokenizes using BERT, and shows token embeddings." | |
| ).launch() |