BERT_Example / app.py
MegGup's picture
Create app.py
7f4db09 verified
Raw
History Blame Contribute Delete
1.56 kB
import gradio as gr
import torch
from transformers import BertTokenizer, BertModel
import re
import numpy as np
# Load BERT tokenizer and model
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
model = BertModel.from_pretrained('bert-base-uncased')
# Text preprocessing: remove ASCII chars and lowercase
def preprocess_text(text):
text = re.sub(r'[^\x00-\x7F]+', '', text) # Remove non-ASCII characters
return text.lower()
# Function to tokenize and generate embeddings
def generate_embeddings(input_text):
cleaned_text = preprocess_text(input_text)
inputs = tokenizer(cleaned_text, return_tensors="pt")
with torch.no_grad():
outputs = model(**inputs)
token_ids = inputs["input_ids"][0]
tokens = tokenizer.convert_ids_to_tokens(token_ids)
embeddings = outputs.last_hidden_state.squeeze(0)
token_embedding_pairs = []
for token, embedding in zip(tokens, embeddings):
vector = embedding[:5].numpy() # Truncate to first 5 dims for display
vector_str = ', '.join(f"{v:.4f}" for v in vector)
token_embedding_pairs.append(f"{token}: [{vector_str}...]")
return "\n".join(token_embedding_pairs)
# Gradio UI
iface = gr.Interface(
fn=generate_embeddings,
inputs=gr.Textbox(lines=4, placeholder="Enter text here..."),
outputs="text",
title="BERT Token Embeddings Viewer",
description="This app removes ASCII chars, lowercases text, tokenizes using BERT, and shows tokens with embeddings (truncated)."
)
if __name__ == "__main__":
iface.launch()