urdu-ner / app.py
JalalHxmi's picture
Update app.py
342c209 verified
import gradio as gr
import torch
from transformers import pipeline, AutoModelForTokenClassification, AutoTokenizer
# 🔥 Define Label Mapping Dictionary
LABEL_MAP = {
0: "تاریخ",
1: "عہدہ",
2: "مقام",
3: "نمبر",
4: "تنظیم",
5: "غیر متعلقہ",
6: "نام",
7: "وقت",
}
# 🔹 Replace with your Hugging Face Model
MODEL_NAME = "blaikhole/distilbert-urdu-ner"
# ✅ Load Model & Tokenizer
device = 0 if torch.cuda.is_available() else -1 # Use GPU if available
model = AutoModelForTokenClassification.from_pretrained(MODEL_NAME).to("cuda" if device == 0 else "cpu")
tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME)
# ✅ Load Hugging Face NER Pipeline
ner_pipeline = pipeline("ner", model=model, tokenizer=tokenizer, device=device)
def merge_subwords_and_decode(entities):
"""Merge subwords and map label IDs to actual labels, treating 'OTHER' tokens as normal text."""
merged_entities = []
current_word = ""
current_label = None
for entity in entities:
word = entity["word"]
label_id = int(entity["entity"].replace("LABEL_", ""))
label = LABEL_MAP.get(label_id, "OTHER")
# Treat OTHER as normal text by setting its label to None
if label == "غیر متعلقہ":
label = None
# Merge subwords: if the token starts with '##', append it without space
if word.startswith("##"):
current_word += word[2:]
else:
# If the current token and new token share the same label, add a space before appending
if current_word and current_label == label:
current_word += " " + word
else:
# Append the current token segment to the result before starting a new one
if current_word:
merged_entities.append((current_word, current_label))
current_word = word
current_label = label
if current_word:
merged_entities.append((current_word, current_label))
return merged_entities
# 🔹 Function to Run NER
def analyze_text(text):
entities = ner_pipeline(text) # Get raw predictions
return merge_subwords_and_decode(entities) # Fix labels & subwords
# 🔹 Example Sentence
EXAMPLE_SENT = "بات یہ ہے تین بجے احمد لاہور میں بطور مینیجر گوگل کمپنی کے آفس میں گیا۔"
# 🔹 Gradio UI
with gr.Blocks() as demo:
gr.Markdown("# 🏥 Urdu language Named Entity Recognition (NER)")
text_input = gr.Textbox(label="Enter Text", lines=5, value=EXAMPLE_SENT)
analyze_button = gr.Button("Run NER Model")
output = gr.HighlightedText(label="NER Result", combine_adjacent=True)
analyze_button.click(analyze_text, inputs=[text_input], outputs=[output])
demo.launch()