|
import gradio as gr |
|
import torch |
|
from transformers import pipeline, AutoModelForTokenClassification, AutoTokenizer |
|
|
|
|
|
LABEL_MAP = { |
|
0: "تاریخ", |
|
1: "عہدہ", |
|
2: "مقام", |
|
3: "نمبر", |
|
4: "تنظیم", |
|
5: "غیر متعلقہ", |
|
6: "نام", |
|
7: "وقت", |
|
} |
|
|
|
|
|
MODEL_NAME = "blaikhole/distilbert-urdu-ner" |
|
|
|
|
|
device = 0 if torch.cuda.is_available() else -1 |
|
model = AutoModelForTokenClassification.from_pretrained(MODEL_NAME).to("cuda" if device == 0 else "cpu") |
|
tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME) |
|
|
|
|
|
ner_pipeline = pipeline("ner", model=model, tokenizer=tokenizer, device=device) |
|
|
|
def merge_subwords_and_decode(entities): |
|
"""Merge subwords and map label IDs to actual labels, treating 'OTHER' tokens as normal text.""" |
|
merged_entities = [] |
|
current_word = "" |
|
current_label = None |
|
|
|
for entity in entities: |
|
word = entity["word"] |
|
label_id = int(entity["entity"].replace("LABEL_", "")) |
|
label = LABEL_MAP.get(label_id, "OTHER") |
|
|
|
if label == "غیر متعلقہ": |
|
label = None |
|
|
|
|
|
if word.startswith("##"): |
|
current_word += word[2:] |
|
else: |
|
|
|
if current_word and current_label == label: |
|
current_word += " " + word |
|
else: |
|
|
|
if current_word: |
|
merged_entities.append((current_word, current_label)) |
|
current_word = word |
|
current_label = label |
|
|
|
if current_word: |
|
merged_entities.append((current_word, current_label)) |
|
return merged_entities |
|
|
|
|
|
|
|
|
|
def analyze_text(text): |
|
entities = ner_pipeline(text) |
|
return merge_subwords_and_decode(entities) |
|
|
|
|
|
EXAMPLE_SENT = "بات یہ ہے تین بجے احمد لاہور میں بطور مینیجر گوگل کمپنی کے آفس میں گیا۔" |
|
|
|
|
|
with gr.Blocks() as demo: |
|
gr.Markdown("# 🏥 Urdu language Named Entity Recognition (NER)") |
|
|
|
text_input = gr.Textbox(label="Enter Text", lines=5, value=EXAMPLE_SENT) |
|
analyze_button = gr.Button("Run NER Model") |
|
output = gr.HighlightedText(label="NER Result", combine_adjacent=True) |
|
|
|
analyze_button.click(analyze_text, inputs=[text_input], outputs=[output]) |
|
|
|
demo.launch() |
|
|