File size: 2,857 Bytes
5ced4bf
 
 
 
 
 
c3d555a
76f16fa
 
 
 
7b97805
76f16fa
 
5ced4bf
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
7b97805
5ced4bf
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
342c209
5ced4bf
 
 
c3d555a
5ced4bf
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
import gradio as gr
import torch
from transformers import pipeline, AutoModelForTokenClassification, AutoTokenizer

# 🔥 Define Label Mapping Dictionary
LABEL_MAP = {
    0: "تاریخ",
    1: "عہدہ",
    2: "مقام",
    3: "نمبر",
    4: "تنظیم",
    5: "غیر متعلقہ",
    6: "نام",
    7: "وقت",
}

# 🔹 Replace with your Hugging Face Model
MODEL_NAME = "blaikhole/distilbert-urdu-ner"  

# ✅ Load Model & Tokenizer
device = 0 if torch.cuda.is_available() else -1  # Use GPU if available
model = AutoModelForTokenClassification.from_pretrained(MODEL_NAME).to("cuda" if device == 0 else "cpu")
tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME)

# ✅ Load Hugging Face NER Pipeline
ner_pipeline = pipeline("ner", model=model, tokenizer=tokenizer, device=device)

def merge_subwords_and_decode(entities):
    """Merge subwords and map label IDs to actual labels, treating 'OTHER' tokens as normal text."""
    merged_entities = []
    current_word = ""
    current_label = None

    for entity in entities:
        word = entity["word"]
        label_id = int(entity["entity"].replace("LABEL_", ""))
        label = LABEL_MAP.get(label_id, "OTHER")
        # Treat OTHER as normal text by setting its label to None
        if label == "غیر متعلقہ":
            label = None

        # Merge subwords: if the token starts with '##', append it without space
        if word.startswith("##"):
            current_word += word[2:]
        else:
            # If the current token and new token share the same label, add a space before appending
            if current_word and current_label == label:
                current_word += " " + word
            else:
                # Append the current token segment to the result before starting a new one
                if current_word:
                    merged_entities.append((current_word, current_label))
                current_word = word
                current_label = label

    if current_word:
        merged_entities.append((current_word, current_label))
    return merged_entities



# 🔹 Function to Run NER
def analyze_text(text):
    entities = ner_pipeline(text)  # Get raw predictions
    return merge_subwords_and_decode(entities)  # Fix labels & subwords

# 🔹 Example Sentence
EXAMPLE_SENT = "بات یہ ہے تین بجے احمد  لاہور میں بطور مینیجر گوگل کمپنی  کے آفس میں گیا۔"

# 🔹 Gradio UI
with gr.Blocks() as demo:
    gr.Markdown("# 🏥 Urdu language Named Entity Recognition (NER)")

    text_input = gr.Textbox(label="Enter Text", lines=5, value=EXAMPLE_SENT)
    analyze_button = gr.Button("Run NER Model")
    output = gr.HighlightedText(label="NER Result", combine_adjacent=True)

    analyze_button.click(analyze_text, inputs=[text_input], outputs=[output])

demo.launch()