Spaces:
Running
Running
| import gradio as gr | |
| import torch | |
| import sentencepiece as spm | |
| from transformers import RobertaForTokenClassification | |
| from huggingface_hub import hf_hub_download | |
| import csv | |
| import io | |
| from collections import Counter | |
| MODEL_ID = "hellosindh/sindhi-bert-ner" | |
| print("Loading model...", flush=True) | |
| model = RobertaForTokenClassification.from_pretrained(MODEL_ID) | |
| model.eval() | |
| print("Loading tokenizer...", flush=True) | |
| sp_path = hf_hub_download(repo_id=MODEL_ID, filename="sindhi_bpe_32k.model") | |
| sp = spm.SentencePieceProcessor() | |
| sp.Load(sp_path) | |
| print("✅ Ready!", flush=True) | |
| ID2TAG = model.config.id2label | |
| BOS_ID = 2 | |
| EOS_ID = 3 | |
| # ── Entity config — Sindhi labels ───────────────── | |
| ENTITY_CONFIG = { | |
| "PERSON": {"color": "#c084fc", "bg": "rgba(192,132,252,0.15)", "sindhi": "ماڻهو"}, | |
| "LOCATION": {"color": "#818cf8", "bg": "rgba(129,140,248,0.15)", "sindhi": "جڳهه"}, | |
| "ORGANIZATION": {"color": "#38bdf8", "bg": "rgba(56,189,248,0.15)", "sindhi": "ادارو"}, | |
| "DATE_TIME": {"color": "#34d399", "bg": "rgba(52,211,153,0.15)", "sindhi": "تاريخ"}, | |
| "EVENT": {"color": "#fbbf24", "bg": "rgba(251,191,36,0.15)", "sindhi": "واقعو"}, | |
| "TITLE": {"color": "#fb923c", "bg": "rgba(251,146,60,0.15)", "sindhi": "لقب"}, | |
| } | |
| FALLBACK_CFG = { | |
| "color": "#6b7280", | |
| "bg": "rgba(107,114,128,0.15)", | |
| "sindhi": "ادارو", # unknown entities shown as ادارو | |
| } | |
| def predict_ner(sentence): | |
| if not sentence.strip(): | |
| return _empty_html(), _empty_summary(), "", None, gr.update(visible=False) | |
| words = sentence.split() | |
| input_ids = [BOS_ID] | |
| word_map = [-1] | |
| for i, word in enumerate(words): | |
| subwords = sp.EncodeAsIds(word) | |
| if not subwords: | |
| continue | |
| for j, sw in enumerate(subwords): | |
| input_ids.append(sw) | |
| word_map.append(i if j == 0 else -1) | |
| input_ids.append(EOS_ID) | |
| word_map.append(-1) | |
| tensor = torch.tensor([input_ids]) | |
| with torch.no_grad(): | |
| logits = model(tensor).logits[0] | |
| probs = torch.softmax(logits, dim=-1) | |
| preds = torch.argmax(logits, dim=-1).tolist() | |
| conf = probs.max(dim=-1).values.tolist() | |
| word_tags = {} | |
| word_conf = {} | |
| for pos, (pred, wid) in enumerate(zip(preds, word_map)): | |
| if wid >= 0: | |
| word_tags[wid] = ID2TAG[pred] | |
| word_conf[wid] = conf[pos] | |
| entities = [] | |
| html_words = [] | |
| i = 0 | |
| while i < len(words): | |
| tag = word_tags.get(i, "O") | |
| if tag.startswith("B-"): | |
| etype = tag[2:] | |
| entity_words = [words[i]] | |
| scores = [word_conf.get(i, 0)] | |
| j = i + 1 | |
| while j < len(words): | |
| if word_tags.get(j, "O") == f"I-{etype}": | |
| entity_words.append(words[j]) | |
| scores.append(word_conf.get(j, 0)) | |
| j += 1 | |
| else: | |
| break | |
| entity_text = " ".join(entity_words) | |
| avg_score = sum(scores) / len(scores) | |
| cfg = ENTITY_CONFIG.get(etype, FALLBACK_CFG) | |
| html_words.append( | |
| f'<span style="' | |
| f'background:{cfg["bg"]};' | |
| f'border:1px solid {cfg["color"]}50;' | |
| f'color:#f1f5f9;' | |
| f'padding:4px 12px 4px 8px;' | |
| f'border-radius:8px;margin:3px;' | |
| f'display:inline-block;font-weight:500;">' | |
| f'<span style="' | |
| f'background:{cfg["color"]};color:#0a0a1a;' | |
| f'font-size:0.62em;font-weight:800;' | |
| f'padding:2px 7px;border-radius:4px;' | |
| f'margin-left:7px;vertical-align:middle;">' | |
| f'{cfg["sindhi"]}</span>' | |
| f'{entity_text}' | |
| f'</span>' | |
| ) | |
| entities.append({ | |
| "text": entity_text, | |
| "type": etype, | |
| "sindhi": cfg["sindhi"], | |
| "score": avg_score, | |
| "color": cfg["color"], | |
| }) | |
| i = j | |
| else: | |
| html_words.append( | |
| f'<span style="color:#cbd5e1;padding:2px 4px;">{words[i]}</span>' | |
| ) | |
| i += 1 | |
| highlighted = f""" | |
| <div style=" | |
| background:linear-gradient(135deg,#1a0533 0%,#0f0f2e 100%); | |
| border:1px solid #7c3aed30;border-radius:16px; | |
| padding:24px 28px; | |
| font-size:1.3em; | |
| line-height:3.2em; | |
| direction:rtl;text-align:right; | |
| font-family:'Lateef','Scheherazade New',serif; | |
| min-height:90px;"> | |
| {" ".join(html_words)} | |
| </div> | |
| """ | |
| summary = _build_summary(entities) | |
| conf_html = _build_confidence(entities) | |
| csv_file = _build_csv(entities) | |
| legend = _build_legend(entities) if entities else "" | |
| return highlighted, summary, conf_html, csv_file, gr.update(value=legend, visible=bool(entities)) | |
| def _empty_html(): | |
| return """ | |
| <div style=" | |
| background:linear-gradient(135deg,#1a0533,#0f0f2e); | |
| border:1px solid #7c3aed20;border-radius:16px; | |
| padding:40px;text-align:center;min-height:90px; | |
| display:flex;align-items:center;justify-content:center;"> | |
| <span style="color:#4c1d95;font-size:1.2em; | |
| font-family:'Lateef','Scheherazade New',serif;"> | |
| ڪو بہ سنڌي جملو لکو | |
| </span> | |
| </div> | |
| """ | |
| def _empty_summary(): | |
| return """ | |
| <div style=" | |
| background:#1a0533;border:1px solid #7c3aed20; | |
| border-radius:16px;padding:24px; | |
| text-align:center;color:#4c1d95; | |
| font-size:1.1em; | |
| font-family:'Lateef','Scheherazade New',serif;"> | |
| اعتماد جوڳا نتيجا نہ مليا | |
| </div> | |
| """ | |
| def _build_summary(entities): | |
| if not entities: | |
| return _empty_summary() | |
| counts = Counter(e["type"] for e in entities) | |
| cards = "" | |
| for etype, cnt in sorted(counts.items(), key=lambda x: -x[1]): | |
| cfg = ENTITY_CONFIG.get(etype, FALLBACK_CFG) | |
| cards += f""" | |
| <div style=" | |
| background:{cfg['bg']};border:1px solid {cfg['color']}40; | |
| border-radius:10px;padding:10px 14px; | |
| display:flex;justify-content:space-between; | |
| align-items:center;margin-bottom:8px;direction:rtl;"> | |
| <span style="color:{cfg['color']};font-weight:600;font-size:1em; | |
| font-family:'Lateef','Scheherazade New',serif;"> | |
| {cfg['sindhi']} | |
| </span> | |
| <span style=" | |
| background:{cfg['color']};color:#0a0a1a; | |
| font-weight:800;border-radius:20px; | |
| padding:1px 10px;font-size:0.85em; | |
| min-width:24px;text-align:center;"> | |
| {cnt} | |
| </span> | |
| </div> | |
| """ | |
| # No مجموعي header — just cards directly | |
| return f""" | |
| <div style=" | |
| background:linear-gradient(135deg,#1a0533,#0f0f2e); | |
| border:1px solid #7c3aed30;border-radius:16px; | |
| padding:16px 14px;"> | |
| {cards} | |
| </div> | |
| """ | |
| def _build_confidence(entities): | |
| if not entities: | |
| return "" | |
| bars = "" | |
| for ent in entities: | |
| cfg = ENTITY_CONFIG.get(ent["type"], FALLBACK_CFG) | |
| pct = int(ent["score"] * 100) | |
| bars += f""" | |
| <div style="margin-bottom:16px;direction:rtl;"> | |
| <div style="display:flex;justify-content:space-between; | |
| align-items:center;margin-bottom:6px;"> | |
| <span style="color:#e2e8f0;font-size:1.1em;font-weight:500; | |
| font-family:'Lateef','Scheherazade New',serif;"> | |
| {ent['text']} | |
| </span> | |
| <div style="display:flex;gap:8px;align-items:center;"> | |
| <span style=" | |
| background:{cfg['color']}18; | |
| border:1px solid {cfg['color']}40; | |
| color:{cfg['color']}; | |
| font-size:0.85em;padding:2px 8px; | |
| border-radius:4px;font-weight:700; | |
| font-family:'Lateef','Scheherazade New',serif;"> | |
| {ent['sindhi']} | |
| </span> | |
| <span style="color:{cfg['color']}; | |
| font-weight:800;font-size:0.88em; | |
| font-family:monospace;"> | |
| {pct}% | |
| </span> | |
| </div> | |
| </div> | |
| <div style="background:#1e1040;border-radius:999px; | |
| height:5px;overflow:hidden;"> | |
| <div style="width:{pct}%;height:100%; | |
| background:linear-gradient(90deg,{cfg['color']}60,{cfg['color']}); | |
| border-radius:999px;"> | |
| </div> | |
| </div> | |
| </div> | |
| """ | |
| return f""" | |
| <div style=" | |
| background:linear-gradient(135deg,#1a0533,#0f0f2e); | |
| border:1px solid #7c3aed30;border-radius:16px; | |
| padding:20px 18px;margin-top:4px;"> | |
| <div style="color:#c084fc;font-weight:700;font-size:0.9em; | |
| margin-bottom:16px;padding-bottom:10px; | |
| border-bottom:1px solid #7c3aed25; | |
| direction:rtl;text-align:right; | |
| font-family:'Lateef','Scheherazade New',serif;"> | |
| اعتماد | |
| </div> | |
| {bars} | |
| </div> | |
| """ | |
| def _build_legend(entities): | |
| """Show only entity types found in this result.""" | |
| found_types = list(dict.fromkeys(e["type"] for e in entities)) # preserve order | |
| items = "" | |
| for etype in found_types: | |
| cfg = ENTITY_CONFIG.get(etype, FALLBACK_CFG) | |
| items += ( | |
| f'<span style="background:{cfg["bg"]};' | |
| f'border:1px solid {cfg["color"]}40;' | |
| f'color:{cfg["color"]};padding:5px 14px;' | |
| f'border-radius:6px;font-size:1em;font-weight:600;' | |
| f'font-family:\'Lateef\',\'Scheherazade New\',serif;">' | |
| f'{cfg["sindhi"]}</span>' | |
| ) | |
| return f""" | |
| <div style=" | |
| background:linear-gradient(135deg,#1a0533,#0f0f2e); | |
| border:1px solid #7c3aed20;border-radius:14px; | |
| padding:14px 18px;margin-top:4px;"> | |
| <div style="display:flex;flex-wrap:wrap;gap:8px;direction:rtl;"> | |
| {items} | |
| </div> | |
| </div> | |
| """ | |
| def _build_csv(entities): | |
| if not entities: | |
| return None | |
| output = io.StringIO() | |
| writer = csv.writer(output) | |
| writer.writerow(["Entity", "Type", "Sindhi Type", "Confidence"]) | |
| for ent in entities: | |
| writer.writerow([ | |
| ent["text"], ent["type"], | |
| ent["sindhi"], f"{ent['score']*100:.1f}%" | |
| ]) | |
| path = "/tmp/sindhi_ner.csv" | |
| with open(path, "w", encoding="utf-8-sig", newline="") as f: | |
| f.write(output.getvalue()) | |
| return path | |
| CSS = """ | |
| @import url('https://fonts.googleapis.com/css2?family=Lateef:wght@400;700&family=Scheherazade+New:wght@400;700&family=Outfit:wght@400;600;700;800&display=swap'); | |
| /* Base — Outfit for UI chrome only */ | |
| *, body, .gradio-container { | |
| font-family: 'Outfit', sans-serif !important; | |
| } | |
| body, .gradio-container { | |
| background: #08081a !important; | |
| } | |
| .gradio-container { | |
| max-width: 980px !important; | |
| margin: 0 auto !important; | |
| padding: 16px !important; | |
| } | |
| /* Labels */ | |
| label > span { | |
| color: #9333ea !important; | |
| font-size: 0.82em !important; | |
| font-weight: 700 !important; | |
| letter-spacing: 0.8px !important; | |
| text-transform: uppercase !important; | |
| font-family: 'Outfit', sans-serif !important; | |
| } | |
| /* Textarea — Lateef font, large size */ | |
| textarea { | |
| background: #130825 !important; | |
| border: 1px solid #6d28d960 !important; | |
| border-radius: 14px !important; | |
| color: #e2e8f0 !important; | |
| font-size: 1.4em !important; | |
| direction: rtl !important; | |
| font-family: 'Lateef', 'Scheherazade New', serif !important; | |
| caret-color: #c084fc !important; | |
| line-height: 2.2em !important; | |
| padding: 14px 16px !important; | |
| } | |
| textarea:focus { | |
| border-color: #c084fc !important; | |
| box-shadow: 0 0 0 3px #7c3aed15 !important; | |
| outline: none !important; | |
| } | |
| textarea::placeholder { | |
| color: #4c1d95 !important; | |
| font-family: 'Lateef', 'Scheherazade New', serif !important; | |
| font-size: 1em !important; | |
| } | |
| /* Search button */ | |
| button.primary { | |
| background: linear-gradient(135deg, #6d28d9, #9333ea, #c084fc) !important; | |
| border: none !important; | |
| border-radius: 12px !important; | |
| color: #fff !important; | |
| font-weight: 800 !important; | |
| font-size: 1em !important; | |
| font-family: 'Lateef', 'Scheherazade New', serif !important; | |
| letter-spacing: 0.5px !important; | |
| transition: all 0.3s ease !important; | |
| padding: 14px !important; | |
| width: 100% !important; | |
| margin-top: 8px !important; | |
| } | |
| button.primary:hover { | |
| box-shadow: 0 6px 24px #7c3aed50 !important; | |
| transform: translateY(-1px) !important; | |
| } | |
| /* Examples — below button, clean look */ | |
| .examples-holder { | |
| background: transparent !important; | |
| border: none !important; | |
| padding: 0 !important; | |
| margin-top: 10px !important; | |
| } | |
| .examples-holder > .label-wrap { | |
| display: none !important; | |
| } | |
| .examples table { | |
| background: #130825 !important; | |
| border: 1px solid #6d28d930 !important; | |
| border-radius: 10px !important; | |
| width: 100% !important; | |
| } | |
| .examples table thead { | |
| display: none !important; | |
| } | |
| .examples table td { | |
| color: #94a3b8 !important; | |
| font-family: 'Lateef', 'Scheherazade New', serif !important; | |
| font-size: 1.15em !important; | |
| direction: rtl !important; | |
| text-align: right !important; | |
| padding: 8px 14px !important; | |
| border-bottom: 1px solid #1e1040 !important; | |
| } | |
| .examples table tr:last-child td { | |
| border-bottom: none !important; | |
| } | |
| .examples table tr:hover td { | |
| color: #c084fc !important; | |
| background: #1a0533 !important; | |
| cursor: pointer !important; | |
| } | |
| /* File download */ | |
| .file-preview { | |
| background: #130825 !important; | |
| border: 1px solid #6d28d940 !important; | |
| border-radius: 10px !important; | |
| } | |
| /* Scrollbar */ | |
| ::-webkit-scrollbar { width: 5px; } | |
| ::-webkit-scrollbar-track { background: #08081a; } | |
| ::-webkit-scrollbar-thumb { background: #6d28d9; border-radius: 3px; } | |
| """ | |
| HEADER = """ | |
| <div style=" | |
| background:linear-gradient(135deg,#1a0533 0%,#0f0f2e 60%,#160a2e 100%); | |
| border:1px solid #7c3aed25;border-radius:20px; | |
| padding:28px 28px 22px;margin-bottom:20px; | |
| text-align:center;position:relative;overflow:hidden;"> | |
| <div style=" | |
| position:absolute;top:0;left:0;right:0;bottom:0; | |
| background:radial-gradient(ellipse at 50% 0%,#7c3aed12 0%,transparent 65%); | |
| pointer-events:none;"></div> | |
| <div style="position:relative;"> | |
| <h1 style=" | |
| color:#f1f5f9;font-size:2em;font-weight:800; | |
| margin:0 0 4px;letter-spacing:-1px; | |
| text-shadow:0 0 40px #7c3aed50; | |
| font-family:'Lateef','Scheherazade New',serif;"> | |
| سنڌي اسمن جي سڃاڻپ | |
| </h1> | |
| <p style=" | |
| font-family:'Outfit',sans-serif; | |
| color:#6d28d9;font-size:0.72em; | |
| letter-spacing:3px;margin:0;"> | |
| SINDHI NAMED ENTITY RECOGNITION | |
| </p> | |
| </div> | |
| </div> | |
| """ | |
| EXAMPLES = [ | |
| ["شيخ اياز شڪارپور ۾ پيدا ٿيو"], | |
| ["يونيورسٽي آف سنڌ، حيدرآباد ۾ آھي"], | |
| ["سيد مراد علي شاھ سنڌ جو وڏو وزير آھي، سندس تعلق پاڪستان پيپلز پارٽي سان آھي"], | |
| ["پاڪستان ۽ ڀارت جي ويڙھ 2025ع ۾ لڳي"], | |
| ["ڊاڪٽر نبي بخش بلوچ 16 ڊسمبر 1917ع تي سنجھوري ۾ پيدا ٿيو"], | |
| ["بينظير ڀٽو پاڪستان جي پھرين عورت وزيراعظم هئي"], | |
| ] | |
| with gr.Blocks(css=CSS, title="سنڌي NER") as demo: | |
| gr.HTML(HEADER) | |
| with gr.Row(): | |
| # ── Left column: input → button → examples ── | |
| with gr.Column(scale=3): | |
| inp = gr.Textbox( | |
| label="سنڌي جملو لکو", | |
| placeholder="شيخ اياز شڪارپور ۾ پيدا ٿيو...", | |
| lines=4, | |
| rtl=True | |
| ) | |
| btn = gr.Button("🔍 ڳوليو", variant="primary") | |
| # Examples BELOW button | |
| gr.Examples( | |
| examples=EXAMPLES, | |
| inputs=inp, | |
| label=None, | |
| ) | |
| # ── Right column: summary ──────────────────── | |
| with gr.Column(scale=2): | |
| summary_out = gr.HTML(value=_empty_summary()) | |
| gr.HTML("<div style='height:6px'></div>") | |
| # Highlighted output | |
| highlighted_out = gr.HTML(value=_empty_html()) | |
| # Confidence bars | |
| conf_out = gr.HTML() | |
| # Legend — hidden until search, no header text | |
| legend_out = gr.HTML(visible=False) | |
| # CSV download | |
| csv_out = gr.File( | |
| label="📥 ڊائونلوڊ ڪريو (CSV)", | |
| file_types=[".csv"], | |
| interactive=False | |
| ) | |
| gr.HTML(""" | |
| <div style="text-align:center;padding:16px 0 4px; | |
| color:#3b0764;font-size:0.72em;letter-spacing:1.5px; | |
| font-family:'Outfit',sans-serif;"> | |
| hellosindh · sindhi-bert-ner · MIT License | |
| </div> | |
| """) | |
| # Wire up both click and enter | |
| btn.click( | |
| fn=predict_ner, | |
| inputs=inp, | |
| outputs=[highlighted_out, summary_out, conf_out, csv_out, legend_out] | |
| ) | |
| inp.submit( | |
| fn=predict_ner, | |
| inputs=inp, | |
| outputs=[highlighted_out, summary_out, conf_out, csv_out, legend_out] | |
| ) | |
| demo.launch() |