Montazer's picture
Update app.py
d18a284
raw
history blame contribute delete
No virus
4.68 kB
import gradio as gr
from transformers import pipeline
import re
HTML_WRAPPER = """<div dir="rtl" style="overflow-x: auto; border: 1px solid #e6e9ef; border-radius: 0.25rem; padding: 1rem; margin-bottom: 2.5rem">{}</div>"""
# Replace this with above latest checkpoint
model_checkpoint = "Montazer/arabert-finetuned-caner"
token_classifier = pipeline(
"token-classification", model=model_checkpoint, aggregation_strategy="simple"
)
import re
import unicodedata
diacritics = {
'\u064B': None, # FATHATAN
'\u064C': None, # DAMMATAN
'\u064D': None, # KASRATAN
'\u064E': None, # FATHA
'\u064F': None, # DAMMA
'\u0650': None, # KASRA
'\u0651': None, # SHADDA
'\u0652': None, # SUKUN
}
def remove_diacritics(text):
normalized_text = unicodedata.normalize('NFKD', text)
return normalized_text.translate(dict.fromkeys(map(ord, diacritics)))
def remove_punctuation(text):
return re.sub(r'[^\w\s]', '', text)
def preprocess_arabic_text(text):
# Remove diacritics
text = remove_diacritics(text)
# Remove punctuation
text = remove_punctuation(text)
# Normalize whitespace
text = re.sub(r'\s+', ' ', text)
# Convert to lowercase
text = text.lower()
return text
# Define a function to highlight different labels in the text
def highlight_text(text, entities):
entity_colors = {"Allah": "#ffe5cc", "Book": "#b3daff", "Clan": "#faedcb", "Crime": "#ffb3d9",
"Date": "#cce6ff", "Day": "#cce6ff", "Hell": "#d9d9d9", "Loc": "#d9b3ff",
"Meas": "#e6ccff", "Mon": "#ffd6cc", "Month": "#ffd6cc", "NatOb": "#ffe0b3",
"Number": "#ffe0cc", "Org": "#c1ffb3", "Para": "#f2f2f2", "Pers": "#b3ffb3",
"Prophet": "#e6ccff", "Rlig": "#ffff80", "Sect": "#b3d9ff", "Time": "#ffb3ba"}
highlighted = []
i = 0
for entity in entities:
highlighted.extend(text[i:int(entity['start'])].split())
entity_group = entity['entity_group']
score = entity['score']
marked_text = f'<mark class="{entity_group}" style="background-color: {entity_colors[entity_group]}">{entity["word"]}<sub>{entity_group}</sub><sup>{score:.2f}</sup></mark>'
highlighted.append(marked_text)
i = int(entity['end']) + 1
highlighted.extend(text[i:].split())
return HTML_WRAPPER.format(' '.join(highlighted))
# Create the Gradio interface
def predict_ner(text):
try:
text = preprocess_arabic_text(text)
entities = token_classifier(text)
highlighted_text = highlight_text(text, entities)
return highlighted_text
except Exception as e:
print(e)
return str(e)
label_text = (
"Enter Hadith in Arabic:\u000A"
"Example:\u000A"
' "ุญูŽุฏู‘ูŽุซูŽู†ูŽุง ุนูŽุจู’ุฏ ุงู„ู„ู‘ูŽู‡ูุŒ ุญูŽุฏู‘ูŽุซูŽู†ููŠ ุนูุจูŽูŠู’ุฏู ุงู„ู„ู‘ูŽู‡ู ุจู’ู†ู ุนูู…ูŽุฑูŽ ุงู„ู’ู‚ูŽูˆูŽุงุฑููŠุฑููŠู‘ูุŒ ุญูŽุฏู‘ูŽุซูŽู†ูŽุง ูŠููˆู†ูุณู ุจู’ู†ู ุฃูŽุฑู’ู‚ูŽู…ูŽุŒ ุญูŽุฏู‘ูŽุซูŽู†ูŽุง ูŠูŽุฒููŠุฏู ุจู’ู†ู ุฃูŽุจููŠ ุฒููŠูŽุงุฏูุŒ ุนูŽู†ู’ ุนูŽุจู’ุฏู ุงู„ุฑู‘ูŽุญู’ู…ูŽู†ู ุจู’ู†ู ุฃูŽุจููŠ ู„ูŽูŠู’ู„ูŽู‰ุŒ ู‚ูŽุงู„ูŽ ุดูŽู‡ูุฏู’ุชู ุนูŽู„ููŠู‘ู‹ุง ุฑูŽุถููŠูŽ ุงู„ู„ู‘ูŽู‡ู ุนูŽู†ู’ู‡ู ูููŠ ุงู„ุฑู‘ูŽุญูŽุจูŽุฉู ูŠูŽู†ู’ุดูุฏู ุงู„ู†ู‘ูŽุงุณูŽ ุฃูŽู†ู’ุดูุฏู ุงู„ู„ู‘ูŽู‡ูŽ ู…ูŽู†ู’ ุณูŽู…ูุนูŽ ุฑูŽุณููˆู„ูŽ ุงู„ู„ู‘ูŽู‡ู ุตูŽู„ู‘ูŽู‰ ุงู„ู„ู‘ูŽู‡ู ุนูŽู„ูŽูŠู’ู‡ู ูˆูŽุณูŽู„ู‘ูŽู…ูŽ ูŠูŽู‚ููˆู„ู ูŠูŽูˆู’ู…ูŽ ุบูŽุฏููŠุฑู ุฎูู…ู‘ู ู…ูŽู†ู’ ูƒูู†ู’ุชู ู…ูŽูˆู’ู„ูŽุงู‡ู ููŽุนูŽู„ููŠู‘ูŒ ู…ูŽูˆู’ู„ูŽุงู‡ู ู„ูŽู…ู‘ูŽุง ู‚ูŽุงู…ูŽ ููŽุดูŽู‡ูุฏูŽ ู‚ูŽุงู„ูŽ ุนูŽุจู’ุฏู ุงู„ุฑู‘ูŽุญู’ู…ูŽู†ู ููŽู‚ูŽุงู…ูŽ ุงุซู’ู†ูŽุง ุนูŽุดูŽุฑูŽ ุจูŽุฏู’ุฑููŠู‘ู‹ุง ูƒูŽุฃูŽู†ู‘ููŠ ุฃูŽู†ู’ุธูุฑู ุฅูู„ูŽู‰ ุฃูŽุญูŽุฏูู‡ูู…ู’ ููŽู‚ูŽุงู„ููˆุง ู†ูŽุดู’ู‡ูŽุฏู ุฃูŽู†ู‘ูŽุง ุณูŽู…ูุนู’ู†ูŽุง ุฑูŽุณููˆู„ูŽ ุงู„ู„ู‘ูŽู‡ู ุตูŽู„ู‘ูŽู‰ ุงู„ู„ู‘ูŽู‡ู ุนูŽู„ูŽูŠู’ู‡ู ูˆูŽุณูŽู„ู‘ูŽู…ูŽ ูŠูŽู‚ููˆู„ู ูŠูŽูˆู’ู…ูŽ ุบูŽุฏููŠุฑู ุฎูู…ู‘ู ุฃูŽู„ูŽุณู’ุชู ุฃูŽูˆู’ู„ูŽู‰ ุจูุงู„ู’ู…ูุคู’ู…ูู†ููŠู†ูŽ ู…ูู†ู’ ุฃูŽู†ู’ููุณูู‡ูู…ู’ ูˆูŽุฃูŽุฒู’ูˆูŽุงุฌููŠ ุฃูู…ู‘ูŽู‡ูŽุงุชูู‡ูู…ู’ ููŽู‚ูู„ู’ู†ูŽุง ุจูŽู„ูŽู‰ ูŠูŽุง ุฑูŽุณููˆู„ูŽ ุงู„ู„ู‘ูŽู‡ู ู‚ูŽุงู„ูŽ ููŽู…ูŽู†ู’ ูƒูู†ู’ุชู ู…ูŽูˆู’ู„ูŽุงู‡ู ููŽุนูŽู„ููŠู‘ูŒ ู…ูŽูˆู’ู„ูŽุงู‡ู ุงู„ู„ู‘ูŽู‡ูู…ู‘ูŽ ูˆูŽุงู„ู ู…ูŽู†ู’ ูˆูŽุงู„ูŽุงู‡ู ูˆูŽุนูŽุงุฏู ู…ูŽู†ู’ ุนูŽุงุฏูŽุงู‡ู"'
)
iface = gr.Interface(
fn=predict_ner,
inputs=gr.inputs.Textbox(label=label_text),
outputs=gr.outputs.HTML(label="Predicted Labels"),
title="Hadith Analysis"
)
# Launch the interface
iface.launch()