|
import gradio as gr |
|
from datasets import load_dataset |
|
import re |
|
|
|
dataset = load_dataset("mohamedabdullah/Arabic-unique-words", data_files="ar_vocab.txt") |
|
word_l = re.findall('[^a-zA-Z0-9\s\W]{2,25}', dataset['train']['text'][0]) |
|
vocab = set(word_l) |
|
|
|
def delete_letter(word): |
|
return [word[:i]+word[i+1:] for i in range(len(word))] |
|
|
|
def switch_letter(word): |
|
switch_l = [] |
|
|
|
for i in range(len(word)-1): |
|
w_l = re.findall('\w', word) |
|
if i-1 < 0: |
|
w_l[i:i+2] = w_l[i+1::-1] |
|
else: |
|
w_l[i:i+2] = w_l[i+1:i-1:-1] |
|
|
|
switch_l.append(''.join(w_l)) |
|
|
|
return switch_l |
|
|
|
def replace_letter(word): |
|
letters = 'ابتةثجحخدذرزسشصضطظعغفقكلمنهويءآأؤإئ' |
|
|
|
replace_set = set() |
|
|
|
for i in range(len(word)): |
|
for l in letters: |
|
new_word = word[:i]+l+word[i+1:] |
|
if new_word == word: |
|
continue |
|
replace_set.add(new_word) |
|
|
|
replace_l = sorted(list(replace_set)) |
|
|
|
return replace_l |
|
|
|
def insert_letter(word): |
|
letters = 'ابتةثجحخدذرزسشصضطظعغفقكلمنهويءآأؤإئ' |
|
insert_l = [] |
|
|
|
for i in range(len(word)+1): |
|
for l in letters: |
|
new_word = word[:i]+l+word[i:] |
|
insert_l.append(new_word) |
|
|
|
return insert_l |
|
|
|
def edit_one_letter(word, allow_switches = True): |
|
edit_one_set = delete_letter(word)+insert_letter(word)+replace_letter(word) |
|
|
|
if allow_switches: |
|
edit_one_set += switch_letter(word) |
|
|
|
return set(edit_one_set) |
|
|
|
def edit_two_letters(word, allow_switches = True): |
|
edit_two_set = [] |
|
edit_one_set = edit_one_letter(word) |
|
|
|
for edit in edit_one_set: |
|
edit_two_set += edit_one_letter(edit) |
|
|
|
return set(edit_two_set) | set(edit_one_set) |
|
|
|
def get_corrections(word, vocab): |
|
suggestions = [] |
|
|
|
correct_word_suggest = [word] if word in vocab else [] |
|
edit_one_letter_suggest = list(filter(lambda item: item in vocab, list(edit_one_letter(word)))) |
|
edit_two_letter_suggest = list(filter(lambda item: item in vocab, list(edit_two_letters(word)))) |
|
|
|
suggestions = correct_word_suggest or edit_one_letter_suggest or edit_two_letter_suggest |
|
|
|
return set(suggestions) |
|
|
|
def min_edit_distance(source, target, ins_cost = 1, del_cost = 1, rep_cost = 2): |
|
m = len(source) |
|
n = len(target) |
|
D = np.zeros((m+1, n+1), dtype=int) |
|
|
|
for row in range(1, m+1): |
|
D[row,0] = D[row-1,0]+del_cost |
|
|
|
for col in range(1, n+1): |
|
D[0,col] = D[0, col-1]+ins_cost |
|
|
|
for row in range(1, m+1): |
|
for col in range(1, n+1): |
|
r_cost = rep_cost |
|
|
|
if source[row-1] == target[col-1]: |
|
r_cost = 0 |
|
|
|
D[row,col] = np.min([D[row-1,col]+del_cost, D[row,col-1]+ins_cost, D[row-1,col-1]+r_cost]) |
|
|
|
med = D[m,n] |
|
|
|
return med |
|
|
|
def get_suggestions(corrections, word): |
|
distance = [] |
|
suggest = [] |
|
|
|
for correction in corrections: |
|
source = word |
|
target = correction |
|
min_edits = min_edit_distance(source, target) |
|
|
|
distance.append(min_edits) |
|
suggest.append(correction) |
|
|
|
suggest_result = list(map(lambda idx: suggest[idx], np.argsort(distance))) |
|
return suggest_result |
|
|
|
def ar_spelling_checker(text): |
|
word_l = re.findall('\w{3,}', text) |
|
result = {} |
|
|
|
for word in word_l: |
|
if not word in vocab: |
|
tmp_corrections = get_corrections(word, vocab) |
|
if len(tmp_corrections) == 0: |
|
continue |
|
result[word] = get_suggestions(tmp_corrections, word) |
|
|
|
output = '''<style> |
|
.content{ |
|
direction: rtl; |
|
} |
|
.word{ |
|
color: #842029; |
|
background-color: #f8d7da; |
|
border-color: #f5c2c7; |
|
padding: 10px 20px; |
|
display: inline-block; |
|
direction: rtl; |
|
font-size: 15px; |
|
font-weight: 500; |
|
margin-bottom: 15px; |
|
box-sizing: border-box; |
|
border: 1px solid transparent; |
|
border-radius: 0.25rem; |
|
} |
|
|
|
.suggest{ |
|
color: #0f5132; |
|
background-color: #d1e7dd; |
|
border-color: #badbcc; |
|
display: inline-block; |
|
margin-right: 5px; |
|
} |
|
|
|
.separator{ |
|
height:3px; |
|
background: #CCC; |
|
margin-bottom: 15px; |
|
} |
|
|
|
.msg{ |
|
color: #0f5132; |
|
background-color: #d1e7dd; |
|
border-color: #badbcc; |
|
border: 1px solid transparent; |
|
border-radius: 0.25rem; |
|
padding: 15px 20px; |
|
direction: rtl; |
|
font-size: 20px; |
|
font-weight: 500; |
|
text-align: center; |
|
} |
|
</style>''' |
|
|
|
output += '<div class="content">' |
|
|
|
if len(result.keys()) == 0: |
|
output += '<div class="msg">لا توجد أخطاء إملائية 🤗</div>' |
|
|
|
for word in result.keys(): |
|
output += f'<div class="word">{word}</div><br />' |
|
for suggest in result[word]: |
|
output += f'<div class="word suggest">{suggest}</div>' |
|
|
|
output += '<div class="separator"></div>' |
|
|
|
output += '</div>' |
|
|
|
return output |
|
|
|
with gr.Blocks(css="""#input{direction: rtl;} |
|
#component-112{height: 30px;} |
|
.gr-form{margin-top: 15px;} |
|
.gr-text-input{font-size: 17px; height:50px; padding: 0.725rem;} |
|
.text-gray-500{font-size: 16px; margin-bottom: 13px;} |
|
.gr-button{color: #084298; background-color: #cfe2ff; border-color: #b6d4fe; |
|
border: 1px solid transparent; border-radius: 0.25rem; |
|
padding: 15px 20px; font-size: 20px; font-weight: 500; font-family: 'IBM Plex Mono';} |
|
.output-html{min-height: 2rem;} |
|
.title{text-align: center;font-size: 25px;margin-top: 13px;position: absolute;width:100%; |
|
line-height: 1.5;font-family: 'IBM Plex Mono';} |
|
.desc{text-align: center; font-size: 17px; font-family: 'IBM Plex Mono'; margin-top: 46px;}""") as demo: |
|
|
|
intro = gr.HTML('<h1 class="title">Arabic Spelling Checker 🤗</h1>') |
|
description = gr.HTML('<p class="desc">Web-based app to detect spelling mistakes in Arabic words using dynamic programming</p>') |
|
text = gr.Textbox(label="النص", elem_id="input") |
|
btn = gr.Button("Spelling Check") |
|
output = gr.HTML() |
|
|
|
btn.click(ar_spelling_checker, [text], output) |
|
|
|
demo.launch(inline=False) |