Spaces:

mohamedabdullah
/

Arabic-Spelling-Checker

Sleeping

App Files Files Community

Arabic-Spelling-Checker / app.py

mohamedabdullah

Update app.py

53a1055 over 2 years ago

raw

history blame

11.3 kB

	import gradio as gr
	from datasets import load_dataset
	import re

	dataset = load_dataset("mohamedabdullah/Arabic-unique-words", data_files="ar_vocab.txt")
	word_l = re.findall('[^a-zA-Z0-9\s\W]{2,25}', dataset['train']['text'][0])
	vocab = set(word_l)

	def delete_letter(word):
	return [word[:i]+word[i+1:] for i in range(len(word))]

	def switch_letter(word):
	switch_l = []

	for i in range(len(word)-1):
	w_l = re.findall('\w', word)
	if i-1 < 0:
	w_l[i:i+2] = w_l[i+1::-1]
	else:
	w_l[i:i+2] = w_l[i+1:i-1:-1]

	switch_l.append(''.join(w_l))

	return switch_l

	def replace_letter(word):
	letters = 'ابتةثجحخدذرزسشصضطظعغفقكلمنهويءآأؤإئ'

	replace_set = set()

	for i in range(len(word)):
	for l in letters:
	new_word = word[:i]+l+word[i+1:]
	if new_word == word:
	continue
	replace_set.add(new_word)

	replace_l = sorted(list(replace_set))

	return replace_l

	def insert_letter(word):
	letters = 'ابتةثجحخدذرزسشصضطظعغفقكلمنهويءآأؤإئ'
	insert_l = []

	for i in range(len(word)+1):
	for l in letters:
	new_word = word[:i]+l+word[i:]
	insert_l.append(new_word)

	return insert_l

	def edit_one_letter(word, allow_switches = True):
	edit_one_set = delete_letter(word)+insert_letter(word)+replace_letter(word)

	if allow_switches:
	edit_one_set += switch_letter(word)

	return set(edit_one_set)

	def edit_two_letters(word, allow_switches = True):
	edit_two_set = []
	edit_one_set = edit_one_letter(word)

	for edit in edit_one_set:
	edit_two_set += edit_one_letter(edit)

	return set(edit_two_set) \| set(edit_one_set)

	def get_corrections(word, vocab):
	suggestions = []

	correct_word_suggest = [word] if word in vocab else []
	edit_one_letter_suggest = list(filter(lambda item: item in vocab, list(edit_one_letter(word))))
	edit_two_letter_suggest = list(filter(lambda item: item in vocab, list(edit_two_letters(word))))

	suggestions = correct_word_suggest or edit_one_letter_suggest or edit_two_letter_suggest

	return set(suggestions)

	def min_edit_distance(source, target, ins_cost = 1, del_cost = 1, rep_cost = 2):
	m = len(source)
	n = len(target)
	D = np.zeros((m+1, n+1), dtype=int)

	for row in range(1, m+1):
	D[row,0] = D[row-1,0]+del_cost

	for col in range(1, n+1):
	D[0,col] = D[0, col-1]+ins_cost

	for row in range(1, m+1):
	for col in range(1, n+1):
	r_cost = rep_cost

	if source[row-1] == target[col-1]:
	r_cost = 0

	D[row,col] = np.min([D[row-1,col]+del_cost, D[row,col-1]+ins_cost, D[row-1,col-1]+r_cost])

	med = D[m,n]

	return med

	def get_suggestions(corrections, word):
	distance = []
	suggest = []

	for correction in corrections:
	source = word
	target = correction
	min_edits = min_edit_distance(source, target)

	distance.append(min_edits)
	suggest.append(correction)

	suggest_result = list(map(lambda idx: suggest[idx], np.argsort(distance)))
	return suggest_result

	def ar_spelling_checker(text):
	word_l = re.findall('\w{3,}', text)
	result = {}

	for word in word_l:
	if not word in vocab:
	tmp_corrections = get_corrections(word, vocab)
	if len(tmp_corrections) == 0:
	continue
	result[word] = get_suggestions(tmp_corrections, word)

	output = '''<style>
	.content{
	direction: rtl;
	}
	.word{
	color: #842029;
	background-color: #f8d7da;
	border-color: #f5c2c7;
	padding: 10px 20px;
	display: inline-block;
	direction: rtl;
	font-size: 15px;
	font-weight: 500;
	margin-bottom: 15px;
	box-sizing: border-box;
	border: 1px solid transparent;
	border-radius: 0.25rem;
	}

	.suggest{
	color: #0f5132;
	background-color: #d1e7dd;
	border-color: #badbcc;
	display: inline-block;
	margin-right: 5px;
	}

	.separator{
	height:3px;
	background: #CCC;
	margin-bottom: 15px;
	}

	.msg{
	color: #0f5132;
	background-color: #d1e7dd;
	border-color: #badbcc;
	border: 1px solid transparent;
	border-radius: 0.25rem;
	padding: 15px 20px;
	direction: rtl;
	font-size: 20px;
	font-weight: 500;
	text-align: center;
	}
	</style>'''

	output += '<div class="content">'

	if len(result.keys()) == 0:
	output += '<div class="msg">لا توجد أخطاء إملائية 🤗</div>'

	for word in result.keys():
	output += f'<div class="word">{word}</div><br />'
	for suggest in result[word]:
	output += f'<div class="word suggest">{suggest}</div>'

	output += '<div class="separator"></div>'

	output += '</div>'

	return output

	with gr.Blocks(css="""#input{direction: rtl;}
	#component-112{height: 30px;}
	.gr-form{margin-top: 15px;}
	.gr-text-input{font-size: 17px; height:50px; padding: 0.725rem;}
	.text-gray-500{font-size: 16px; margin-bottom: 13px;}
	.gr-button{color: #084298; background-color: #cfe2ff; border-color: #b6d4fe;
	border: 1px solid transparent; border-radius: 0.25rem;
	padding: 15px 20px; font-size: 20px; font-weight: 500; font-family: 'IBM Plex Mono';}
	.output-html{min-height: 2rem;}
	.title{text-align: center;font-size: 25px;margin-top: 13px;position: absolute;width:100%;
	line-height: 1.5;font-family: 'IBM Plex Mono';}
	.desc{text-align: center; font-size: 17px; font-family: 'IBM Plex Mono'; margin-top: 46px;}""") as demo:

	intro = gr.HTML('<h1 class="title">Arabic Spelling Checker 🤗</h1>')
	description = gr.HTML('<p class="desc">Web-based app to detect spelling mistakes in Arabic words using dynamic programming</p>')
	text = gr.Textbox(label="النص", elem_id="input")
	btn = gr.Button("Spelling Check")
	output = gr.HTML()

	btn.click(ar_spelling_checker, [text], output)

	demo.launch(inline=False)