Spaces:

mohamedabdullah
/

Arabic-Spelling-Checker

Runtime error

File size: 11,148 Bytes

import gradio as gr
from datasets import load_dataset
import re
import numpy as np

dataset = load_dataset("mohamedabdullah/Arabic-unique-words",  data_files="ar_vocab.txt")
word_l = re.findall('[^a-zA-Z0-9\s\W]{2,25}', dataset['train']['text'][0])
vocab = set(word_l)

def delete_letter(word):
    return  [word[:i]+word[i+1:] for i in range(len(word))]
    
def switch_letter(word):
    switch_l = []

    for i in range(len(word)-1):
        w_l = re.findall('\w', word)
        if i-1 < 0:
            w_l[i:i+2] = w_l[i+1::-1]
        else:
            w_l[i:i+2] = w_l[i+1:i-1:-1]
            
        switch_l.append(''.join(w_l))
    
    return switch_l
    
def replace_letter(word):
    letters = 'ابتةثجحخدذرزسشصضطظعغفقكلمنهويءآأؤإئ'
    
    replace_set = set()

    for i in range(len(word)):
        for l in letters:
            new_word = word[:i]+l+word[i+1:]
            if new_word == word:
                continue
            replace_set.add(new_word)

    replace_l = sorted(list(replace_set))
    
    return replace_l
    
def insert_letter(word):
    letters = 'ابتةثجحخدذرزسشصضطظعغفقكلمنهويءآأؤإئ'
    insert_l = []

    for i in range(len(word)+1):
        for l in letters:
            new_word = word[:i]+l+word[i:]
            insert_l.append(new_word)
    
    return insert_l
    
def edit_one_letter(word, allow_switches = True):
    edit_one_set = delete_letter(word)+insert_letter(word)+replace_letter(word)

    if allow_switches:
        edit_one_set += switch_letter(word)

    return set(edit_one_set)
    
def edit_two_letters(word, allow_switches = True):
    edit_two_set = []
    edit_one_set = edit_one_letter(word)
    
    for edit in edit_one_set:
        edit_two_set += edit_one_letter(edit)
        
    return set(edit_two_set) | set(edit_one_set)
    
def get_corrections(word, vocab):
    suggestions = []
    
    correct_word_suggest = [word] if word in vocab else []
    edit_one_letter_suggest = list(filter(lambda item: item in vocab, list(edit_one_letter(word))))
    edit_two_letter_suggest = list(filter(lambda item: item in vocab, list(edit_two_letters(word))))
    
    suggestions = correct_word_suggest or edit_one_letter_suggest or edit_two_letter_suggest or ['لم يتم العثور علي إقتراحات مناسبة لهذه الكلمة']

    return set(suggestions) 
    
def min_edit_distance(source, target, ins_cost = 1, del_cost = 1, rep_cost = 2):
    m = len(source) 
    n = len(target) 
    D = np.zeros((m+1, n+1), dtype=int) 
  
    for row in range(1, m+1):
        D[row,0] =  D[row-1,0]+del_cost
        
    for col in range(1, n+1):
        D[0,col] = D[0, col-1]+ins_cost
        
    for row in range(1, m+1):
        for col in range(1, n+1):
            r_cost = rep_cost

            if source[row-1] == target[col-1]:
                r_cost = 0
                
            D[row,col] = np.min([D[row-1,col]+del_cost, D[row,col-1]+ins_cost, D[row-1,col-1]+r_cost])

    med = D[m,n]
    
    return med
    
def get_suggestions(corrections, word):
  distance = []
  suggest = []
  
  for correction in corrections:
    source = word
    target = correction
    min_edits = min_edit_distance(source, target)

    distance.append(min_edits)
    suggest.append(correction)

  suggest_result = list(map(lambda idx: suggest[idx], np.argsort(distance))) 
  return suggest_result  
  
def ar_spelling_checker(text):
  word_l = re.findall('\w{3,}', text)
  result = {}

  for word in word_l:
    tmp_corrections = []
    if not word in vocab:
      tmp_corrections = get_corrections(word, vocab)
      if len(tmp_corrections) == 0:
        continue
      result[word] = get_suggestions(tmp_corrections, word)

  output = '''<style>
    .content{
      direction: rtl;
    }
    .word{
            color: #842029;
            background-color: #f8d7da;
            border-color: #f5c2c7;
            padding: 10px 20px;
            display: inline-block;
            direction: rtl;
            font-size: 15px;
            font-weight: 500;
            margin-bottom: 15px;
            box-sizing: border-box;
            border: 1px solid transparent;
            border-radius: 0.25rem;
    }

    .suggest{
      color: #0f5132;
      background-color: #d1e7dd;
      border-color: #badbcc;
      display: inline-block;
      margin-right: 5px;
    }

    .separator{
      height:3px;
      background: #CCC;
      margin-bottom: 15px;
    }

    .msg{
      color: #0f5132;
      background-color: #d1e7dd;
      border-color: #badbcc;
      border: 1px solid transparent;
      border-radius: 0.25rem;
      padding: 15px 20px;
      direction: rtl;
      font-size: 20px;
      font-weight: 500;
      text-align: center;
    }
    </style>'''

  output += '<div class="content">' 

  if len(result.keys()) == 0:
    output += '<div class="msg">لا توجد أخطاء إملائية 🤗</div>'

  for word in result.keys():
    output += f'<div class="word">{word}</div><br />'
    for suggest in result[word]:
      output += f'<div class="word suggest">{suggest}</div>'

    output += '<div class="separator"></div>'

  output += '</div>' 
  
  return output
  
with gr.Blocks(css="""
 #input{direction: rtl;}
 #component-112{height: 30px;}
 .gr-form{margin-top: 15px;}
 .gr-text-input{font-size: 17px; height:50px; padding: 0.725rem;} 
 .text-gray-500{font-size: 16px; margin-bottom: 13px;}
 .gr-button{color: #084298; background-color: #cfe2ff; border-color: #b6d4fe;
  border: 1px solid transparent; border-radius: 0.25rem; 
  padding: 15px 20px; font-size: 20px; font-weight: 500; font-family: 'IBM Plex Mono';}
  .output-html{min-height: 2rem;}          
  .title{text-align: center;font-size: 25px;margin-top: 13px;position: absolute;width:100%;
  line-height: 1.5;font-family: 'IBM Plex Mono';}
  .desc{text-align: center; font-size: 17px; font-family: 'IBM Plex Mono'; margin-top: 46px;}""") as demo:

    intro =  gr.HTML('<h1 class="title">Arabic Spelling Checker 🤗</h1>')
    description = gr.HTML('<p class="desc">Web-based app to detect spelling mistakes in Arabic words using dynamic programming</p>')
    text = gr.Textbox(label="النص", elem_id="input")
    btn = gr.Button("Spelling Check")
    output = gr.HTML()

    btn.click(ar_spelling_checker, [text], output)

demo.launch(inline=False)