File size: 11,148 Bytes
dcde80d
 
 
ea816ea
dcde80d
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
a1150de
dcde80d
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
f41473e
dcde80d
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
28ba0c3
84b7b69
 
 
 
 
 
 
 
 
 
 
 
dcde80d
 
 
 
 
 
 
 
 
53a1055
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
import gradio as gr
from datasets import load_dataset
import re
import numpy as np

dataset = load_dataset("mohamedabdullah/Arabic-unique-words",  data_files="ar_vocab.txt")
word_l = re.findall('[^a-zA-Z0-9\s\W]{2,25}', dataset['train']['text'][0])
vocab = set(word_l)

def delete_letter(word):
    return  [word[:i]+word[i+1:] for i in range(len(word))]
    
def switch_letter(word):
    switch_l = []

    for i in range(len(word)-1):
        w_l = re.findall('\w', word)
        if i-1 < 0:
            w_l[i:i+2] = w_l[i+1::-1]
        else:
            w_l[i:i+2] = w_l[i+1:i-1:-1]
            
        switch_l.append(''.join(w_l))
    
    return switch_l
    
def replace_letter(word):
    letters = 'ابتةثجحخدذرزسشصضطظعغفقكلمنهويءآأؤإئ'
    
    replace_set = set()

    for i in range(len(word)):
        for l in letters:
            new_word = word[:i]+l+word[i+1:]
            if new_word == word:
                continue
            replace_set.add(new_word)

    replace_l = sorted(list(replace_set))
    
    return replace_l
    
def insert_letter(word):
    letters = 'ابتةثجحخدذرزسشصضطظعغفقكلمنهويءآأؤإئ'
    insert_l = []

    for i in range(len(word)+1):
        for l in letters:
            new_word = word[:i]+l+word[i:]
            insert_l.append(new_word)
    
    return insert_l
    
def edit_one_letter(word, allow_switches = True):
    edit_one_set = delete_letter(word)+insert_letter(word)+replace_letter(word)

    if allow_switches:
        edit_one_set += switch_letter(word)

    return set(edit_one_set)
    
def edit_two_letters(word, allow_switches = True):
    edit_two_set = []
    edit_one_set = edit_one_letter(word)
    
    for edit in edit_one_set:
        edit_two_set += edit_one_letter(edit)
        
    return set(edit_two_set) | set(edit_one_set)
    
def get_corrections(word, vocab):
    suggestions = []
    
    correct_word_suggest = [word] if word in vocab else []
    edit_one_letter_suggest = list(filter(lambda item: item in vocab, list(edit_one_letter(word))))
    edit_two_letter_suggest = list(filter(lambda item: item in vocab, list(edit_two_letters(word))))
    
    suggestions = correct_word_suggest or edit_one_letter_suggest or edit_two_letter_suggest or ['لم يتم العثور علي إقتراحات مناسبة لهذه الكلمة']

    return set(suggestions) 
    
def min_edit_distance(source, target, ins_cost = 1, del_cost = 1, rep_cost = 2):
    m = len(source) 
    n = len(target) 
    D = np.zeros((m+1, n+1), dtype=int) 
  
    for row in range(1, m+1):
        D[row,0] =  D[row-1,0]+del_cost
        
    for col in range(1, n+1):
        D[0,col] = D[0, col-1]+ins_cost
        
    for row in range(1, m+1):
        for col in range(1, n+1):
            r_cost = rep_cost

            if source[row-1] == target[col-1]:
                r_cost = 0
                
            D[row,col] = np.min([D[row-1,col]+del_cost, D[row,col-1]+ins_cost, D[row-1,col-1]+r_cost])

    med = D[m,n]
    
    return med
    
def get_suggestions(corrections, word):
  distance = []
  suggest = []
  
  for correction in corrections:
    source = word
    target = correction
    min_edits = min_edit_distance(source, target)

    distance.append(min_edits)
    suggest.append(correction)

  suggest_result = list(map(lambda idx: suggest[idx], np.argsort(distance))) 
  return suggest_result  
  
def ar_spelling_checker(text):
  word_l = re.findall('\w{3,}', text)
  result = {}

  for word in word_l:
    tmp_corrections = []
    if not word in vocab:
      tmp_corrections = get_corrections(word, vocab)
      if len(tmp_corrections) == 0:
        continue
      result[word] = get_suggestions(tmp_corrections, word)

  output = '''<style>
    .content{
      direction: rtl;
    }
    .word{
            color: #842029;
            background-color: #f8d7da;
            border-color: #f5c2c7;
            padding: 10px 20px;
            display: inline-block;
            direction: rtl;
            font-size: 15px;
            font-weight: 500;
            margin-bottom: 15px;
            box-sizing: border-box;
            border: 1px solid transparent;
            border-radius: 0.25rem;
    }

    .suggest{
      color: #0f5132;
      background-color: #d1e7dd;
      border-color: #badbcc;
      display: inline-block;
      margin-right: 5px;
    }

    .separator{
      height:3px;
      background: #CCC;
      margin-bottom: 15px;
    }

    .msg{
      color: #0f5132;
      background-color: #d1e7dd;
      border-color: #badbcc;
      border: 1px solid transparent;
      border-radius: 0.25rem;
      padding: 15px 20px;
      direction: rtl;
      font-size: 20px;
      font-weight: 500;
      text-align: center;
    }
    </style>'''

  output += '<div class="content">' 

  if len(result.keys()) == 0:
    output += '<div class="msg">لا توجد أخطاء إملائية 🤗</div>'

  for word in result.keys():
    output += f'<div class="word">{word}</div><br />'
    for suggest in result[word]:
      output += f'<div class="word suggest">{suggest}</div>'

    output += '<div class="separator"></div>'

  output += '</div>' 
  
  return output
  
with gr.Blocks(css="""
 #input{direction: rtl;}
 #component-112{height: 30px;}
 .gr-form{margin-top: 15px;}
 .gr-text-input{font-size: 17px; height:50px; padding: 0.725rem;} 
 .text-gray-500{font-size: 16px; margin-bottom: 13px;}
 .gr-button{color: #084298; background-color: #cfe2ff; border-color: #b6d4fe;
  border: 1px solid transparent; border-radius: 0.25rem; 
  padding: 15px 20px; font-size: 20px; font-weight: 500; font-family: 'IBM Plex Mono';}
  .output-html{min-height: 2rem;}          
  .title{text-align: center;font-size: 25px;margin-top: 13px;position: absolute;width:100%;
  line-height: 1.5;font-family: 'IBM Plex Mono';}
  .desc{text-align: center; font-size: 17px; font-family: 'IBM Plex Mono'; margin-top: 46px;}""") as demo:

    intro =  gr.HTML('<h1 class="title">Arabic Spelling Checker 🤗</h1>')
    description = gr.HTML('<p class="desc">Web-based app to detect spelling mistakes in Arabic words using dynamic programming</p>')
    text = gr.Textbox(label="النص", elem_id="input")
    btn = gr.Button("Spelling Check")
    output = gr.HTML()

    btn.click(ar_spelling_checker, [text], output)

demo.launch(inline=False)