mohamedabdullah commited on
Commit
dcde80d
1 Parent(s): 5223f7d

Create app.py

Browse files
Files changed (1) hide show
  1. app.py +215 -0
app.py ADDED
@@ -0,0 +1,215 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import gradio as gr
2
+ from datasets import load_dataset
3
+ import re
4
+
5
+ dataset = load_dataset("mohamedabdullah/Arabic-unique-words", data_files="ar_vocab.txt")
6
+ word_l = re.findall('[^a-zA-Z0-9\s\W]{2,25}', dataset['train']['text'][0])
7
+ vocab = set(word_l)
8
+
9
+ def delete_letter(word):
10
+ return [word[:i]+word[i+1:] for i in range(len(word))]
11
+
12
+ def switch_letter(word):
13
+ switch_l = []
14
+
15
+ for i in range(len(word)-1):
16
+ w_l = re.findall('\w', word)
17
+ if i-1 < 0:
18
+ w_l[i:i+2] = w_l[i+1::-1]
19
+ else:
20
+ w_l[i:i+2] = w_l[i+1:i-1:-1]
21
+
22
+ switch_l.append(''.join(w_l))
23
+
24
+ return switch_l
25
+
26
+ def replace_letter(word):
27
+ letters = 'ابتةثجحخدذرزسشصضطظعغفقكلمنهويءآأؤإئ'
28
+
29
+ replace_set = set()
30
+
31
+ for i in range(len(word)):
32
+ for l in letters:
33
+ new_word = word[:i]+l+word[i+1:]
34
+ if new_word == word:
35
+ continue
36
+ replace_set.add(new_word)
37
+
38
+ replace_l = sorted(list(replace_set))
39
+
40
+ return replace_l
41
+
42
+ def insert_letter(word):
43
+ letters = 'ابتةثجحخدذرزسشصضطظعغفقكلمنهويءآأؤإئ'
44
+ insert_l = []
45
+
46
+ for i in range(len(word)+1):
47
+ for l in letters:
48
+ new_word = word[:i]+l+word[i:]
49
+ insert_l.append(new_word)
50
+
51
+ return insert_l
52
+
53
+ def edit_one_letter(word, allow_switches = True):
54
+ edit_one_set = delete_letter(word)+insert_letter(word)+replace_letter(word)
55
+
56
+ if allow_switches:
57
+ edit_one_set += switch_letter(word)
58
+
59
+ return set(edit_one_set)
60
+
61
+ def edit_two_letters(word, allow_switches = True):
62
+ edit_two_set = []
63
+ edit_one_set = edit_one_letter(word)
64
+
65
+ for edit in edit_one_set:
66
+ edit_two_set += edit_one_letter(edit)
67
+
68
+ return set(edit_two_set) | set(edit_one_set)
69
+
70
+ def get_corrections(word, vocab):
71
+ suggestions = []
72
+
73
+ correct_word_suggest = [word] if word in vocab else []
74
+ edit_one_letter_suggest = list(filter(lambda item: item in vocab, list(edit_one_letter(word))))
75
+ edit_two_letter_suggest = list(filter(lambda item: item in vocab, list(edit_two_letters(word))))
76
+
77
+ suggestions = correct_word_suggest or edit_one_letter_suggest or edit_two_letter_suggest
78
+
79
+ return set(suggestions)
80
+
81
+ def min_edit_distance(source, target, ins_cost = 1, del_cost = 1, rep_cost = 2):
82
+ m = len(source)
83
+ n = len(target)
84
+ D = np.zeros((m+1, n+1), dtype=int)
85
+
86
+ for row in range(1, m+1):
87
+ D[row,0] = D[row-1,0]+del_cost
88
+
89
+ for col in range(1, n+1):
90
+ D[0,col] = D[0, col-1]+ins_cost
91
+
92
+ for row in range(1, m+1):
93
+ for col in range(1, n+1):
94
+ r_cost = rep_cost
95
+
96
+ if source[row-1] == target[col-1]:
97
+ r_cost = 0
98
+
99
+ D[row,col] = np.min([D[row-1,col]+del_cost, D[row,col-1]+ins_cost, D[row-1,col-1]+r_cost])
100
+
101
+ med = D[m,n]
102
+
103
+ return med
104
+
105
+ def get_suggestions(corrections, word):
106
+ distance = []
107
+ suggest = []
108
+
109
+ for correction in corrections:
110
+ source = word
111
+ target = correction
112
+ min_edits = min_edit_distance(source, target)
113
+
114
+ distance.append(min_edits)
115
+ suggest.append(correction)
116
+
117
+ suggest_result = list(map(lambda idx: suggest[idx], np.argsort(distance)))
118
+ return suggest_result
119
+
120
+ def ar_spelling_checker(text):
121
+ word_l = re.findall('\w{3,}', text)
122
+ result = {}
123
+
124
+ for word in word_l:
125
+ if not word in vocab:
126
+ tmp_corrections = get_corrections(word, vocab)
127
+ if len(tmp_corrections) == 0:
128
+ continue
129
+ result[word] = get_suggestions(tmp_corrections, word)
130
+
131
+ output = '''<style>
132
+ .content{
133
+ direction: rtl;
134
+ }
135
+ .word{
136
+ color: #842029;
137
+ background-color: #f8d7da;
138
+ border-color: #f5c2c7;
139
+ padding: 10px 20px;
140
+ display: inline-block;
141
+ direction: rtl;
142
+ font-size: 15px;
143
+ font-weight: 500;
144
+ margin-bottom: 15px;
145
+ box-sizing: border-box;
146
+ border: 1px solid transparent;
147
+ border-radius: 0.25rem;
148
+ }
149
+
150
+ .suggest{
151
+ color: #0f5132;
152
+ background-color: #d1e7dd;
153
+ border-color: #badbcc;
154
+ display: inline-block;
155
+ margin-right: 5px;
156
+ }
157
+
158
+ .separator{
159
+ height:3px;
160
+ background: #CCC;
161
+ margin-bottom: 15px;
162
+ }
163
+
164
+ .msg{
165
+ color: #0f5132;
166
+ background-color: #d1e7dd;
167
+ border-color: #badbcc;
168
+ border: 1px solid transparent;
169
+ border-radius: 0.25rem;
170
+ padding: 15px 20px;
171
+ direction: rtl;
172
+ font-size: 20px;
173
+ font-weight: 500;
174
+ text-align: center;
175
+ }
176
+ </style>'''
177
+
178
+ output += '<div class="content">'
179
+
180
+ if len(result.keys()) == 0:
181
+ output += '<div class="msg">لا توجد أخطاء إملائية 🤗</div>'
182
+
183
+ for word in result.keys():
184
+ output += f'<div class="word">{word}</div><br />'
185
+ for suggest in result[word]:
186
+ output += f'<div class="word suggest">{suggest}</div>'
187
+
188
+ output += '<div class="separator"></div>'
189
+
190
+ output += '</div>'
191
+
192
+ return output
193
+
194
+ with gr.Blocks(css="""#input{direction: rtl;}
195
+ #component-112{height: 30px;}
196
+ .gr-form{margin-top: 15px;}
197
+ .gr-text-input{font-size: 17px; height:50px; padding: 0.725rem;}
198
+ .text-gray-500{font-size: 16px; margin-bottom: 13px;}
199
+ .gr-button{color: #084298; background-color: #cfe2ff; border-color: #b6d4fe;
200
+ border: 1px solid transparent; border-radius: 0.25rem;
201
+ padding: 15px 20px; font-size: 20px; font-weight: 500; font-family: 'IBM Plex Mono';}
202
+ .output-html{min-height: 2rem;}
203
+ .title{text-align: center;font-size: 25px;margin-top: 13px;position: absolute;width:100%;
204
+ line-height: 1.5;font-family: 'IBM Plex Mono';}
205
+ .desc{text-align: center; font-size: 17px; font-family: 'IBM Plex Mono'; margin-top: 46px;}""") as demo:
206
+
207
+ intro = gr.HTML('<h1 class="title">Arabic Spelling Checker 🤗</h1>')
208
+ description = gr.HTML('<p class="desc">Web-based app to detect spelling mistakes in Arabic words using dynamic programming</p>')
209
+ text = gr.Textbox(label="النص", elem_id="input")
210
+ btn = gr.Button("Spelling Check")
211
+ output = gr.HTML()
212
+
213
+ btn.click(ar_spelling_checker, [text], output)
214
+
215
+ demo.launch()