|
import pickle
|
|
import math
|
|
import os
|
|
|
|
from .normalizer import Normalizer
|
|
from .tokenizer import Tokenizer
|
|
from .data_helper import DataHelper
|
|
|
|
|
|
class SpellCheck:
|
|
def __init__(self):
|
|
self.normalizer = Normalizer()
|
|
self.tokenizer = Tokenizer()
|
|
self.data_helper = DataHelper()
|
|
|
|
self.dir_path = os.path.dirname(os.path.realpath(__file__)) + "/"
|
|
|
|
self.bigram_lm = self.data_helper.load_var(self.dir_path + "resource/spell/mybigram_lm.pckl")
|
|
self.onegram_lm = self.data_helper.load_var(self.dir_path + "resource/spell/onegram.pckl")
|
|
self.ingroup_chars = [{'ا', 'آ', 'ع'},
|
|
{'ت', 'ط'},
|
|
{'ث', 'س', 'ص'},
|
|
{'ح', 'ه'},
|
|
{'ذ', 'ز', 'ض', 'ظ'},
|
|
{'ق', 'غ'}]
|
|
|
|
def deletion(self, word):
|
|
p_list = []
|
|
for k in range(len(word)):
|
|
if word[k] == '-' or word[k] == '#':
|
|
continue
|
|
begin = word[:k]
|
|
end = word[k+1:]
|
|
tmp_string = begin + end
|
|
p_list.append(tmp_string)
|
|
return p_list
|
|
|
|
def splitting(self, word):
|
|
p_list = set([])
|
|
delimator = '-'
|
|
for i, char in enumerate(word):
|
|
begin = word[:i].strip('\u200c')
|
|
end = word[i:].strip('\u200c')
|
|
tmp_string = begin + delimator + end
|
|
p_list.add(tmp_string)
|
|
return list(p_list)
|
|
|
|
def insertion(self, word):
|
|
p_list = []
|
|
alphabet = ['ا', 'آ', 'ب', 'پ', 'ت', 'ث', 'ج', 'چ', 'ح', 'خ',
|
|
'د', 'ذ', 'ر', 'ز', 'ژ', 'س', 'ش', 'ص', 'ض', 'ط',
|
|
'ظ', 'ع', 'غ', 'ف', 'ق', 'ک', 'گ', 'ل', 'م', 'ن',
|
|
'و', 'ه', 'ی', '']
|
|
for k in range(len(word)+1):
|
|
for char in alphabet:
|
|
begin = word[:k]
|
|
end = word[k:]
|
|
tmp_string = begin + char + end
|
|
p_list.append(tmp_string)
|
|
return p_list
|
|
|
|
def substitution(self, word):
|
|
p_list = []
|
|
alphabet = ['ا', 'آ', 'ب', 'پ', 'ت', 'ث', 'ج', 'چ', 'ح', 'خ',
|
|
'د', 'ذ', 'ر', 'ز', 'ژ', 'س', 'ش', 'ص', 'ض', 'ط',
|
|
'ظ', 'ع', 'غ', 'ف', 'ق', 'ک', 'گ', 'ل', 'م', 'ن',
|
|
'و', 'ه', 'ی']
|
|
for i, char in enumerate(word):
|
|
if char == '-' or char == '#':
|
|
continue
|
|
for c in alphabet:
|
|
begin = word[:i]
|
|
end = word[i+1:]
|
|
tmp_string = begin + c + end
|
|
p_list.append(tmp_string)
|
|
return p_list
|
|
|
|
def transposition(self, word):
|
|
p_list = []
|
|
word = list(word)
|
|
tmp_word = word[:]
|
|
for k1 in range(len(word)):
|
|
k2 = k1 + 1
|
|
if k2 == len(word):
|
|
break
|
|
tmp = tmp_word[k1]
|
|
tmp_word[k1] = tmp_word[k2]
|
|
tmp_word[k2] = tmp
|
|
tmp_string = ''.join(tmp_word)
|
|
p_list.append(tmp_string)
|
|
tmp_word = word[:]
|
|
return p_list
|
|
|
|
def build_similar_words(self, word_seq, index, zi, operation):
|
|
z_list = []
|
|
o_list = []
|
|
|
|
if operation == "Spell":
|
|
tmp = self.deletion(zi)
|
|
for i in tmp:
|
|
z_list.append(i)
|
|
o_list.append("Deletion")
|
|
tmp = self.insertion(zi)
|
|
for i in tmp:
|
|
z_list.append(i)
|
|
o_list.append("Insertion")
|
|
tmp = self.substitution(zi)
|
|
for i in tmp:
|
|
z_list.append(i)
|
|
o_list.append("Substitution")
|
|
tmp = self.transposition(zi)
|
|
for i in tmp:
|
|
z_list.append(i)
|
|
o_list.append("Transposition")
|
|
|
|
elif operation == "Split":
|
|
tmp = self.splitting(zi)
|
|
for i in tmp:
|
|
z_list.append(i)
|
|
o_list.append("Split")
|
|
|
|
elif operation == "Merg":
|
|
if index < len(word_seq)-1:
|
|
tmp = zi + '#' + word_seq[index+1]
|
|
z_list.append(tmp)
|
|
o_list.append("Merg")
|
|
|
|
return [z_list, o_list]
|
|
|
|
def bigram_markov_factor(self, yi_1, yi):
|
|
bigram_counts, total_count = self.bigram_lm
|
|
tmp = (yi_1, yi)
|
|
if tmp in bigram_counts.keys():
|
|
x = bigram_counts[tmp]
|
|
x = float(x)/total_count
|
|
x = math.log2(x)
|
|
return x
|
|
else:
|
|
return -28
|
|
|
|
def get_word_probability(self, word):
|
|
lex_dict = self.onegram_lm[0]
|
|
total_words = self.onegram_lm[1]
|
|
|
|
if word in lex_dict:
|
|
count = lex_dict[word]
|
|
logprob = math.log2(float(count)/total_words)
|
|
return logprob
|
|
else:
|
|
return -50.0
|
|
|
|
def isword(self, x):
|
|
if abs(x.find('#') - x.find('-')) == 1:
|
|
return False
|
|
|
|
dash_idx = x.find('-')
|
|
if dash_idx != -1:
|
|
first = x[:dash_idx]
|
|
secound = x[dash_idx+1:]
|
|
if self.get_word_probability(first) < -49:
|
|
return False
|
|
elif self.get_word_probability(secound) < -49:
|
|
return False
|
|
else:
|
|
return True
|
|
|
|
sharp_idx = x.find('#')
|
|
if sharp_idx != -1:
|
|
begin = x[:sharp_idx]
|
|
end = x[sharp_idx+1:]
|
|
tmp_str = begin + end
|
|
if self.get_word_probability(tmp_str) < -49:
|
|
return False
|
|
else:
|
|
return True
|
|
else:
|
|
if self.get_word_probability(x) < -49:
|
|
return False
|
|
else:
|
|
return True
|
|
|
|
def get_possible_words(self, word_seq, index):
|
|
wi = word_seq[index]
|
|
possible_words = []
|
|
operation_list = []
|
|
|
|
possible_words.append(wi)
|
|
operation_list.append("Nothing")
|
|
|
|
if len(wi) == 1:
|
|
return possible_words, operation_list
|
|
|
|
'''Merg Split Spell'''
|
|
[c_list, o_list] = self.build_similar_words(word_seq, index, wi, "Merg")
|
|
for i, c in enumerate(c_list):
|
|
if self.isword(c):
|
|
possible_words.append(c)
|
|
operation_list.append(o_list[i])
|
|
|
|
[c_list, o_list] = self.build_similar_words(word_seq, index, wi, "Split")
|
|
for i, c in enumerate(c_list):
|
|
if self.isword(c):
|
|
possible_words.append(c)
|
|
operation_list.append(o_list[i])
|
|
|
|
[c_list, o_list] = self.build_similar_words(word_seq, index, wi, "Spell")
|
|
for i, c in enumerate(c_list):
|
|
if self.isword(c):
|
|
possible_words.append(c)
|
|
operation_list.append(o_list[i])
|
|
|
|
return possible_words, operation_list
|
|
|
|
def select_n_best(self, c_list, o_list, n=3):
|
|
my_dict = {}
|
|
map_dict = {}
|
|
for i, word in enumerate(c_list):
|
|
if o_list[i] == 'Merg':
|
|
tmp_word = word.replace("#", "")
|
|
prob = self.get_word_probability(tmp_word)
|
|
|
|
elif o_list[i] == 'Split':
|
|
begin = word.split('-')[0]
|
|
end = word.split('-')[1]
|
|
prob = float(self.get_word_probability(begin) + self.get_word_probability(end))/2
|
|
|
|
else:
|
|
prob = self.get_word_probability(word)
|
|
|
|
if word not in my_dict:
|
|
my_dict[word] = prob
|
|
map_dict[word] = o_list[i]
|
|
|
|
n_best = set(sorted(my_dict, key=my_dict.get, reverse=True)[:n])
|
|
n_best.add(c_list[0])
|
|
n_best = list(n_best)
|
|
n_best_op = [map_dict[key] for key in n_best]
|
|
return n_best, n_best_op
|
|
|
|
def is_ingroup_substitution(self, main_word, candidate_word):
|
|
main_word = list(main_word)
|
|
candidate_word = list(candidate_word)
|
|
flag = False
|
|
for i, c in enumerate(main_word):
|
|
if c == candidate_word[i]:
|
|
continue
|
|
else:
|
|
flag = False
|
|
for l in self.ingroup_chars:
|
|
if c in l and candidate_word[i] in l:
|
|
flag = True
|
|
break
|
|
break
|
|
return flag
|
|
|
|
def select_correct_spell(self, candidate_list, next_candidates, next_next_candidates, prev_word, current_word):
|
|
best_candidate = None
|
|
best_operation = None
|
|
best_score = -1000
|
|
next_next_candidate_list = []
|
|
next_next_operation_list = []
|
|
|
|
candidate_list, operation_list = candidate_list
|
|
if next_candidates is not None:
|
|
next_candidate_list, next_operation_list = next_candidates
|
|
else:
|
|
next_candidate_list, next_operation_list = [None], "Nothing"
|
|
|
|
if next_next_candidates is not None:
|
|
next_next_candidate_list, next_next_operation_list = next_next_candidates
|
|
else:
|
|
next_candidate_list, next_operation_list = [None], "Nothing"
|
|
|
|
for i, candidate in enumerate(candidate_list):
|
|
operation = operation_list[i]
|
|
|
|
if operation == "Split":
|
|
begin = candidate[:candidate.find('-')]
|
|
end = candidate[candidate.find('-')+1:]
|
|
candidate = begin
|
|
next_word = end
|
|
|
|
onegram_score = self.get_word_probability(candidate)
|
|
bigram_score_with_prev = self.bigram_markov_factor(prev_word, candidate)
|
|
|
|
bigram_score_next = -1000
|
|
tmp_score_next = self.bigram_markov_factor(candidate, next_word)
|
|
for j, next_next_word in enumerate(next_candidate_list):
|
|
opt = next_operation_list[j]
|
|
if opt == 'Merg':
|
|
next_next_word = next_next_word.replace("#", "")
|
|
elif opt == 'Split':
|
|
next_next_word = next_next_word.split('-')[0]
|
|
|
|
tmp_score_next_next = self.bigram_markov_factor(next_word, next_next_word)
|
|
if tmp_score_next_next > bigram_score_next:
|
|
bigram_score_next = tmp_score_next_next
|
|
bigram_score_next = float(bigram_score_next + tmp_score_next)/2
|
|
|
|
elif operation == "Merg":
|
|
begin = candidate[:candidate.find('#')]
|
|
end = candidate[candidate.find('#')+1:]
|
|
candidate = begin + end
|
|
|
|
onegram_score = self.get_word_probability(candidate)
|
|
bigram_score_with_prev = self.bigram_markov_factor(prev_word, candidate)
|
|
|
|
bigram_score_next = -1000
|
|
for j, next_next_word in enumerate(next_next_candidate_list):
|
|
opt = next_next_operation_list[j]
|
|
if opt == 'Merg':
|
|
next_next_word = next_next_word.replace("#", "")
|
|
elif opt == 'Split':
|
|
next_next_word = next_next_word.split('-')[0]
|
|
|
|
tmp_score = self.bigram_markov_factor(candidate, next_next_word)
|
|
if tmp_score > bigram_score_next:
|
|
bigram_score_next = tmp_score
|
|
|
|
else:
|
|
onegram_score = self.get_word_probability(candidate)
|
|
bigram_score_with_prev = self.bigram_markov_factor(prev_word, candidate)
|
|
|
|
bigram_score_next = -1000
|
|
for j, next_word in enumerate(next_candidate_list):
|
|
opt = next_operation_list[j]
|
|
if opt == 'Merg':
|
|
next_word = next_word.replace("#", "")
|
|
elif opt == 'Split':
|
|
next_word = next_word.split('-')[0]
|
|
|
|
tmp_score = self.bigram_markov_factor(candidate, next_word)
|
|
if tmp_score > bigram_score_next:
|
|
bigram_score_next = tmp_score
|
|
|
|
if operation == 'Substitution':
|
|
if self.is_ingroup_substitution(current_word, candidate):
|
|
onegram_score += 20
|
|
else:
|
|
onegram_score += 10
|
|
elif operation == 'Deletion' or operation == 'Insertion':
|
|
onegram_score += 5
|
|
if '\u200c' in candidate and '\u200c' not in current_word:
|
|
onegram_score += 5
|
|
elif operation == 'Split' or operation == 'Merg':
|
|
onegram_score += 7
|
|
elif operation == 'Nothing':
|
|
onegram_score += 20
|
|
|
|
score = 1*onegram_score + 0.7*bigram_score_with_prev + 0.7*bigram_score_next
|
|
|
|
if score > best_score:
|
|
best_operation = operation
|
|
best_candidate = candidate_list[i]
|
|
best_score = score
|
|
|
|
return best_candidate, best_operation
|
|
|
|
def spell_corrector(self, doc_string):
|
|
words = self.tokenizer.tokenize_words(self.normalizer.normalize(doc_string))
|
|
|
|
best_o_list = []
|
|
best_candidates_list = []
|
|
|
|
yi_1 = None
|
|
merged_before = False
|
|
|
|
suggest_list = []
|
|
|
|
for i, word in enumerate(words):
|
|
[c_list, o_list] = self.get_possible_words(words, i)
|
|
n_best = self.select_n_best(c_list, o_list, n=15)
|
|
suggest_list.append(n_best)
|
|
|
|
for i, candidate_list in enumerate(suggest_list):
|
|
|
|
if merged_before:
|
|
continue
|
|
|
|
if (i+2) < len(suggest_list):
|
|
next_candidates = suggest_list[i+1]
|
|
next_next_candidates = suggest_list[i+2]
|
|
elif (i+1) < len(suggest_list):
|
|
next_candidates = suggest_list[i+1]
|
|
next_next_candidates = None
|
|
else:
|
|
next_candidates = None
|
|
next_next_candidates = None
|
|
|
|
best_candidate, best_operation = self.select_correct_spell(candidate_list, next_candidates,
|
|
next_next_candidates, yi_1, words[i])
|
|
|
|
merged_before = False
|
|
if best_operation == "Split":
|
|
begin = best_candidate.split('-')[0]
|
|
end = best_candidate.split('-')[1]
|
|
best_candidate = [begin, end]
|
|
if best_operation == "Merg":
|
|
best_candidate = best_operation.replace("#", "")
|
|
merged_before = True
|
|
if type(best_candidate) == str:
|
|
best_candidate = [best_candidate]
|
|
|
|
best_o_list.append(best_operation)
|
|
best_candidates_list.extend(best_candidate)
|
|
yi_1 = best_candidate[-1]
|
|
|
|
res = " ".join(best_candidates_list)
|
|
ops = " ".join(best_o_list)
|
|
|
|
return res
|
|
|
|
|
|
if __name__ == "__main__":
|
|
doc_string = "نمازگذاران وارد مسلی شدند."
|
|
myspell_checker = SpellCheck()
|
|
|
|
res = myspell_checker.spell_corrector(doc_string)
|
|
print(res)
|
|
|