| import difflib |
| import re |
|
|
| def get_word_positions(text): |
| positions = [] |
| for m in re.finditer(r'\S+', text): |
| positions.append((m.group(), m.start(), m.end())) |
| return positions |
|
|
| def get_word_diffs(original, corrected): |
| orig_words = get_word_positions(original) |
| corr_words = get_word_positions(corrected) |
| s = difflib.SequenceMatcher(None, [w[0] for w in orig_words], [w[0] for w in corr_words]) |
| suggestions = [] |
| |
| for tag, i1, i2, j1, j2 in s.get_opcodes(): |
| if tag == 'replace': |
| if i1 < len(orig_words) and i2 - 1 < len(orig_words): |
| start_char = orig_words[i1][1] |
| end_char = orig_words[i2-1][2] |
| suggestions.append({ |
| 'start': start_char, |
| 'end': end_char, |
| 'original': original[start_char:end_char], |
| 'correction': " ".join([w[0] for w in corr_words[j1:j2]]), |
| 'type': 'generic' |
| }) |
| elif tag == 'delete': |
| if i1 < len(orig_words) and i2 - 1 < len(orig_words): |
| start_char = orig_words[i1][1] |
| end_char = orig_words[i2-1][2] |
| suggestions.append({ |
| 'start': start_char, |
| 'end': end_char, |
| 'original': original[start_char:end_char], |
| 'correction': '', |
| 'type': 'generic' |
| }) |
| elif tag == 'insert': |
| pos = orig_words[i1][1] if i1 < len(orig_words) else len(original) |
| suggestions.append({ |
| 'start': pos, |
| 'end': pos, |
| 'original': '', |
| 'correction': " ".join([w[0] for w in corr_words[j1:j2]]), |
| 'type': 'generic' |
| }) |
| |
| return suggestions |
|
|
| def test(): |
| original = "قال محمد علي أننا حققنا نجاحا كبيرا في المشروع رغم الصعوباالصعوبات...." |
| corrected = "قال محمد علي أننا حققنا نجاحا كبيرا في المشروع رغم الصعوبات..." |
| |
| diffs = get_word_diffs(original, corrected) |
| for d in diffs: |
| print(d) |
|
|
| if __name__ == "__main__": |
| test() |
|
|