File size: 3,769 Bytes
dbdb640
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
"""
This module is taken with slight modifications from:
https://github.com/PrithivirajDamodaran/Gramformer/blob/main/gramformer/gramformer.py
"""

import re

import pandas as pd
from annotated_text import annotated_text
from bs4 import BeautifulSoup


def show_highlights(annotator, input_text, corrected_sentence):
    strikeout = lambda x: "\u0336".join(x) + "\u0336"
    highlight_text = highlight(annotator, input_text, corrected_sentence)
    color_map = {"d": "#faa", "a": "#afa", "c": "#fea"}
    tokens = re.split(r"(<[dac]\s.*?<\/[dac]>)", highlight_text)
    annotations = []
    for token in tokens:
        soup = BeautifulSoup(token, "html.parser")
        tags = soup.findAll()
        if tags:
            _tag = tags[0].name
            _type = tags[0]["type"]
            _text = tags[0]["edit"]
            _color = color_map[_tag]
            if _tag == "d":
                _text = strikeout(tags[0].text)
            annotations.append((_text, _type, _color))
        else:
            annotations.append(token)
    annotated_text(*annotations)


def show_edits(annotator, input_text, corrected_sentence):
    edits = get_edits(annotator, input_text, corrected_sentence)
    df = pd.DataFrame(
        edits,
        columns=[
            "type",
            "original word",
            "original start",
            "original end",
            "correct word",
            "correct start",
            "correct end",
        ],
    )
    return df.set_index("type")


def highlight(annotator, orig, cor):
    edits = get_edits(annotator, orig, cor)
    orig_tokens = orig.split()
    ignore_indexes = []
    for edit in edits:
        edit_type = edit[0]
        edit_str_start = edit[1]
        edit_spos = edit[2]
        edit_epos = edit[3]
        edit_str_end = edit[4]
        for i in range(edit_spos + 1, edit_epos):
            ignore_indexes.append(i)
        if edit_str_start == "":
            if edit_spos >= 1:
                new_edit_str = orig_tokens[edit_spos - 1]
                # print("edit_spos >= 1", new_edit_str)
                edit_spos -= 1
            else:
                new_edit_str = orig_tokens[edit_spos + 1]
                # print("new", new_edit_str)
                edit_spos += 1

            if edit_type == "PUNCT":
                st = (
                    "<a type='"
                    + edit_type
                    + "' edit='"
                    + edit_str_end
                    + "'>"
                    + new_edit_str
                    + "</a>"
                )
            else:
                st = (
                    "<a type='"
                    + edit_type
                    + "' edit='"
                    + new_edit_str
                    + " "
                    + edit_str_end
                    + "'>"
                    + new_edit_str
                    + "</a>"
                )
        elif edit_str_end == "":
            st = "<d type='" + edit_type + "' edit=''>" + edit_str_start + "</d>"
        else:
            st = (
                "<c type='" + edit_type + "' edit='" + edit_str_end + "'>" + edit_str_start + "</c>"
            )
        orig_tokens[edit_spos] = st
    for i in sorted(ignore_indexes, reverse=True):
        del orig_tokens[i]
    return " ".join(orig_tokens)


def get_edits(annotator, orig, cor):
    orig = annotator.parse(orig)
    cor = annotator.parse(cor)
    alignment = annotator.align(orig, cor)
    edits = annotator.merge(alignment)
    if len(edits) == 0:
        return []
    edit_annotations = []
    for e in edits:
        e = annotator.classify(e)
        edit_annotations.append(
            (e.type[2:], e.o_str, e.o_start, e.o_end, e.c_str, e.c_start, e.c_end)
        )

    return edit_annotations or []