File size: 6,749 Bytes
50dc05d
 
 
 
 
 
 
 
 
 
5a9c00f
50dc05d
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
2770945
 
50dc05d
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
8c86600
50dc05d
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
import nltk
import streamlit as st
from nltk.tokenize import sent_tokenize
from transformers import pipeline

st.set_page_config(page_title="Relation Extraction App", page_icon="🔍", layout="wide")
nltk.download("punkt")

relation_pipe = pipeline(
    "text-classification",
    model="PaDaS-Lab/privacy-policy-relation-extraction",
    return_all_scores=True,
    framework="pt",
)

ner_pipe = pipeline(
    "token-classification",
    model="PaDaS-Lab/gdpr-privacy-policy-ner",
    aggregation_strategy="simple",
    framework="pt",
)

classes_gdpr = {
    "DC": "Data Controller",
    "DP": "Data Processor",
    "DPO": "Data Protection Officer",
    "R": "Recipient",
    "TP": "Third Party",
    "A": "Authority",
    "DS": "Data Subject",
    "DSO": "Data Source",
    "RP": "Required Purpose",
    "NRP": "Not-Required Purpose",
    "P": "Processing",
    "NPD": "Non-Personal Data",
    "PD": "Personal Data",
    "OM": "Organisational Measure",
    "TM": "Technical Measure",
    "LB": "Legal Basis",
    "CONS": "Consent",
    "CONT": "Contract",
    "LI": "Legitimate Interest",
    "ADM": "Automated Decision Making",
    "RET": "Retention",
    "SEU": "Scale EU",
    "SNEU": "Scale Non-EU",
    "RI": "Right",
    "DSR15": "Art. 15 Right of access by the data subject",
    "DSR16": "Art. 16 Right to rectification",
    "DSR17": "Art. 17 Right to erasure (‘right to be forgotten’)",
    "DSR18": "Art. 18 Right to restriction of processing",
    "DSR19": "Notification obligation regarding rectification or erasure of personal data or restriction of processing",
    "DSR20": "Art. 20 Right to data portability",
    "DSR21": "Art. 21 Right to object",
    "DSR22": "Art. 22 Automated individual decision-making, including profiling",
    "LC": "Lodge Complaint",
}


@st.cache_data
def classify_sentences(text):
    sentences = sent_tokenize(text)
    results = relation_pipe(sentences)
    return sentences, results


@st.cache_data
def get_ner_annotations(sentence):
    ner_results = ner_pipe(sentence)
    return ner_results


def annotate_sentence(sentence, ner_results):
    spans = []
    current_entity = None
    current_start = None
    current_end = None

    for ner in ner_results:
        entity_group = ner["entity_group"]
        entity = classes_gdpr.get(entity_group, entity_group)
        start = ner["start"]
        end = ner["end"]

        if current_entity == entity:
            current_end = end
        else:
            if current_entity is not None:
                spans.append((current_start, current_end, current_entity))
            current_entity = entity
            current_start = start
            current_end = end

    if current_entity is not None:
        spans.append((current_start, current_end, current_entity))

    annotated_sentence = ""
    last_idx = 0

    for start, end, entity in spans:
        annotated_sentence += sentence[last_idx:start]
        annotated_sentence += f"<span class='tooltip' style='text-decoration: underline;'>{sentence[start:end]}<span class='tooltiptext'>{entity}</span></span>"
        last_idx = end

    annotated_sentence += sentence[last_idx:]

    return annotated_sentence


st.markdown(
    """
    <style>
    .tooltip {
        position: relative;
        display: inline-block;
    }

    .tooltip .tooltiptext {
        visibility: hidden;
        width: auto;
        background-color: black;
        color: #fff;
        text-align: center;
        border-radius: 6px;
        padding: 5px;
        position: absolute;
        z-index: 1;
        bottom: 125%;
        left: 50%;
        transform: translateX(-50%);
        font-size: 12px;
        white-space: nowrap;
    }

    .tooltip:hover .tooltiptext {
        visibility: visible;
        transition: visibility 0s linear 0s;
    }
    </style>
    """,
    unsafe_allow_html=True,
)


def get_top_labels(results, top_n=2):
    top_labels = []
    for result in results:
        sorted_result = sorted(result, key=lambda x: x["score"], reverse=True)[:top_n]
        top_labels.append(sorted_result)
    return top_labels


st.title("Relation Extraction App")

st.sidebar.title("Identified relation labels")
st.sidebar.write("Choose one:")

text = st.text_area(
    "Enter your text here:",
    value="We may use these technologies to collect information when you interact with services we offer through one of our partners, such as advertising and commerce features. Most web browsers are set to accept cookies by default. It is up to you to move or reject browser cookies through the settings on your browser or device. Removing or rejecting cookies may affect our service function and availability.",
)

if st.button("Analyze"):
    if text:
        sentences, results = classify_sentences(text)
        top_labels = get_top_labels(results, top_n=2)

        labels_dict = {}
        for sentence, result in zip(sentences, top_labels):
            for res in result:
                label = res["label"]
                score = res["score"]
                if label not in labels_dict:
                    labels_dict[label] = []
                labels_dict[label].append((sentence, score))

        st.session_state.labels_dict = labels_dict

if "labels_dict" not in st.session_state:
    st.markdown(
        """
        <style>
        .hint {
            color: rgba(222, 49, 99, 0.9);
            font-size: 16px;
        }
        </style>
        <h4 class="hint">Notes:</h4>
        <ul class="hint">
            <li>Enter text in the text area above,</li>
            <li>The relation labels will be displayed in the sidebar,</li>
            <li>Click on any label to see the corresponding sentences,</li>
            <li>In displayed sentences, hover over underlined words to see their corresponding NER tag.</li>
        </ul>
        """,
        unsafe_allow_html=True,
    )

if "labels_dict" in st.session_state:
    labels_dict = st.session_state.labels_dict

    for label in labels_dict.keys():
        if st.sidebar.button(label):
            st.markdown(
                f"Sentences with relation label: <strong><span style='color: #FF4B4B; font-size: 1.2em;'>{label}</span></strong>",
                unsafe_allow_html=True,
            )
            for sentence, score in labels_dict[label]:
                ner_results = get_ner_annotations(sentence)
                annotated_sentence = annotate_sentence(sentence, ner_results)
                st.markdown(
                    f"<div style='background-color: rgba(143, 203, 249, 0.1); padding: 10px; border-radius: 7px; margin: 5px 0;'>{annotated_sentence} <span style='color: #C71585; font-weight: bold;'>({score:.2f})</span></div>",
                    unsafe_allow_html=True,
                )