harshildarji commited on
Commit
50dc05d
1 Parent(s): d82feeb
Files changed (3) hide show
  1. README.md +4 -2
  2. app.py +213 -0
  3. requirements.txt +2 -0
README.md CHANGED
@@ -1,6 +1,6 @@
1
  ---
2
  title: RE With NER
3
- emoji: 😻
4
  colorFrom: gray
5
  colorTo: yellow
6
  sdk: streamlit
@@ -10,4 +10,6 @@ pinned: false
10
  license: mit
11
  ---
12
 
13
- Check out the configuration reference at https://huggingface.co/docs/hub/spaces-config-reference
 
 
 
1
  ---
2
  title: RE With NER
3
+ emoji: 🔍
4
  colorFrom: gray
5
  colorTo: yellow
6
  sdk: streamlit
 
10
  license: mit
11
  ---
12
 
13
+ Models used:
14
+ - https://huggingface.co/harshildarji/privacy-policy-relation-extraction
15
+ - https://huggingface.co/PaDaS-Lab/gdpr-privacy-policy-ner
app.py ADDED
@@ -0,0 +1,213 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import nltk
2
+ import streamlit as st
3
+ from nltk.tokenize import sent_tokenize
4
+ from transformers import pipeline
5
+
6
+ st.set_page_config(page_title="Relation Extraction App", page_icon="🔍", layout="wide")
7
+ nltk.download("punkt")
8
+
9
+ relation_pipe = pipeline(
10
+ "text-classification",
11
+ model="harshildarji/privacy-policy-relation-extraction",
12
+ return_all_scores=True,
13
+ framework="pt",
14
+ )
15
+
16
+ ner_pipe = pipeline(
17
+ "token-classification",
18
+ model="PaDaS-Lab/gdpr-privacy-policy-ner",
19
+ aggregation_strategy="simple",
20
+ framework="pt",
21
+ )
22
+
23
+ classes_gdpr = {
24
+ "DC": "Data Controller",
25
+ "DP": "Data Processor",
26
+ "DPO": "Data Protection Officer",
27
+ "R": "Recipient",
28
+ "TP": "Third Party",
29
+ "A": "Authority",
30
+ "DS": "Data Subject",
31
+ "DSO": "Data Source",
32
+ "RP": "Required Purpose",
33
+ "NRP": "Not-Required Purpose",
34
+ "P": "Processing",
35
+ "NPD": "Non-Personal Data",
36
+ "PD": "Personal Data",
37
+ "OM": "Organisational Measure",
38
+ "TM": "Technical Measure",
39
+ "LB": "Legal Basis",
40
+ "CONS": "Consent",
41
+ "CONT": "Contract",
42
+ "LI": "Legitimate Interest",
43
+ "ADM": "Automated Decision Making",
44
+ "RET": "Retention",
45
+ "SEU": "Scale EU",
46
+ "SNEU": "Scale Non-EU",
47
+ "RI": "Right",
48
+ "DSR15": "Art. 15 Right of access by the data subject",
49
+ "DSR16": "Art. 16 Right to rectification",
50
+ "DSR17": "Art. 17 Right to erasure (‘right to be forgotten’)",
51
+ "DSR18": "Art. 18 Right to restriction of processing",
52
+ "DSR19": "Notification obligation regarding rectification or erasure of personal data or restriction of processing",
53
+ "DSR20": "Art. 20 Right to data portability",
54
+ "DSR21": "Art. 21 Right to object",
55
+ "DSR22": "Art. 22 Automated individual decision-making, including profiling",
56
+ "LC": "Lodge Complaint",
57
+ }
58
+
59
+
60
+ @st.cache_data
61
+ def classify_sentences(text):
62
+ sentences = sent_tokenize(text)
63
+ results = relation_pipe(sentences)
64
+ return sentences, results
65
+
66
+
67
+ @st.cache_data
68
+ def get_ner_annotations(sentence):
69
+ ner_results = ner_pipe(sentence)
70
+ return ner_results
71
+
72
+
73
+ def annotate_sentence(sentence, ner_results):
74
+ spans = []
75
+ current_entity = None
76
+ current_start = None
77
+ current_end = None
78
+
79
+ for ner in ner_results:
80
+ entity_group = ner["entity_group"]
81
+ entity = classes_gdpr.get(entity_group, entity_group)
82
+ start = ner["start"]
83
+ end = ner["end"]
84
+
85
+ if current_entity == entity:
86
+ current_end = end
87
+ else:
88
+ if current_entity is not None:
89
+ spans.append((current_start, current_end, current_entity))
90
+ current_entity = entity
91
+ current_start = start
92
+ current_end = end
93
+
94
+ if current_entity is not None:
95
+ spans.append((current_start, current_end, current_entity))
96
+
97
+ annotated_sentence = ""
98
+ last_idx = 0
99
+
100
+ for start, end, entity in spans:
101
+ annotated_sentence += sentence[last_idx:start]
102
+ annotated_sentence += f"<span class='tooltip' style='text-decoration: underline;'>{sentence[start:end]}<span class='tooltiptext'>{entity}</span></span>"
103
+ last_idx = end
104
+
105
+ annotated_sentence += sentence[last_idx:]
106
+
107
+ return annotated_sentence
108
+
109
+
110
+ st.markdown(
111
+ """
112
+ <style>
113
+ .tooltip {
114
+ position: relative;
115
+ display: inline-block;
116
+ }
117
+
118
+ .tooltip .tooltiptext {
119
+ visibility: hidden;
120
+ width: auto;
121
+ background-color: black;
122
+ color: #fff;
123
+ text-align: center;
124
+ border-radius: 6px;
125
+ padding: 5px;
126
+ position: absolute;
127
+ z-index: 1;
128
+ bottom: 125%;
129
+ left: 50%;
130
+ transform: translateX(-50%);
131
+ font-size: 12px;
132
+ white-space: nowrap;
133
+ }
134
+
135
+ .tooltip:hover .tooltiptext {
136
+ visibility: visible;
137
+ transition: visibility 0s linear 0s;
138
+ }
139
+ </style>
140
+ """,
141
+ unsafe_allow_html=True,
142
+ )
143
+
144
+
145
+ def get_top_labels(results, top_n=2):
146
+ top_labels = []
147
+ for result in results:
148
+ sorted_result = sorted(result, key=lambda x: x["score"], reverse=True)[:top_n]
149
+ top_labels.append(sorted_result)
150
+ return top_labels
151
+
152
+
153
+ st.title("Relation Extraction App")
154
+
155
+ st.sidebar.title("Identified relation labels:")
156
+
157
+ text = st.text_area(
158
+ "Enter your text here:",
159
+ value="We may use these technologies to collect information when you interact with services we offer through one of our partners, such as advertising and commerce features. Most web browsers are set to accept cookies by default. It is up to you to move or reject browser cookies through the settings on your browser or device. Removing or rejecting cookies may affect our service function and availability.",
160
+ )
161
+
162
+ if st.button("Analyze"):
163
+ if text:
164
+ sentences, results = classify_sentences(text)
165
+ top_labels = get_top_labels(results, top_n=2)
166
+
167
+ labels_dict = {}
168
+ for sentence, result in zip(sentences, top_labels):
169
+ for res in result:
170
+ label = res["label"]
171
+ score = res["score"]
172
+ if label not in labels_dict:
173
+ labels_dict[label] = []
174
+ labels_dict[label].append((sentence, score))
175
+
176
+ st.session_state.labels_dict = labels_dict
177
+
178
+ if "labels_dict" not in st.session_state:
179
+ st.markdown(
180
+ """
181
+ <style>
182
+ .hint {
183
+ color: rgba(41, 134, 204, 0.6);
184
+ font-size: 16px;
185
+ }
186
+ </style>
187
+ <h4 class="hint">Notes:</h4>
188
+ <ul class="hint">
189
+ <li>Enter text in the text area above,</li>
190
+ <li>The relation labels will be displayed in the sidebar,</li>
191
+ <li>Click on any label to see the corresponding sentences,</li>
192
+ <li>In displayed sentences, hover over underlined words to see their corresponding NER tag.</li>
193
+ </ul>
194
+ """,
195
+ unsafe_allow_html=True,
196
+ )
197
+
198
+ if "labels_dict" in st.session_state:
199
+ labels_dict = st.session_state.labels_dict
200
+
201
+ for label in labels_dict.keys():
202
+ if st.sidebar.button(label):
203
+ st.markdown(
204
+ f"Sentences with relation label: <strong><span style='color: #FF4B4B; font-size: 1.2em;'>{label}</span></strong>",
205
+ unsafe_allow_html=True,
206
+ )
207
+ for sentence, score in labels_dict[label]:
208
+ ner_results = get_ner_annotations(sentence)
209
+ annotated_sentence = annotate_sentence(sentence, ner_results)
210
+ st.markdown(
211
+ f"<div style='background-color: rgba(143, 203, 249, 0.1); padding: 10px; border-radius: 7px; margin: 5px 0;'>{annotated_sentence} <span style='color: #C71585; font-weight: bold;'>({score:.2f})</span></div>",
212
+ unsafe_allow_html=True,
213
+ )
requirements.txt ADDED
@@ -0,0 +1,2 @@
 
 
 
1
+ transformers
2
+ torch