Improve code and add more example specific text
Browse files- app.py +82 -153
- dependency-specific-text/article11.txt +4 -4
- dependency-specific-text/article13.txt +8 -2
- dependency-specific-text/article16.txt +0 -0
- dependency-specific-text/article4.txt +7 -0
- dependency-specific-text/article9.txt +4 -0
- dependency-specific-text/biden.txt +0 -0
- dependency-specific-text/protestors.txt +0 -0
- entity-specific-text/article11.txt +4 -3
- entity-specific-text/article13.txt +5 -2
- entity-specific-text/article16.txt +4 -0
- entity-specific-text/article4.txt +0 -0
- entity-specific-text/article9.txt +0 -0
- entity-specific-text/biden.txt +0 -0
- entity-specific-text/protestors.txt +0 -0
- requirements.txt +1 -0
- {sample-articles → sample-articles-temp}/biden.txt +0 -0
- {sample-articles → sample-articles-temp}/protestors.txt +0 -0
- sample-articles/article4.txt +55 -0
- sample-articles/article9.txt +48 -0
- sample-summaries/article4.txt +1 -0
- sample-summaries/article9.txt +1 -0
- sample-summaries/biden.txt +0 -0
app.py
CHANGED
@@ -1,8 +1,10 @@
|
|
1 |
import random
|
2 |
from typing import AnyStr
|
|
|
3 |
|
4 |
import itertools
|
5 |
import streamlit as st
|
|
|
6 |
import torch.nn.parameter
|
7 |
from bs4 import BeautifulSoup
|
8 |
import numpy as np
|
@@ -15,6 +17,7 @@ from validators import ValidationFailure
|
|
15 |
from custom_renderer import render_sentence_custom
|
16 |
from flair.data import Sentence
|
17 |
from flair.models import SequenceTagger
|
|
|
18 |
|
19 |
import spacy
|
20 |
from spacy import displacy
|
@@ -25,30 +28,8 @@ from transformers import pipeline
|
|
25 |
import os
|
26 |
from transformers_interpret import SequenceClassificationExplainer
|
27 |
|
28 |
-
#
|
29 |
-
|
30 |
-
'ml6team/distilbert-base-dutch-cased-toxic-comments':
|
31 |
-
'https://huggingface.co/ml6team/distilbert-base-dutch-cased-toxic-comments',
|
32 |
-
'ml6team/robbert-dutch-base-toxic-comments':
|
33 |
-
'https://huggingface.co/ml6team/robbert-dutch-base-toxic-comments',
|
34 |
-
}
|
35 |
-
|
36 |
-
about_page_markdown = f"""# 🤬 Dutch Toxic Comment Detection Space
|
37 |
-
|
38 |
-
Made by [ML6](https://ml6.eu/).
|
39 |
-
|
40 |
-
Token attribution is performed using [transformers-interpret](https://github.com/cdpierse/transformers-interpret).
|
41 |
-
"""
|
42 |
-
|
43 |
-
regular_emojis = [
|
44 |
-
'😐', '🙂', '👶', '😇',
|
45 |
-
]
|
46 |
-
undecided_emojis = [
|
47 |
-
'🤨', '🧐', '🥸', '🥴', '🤷',
|
48 |
-
]
|
49 |
-
potty_mouth_emojis = [
|
50 |
-
'🤐', '👿', '😡', '🤬', '☠️', '☣️', '☢️',
|
51 |
-
]
|
52 |
|
53 |
# Page setup
|
54 |
st.set_page_config(
|
@@ -64,58 +45,6 @@ st.set_page_config(
|
|
64 |
)
|
65 |
|
66 |
|
67 |
-
# Model setup
|
68 |
-
@st.cache(allow_output_mutation=True,
|
69 |
-
suppress_st_warning=True,
|
70 |
-
show_spinner=False)
|
71 |
-
def load_pipeline(model_name):
|
72 |
-
with st.spinner('Loading model (this might take a while)...'):
|
73 |
-
toxicity_pipeline = pipeline(
|
74 |
-
'text-classification',
|
75 |
-
model=model_name,
|
76 |
-
tokenizer=model_name)
|
77 |
-
cls_explainer = SequenceClassificationExplainer(
|
78 |
-
toxicity_pipeline.model,
|
79 |
-
toxicity_pipeline.tokenizer)
|
80 |
-
return toxicity_pipeline, cls_explainer
|
81 |
-
|
82 |
-
|
83 |
-
# Auxiliary functions
|
84 |
-
def format_explainer_html(html_string):
|
85 |
-
"""Extract tokens with attribution-based background color."""
|
86 |
-
inside_token_prefix = '##'
|
87 |
-
soup = BeautifulSoup(html_string, 'html.parser')
|
88 |
-
p = soup.new_tag('p',
|
89 |
-
attrs={'style': 'color: black; background-color: white;'})
|
90 |
-
# Select token elements and remove model specific tokens
|
91 |
-
current_word = None
|
92 |
-
for token in soup.find_all('td')[-1].find_all('mark')[1:-1]:
|
93 |
-
text = token.font.text.strip()
|
94 |
-
if text.startswith(inside_token_prefix):
|
95 |
-
text = text[len(inside_token_prefix):]
|
96 |
-
else:
|
97 |
-
# Create a new span for each word (sequence of sub-tokens)
|
98 |
-
if current_word is not None:
|
99 |
-
p.append(current_word)
|
100 |
-
p.append(' ')
|
101 |
-
current_word = soup.new_tag('span')
|
102 |
-
token.string = text
|
103 |
-
token.attrs['style'] = f"{token.attrs['style']}; padding: 0.2em 0em;"
|
104 |
-
current_word.append(token)
|
105 |
-
|
106 |
-
# Add last word
|
107 |
-
p.append(current_word)
|
108 |
-
|
109 |
-
# Add left and right-padding to each word
|
110 |
-
for span in p.find_all('span'):
|
111 |
-
span.find_all('mark')[0].attrs['style'] = (
|
112 |
-
f"{span.find_all('mark')[0].attrs['style']}; padding-left: 0.2em;")
|
113 |
-
span.find_all('mark')[-1].attrs['style'] = (
|
114 |
-
f"{span.find_all('mark')[-1].attrs['style']}; padding-right: 0.2em;")
|
115 |
-
|
116 |
-
return p
|
117 |
-
|
118 |
-
|
119 |
def list_all_article_names() -> list:
|
120 |
filenames = []
|
121 |
for file in sorted(os.listdir('./sample-articles/')):
|
@@ -148,32 +77,6 @@ def fetch_dependency_specific_contents(filename: str) -> AnyStr:
|
|
148 |
return data
|
149 |
|
150 |
|
151 |
-
def classify_comment(comment, selected_model):
|
152 |
-
"""Classify the given comment and augment with additional information."""
|
153 |
-
toxicity_pipeline, cls_explainer = load_pipeline(selected_model)
|
154 |
-
result = toxicity_pipeline(comment)[0]
|
155 |
-
result['model_name'] = selected_model
|
156 |
-
|
157 |
-
# Add explanation
|
158 |
-
result['word_attribution'] = cls_explainer(comment, class_name="non-toxic")
|
159 |
-
result['visualitsation_html'] = cls_explainer.visualize()._repr_html_()
|
160 |
-
result['tokens_with_background'] = format_explainer_html(
|
161 |
-
result['visualitsation_html'])
|
162 |
-
|
163 |
-
# Choose emoji reaction
|
164 |
-
label, score = result['label'], result['score']
|
165 |
-
if label == 'toxic' and score > 0.1:
|
166 |
-
emoji = random.choice(potty_mouth_emojis)
|
167 |
-
elif label in ['non_toxic', 'non-toxic'] and score > 0.1:
|
168 |
-
emoji = random.choice(regular_emojis)
|
169 |
-
else:
|
170 |
-
emoji = random.choice(undecided_emojis)
|
171 |
-
result.update({'text': comment, 'emoji': emoji})
|
172 |
-
|
173 |
-
# Add result to session
|
174 |
-
st.session_state.results.append(result)
|
175 |
-
|
176 |
-
|
177 |
def display_summary(article_name: str):
|
178 |
summary_content = fetch_summary_contents(article_name)
|
179 |
st.session_state.summary_output = summary_content
|
@@ -244,6 +147,10 @@ def get_and_compare_entities(article_name: str):
|
|
244 |
# TODO: currently substring matching but probably should do embedding method or idk?
|
245 |
if any(entity.lower() in substring_entity.lower() for substring_entity in entities_article):
|
246 |
matched_entities.append(entity)
|
|
|
|
|
|
|
|
|
247 |
else:
|
248 |
unmatched_entities.append(entity)
|
249 |
return matched_entities, unmatched_entities
|
@@ -343,26 +250,27 @@ st.title('Summarization fact checker')
|
|
343 |
|
344 |
# INTRODUCTION
|
345 |
st.header("Introduction")
|
346 |
-
st.markdown("""Recent work using transformers on large text corpora has shown great
|
347 |
-
different downstream NLP tasks. One such task is that of text summarization. The goal of text summarization
|
348 |
-
generate concise and accurate summaries from input document(s). There are 2 types of summarization: extractive
|
349 |
-
abstractive. **
|
350 |
-
summarization** may generate novel words. A good abstractive summary should cover principal
|
351 |
-
and has to be linguistically fluent. This blogpost will focus on this more difficult task of
|
352 |
-
generation.""")
|
353 |
|
354 |
st.markdown("""To generate summaries we will use the [PEGASUS] (https://huggingface.co/google/pegasus-cnn_dailymail)
|
355 |
-
model, producing abstractive summaries from large articles. These summaries often
|
356 |
-
|
357 |
-
|
358 |
-
|
359 |
-
|
|
|
360 |
|
361 |
# GENERATING SUMMARIES PART
|
362 |
st.header("Generating summaries")
|
363 |
st.markdown("Let’s start by selecting an article text for which we want to generate a summary, or you can provide "
|
364 |
"text yourself. Note that it’s suggested to provide a sufficiently large text, as otherwise the summary "
|
365 |
-
"generated might not be optimal to
|
366 |
|
367 |
# TODO: NEED TO CHECK ARTICLE TEXT INSTEAD OF ARTICLE NAME ALSO FREE INPUT OPTION
|
368 |
selected_article = st.selectbox('Select an article or provide your own:',
|
@@ -374,12 +282,11 @@ article_text = st.text_area(
|
|
374 |
height=150
|
375 |
)
|
376 |
|
377 |
-
st.markdown("Below you can find the generated summary for the article.
|
378 |
-
"
|
379 |
-
"
|
380 |
-
"
|
381 |
-
"
|
382 |
-
"we can detect some errors in summaries, and choose the best one to actually use.")
|
383 |
if st.session_state.article_text:
|
384 |
with st.spinner('Generating summary...'):
|
385 |
# classify_comment(article_text, selected_model)
|
@@ -395,6 +302,8 @@ if is_valid_url(article_text):
|
|
395 |
print("YES")
|
396 |
else:
|
397 |
print("NO")
|
|
|
|
|
398 |
def render_svg(svg_file):
|
399 |
with open(svg_file, "r") as f:
|
400 |
lines = f.readlines()
|
@@ -408,11 +317,15 @@ def render_svg(svg_file):
|
|
408 |
|
409 |
# ENTITY MATCHING PART
|
410 |
st.header("Entity matching")
|
411 |
-
st.markdown("**Named
|
412 |
-
"entities) in text. An entity can be a singular word or a
|
413 |
-
"same thing. Common entity classes are person names,
|
414 |
-
"to both the article and its summary, we can spot
|
415 |
-
"generated by the model that are not supported by
|
|
|
|
|
|
|
|
|
416 |
with st.spinner("Calculating and matching entities..."):
|
417 |
entity_match_html = highlight_entities(selected_article)
|
418 |
st.write(entity_match_html, unsafe_allow_html=True)
|
@@ -424,31 +337,47 @@ with st.spinner("Calculating and matching entities..."):
|
|
424 |
|
425 |
markdown_start_red = "<mark class=\"entity\" style=\"background: rgb(238, 135, 135);\">"
|
426 |
markdown_start_green = "<mark class=\"entity\" style=\"background: rgb(121, 236, 121);\">"
|
427 |
-
st.markdown("
|
428 |
-
"
|
429 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
430 |
unsafe_allow_html=True)
|
431 |
entity_specific_text = fetch_entity_specific_contents(selected_article)
|
432 |
-
|
|
|
|
|
|
|
433 |
|
434 |
# DEPENDENCY PARSING PART
|
435 |
st.header("Dependency comparison")
|
436 |
-
st.markdown("**Dependency parsing
|
437 |
-
"to find out related words as well as the type of the
|
438 |
-
"wife is called Sarah” you would get the following
|
|
|
439 |
|
440 |
# TODO: I wonder why the first doesn't work but the second does (it doesn't show deps otherwise)
|
441 |
# st.image("ExampleParsing.svg")
|
442 |
st.write(render_svg('ExampleParsing.svg'), unsafe_allow_html=True)
|
443 |
st.markdown("Here, “Jan” is the “poss” (possession modifier) of “wife”. If suddenly the summary would read “Jan’s "
|
444 |
-
"husband…”, there would be a dependency in the summary that is non-existent in the article itself
|
445 |
-
"
|
446 |
-
"
|
447 |
-
"
|
448 |
-
|
449 |
-
|
450 |
-
"
|
451 |
-
"
|
|
|
|
|
|
|
|
|
|
|
452 |
with st.spinner("Doing dependency parsing..."):
|
453 |
summary_deps = check_dependency(False)
|
454 |
article_deps = check_dependency(True)
|
@@ -461,22 +390,22 @@ with st.spinner("Doing dependency parsing..."):
|
|
461 |
if total_unmatched_deps:
|
462 |
for current_drawing_list in total_unmatched_deps:
|
463 |
render_dependency_parsing(current_drawing_list)
|
464 |
-
|
465 |
-
|
466 |
-
soup = BeautifulSoup("Example text option with box", features="html.parser")
|
467 |
HTML_WRAPPER = """<div style="overflow-x: auto; border: 1px solid #e6e9ef; border-radius: 0.25rem; padding: 1rem;
|
468 |
margin-bottom: 2.5rem">{}</div> """
|
469 |
-
st.write(HTML_WRAPPER.format(soup), unsafe_allow_html=True)
|
470 |
|
471 |
# OUTRO/CONCLUSION
|
472 |
st.header("Wrapping up")
|
473 |
st.markdown("We have presented 2 methods that try to improve summaries via post-processing steps. Entity matching can "
|
474 |
-
"be used to solve hallucinations, while
|
475 |
-
"
|
476 |
-
"only basic methods
|
477 |
-
"
|
|
|
478 |
st.markdown("####")
|
479 |
-
st.markdown("
|
480 |
-
"
|
481 |
-
"
|
482 |
-
"
|
|
|
1 |
import random
|
2 |
from typing import AnyStr
|
3 |
+
# import tensorflow_hub as hub
|
4 |
|
5 |
import itertools
|
6 |
import streamlit as st
|
7 |
+
|
8 |
import torch.nn.parameter
|
9 |
from bs4 import BeautifulSoup
|
10 |
import numpy as np
|
|
|
17 |
from custom_renderer import render_sentence_custom
|
18 |
from flair.data import Sentence
|
19 |
from flair.models import SequenceTagger
|
20 |
+
from sentence_transformers import SentenceTransformer
|
21 |
|
22 |
import spacy
|
23 |
from spacy import displacy
|
|
|
28 |
import os
|
29 |
from transformers_interpret import SequenceClassificationExplainer
|
30 |
|
31 |
+
# USE_model = hub.load("https://tfhub.dev/google/universal-sentence-encoder/4")
|
32 |
+
sentence_embedding_model = SentenceTransformer('sentence-transformers/all-MiniLM-L6-v2')
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
33 |
|
34 |
# Page setup
|
35 |
st.set_page_config(
|
|
|
45 |
)
|
46 |
|
47 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
48 |
def list_all_article_names() -> list:
|
49 |
filenames = []
|
50 |
for file in sorted(os.listdir('./sample-articles/')):
|
|
|
77 |
return data
|
78 |
|
79 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
80 |
def display_summary(article_name: str):
|
81 |
summary_content = fetch_summary_contents(article_name)
|
82 |
st.session_state.summary_output = summary_content
|
|
|
147 |
# TODO: currently substring matching but probably should do embedding method or idk?
|
148 |
if any(entity.lower() in substring_entity.lower() for substring_entity in entities_article):
|
149 |
matched_entities.append(entity)
|
150 |
+
elif any(
|
151 |
+
np.inner(sentence_embedding_model.encode(entity), sentence_embedding_model.encode(art_entity)) > 0.9 for
|
152 |
+
art_entity in entities_article):
|
153 |
+
matched_entities.append(entity)
|
154 |
else:
|
155 |
unmatched_entities.append(entity)
|
156 |
return matched_entities, unmatched_entities
|
|
|
250 |
|
251 |
# INTRODUCTION
|
252 |
st.header("Introduction")
|
253 |
+
st.markdown("""Recent work using transformers on large text corpora has shown great success when fine-tuned on
|
254 |
+
several different downstream NLP tasks. One such task is that of text summarization. The goal of text summarization
|
255 |
+
is to generate concise and accurate summaries from input document(s). There are 2 types of summarization: extractive
|
256 |
+
and abstractive. **Extractive summarization** merely copies informative fragments from the input,
|
257 |
+
whereas **abstractive summarization** may generate novel words. A good abstractive summary should cover principal
|
258 |
+
information in the input and has to be linguistically fluent. This blogpost will focus on this more difficult task of
|
259 |
+
abstractive summary generation.""")
|
260 |
|
261 |
st.markdown("""To generate summaries we will use the [PEGASUS] (https://huggingface.co/google/pegasus-cnn_dailymail)
|
262 |
+
model, producing abstractive summaries from large articles. These summaries often contain sentences with different
|
263 |
+
kinds of errors. Rather than improving the core model, we will look into possible post-processing steps to improve
|
264 |
+
the generated summaries. By comparing contents of the summary with the source text, we come up with a factualness
|
265 |
+
metric, indicating the trustworthiness of the generated summary. Throughout this blog, we will also explain the
|
266 |
+
results for some methods on specific examples. These text blocks will be indicated and they change according to the
|
267 |
+
currently selected article.""")
|
268 |
|
269 |
# GENERATING SUMMARIES PART
|
270 |
st.header("Generating summaries")
|
271 |
st.markdown("Let’s start by selecting an article text for which we want to generate a summary, or you can provide "
|
272 |
"text yourself. Note that it’s suggested to provide a sufficiently large text, as otherwise the summary "
|
273 |
+
"generated from it might not be optimal, leading to suboptimal performance of the post-processing steps.")
|
274 |
|
275 |
# TODO: NEED TO CHECK ARTICLE TEXT INSTEAD OF ARTICLE NAME ALSO FREE INPUT OPTION
|
276 |
selected_article = st.selectbox('Select an article or provide your own:',
|
|
|
282 |
height=150
|
283 |
)
|
284 |
|
285 |
+
st.markdown("Below you can find the generated summary for the article. Based on empirical research, we will discuss "
|
286 |
+
"two main methods that detect some common errors. We can then score different summaries, to indicate how "
|
287 |
+
"factual a summary is for a given article. The idea is that in production, you could generate a set of "
|
288 |
+
"summaries for the same article, with different parameters (or even different models). By using "
|
289 |
+
"post-processing error detection, we can then select the best possible summary.")
|
|
|
290 |
if st.session_state.article_text:
|
291 |
with st.spinner('Generating summary...'):
|
292 |
# classify_comment(article_text, selected_model)
|
|
|
302 |
print("YES")
|
303 |
else:
|
304 |
print("NO")
|
305 |
+
|
306 |
+
|
307 |
def render_svg(svg_file):
|
308 |
with open(svg_file, "r") as f:
|
309 |
lines = f.readlines()
|
|
|
317 |
|
318 |
# ENTITY MATCHING PART
|
319 |
st.header("Entity matching")
|
320 |
+
st.markdown("The first method we will discuss is called **Named Entity Recognition** (NER). NER is the task of "
|
321 |
+
"identifying and categorising key information (entities) in text. An entity can be a singular word or a "
|
322 |
+
"series of words that consistently refers to the same thing. Common entity classes are person names, "
|
323 |
+
"organisations, locations and so on. By applying NER to both the article and its summary, we can spot "
|
324 |
+
"possible **hallucinations**. Hallucinations are words generated by the model that are not supported by "
|
325 |
+
"the source input. In theory all entities in the summary (such as dates, locations and so on), "
|
326 |
+
"should also be present in the article. Thus we can extract all entities from the summary and compare "
|
327 |
+
"them to the entities of the original article, spotting potential hallucinations. The more unmatched "
|
328 |
+
"entities we find, the lower the factualness score of the summary. ")
|
329 |
with st.spinner("Calculating and matching entities..."):
|
330 |
entity_match_html = highlight_entities(selected_article)
|
331 |
st.write(entity_match_html, unsafe_allow_html=True)
|
|
|
337 |
|
338 |
markdown_start_red = "<mark class=\"entity\" style=\"background: rgb(238, 135, 135);\">"
|
339 |
markdown_start_green = "<mark class=\"entity\" style=\"background: rgb(121, 236, 121);\">"
|
340 |
+
st.markdown("We call this technique “entity matching” and here you can see what this looks like when we apply "
|
341 |
+
"this method on the summary. Entities in the summary are marked " + green_text + " when the entity "
|
342 |
+
"also exists in the "
|
343 |
+
"article, "
|
344 |
+
"while unmatched "
|
345 |
+
"entities are "
|
346 |
+
"marked " +
|
347 |
+
red_text + ". Several of the example articles and their summaries indicate different errors we find "
|
348 |
+
"by using this technique. Based on which article you choose, we provide a short "
|
349 |
+
"explanation of the results below.",
|
350 |
unsafe_allow_html=True)
|
351 |
entity_specific_text = fetch_entity_specific_contents(selected_article)
|
352 |
+
soup = BeautifulSoup(entity_specific_text, features="html.parser")
|
353 |
+
HTML_WRAPPER = """<div style="overflow-x: auto; border: 1px solid #e6e9ef; border-radius: 0.25rem; padding: 1rem;
|
354 |
+
margin-bottom: 2.5rem">{}</div> """
|
355 |
+
st.write("💡👇 **Specific example explanation** 👇💡", HTML_WRAPPER.format(soup), unsafe_allow_html=True)
|
356 |
|
357 |
# DEPENDENCY PARSING PART
|
358 |
st.header("Dependency comparison")
|
359 |
+
st.markdown("The second method we use for post-processing is called **Dependency parsing**: the process in which the "
|
360 |
+
"grammatical structure in a sentence is analysed, to find out related words as well as the type of the "
|
361 |
+
"relationship between them. For the sentence “Jan’s wife is called Sarah” you would get the following "
|
362 |
+
"dependency graph:")
|
363 |
|
364 |
# TODO: I wonder why the first doesn't work but the second does (it doesn't show deps otherwise)
|
365 |
# st.image("ExampleParsing.svg")
|
366 |
st.write(render_svg('ExampleParsing.svg'), unsafe_allow_html=True)
|
367 |
st.markdown("Here, “Jan” is the “poss” (possession modifier) of “wife”. If suddenly the summary would read “Jan’s "
|
368 |
+
"husband…”, there would be a dependency in the summary that is non-existent in the article itself (namely "
|
369 |
+
"“Jan” is the “poss” of “husband”). However, often new dependencies are introduced in the summary that "
|
370 |
+
"are still correct. “The borders of Ukraine” have a different dependency between “borders” and “Ukraine” "
|
371 |
+
"than “Ukraine’s borders”, while both descriptions have the same meaning. So just matching all "
|
372 |
+
"dependencies between article and summary (as we did with entity matching) would not be a robust method.")
|
373 |
+
st.markdown("However, by empirical testing, we have found that there are certain dependencies which can be used for "
|
374 |
+
"such matching techniques. When unmatched, these specific dependencies are often an indication of a "
|
375 |
+
"wrongly constructed sentence. **Should I explain this more/better or is it enough that I explain by "
|
376 |
+
"example specific run throughs?**. We found 2(/3 TODO) common dependencies which, when present in the "
|
377 |
+
"summary but not in the article, are highly indicative of factualness errors. Furthermore, we only check "
|
378 |
+
"dependencies between an existing **entity** and its direct connections. Below we highlight all unmatched "
|
379 |
+
"dependencies that satisfy the discussed constraints. We also discuss the specific results for the "
|
380 |
+
"currently selected article.")
|
381 |
with st.spinner("Doing dependency parsing..."):
|
382 |
summary_deps = check_dependency(False)
|
383 |
article_deps = check_dependency(True)
|
|
|
390 |
if total_unmatched_deps:
|
391 |
for current_drawing_list in total_unmatched_deps:
|
392 |
render_dependency_parsing(current_drawing_list)
|
393 |
+
dep_specific_text = fetch_dependency_specific_contents(selected_article)
|
394 |
+
soup = BeautifulSoup(dep_specific_text, features="html.parser")
|
|
|
395 |
HTML_WRAPPER = """<div style="overflow-x: auto; border: 1px solid #e6e9ef; border-radius: 0.25rem; padding: 1rem;
|
396 |
margin-bottom: 2.5rem">{}</div> """
|
397 |
+
st.write("💡👇 **Specific example explanation** 👇💡", HTML_WRAPPER.format(soup), unsafe_allow_html=True)
|
398 |
|
399 |
# OUTRO/CONCLUSION
|
400 |
st.header("Wrapping up")
|
401 |
st.markdown("We have presented 2 methods that try to improve summaries via post-processing steps. Entity matching can "
|
402 |
+
"be used to solve hallucinations, while dependency comparison can be used to filter out some bad "
|
403 |
+
"sentences (and thus worse summaries). These methods highlight the possibilities of post-processing "
|
404 |
+
"AI-made summaries, but are only a basic introduction. As the methods were empirically tested they are "
|
405 |
+
"definitely not sufficiently robust for general use-cases. (something about that we tested also RE and "
|
406 |
+
"maybe other things).")
|
407 |
st.markdown("####")
|
408 |
+
st.markdown("Below we generated 5 different kind of summaries from the article in which their ranks are estimated, "
|
409 |
+
"and hopefully the best summary (read: the one that a human would prefer or indicate as the best one) "
|
410 |
+
"will be at the top. TODO: implement this (at the end I think) and also put something in the text with "
|
411 |
+
"the actual parameters or something? ")
|
dependency-specific-text/article11.txt
CHANGED
@@ -1,4 +1,4 @@
|
|
1 |
-
One of the dependencies that, when found in the summary but not in the article, indicates a possible error is the "
|
2 |
-
|
3 |
-
|
4 |
-
|
|
|
1 |
+
One of the dependencies that, when found in the summary but not in the article, indicates a possible error is the "pobj" (object of preposition) dependency.
|
2 |
+
Furthermore, we only match *pobj* dependencies when the target word is "in", as in this example.
|
3 |
+
In this case it's obvious that "in U.S." is not found in the article, as "U.S." is a hallucinated entity itself as discussed in the entity matching paragraph.
|
4 |
+
So technically we don't need dependency comparison to spot the error from this summary.
|
dependency-specific-text/article13.txt
CHANGED
@@ -1,3 +1,9 @@
|
|
1 |
One of the dependencies that, when found in the summary but not in the article, indicates a possible error is the "amod" (adjectival modifier) dependency.
|
2 |
-
|
3 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
One of the dependencies that, when found in the summary but not in the article, indicates a possible error is the "amod" (adjectival modifier) dependency.
|
2 |
+
Applied to this summary, we have "First" as the entity, and it is the adjectival modifier of the word "phone".
|
3 |
+
And indeed, this unmatched dependency indicates an actual error here. The sentence is not factual, since the article talks about a **new** type of flagship phone,
|
4 |
+
and not the **first** flagship phone. This error was found by filtering on this specific kind of dependency. Empirical results showed that unmatched *amod* dependencies often suggest
|
5 |
+
that the summary sentence contains an error. <br> <br>
|
6 |
+
Another dependency that we use is the "pobj" (object of preposition) dependency.
|
7 |
+
Furthermore, we only match *pobj* dependencies when the target word is "in", as in this example.
|
8 |
+
In this case the sentence itself contains a factual error (because the article states "there's no word on a US release date yet").
|
9 |
+
However, this could have been found by entity matching already (as january 18 is unmatched), and the unmatched dependency can not be completely blamed for this error here.
|
dependency-specific-text/article16.txt
ADDED
File without changes
|
dependency-specific-text/article4.txt
ADDED
@@ -0,0 +1,7 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
One of the dependencies that, when found in the summary but not in the article, indicates a possible error is the "amod" (adjectival modifier) dependency.
|
2 |
+
Applied to this summary, we have "Democratic" as the entity, and it is the adjectival modifier of the word "member".
|
3 |
+
And indeed, this unmatched dependency indicates an actual error here. The sentence is not factual for two reasons. <br> <br>
|
4 |
+
First, the article talks about "democrats" and "members of the committee", which are two separate things. The summary combines those two in a way
|
5 |
+
that can be seen as not completely factual. Second, the statement itself was not made by a democrat (nor a member of the committee), and even though the dependency can't be
|
6 |
+
directly linked to this error, empirical results showed that unmatched *amod* dependencies often suggest
|
7 |
+
that the summary sentence is incorrect.
|
dependency-specific-text/article9.txt
ADDED
@@ -0,0 +1,4 @@
|
|
|
|
|
|
|
|
|
|
|
1 |
+
One of the dependencies that, when found in the summary but not in the article, indicates a possible error is the "pobj" (object of preposition) dependency.
|
2 |
+
Furthermore, we only match *pobj* dependencies when the target word is "in", as in this example.
|
3 |
+
The sentence here is not a factual error per se, but rather a readability issue. The "in" should be dropped to make the sentence correct.
|
4 |
+
For better examples with this specific dependency, try choosing another article. TODO: readability issue with the dependency graph for this specific issue
|
dependency-specific-text/biden.txt
ADDED
File without changes
|
dependency-specific-text/protestors.txt
ADDED
File without changes
|
entity-specific-text/article11.txt
CHANGED
@@ -1,3 +1,4 @@
|
|
1 |
-
|
2 |
-
|
3 |
-
|
|
|
|
1 |
+
As you can see we have 1 unmatched entity: "U.S." is a hallucinated entity in the summary, that does not exist in the article.
|
2 |
+
Deep learning based generation is [prone to hallucinate](https://arxiv.org/pdf/2202.03629.pdf) unintended text. These hallucinations degrade
|
3 |
+
system performance and fail to meet user expectations in many real-world scenarios. By applying entity matching, we can improve this problem
|
4 |
+
for the downstream task of summary generation.
|
entity-specific-text/article13.txt
CHANGED
@@ -1,2 +1,5 @@
|
|
1 |
-
|
2 |
-
|
|
|
|
|
|
|
|
1 |
+
As you can see we have 2 unmatched entities: "January 18" and "U.S". The first one is a hallucinated entity in the summary, that does not exist in the article.
|
2 |
+
Deep learning based generation is [prone to hallucinate](https://arxiv.org/pdf/2202.03629.pdf) unintended text. These hallucinations degrade
|
3 |
+
system performance and fail to meet user expectations in many real-world scenarios. By applying entity matching, we can improve this problem
|
4 |
+
for the downstream task of summary generation. U.S. **does** occur in the article, but as "US" instead of "U.S.". This could be solved
|
5 |
+
by comparing to a list of abbreviations or with a specific embedder for abbreviations but is currently not implemented.
|
entity-specific-text/article16.txt
ADDED
@@ -0,0 +1,4 @@
|
|
|
|
|
|
|
|
|
|
|
1 |
+
As you can see we have 1 unmatched entity: "Six9" is a hallucinated entity in the summary, that does not exist in the article.
|
2 |
+
Deep learning based generation is [prone to hallucinate](https://arxiv.org/pdf/2202.03629.pdf) unintended text. These hallucinations degrade
|
3 |
+
system performance and fail to meet user expectations in many real-world scenarios. By applying entity matching, we can improve this problem
|
4 |
+
for the downstream task of summary generation.
|
entity-specific-text/article4.txt
ADDED
File without changes
|
entity-specific-text/article9.txt
ADDED
File without changes
|
entity-specific-text/biden.txt
ADDED
File without changes
|
entity-specific-text/protestors.txt
ADDED
File without changes
|
requirements.txt
CHANGED
@@ -2,6 +2,7 @@ beautifulsoup4==4.10.0
|
|
2 |
streamlit==1.2.0
|
3 |
transformers==4.15.0
|
4 |
transformers-interpret==0.5.2
|
|
|
5 |
spacy==3.0.0
|
6 |
spacy_streamlit==1.0.3
|
7 |
flair
|
|
|
2 |
streamlit==1.2.0
|
3 |
transformers==4.15.0
|
4 |
transformers-interpret==0.5.2
|
5 |
+
sentence-transformers==2.2.0
|
6 |
spacy==3.0.0
|
7 |
spacy_streamlit==1.0.3
|
8 |
flair
|
{sample-articles → sample-articles-temp}/biden.txt
RENAMED
File without changes
|
{sample-articles → sample-articles-temp}/protestors.txt
RENAMED
File without changes
|
sample-articles/article4.txt
ADDED
@@ -0,0 +1,55 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
Former White House chief of staff Mark Meadows will no longer cooperate with the House select committee investigating January 6 insurrection, according to a letter from his attorney to the panel, which was obtained by CNN on Tuesday.
|
2 |
+
|
3 |
+
"We agreed to provide thousands of pages of responsive documents and Mr. Meadows was willing to appear voluntarily, not under compulsion of the Select Committee's subpoena to him, for a deposition to answer questions about non-privileged matters. Now actions by the Select Committee have made such an appearance untenable," the letter from George J. Terwilliger II stated.
|
4 |
+
|
5 |
+
"In short, we now have every indication from the information supplied to us last Friday -- upon which Mr. Meadows could expect to be questioned -- that the Select Committee has no intention of respecting boundaries concerning Executive Privilege," Terwilliger added.
|
6 |
+
|
7 |
+
The committee said later Tuesday that it will move forward with a scheduled deposition with Meadows on Wednesday even though he said he no longer plans to cooperate.
|
8 |
+
|
9 |
+
By proceeding with the scheduled deposition, the committee is setting up a path to hold Meadows in criminal contempt.
|
10 |
+
|
11 |
+
"Tomorrow's deposition, which was scheduled at Mr. Meadows's request, will go forward as planned. If indeed Mr. Meadows refuses to appear, the Select Committee will be left no choice but to advance contempt proceedings and recommend that the body in which Mr. Meadows once served refer him for criminal prosecution," Democratic Rep. Bennie Thompson of Mississippi and GOP Rep. Liz Cheney of Wyoming, who lead the committee, said in a joint statement.
|
12 |
+
|
13 |
+
Thompson told CNN later Tuesday evening, "Obviously, we had hoped Mr. Meadows would continue to work with the committee. But obviously based on his lawyer's letter today and his plan to not show up for the deposition, that creates a different dynamic."
|
14 |
+
|
15 |
+
"As you know, we were prepared to go with contempt earlier, but we withheld it based on what we thought was an agreement that we'd work together. That has not been the case. So obviously, we will move forward with it," he said.
|
16 |
+
|
17 |
+
Although Thompson indicated criminal contempt was on the table, he made clear that the committee is weighing multiple options, including immunity, that could pave the way for it to get the information that it wants from Meadows.
|
18 |
+
|
19 |
+
"I think we're interested in getting the information. I think we will still want Mr. Meadows to cooperate. So we will look at all of our options at this point," he said.
|
20 |
+
|
21 |
+
Responding to the letter from Meadows' attorney, the committee made clear it needs to hear from the former White House chief of staff "about voluminous official records stored in his personal phone and email accounts, which were required to be turned over to the National Archives in accordance with the Presidential Records Act. "
|
22 |
+
|
23 |
+
A source familiar with the matter told CNN that among the 6,000 pages of documents Meadows has already provided to the committee are communications from January 6. It is still unclear who communicated that day with Meadows but the source said that "many people had Meadows' cell phone."
|
24 |
+
|
25 |
+
Democratic Rep. Pete Aguilar of California, who serves on the panel, told CNN that within the documents Meadows turned over is evidence that he was in communication with individuals involved in the planning of the rally on January 6 that preceded the riot.
|
26 |
+
|
27 |
+
"What I'll share is that we continue to learn and we continue to connect the dots," Aguilar said. "But individuals that were responsible for the planning of January 6 in the rally, Mr. Meadows was in communication with, and those are in the documents ... that he turned over himself."
|
28 |
+
|
29 |
+
Aguilar added that some of the records Meadows turned over, including text messages, were from his personal device.
|
30 |
+
|
31 |
+
Rep. Zoe Lofgren, a California Democrat and member of the committee, said on CNN's "The Lead with Jake Tapper" that the records including "volumes of material, including real time communication as the riot unfolded. Lofgren said the messages were shared "without an assertion of privilege," and criticized Meadows for then reversing his cooperation.
|
32 |
+
|
33 |
+
"The committee wants to ask him about some of that, and it's really untenable that all of a sudden at the last minute he's saying no. That somehow there's some reason why he can't talk about this," Lofgren said.
|
34 |
+
|
35 |
+
CNN first reported last week that Meadows had begun cooperating with the committee, handing over thousands of documents and agreeing to appear for an interview this week.
|
36 |
+
|
37 |
+
Meadows' about-face is due in part to learning over the weekend that the committee had "issued wide ranging subpoenas for information from a third party communications provider," the letter notes.
|
38 |
+
|
39 |
+
"As a result of careful and deliberate consideration of these factors, we now must decline the opportunity to appear voluntarily for a deposition," Terwilliger writes.
|
40 |
+
|
41 |
+
Terwilliger writes that Meadows would answer written questions "so that there might be both an orderly process and a clear record of questions and related assertions of privilege where appropriate."
|
42 |
+
|
43 |
+
Responding to Meadows' claim that the committee was ignoring his claims of executive privilege, Thompson and Cheney state that Meadows was willing to discuss details about Trump in his new book.
|
44 |
+
|
45 |
+
"Mark Meadows has informed the Select Committee that he does not intend to cooperate further with our investigation despite his apparent willingness to provide details about the facts and circumstances surrounding the January 6th attack, including conversations with President Trump, in the book he is now promoting and selling," they write.
|
46 |
+
|
47 |
+
The pair add that they have "numerous questions" for Meadows that have nothing to do with executive privilege.
|
48 |
+
|
49 |
+
Rep. Stephanie Murphy, a Florida Democrat who's also a member of the select committee, said Tuesday evening that while the panel will continue to do all it can to compel the testimony of witnesses like Meadows and Steve Bannon, members believe they'll be able to get the information they're looking for without their help.
|
50 |
+
|
51 |
+
"To be fair, it's only a very handful of people who want to risk jail time and fines for contempt of Congress who are obstructing our process," Murphy said.
|
52 |
+
|
53 |
+
"The vast majority of the people that we have reached out to are providing us with information, with evidence, with text messages, with emails, with details of conversations that they have been a party to. So these people are well within their right to not cooperate, but it's not as if we're not going to get to the information we need."
|
54 |
+
|
55 |
+
This story has been updated with additional developments Tuesday.
|
sample-articles/article9.txt
ADDED
@@ -0,0 +1,48 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
Novak Djokovic has been granted permission to defend his Australian Open title
|
2 |
+
|
3 |
+
Australians have reacted angrily to news that tennis player Novak Djokovic will play in the Australian Open, after being exempted from vaccination rules.
|
4 |
+
|
5 |
+
All players and staff at the tournament must be vaccinated or have an exemption granted by an expert independent panel.
|
6 |
+
|
7 |
+
Djokovic has not spoken about his vaccination status, but last year said he was "opposed to vaccination".
|
8 |
+
|
9 |
+
Organisers say he has not been given special treatment. But Australians have criticised officials and the player.
|
10 |
+
|
11 |
+
The controversy comes as the country is seeing tens of thousands of Covid-19 cases for the first time after enduring some of the world's strictest restrictions.
|
12 |
+
|
13 |
+
Over 90% of Australia's over-16 population is fully vaccinated, but some Australians still cannot travel interstate or globally because of current restrictions.
|
14 |
+
|
15 |
+
Amid the row, Australian Prime Minister Scott Morrison said Djokovic would be required to present evidence upon arrival that he has a genuine medical exemption from vaccination, or he would be "on the next plane home".
|
16 |
+
|
17 |
+
"If that evidence is insufficient, then he won't be treated any different to anyone else and he'll be on the next plane home," the prime minister told reporters. "There should be no special rules for Novak Djokovic at all. None whatsoever."
|
18 |
+
|
19 |
+
Many Australians had previously accused the government of allowing the rich and famous to do as they please while ordinary people remained separated from sick and dying loved ones.
|
20 |
+
|
21 |
+
"I think it's a disgrace," Christine Wharton, who lives in Melbourne, where the Australian Open will be held, told ABC.
|
22 |
+
|
23 |
+
"We've all done the right thing, we've all gone out and got our jabs and our boosters and we have someone that has come from overseas and all of a sudden he's been exempt and can play and I think it's an absolute disgrace and I won't be watching it."
|
24 |
+
|
25 |
+
A&E doctor Stephen Parnis tweeted: "I don't care how good a tennis player he is. If he's refusing to get vaccinated, he shouldn't be allowed in. "If this exemption is true, it sends an appalling message to millions seeking to reduce #COVID19Aus risk to themselves & others."
|
26 |
+
|
27 |
+
The decision raised eyebrows with some other tennis players too. "I just think it's very interesting. That's all I'm going to say," Australian Alex de Minaur said.
|
28 |
+
|
29 |
+
Britain's Jamie Murray added: "I think if it was me that wasn't vaccinated I wouldn't be getting an exemption. You know, but well done to him for getting clear to come to Australia and compete."
|
30 |
+
|
31 |
+
The Australian Open begins on 17 January, and the event's chief executive Craig Tiley said 26 athletes had applied for medical exemptions. "A handful" had been granted, he said, under guidelines set by federal regulators.
|
32 |
+
|
33 |
+
"We made it extra difficult for anyone applying for an application to ensure it was the right process and to make sure the medical experts deal with it independently," he told Channel 9.
|
34 |
+
|
35 |
+
Applications for medical exemptions are being assessed anonymously by two separate panels, with inflammatory cardiac illness or another acute condition listed as valid reasons.
|
36 |
+
|
37 |
+
But it is also possible Djokovic has recently tested positive for the virus, which would allow him to defer taking the vaccine.
|
38 |
+
|
39 |
+
He has not revealed his vaccination status and said last April: "Personally I am opposed to vaccination and I wouldn't want to be forced by someone to take a vaccine in order to be able to travel."
|
40 |
+
|
41 |
+
On Tuesday he said on Instagram: "I've spent fantastic quality time with my loved ones over the break and today I'm heading down under with an exemption permission. Let's go 2022. I am ready to live and breathe tennis in the next few weeks of competition."
|
42 |
+
|
43 |
+
Media caption,
|
44 |
+
"Heartless" Queensland bars US couple from seeing dying father
|
45 |
+
|
46 |
+
Victoria state government minister Jaala Pulford acknowledged the decision was "frustrating and upsetting", but also denied that Djokovic had received special treatment. Both she and Mr Tiley urged Djokovic to give more information to the public.
|
47 |
+
|
48 |
+
"It'll certainly be helpful if Novak was to explain the conditions in which he's sought an exemption and granted an exemption but ultimately it's up to him," Mr Tiley said.
|
sample-summaries/article4.txt
ADDED
@@ -0,0 +1 @@
|
|
|
|
|
1 |
+
Former White House chief of staff Mark Meadows will no longer cooperate with the House select committee. The committee says it will move forward with a scheduled deposition with Meadows on Wednesday. By proceeding with the scheduled deposition, the committee is setting up a path to hold Meadows in criminal contempt. A source familiar with the matter told CNN that among the 6,000 pages of documents Meadows has already provided to the committee are communications from January 6. A Democratic member of the committee said Meadows' about-face is due in part to learning over the weekend that the committee had "issued wide ranging subpoenas for information from a third party communications provider".
|
sample-summaries/article9.txt
ADDED
@@ -0,0 +1 @@
|
|
|
|
|
1 |
+
Novak Djokovic has been granted permission to play in the Australian Open. All players and staff at the tournament must be vaccinated or have an exemption.. Djokovic has not spoken about his vaccination status, but last year said he was "opposed to vaccination" Australian Prime Minister Scott Morrison says there should be no special rules for Djokovic, but adds he would be "on the next plane home" if he did not have the right evidence. in Australia is seeing tens of thousands of Covid-19 cases for the first time after enduring some of the world's strictest restrictions.
|
sample-summaries/biden.txt
ADDED
File without changes
|