Delete non-used code
Browse files- .idea/HFSummSpace.iml +17 -0
- .idea/inspectionProfiles/Project_Default.xml +12 -0
- .idea/inspectionProfiles/profiles_settings.xml +6 -0
- .idea/misc.xml +4 -0
- .idea/modules.xml +8 -0
- .idea/vcs.xml +6 -0
- .idea/workspace.xml +107 -0
- __pycache__/custom_renderer.cpython-37.pyc +0 -0
- app.py +43 -71
- custom_renderer.py +0 -2
.idea/HFSummSpace.iml
ADDED
@@ -0,0 +1,17 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
<?xml version="1.0" encoding="UTF-8"?>
|
2 |
+
<module type="PYTHON_MODULE" version="4">
|
3 |
+
<component name="NewModuleRootManager">
|
4 |
+
<content url="file://$MODULE_DIR$">
|
5 |
+
<excludeFolder url="file://$MODULE_DIR$/venv" />
|
6 |
+
</content>
|
7 |
+
<orderEntry type="inheritedJdk" />
|
8 |
+
<orderEntry type="sourceFolder" forTests="false" />
|
9 |
+
</component>
|
10 |
+
<component name="PyDocumentationSettings">
|
11 |
+
<option name="format" value="PLAIN" />
|
12 |
+
<option name="myDocStringFormat" value="Plain" />
|
13 |
+
</component>
|
14 |
+
<component name="TestRunnerService">
|
15 |
+
<option name="PROJECT_TEST_RUNNER" value="py.test" />
|
16 |
+
</component>
|
17 |
+
</module>
|
.idea/inspectionProfiles/Project_Default.xml
ADDED
@@ -0,0 +1,12 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
<component name="InspectionProjectProfileManager">
|
2 |
+
<profile version="1.0">
|
3 |
+
<option name="myName" value="Project Default" />
|
4 |
+
<inspection_tool class="PyPep8NamingInspection" enabled="true" level="WEAK WARNING" enabled_by_default="true">
|
5 |
+
<option name="ignoredErrors">
|
6 |
+
<list>
|
7 |
+
<option value="N806" />
|
8 |
+
</list>
|
9 |
+
</option>
|
10 |
+
</inspection_tool>
|
11 |
+
</profile>
|
12 |
+
</component>
|
.idea/inspectionProfiles/profiles_settings.xml
ADDED
@@ -0,0 +1,6 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
<component name="InspectionProjectProfileManager">
|
2 |
+
<settings>
|
3 |
+
<option name="USE_PROJECT_PROFILE" value="false" />
|
4 |
+
<version value="1.0" />
|
5 |
+
</settings>
|
6 |
+
</component>
|
.idea/misc.xml
ADDED
@@ -0,0 +1,4 @@
|
|
|
|
|
|
|
|
|
|
|
1 |
+
<?xml version="1.0" encoding="UTF-8"?>
|
2 |
+
<project version="4">
|
3 |
+
<component name="ProjectRootManager" version="2" project-jdk-name="Python 3.7 (HFSummSpace)" project-jdk-type="Python SDK" />
|
4 |
+
</project>
|
.idea/modules.xml
ADDED
@@ -0,0 +1,8 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
<?xml version="1.0" encoding="UTF-8"?>
|
2 |
+
<project version="4">
|
3 |
+
<component name="ProjectModuleManager">
|
4 |
+
<modules>
|
5 |
+
<module fileurl="file://$PROJECT_DIR$/.idea/HFSummSpace.iml" filepath="$PROJECT_DIR$/.idea/HFSummSpace.iml" />
|
6 |
+
</modules>
|
7 |
+
</component>
|
8 |
+
</project>
|
.idea/vcs.xml
ADDED
@@ -0,0 +1,6 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
<?xml version="1.0" encoding="UTF-8"?>
|
2 |
+
<project version="4">
|
3 |
+
<component name="VcsDirectoryMappings">
|
4 |
+
<mapping directory="$PROJECT_DIR$" vcs="Git" />
|
5 |
+
</component>
|
6 |
+
</project>
|
.idea/workspace.xml
ADDED
@@ -0,0 +1,107 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
<?xml version="1.0" encoding="UTF-8"?>
|
2 |
+
<project version="4">
|
3 |
+
<component name="ChangeListManager">
|
4 |
+
<list default="true" id="57f23431-346d-451d-8d77-db859508e831" name="Changes" comment="">
|
5 |
+
<change beforePath="$PROJECT_DIR$/app.py" beforeDir="false" afterPath="$PROJECT_DIR$/app.py" afterDir="false" />
|
6 |
+
<change beforePath="$PROJECT_DIR$/custom_renderer.py" beforeDir="false" afterPath="$PROJECT_DIR$/custom_renderer.py" afterDir="false" />
|
7 |
+
</list>
|
8 |
+
<option name="SHOW_DIALOG" value="false" />
|
9 |
+
<option name="HIGHLIGHT_CONFLICTS" value="true" />
|
10 |
+
<option name="HIGHLIGHT_NON_ACTIVE_CHANGELIST" value="false" />
|
11 |
+
<option name="LAST_RESOLUTION" value="IGNORE" />
|
12 |
+
</component>
|
13 |
+
<component name="FileTemplateManagerImpl">
|
14 |
+
<option name="RECENT_TEMPLATES">
|
15 |
+
<list>
|
16 |
+
<option value="Python Script" />
|
17 |
+
</list>
|
18 |
+
</option>
|
19 |
+
</component>
|
20 |
+
<component name="Git.Settings">
|
21 |
+
<option name="RECENT_GIT_ROOT_PATH" value="$PROJECT_DIR$" />
|
22 |
+
</component>
|
23 |
+
<component name="GitSEFilterConfiguration">
|
24 |
+
<file-type-list>
|
25 |
+
<filtered-out-file-type name="LOCAL_BRANCH" />
|
26 |
+
<filtered-out-file-type name="REMOTE_BRANCH" />
|
27 |
+
<filtered-out-file-type name="TAG" />
|
28 |
+
<filtered-out-file-type name="COMMIT_BY_MESSAGE" />
|
29 |
+
</file-type-list>
|
30 |
+
</component>
|
31 |
+
<component name="HighlightingSettingsPerFile">
|
32 |
+
<setting file="file://$PROJECT_DIR$/venv/lib/python3.7/site-packages/flair/models/sequence_tagger_model.py" root0="SKIP_INSPECTION" />
|
33 |
+
</component>
|
34 |
+
<component name="MarkdownSettingsMigration">
|
35 |
+
<option name="stateVersion" value="1" />
|
36 |
+
</component>
|
37 |
+
<component name="ProjectId" id="27jdqgqsSB1v523dZaR7czhkX4c" />
|
38 |
+
<component name="ProjectLevelVcsManager" settingsEditedManually="true" />
|
39 |
+
<component name="ProjectViewState">
|
40 |
+
<option name="hideEmptyMiddlePackages" value="true" />
|
41 |
+
<option name="showLibraryContents" value="true" />
|
42 |
+
</component>
|
43 |
+
<component name="PropertiesComponent"><![CDATA[{
|
44 |
+
"keyToString": {
|
45 |
+
"last_opened_file_path": "/home/matthias/Documents/Summarization-fact-checker/HugginfaceSpace/HFSummSpace",
|
46 |
+
"settings.editor.selected.configurable": "editor.preferences.fonts.default"
|
47 |
+
}
|
48 |
+
}]]></component>
|
49 |
+
<component name="RecentsManager">
|
50 |
+
<key name="CopyFile.RECENT_KEYS">
|
51 |
+
<recent name="$PROJECT_DIR$" />
|
52 |
+
</key>
|
53 |
+
<key name="MoveFile.RECENT_KEYS">
|
54 |
+
<recent name="$PROJECT_DIR$/sample-articles-temp" />
|
55 |
+
</key>
|
56 |
+
</component>
|
57 |
+
<component name="RunManager">
|
58 |
+
<configuration name="app" type="PythonConfigurationType" factoryName="Python" temporary="true" nameIsGenerated="true">
|
59 |
+
<module name="HFSummSpace" />
|
60 |
+
<option name="INTERPRETER_OPTIONS" value="" />
|
61 |
+
<option name="PARENT_ENVS" value="true" />
|
62 |
+
<envs>
|
63 |
+
<env name="PYTHONUNBUFFERED" value="1" />
|
64 |
+
</envs>
|
65 |
+
<option name="SDK_HOME" value="" />
|
66 |
+
<option name="WORKING_DIRECTORY" value="$PROJECT_DIR$" />
|
67 |
+
<option name="IS_MODULE_SDK" value="true" />
|
68 |
+
<option name="ADD_CONTENT_ROOTS" value="true" />
|
69 |
+
<option name="ADD_SOURCE_ROOTS" value="true" />
|
70 |
+
<option name="SCRIPT_NAME" value="$PROJECT_DIR$/app.py" />
|
71 |
+
<option name="PARAMETERS" value="" />
|
72 |
+
<option name="SHOW_COMMAND_LINE" value="false" />
|
73 |
+
<option name="EMULATE_TERMINAL" value="false" />
|
74 |
+
<option name="MODULE_MODE" value="false" />
|
75 |
+
<option name="REDIRECT_INPUT" value="false" />
|
76 |
+
<option name="INPUT_FILE" value="" />
|
77 |
+
<method v="2" />
|
78 |
+
</configuration>
|
79 |
+
<recent_temporary>
|
80 |
+
<list>
|
81 |
+
<item itemvalue="Python.app" />
|
82 |
+
</list>
|
83 |
+
</recent_temporary>
|
84 |
+
</component>
|
85 |
+
<component name="SpellCheckerSettings" RuntimeDictionaries="0" Folders="0" CustomDictionaries="0" DefaultDictionary="application-level" UseSingleDictionary="true" transferred="true" />
|
86 |
+
<component name="TaskManager">
|
87 |
+
<task active="true" id="Default" summary="Default task">
|
88 |
+
<changelist id="57f23431-346d-451d-8d77-db859508e831" name="Changes" comment="" />
|
89 |
+
<created>1649837622575</created>
|
90 |
+
<option name="number" value="Default" />
|
91 |
+
<option name="presentableId" value="Default" />
|
92 |
+
<updated>1649837622575</updated>
|
93 |
+
</task>
|
94 |
+
<servers />
|
95 |
+
</component>
|
96 |
+
<component name="Vcs.Log.Tabs.Properties">
|
97 |
+
<option name="TAB_STATES">
|
98 |
+
<map>
|
99 |
+
<entry key="MAIN">
|
100 |
+
<value>
|
101 |
+
<State />
|
102 |
+
</value>
|
103 |
+
</entry>
|
104 |
+
</map>
|
105 |
+
</option>
|
106 |
+
</component>
|
107 |
+
</project>
|
__pycache__/custom_renderer.cpython-37.pyc
CHANGED
Binary files a/__pycache__/custom_renderer.cpython-37.pyc and b/__pycache__/custom_renderer.cpython-37.pyc differ
|
|
app.py
CHANGED
@@ -1,10 +1,6 @@
|
|
1 |
-
import
|
2 |
-
from typing import AnyStr, List, Dict
|
3 |
-
# import tensorflow_hub as hub
|
4 |
|
5 |
import itertools
|
6 |
-
|
7 |
-
#import en_core_web_sm
|
8 |
import streamlit as st
|
9 |
import en_core_web_lg
|
10 |
|
@@ -13,25 +9,15 @@ from bs4 import BeautifulSoup
|
|
13 |
import numpy as np
|
14 |
import base64
|
15 |
|
16 |
-
import validators
|
17 |
from spacy_streamlit.util import get_svg
|
18 |
-
from validators import ValidationFailure
|
19 |
|
20 |
from custom_renderer import render_sentence_custom
|
21 |
-
# from flair.data import Sentence
|
22 |
-
# from flair.models import SequenceTagger
|
23 |
from sentence_transformers import SentenceTransformer
|
24 |
|
25 |
-
import
|
26 |
-
from spacy import displacy
|
27 |
-
from spacy_streamlit import visualize_parser
|
28 |
-
|
29 |
-
from transformers import AutoTokenizer, AutoModelForSequenceClassification, AutoModelForTokenClassification
|
30 |
from transformers import pipeline
|
31 |
import os
|
32 |
-
from transformers_interpret import SequenceClassificationExplainer
|
33 |
|
34 |
-
# USE_model = hub.load("https://tfhub.dev/google/universal-sentence-encoder/4")
|
35 |
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
|
36 |
|
37 |
|
@@ -42,19 +28,10 @@ def get_sentence_embedding_model():
|
|
42 |
|
43 |
@st.experimental_singleton
|
44 |
def get_spacy():
|
45 |
-
# nlp = spacy.load('en_core_web_lg')
|
46 |
nlp = en_core_web_lg.load()
|
47 |
return nlp
|
48 |
|
49 |
|
50 |
-
# TODO: might look into which one is the best here
|
51 |
-
# TODO: might be useful to make an ml6 preloaded model for flair as this takes ridiculously long to load the first time
|
52 |
-
# @st.experimental_singleton
|
53 |
-
# @st.cache(suppress_st_warning=True, allow_output_mutation=True)
|
54 |
-
# def get_flair_tagger():
|
55 |
-
# return SequenceTagger.load("flair/ner-english-ontonotes-fast")
|
56 |
-
|
57 |
-
|
58 |
@st.experimental_singleton
|
59 |
def get_transformer_pipeline():
|
60 |
tokenizer = AutoTokenizer.from_pretrained("xlm-roberta-large-finetuned-conll03-english")
|
@@ -96,7 +73,7 @@ def list_all_article_names() -> list:
|
|
96 |
|
97 |
|
98 |
def fetch_article_contents(filename: str) -> AnyStr:
|
99 |
-
if
|
100 |
return " "
|
101 |
with open(f'./sample-articles/{filename.lower()}.txt', 'r') as f:
|
102 |
data = f.read()
|
@@ -174,13 +151,13 @@ def get_all_entities(text):
|
|
174 |
|
175 |
# TODO: this functionality can be cached (e.g. by storing html file output) if wanted (or just store list of entities idk)
|
176 |
def get_and_compare_entities():
|
177 |
-
#article_content = fetch_article_contents(article_name)
|
178 |
article_content = st.session_state.article_text
|
179 |
all_entities_per_sentence = get_all_entities_per_sentence(article_content)
|
180 |
# st.session_state.entities_per_sentence_article = all_entities_per_sentence
|
181 |
entities_article = list(itertools.chain.from_iterable(all_entities_per_sentence))
|
182 |
|
183 |
-
#summary_content = fetch_summary_contents(article_name)
|
184 |
summary_content = st.session_state.summary_output
|
185 |
all_entities_per_sentence = get_all_entities_per_sentence(summary_content)
|
186 |
# st.session_state.entities_per_sentence_summary = all_entities_per_sentence
|
@@ -193,7 +170,8 @@ def get_and_compare_entities():
|
|
193 |
if any(entity.lower() in substring_entity.lower() for substring_entity in entities_article):
|
194 |
matched_entities.append(entity)
|
195 |
elif any(
|
196 |
-
np.inner(sentence_embedding_model.encode(entity, show_progress_bar=False),
|
|
|
197 |
art_entity in entities_article):
|
198 |
matched_entities.append(entity)
|
199 |
else:
|
@@ -202,7 +180,7 @@ def get_and_compare_entities():
|
|
202 |
|
203 |
|
204 |
def highlight_entities():
|
205 |
-
#summary_content = fetch_summary_contents(article_name)
|
206 |
summary_content = st.session_state.summary_output
|
207 |
markdown_start_red = "<mark class=\"entity\" style=\"background: rgb(238, 135, 135);\">"
|
208 |
markdown_start_green = "<mark class=\"entity\" style=\"background: rgb(121, 236, 121);\">"
|
@@ -277,13 +255,6 @@ def check_dependency(article: bool):
|
|
277 |
# return all_deps
|
278 |
|
279 |
|
280 |
-
def is_valid_url(url: str) -> bool:
|
281 |
-
result = validators.url(url)
|
282 |
-
if isinstance(result, ValidationFailure):
|
283 |
-
return False
|
284 |
-
return True
|
285 |
-
|
286 |
-
|
287 |
def render_svg(svg_file):
|
288 |
with open(svg_file, "r") as f:
|
289 |
lines = f.readlines()
|
@@ -296,7 +267,6 @@ def render_svg(svg_file):
|
|
296 |
|
297 |
|
298 |
def generate_abstractive_summary(text, type, min_len=120, max_len=512, **kwargs):
|
299 |
-
summarization_model = get_summarizer_model()
|
300 |
text = text.strip().replace("\n", " ")
|
301 |
if type == "top_p":
|
302 |
text = summarization_model(text, min_length=min_len,
|
@@ -316,10 +286,6 @@ def generate_abstractive_summary(text, type, min_len=120, max_len=512, **kwargs)
|
|
316 |
return summary
|
317 |
|
318 |
|
319 |
-
# Start session
|
320 |
-
if 'results' not in st.session_state:
|
321 |
-
st.session_state.results = []
|
322 |
-
|
323 |
# Page
|
324 |
st.title('Summarization fact checker')
|
325 |
|
@@ -341,11 +307,11 @@ metric, indicating the trustworthiness of the generated summary. Throughout this
|
|
341 |
results for some methods on specific examples. These text blocks will be indicated and they change according to the
|
342 |
currently selected article.""")
|
343 |
|
|
|
344 |
sentence_embedding_model = get_sentence_embedding_model()
|
345 |
-
# tagger = get_flair_tagger()
|
346 |
ner_model = get_transformer_pipeline()
|
347 |
nlp = get_spacy()
|
348 |
-
|
349 |
|
350 |
# GENERATING SUMMARIES PART
|
351 |
st.header("Generating summaries")
|
@@ -353,7 +319,6 @@ st.markdown("Let’s start by selecting an article text for which we want to gen
|
|
353 |
"text yourself. Note that it’s suggested to provide a sufficiently large text, as otherwise the summary "
|
354 |
"generated from it might not be optimal, leading to suboptimal performance of the post-processing steps.")
|
355 |
|
356 |
-
# TODO: NEED TO CHECK ARTICLE TEXT INSTEAD OF ARTICLE NAME ALSO FREE INPUT OPTION
|
357 |
selected_article = st.selectbox('Select an article or provide your own:',
|
358 |
list_all_article_names()) # index=0, format_func=special_internal_function, key=None, help=None, on_change=None, args=None, kwargs=None, *, disabled=False)
|
359 |
st.session_state.article_text = fetch_article_contents(selected_article)
|
@@ -363,23 +328,27 @@ article_text = st.text_area(
|
|
363 |
height=150
|
364 |
)
|
365 |
|
366 |
-
summarize_button = st.button(label='Process article content',
|
|
|
367 |
|
368 |
if summarize_button:
|
369 |
st.session_state.article_text = article_text
|
370 |
-
st.markdown(
|
371 |
-
|
372 |
-
|
373 |
-
|
374 |
-
|
|
|
375 |
if st.session_state.article_text:
|
376 |
with st.spinner('Generating summary...'):
|
377 |
# classify_comment(article_text, selected_model)
|
378 |
-
if selected_article != "Provide your own input" and article_text == fetch_article_contents(
|
|
|
379 |
st.session_state.unchanged_text = True
|
380 |
summary_content = fetch_summary_contents(selected_article)
|
381 |
else:
|
382 |
-
summary_content = generate_abstractive_summary(article_text, type="beam", do_sample=True, num_beams=15,
|
|
|
383 |
st.session_state.unchanged_text = False
|
384 |
summary_displayed = display_summary(summary_content)
|
385 |
st.write("**Generated summary:**", summary_displayed, unsafe_allow_html=True)
|
@@ -428,10 +397,11 @@ if summarize_button:
|
|
428 |
|
429 |
# DEPENDENCY PARSING PART
|
430 |
st.header("Dependency comparison")
|
431 |
-
st.markdown(
|
432 |
-
|
433 |
-
|
434 |
-
|
|
|
435 |
|
436 |
# TODO: I wonder why the first doesn't work but the second does (it doesn't show deps otherwise)
|
437 |
# st.image("ExampleParsing.svg")
|
@@ -442,14 +412,15 @@ if summarize_button:
|
|
442 |
"are still correct. “The borders of Ukraine” have a different dependency between “borders” and “Ukraine” "
|
443 |
"than “Ukraine’s borders”, while both descriptions have the same meaning. So just matching all "
|
444 |
"dependencies between article and summary (as we did with entity matching) would not be a robust method.")
|
445 |
-
st.markdown(
|
446 |
-
|
447 |
-
|
448 |
-
|
449 |
-
|
450 |
-
|
451 |
-
|
452 |
-
|
|
|
453 |
with st.spinner("Doing dependency parsing..."):
|
454 |
# TODO RIGHT IF FUNCTION (IF EXAMPLE AND IF INPUT UNCHANGED)
|
455 |
# if selected_article == 'article11':
|
@@ -474,12 +445,13 @@ if summarize_button:
|
|
474 |
|
475 |
# OUTRO/CONCLUSION
|
476 |
st.header("Wrapping up")
|
477 |
-
st.markdown(
|
478 |
-
|
479 |
-
|
480 |
-
|
481 |
-
|
482 |
-
|
|
|
483 |
st.markdown("####")
|
484 |
st.markdown("Below we generated 5 different kind of summaries from the article in which their ranks are estimated, "
|
485 |
"and hopefully the best summary (read: the one that a human would prefer or indicate as the best one) "
|
|
|
1 |
+
from typing import AnyStr, Dict
|
|
|
|
|
2 |
|
3 |
import itertools
|
|
|
|
|
4 |
import streamlit as st
|
5 |
import en_core_web_lg
|
6 |
|
|
|
9 |
import numpy as np
|
10 |
import base64
|
11 |
|
|
|
12 |
from spacy_streamlit.util import get_svg
|
|
|
13 |
|
14 |
from custom_renderer import render_sentence_custom
|
|
|
|
|
15 |
from sentence_transformers import SentenceTransformer
|
16 |
|
17 |
+
from transformers import AutoTokenizer, AutoModelForTokenClassification
|
|
|
|
|
|
|
|
|
18 |
from transformers import pipeline
|
19 |
import os
|
|
|
20 |
|
|
|
21 |
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
|
22 |
|
23 |
|
|
|
28 |
|
29 |
@st.experimental_singleton
|
30 |
def get_spacy():
|
|
|
31 |
nlp = en_core_web_lg.load()
|
32 |
return nlp
|
33 |
|
34 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
35 |
@st.experimental_singleton
|
36 |
def get_transformer_pipeline():
|
37 |
tokenizer = AutoTokenizer.from_pretrained("xlm-roberta-large-finetuned-conll03-english")
|
|
|
73 |
|
74 |
|
75 |
def fetch_article_contents(filename: str) -> AnyStr:
|
76 |
+
if filename == "Provide your own input":
|
77 |
return " "
|
78 |
with open(f'./sample-articles/{filename.lower()}.txt', 'r') as f:
|
79 |
data = f.read()
|
|
|
151 |
|
152 |
# TODO: this functionality can be cached (e.g. by storing html file output) if wanted (or just store list of entities idk)
|
153 |
def get_and_compare_entities():
|
154 |
+
# article_content = fetch_article_contents(article_name)
|
155 |
article_content = st.session_state.article_text
|
156 |
all_entities_per_sentence = get_all_entities_per_sentence(article_content)
|
157 |
# st.session_state.entities_per_sentence_article = all_entities_per_sentence
|
158 |
entities_article = list(itertools.chain.from_iterable(all_entities_per_sentence))
|
159 |
|
160 |
+
# summary_content = fetch_summary_contents(article_name)
|
161 |
summary_content = st.session_state.summary_output
|
162 |
all_entities_per_sentence = get_all_entities_per_sentence(summary_content)
|
163 |
# st.session_state.entities_per_sentence_summary = all_entities_per_sentence
|
|
|
170 |
if any(entity.lower() in substring_entity.lower() for substring_entity in entities_article):
|
171 |
matched_entities.append(entity)
|
172 |
elif any(
|
173 |
+
np.inner(sentence_embedding_model.encode(entity, show_progress_bar=False),
|
174 |
+
sentence_embedding_model.encode(art_entity, show_progress_bar=False)) > 0.9 for
|
175 |
art_entity in entities_article):
|
176 |
matched_entities.append(entity)
|
177 |
else:
|
|
|
180 |
|
181 |
|
182 |
def highlight_entities():
|
183 |
+
# summary_content = fetch_summary_contents(article_name)
|
184 |
summary_content = st.session_state.summary_output
|
185 |
markdown_start_red = "<mark class=\"entity\" style=\"background: rgb(238, 135, 135);\">"
|
186 |
markdown_start_green = "<mark class=\"entity\" style=\"background: rgb(121, 236, 121);\">"
|
|
|
255 |
# return all_deps
|
256 |
|
257 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
258 |
def render_svg(svg_file):
|
259 |
with open(svg_file, "r") as f:
|
260 |
lines = f.readlines()
|
|
|
267 |
|
268 |
|
269 |
def generate_abstractive_summary(text, type, min_len=120, max_len=512, **kwargs):
|
|
|
270 |
text = text.strip().replace("\n", " ")
|
271 |
if type == "top_p":
|
272 |
text = summarization_model(text, min_length=min_len,
|
|
|
286 |
return summary
|
287 |
|
288 |
|
|
|
|
|
|
|
|
|
289 |
# Page
|
290 |
st.title('Summarization fact checker')
|
291 |
|
|
|
307 |
results for some methods on specific examples. These text blocks will be indicated and they change according to the
|
308 |
currently selected article.""")
|
309 |
|
310 |
+
# Load all different models (cached) at start time of the hugginface space
|
311 |
sentence_embedding_model = get_sentence_embedding_model()
|
|
|
312 |
ner_model = get_transformer_pipeline()
|
313 |
nlp = get_spacy()
|
314 |
+
summarization_model = get_summarizer_model()
|
315 |
|
316 |
# GENERATING SUMMARIES PART
|
317 |
st.header("Generating summaries")
|
|
|
319 |
"text yourself. Note that it’s suggested to provide a sufficiently large text, as otherwise the summary "
|
320 |
"generated from it might not be optimal, leading to suboptimal performance of the post-processing steps.")
|
321 |
|
|
|
322 |
selected_article = st.selectbox('Select an article or provide your own:',
|
323 |
list_all_article_names()) # index=0, format_func=special_internal_function, key=None, help=None, on_change=None, args=None, kwargs=None, *, disabled=False)
|
324 |
st.session_state.article_text = fetch_article_contents(selected_article)
|
|
|
328 |
height=150
|
329 |
)
|
330 |
|
331 |
+
summarize_button = st.button(label='Process article content',
|
332 |
+
help="Generates summary and applies entity matching and dependency parsing for given article")
|
333 |
|
334 |
if summarize_button:
|
335 |
st.session_state.article_text = article_text
|
336 |
+
st.markdown(
|
337 |
+
"Below you can find the generated summary for the article. Based on empirical research, we will discuss "
|
338 |
+
"two main methods that detect some common errors. We can then score different summaries, to indicate how "
|
339 |
+
"factual a summary is for a given article. The idea is that in production, you could generate a set of "
|
340 |
+
"summaries for the same article, with different parameters (or even different models). By using "
|
341 |
+
"post-processing error detection, we can then select the best possible summary.")
|
342 |
if st.session_state.article_text:
|
343 |
with st.spinner('Generating summary...'):
|
344 |
# classify_comment(article_text, selected_model)
|
345 |
+
if selected_article != "Provide your own input" and article_text == fetch_article_contents(
|
346 |
+
selected_article):
|
347 |
st.session_state.unchanged_text = True
|
348 |
summary_content = fetch_summary_contents(selected_article)
|
349 |
else:
|
350 |
+
summary_content = generate_abstractive_summary(article_text, type="beam", do_sample=True, num_beams=15,
|
351 |
+
no_repeat_ngram_size=4)
|
352 |
st.session_state.unchanged_text = False
|
353 |
summary_displayed = display_summary(summary_content)
|
354 |
st.write("**Generated summary:**", summary_displayed, unsafe_allow_html=True)
|
|
|
397 |
|
398 |
# DEPENDENCY PARSING PART
|
399 |
st.header("Dependency comparison")
|
400 |
+
st.markdown(
|
401 |
+
"The second method we use for post-processing is called **Dependency parsing**: the process in which the "
|
402 |
+
"grammatical structure in a sentence is analysed, to find out related words as well as the type of the "
|
403 |
+
"relationship between them. For the sentence “Jan’s wife is called Sarah” you would get the following "
|
404 |
+
"dependency graph:")
|
405 |
|
406 |
# TODO: I wonder why the first doesn't work but the second does (it doesn't show deps otherwise)
|
407 |
# st.image("ExampleParsing.svg")
|
|
|
412 |
"are still correct. “The borders of Ukraine” have a different dependency between “borders” and “Ukraine” "
|
413 |
"than “Ukraine’s borders”, while both descriptions have the same meaning. So just matching all "
|
414 |
"dependencies between article and summary (as we did with entity matching) would not be a robust method.")
|
415 |
+
st.markdown(
|
416 |
+
"However, by empirical testing, we have found that there are certain dependencies which can be used for "
|
417 |
+
"such matching techniques. When unmatched, these specific dependencies are often an indication of a "
|
418 |
+
"wrongly constructed sentence. **Should I explain this more/better or is it enough that I explain by "
|
419 |
+
"example specific run throughs?**. We found 2(/3 TODO) common dependencies which, when present in the "
|
420 |
+
"summary but not in the article, are highly indicative of factualness errors. Furthermore, we only check "
|
421 |
+
"dependencies between an existing **entity** and its direct connections. Below we highlight all unmatched "
|
422 |
+
"dependencies that satisfy the discussed constraints. We also discuss the specific results for the "
|
423 |
+
"currently selected article.")
|
424 |
with st.spinner("Doing dependency parsing..."):
|
425 |
# TODO RIGHT IF FUNCTION (IF EXAMPLE AND IF INPUT UNCHANGED)
|
426 |
# if selected_article == 'article11':
|
|
|
445 |
|
446 |
# OUTRO/CONCLUSION
|
447 |
st.header("Wrapping up")
|
448 |
+
st.markdown(
|
449 |
+
"We have presented 2 methods that try to improve summaries via post-processing steps. Entity matching can "
|
450 |
+
"be used to solve hallucinations, while dependency comparison can be used to filter out some bad "
|
451 |
+
"sentences (and thus worse summaries). These methods highlight the possibilities of post-processing "
|
452 |
+
"AI-made summaries, but are only a basic introduction. As the methods were empirically tested they are "
|
453 |
+
"definitely not sufficiently robust for general use-cases. (something about that we tested also RE and "
|
454 |
+
"maybe other things).")
|
455 |
st.markdown("####")
|
456 |
st.markdown("Below we generated 5 different kind of summaries from the article in which their ranks are estimated, "
|
457 |
"and hopefully the best summary (read: the one that a human would prefer or indicate as the best one) "
|
custom_renderer.py
CHANGED
@@ -1,6 +1,4 @@
|
|
1 |
from typing import Dict
|
2 |
-
|
3 |
-
import spacy
|
4 |
from PIL import ImageFont
|
5 |
|
6 |
|
|
|
1 |
from typing import Dict
|
|
|
|
|
2 |
from PIL import ImageFont
|
3 |
|
4 |
|