Update app.py
Browse files
app.py
CHANGED
@@ -198,6 +198,7 @@ def get_all_entities(text):
|
|
198 |
|
199 |
def get_and_compare_entities(article_content,summary_output):
|
200 |
|
|
|
201 |
all_entities_per_sentence = get_all_entities_per_sentence(article_content)
|
202 |
entities_article = list(itertools.chain.from_iterable(all_entities_per_sentence))
|
203 |
|
@@ -260,51 +261,6 @@ def highlight_entities(article_content,summary_output):
|
|
260 |
return HTML_WRAPPER.format(soup)
|
261 |
|
262 |
|
263 |
-
def render_dependency_parsing(text: dict):
|
264 |
-
html = render_sentence_custom(text, nlp)
|
265 |
-
html = html.replace("\n\n", "\n")
|
266 |
-
st.write(get_svg(html), unsafe_allow_html=True)
|
267 |
-
|
268 |
-
|
269 |
-
def check_dependency(article: bool):
|
270 |
-
if article:
|
271 |
-
text = st.session_state.article_text
|
272 |
-
all_entities = get_all_entities_per_sentence(text)
|
273 |
-
else:
|
274 |
-
text = st.session_state.summary_output
|
275 |
-
all_entities = get_all_entities_per_sentence(text)
|
276 |
-
doc = nlp(text)
|
277 |
-
tok_l = doc.to_json()['tokens']
|
278 |
-
test_list_dict_output = []
|
279 |
-
|
280 |
-
sentences = list(doc.sents)
|
281 |
-
for i, sentence in enumerate(sentences):
|
282 |
-
start_id = sentence.start
|
283 |
-
end_id = sentence.end
|
284 |
-
for t in tok_l:
|
285 |
-
if t["id"] < start_id or t["id"] > end_id:
|
286 |
-
continue
|
287 |
-
head = tok_l[t['head']]
|
288 |
-
if t['dep'] == 'amod' or t['dep'] == "pobj":
|
289 |
-
object_here = text[t['start']:t['end']]
|
290 |
-
object_target = text[head['start']:head['end']]
|
291 |
-
if t['dep'] == "pobj" and str.lower(object_target) != "in":
|
292 |
-
continue
|
293 |
-
# ONE NEEDS TO BE ENTITY
|
294 |
-
if object_here in all_entities[i]:
|
295 |
-
identifier = object_here + t['dep'] + object_target
|
296 |
-
test_list_dict_output.append({"dep": t['dep'], "cur_word_index": (t['id'] - sentence.start),
|
297 |
-
"target_word_index": (t['head'] - sentence.start),
|
298 |
-
"identifier": identifier, "sentence": str(sentence)})
|
299 |
-
elif object_target in all_entities[i]:
|
300 |
-
identifier = object_here + t['dep'] + object_target
|
301 |
-
test_list_dict_output.append({"dep": t['dep'], "cur_word_index": (t['id'] - sentence.start),
|
302 |
-
"target_word_index": (t['head'] - sentence.start),
|
303 |
-
"identifier": identifier, "sentence": str(sentence)})
|
304 |
-
else:
|
305 |
-
continue
|
306 |
-
return test_list_dict_output
|
307 |
-
|
308 |
|
309 |
def render_svg(svg_file):
|
310 |
with open(svg_file, "r") as f:
|
@@ -378,6 +334,12 @@ def schleifer_model():
|
|
378 |
|
379 |
summarizer = pipeline('summarization',model='sshleifer/distilbart-cnn-12-6')
|
380 |
return summarizer
|
|
|
|
|
|
|
|
|
|
|
|
|
381 |
|
382 |
@st.experimental_singleton(suppress_st_warning=True)
|
383 |
def get_sentence_embedding_model():
|
@@ -399,7 +361,7 @@ nlp = get_spacy()
|
|
399 |
st.title("Article Text and Link Extractive Summarizer π")
|
400 |
|
401 |
model_type = st.sidebar.selectbox(
|
402 |
-
"Model type", options=["Facebook-Bart", "Sshleifer-DistilBart"]
|
403 |
)
|
404 |
|
405 |
max_len= st.sidebar.slider("Maximum length of the summarized text",min_value=80,max_value=500,step=10)
|
@@ -416,7 +378,8 @@ st.markdown(
|
|
416 |
|
417 |
st.markdown("""
|
418 |
- Facebook-Bart, trained on large [CNN and Daily Mail](https://huggingface.co/datasets/cnn_dailymail) news articles.
|
419 |
-
- Sshleifer-Distilbart, which is a distilled (smaller) version of the large Bart model.
|
|
|
420 |
)
|
421 |
|
422 |
st.markdown("""Please do note that the model will take longer to generate summaries for documents that are too long.""")
|
@@ -499,10 +462,23 @@ if summarize:
|
|
499 |
summarizer_model = schleifer_model()
|
500 |
summarized_text = summarizer_model(text_to_summarize, max_length=max_len, min_length=min_len)
|
501 |
summarized_text = ' '.join([summ['summary_text'] for summ in summarized_text])
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
502 |
|
503 |
with st.spinner("Calculating and matching entities, this takes a few seconds..."):
|
504 |
|
505 |
-
entity_match_html = highlight_entities(cleaned_text
|
506 |
st.subheader("Summarized text with matched entities in Green and mismatched entities in Red relative to the original text")
|
507 |
st.markdown("####")
|
508 |
|
|
|
198 |
|
199 |
def get_and_compare_entities(article_content,summary_output):
|
200 |
|
201 |
+
|
202 |
all_entities_per_sentence = get_all_entities_per_sentence(article_content)
|
203 |
entities_article = list(itertools.chain.from_iterable(all_entities_per_sentence))
|
204 |
|
|
|
261 |
return HTML_WRAPPER.format(soup)
|
262 |
|
263 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
264 |
|
265 |
def render_svg(svg_file):
|
266 |
with open(svg_file, "r") as f:
|
|
|
334 |
|
335 |
summarizer = pipeline('summarization',model='sshleifer/distilbart-cnn-12-6')
|
336 |
return summarizer
|
337 |
+
|
338 |
+
@st.experimental_singleton(suppress_st_warning=True)
|
339 |
+
def google_model():
|
340 |
+
|
341 |
+
summarizer = pipeline('summarization',model='google/pegasus-cnn_dailymail')
|
342 |
+
return summarizer
|
343 |
|
344 |
@st.experimental_singleton(suppress_st_warning=True)
|
345 |
def get_sentence_embedding_model():
|
|
|
361 |
st.title("Article Text and Link Extractive Summarizer π")
|
362 |
|
363 |
model_type = st.sidebar.selectbox(
|
364 |
+
"Model type", options=["Facebook-Bart", "Sshleifer-DistilBart","Google-Pegasus"]
|
365 |
)
|
366 |
|
367 |
max_len= st.sidebar.slider("Maximum length of the summarized text",min_value=80,max_value=500,step=10)
|
|
|
378 |
|
379 |
st.markdown("""
|
380 |
- Facebook-Bart, trained on large [CNN and Daily Mail](https://huggingface.co/datasets/cnn_dailymail) news articles.
|
381 |
+
- Sshleifer-Distilbart, which is a distilled (smaller) version of the large Bart model.
|
382 |
+
- Google Pegasus"""
|
383 |
)
|
384 |
|
385 |
st.markdown("""Please do note that the model will take longer to generate summaries for documents that are too long.""")
|
|
|
462 |
summarizer_model = schleifer_model()
|
463 |
summarized_text = summarizer_model(text_to_summarize, max_length=max_len, min_length=min_len)
|
464 |
summarized_text = ' '.join([summ['summary_text'] for summ in summarized_text])
|
465 |
+
|
466 |
+
elif model_type == "Google-Pegasus":
|
467 |
+
if url_text:
|
468 |
+
text_to_summarize = cleaned_text
|
469 |
+
else:
|
470 |
+
text_to_summarize = cleaned_text
|
471 |
+
|
472 |
+
with st.spinner(
|
473 |
+
text="Loading Sshleifer-DistilBart Model and Extracting summary. This might take a few seconds depending on the length of your text..."
|
474 |
+
):
|
475 |
+
summarizer_model = google_model()
|
476 |
+
summarized_text = summarizer_model(text_to_summarize, max_length=max_len, min_length=min_len)
|
477 |
+
summarized_text = ' '.join([summ['summary_text'] for summ in summarized_text])
|
478 |
|
479 |
with st.spinner("Calculating and matching entities, this takes a few seconds..."):
|
480 |
|
481 |
+
entity_match_html = highlight_entities(cleaned_text,summarized_text)
|
482 |
st.subheader("Summarized text with matched entities in Green and mismatched entities in Red relative to the original text")
|
483 |
st.markdown("####")
|
484 |
|