nickmuchi commited on
Commit
168a75c
β€’
1 Parent(s): 972218d

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +24 -48
app.py CHANGED
@@ -198,6 +198,7 @@ def get_all_entities(text):
198
 
199
  def get_and_compare_entities(article_content,summary_output):
200
 
 
201
  all_entities_per_sentence = get_all_entities_per_sentence(article_content)
202
  entities_article = list(itertools.chain.from_iterable(all_entities_per_sentence))
203
 
@@ -260,51 +261,6 @@ def highlight_entities(article_content,summary_output):
260
  return HTML_WRAPPER.format(soup)
261
 
262
 
263
- def render_dependency_parsing(text: dict):
264
- html = render_sentence_custom(text, nlp)
265
- html = html.replace("\n\n", "\n")
266
- st.write(get_svg(html), unsafe_allow_html=True)
267
-
268
-
269
- def check_dependency(article: bool):
270
- if article:
271
- text = st.session_state.article_text
272
- all_entities = get_all_entities_per_sentence(text)
273
- else:
274
- text = st.session_state.summary_output
275
- all_entities = get_all_entities_per_sentence(text)
276
- doc = nlp(text)
277
- tok_l = doc.to_json()['tokens']
278
- test_list_dict_output = []
279
-
280
- sentences = list(doc.sents)
281
- for i, sentence in enumerate(sentences):
282
- start_id = sentence.start
283
- end_id = sentence.end
284
- for t in tok_l:
285
- if t["id"] < start_id or t["id"] > end_id:
286
- continue
287
- head = tok_l[t['head']]
288
- if t['dep'] == 'amod' or t['dep'] == "pobj":
289
- object_here = text[t['start']:t['end']]
290
- object_target = text[head['start']:head['end']]
291
- if t['dep'] == "pobj" and str.lower(object_target) != "in":
292
- continue
293
- # ONE NEEDS TO BE ENTITY
294
- if object_here in all_entities[i]:
295
- identifier = object_here + t['dep'] + object_target
296
- test_list_dict_output.append({"dep": t['dep'], "cur_word_index": (t['id'] - sentence.start),
297
- "target_word_index": (t['head'] - sentence.start),
298
- "identifier": identifier, "sentence": str(sentence)})
299
- elif object_target in all_entities[i]:
300
- identifier = object_here + t['dep'] + object_target
301
- test_list_dict_output.append({"dep": t['dep'], "cur_word_index": (t['id'] - sentence.start),
302
- "target_word_index": (t['head'] - sentence.start),
303
- "identifier": identifier, "sentence": str(sentence)})
304
- else:
305
- continue
306
- return test_list_dict_output
307
-
308
 
309
  def render_svg(svg_file):
310
  with open(svg_file, "r") as f:
@@ -378,6 +334,12 @@ def schleifer_model():
378
 
379
  summarizer = pipeline('summarization',model='sshleifer/distilbart-cnn-12-6')
380
  return summarizer
 
 
 
 
 
 
381
 
382
  @st.experimental_singleton(suppress_st_warning=True)
383
  def get_sentence_embedding_model():
@@ -399,7 +361,7 @@ nlp = get_spacy()
399
  st.title("Article Text and Link Extractive Summarizer πŸ“")
400
 
401
  model_type = st.sidebar.selectbox(
402
- "Model type", options=["Facebook-Bart", "Sshleifer-DistilBart"]
403
  )
404
 
405
  max_len= st.sidebar.slider("Maximum length of the summarized text",min_value=80,max_value=500,step=10)
@@ -416,7 +378,8 @@ st.markdown(
416
 
417
  st.markdown("""
418
  - Facebook-Bart, trained on large [CNN and Daily Mail](https://huggingface.co/datasets/cnn_dailymail) news articles.
419
- - Sshleifer-Distilbart, which is a distilled (smaller) version of the large Bart model."""
 
420
  )
421
 
422
  st.markdown("""Please do note that the model will take longer to generate summaries for documents that are too long.""")
@@ -499,10 +462,23 @@ if summarize:
499
  summarizer_model = schleifer_model()
500
  summarized_text = summarizer_model(text_to_summarize, max_length=max_len, min_length=min_len)
501
  summarized_text = ' '.join([summ['summary_text'] for summ in summarized_text])
 
 
 
 
 
 
 
 
 
 
 
 
 
502
 
503
  with st.spinner("Calculating and matching entities, this takes a few seconds..."):
504
 
505
- entity_match_html = highlight_entities(cleaned_text[0],summarized_text)
506
  st.subheader("Summarized text with matched entities in Green and mismatched entities in Red relative to the original text")
507
  st.markdown("####")
508
 
 
198
 
199
  def get_and_compare_entities(article_content,summary_output):
200
 
201
+
202
  all_entities_per_sentence = get_all_entities_per_sentence(article_content)
203
  entities_article = list(itertools.chain.from_iterable(all_entities_per_sentence))
204
 
 
261
  return HTML_WRAPPER.format(soup)
262
 
263
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
264
 
265
  def render_svg(svg_file):
266
  with open(svg_file, "r") as f:
 
334
 
335
  summarizer = pipeline('summarization',model='sshleifer/distilbart-cnn-12-6')
336
  return summarizer
337
+
338
+ @st.experimental_singleton(suppress_st_warning=True)
339
+ def google_model():
340
+
341
+ summarizer = pipeline('summarization',model='google/pegasus-cnn_dailymail')
342
+ return summarizer
343
 
344
  @st.experimental_singleton(suppress_st_warning=True)
345
  def get_sentence_embedding_model():
 
361
  st.title("Article Text and Link Extractive Summarizer πŸ“")
362
 
363
  model_type = st.sidebar.selectbox(
364
+ "Model type", options=["Facebook-Bart", "Sshleifer-DistilBart","Google-Pegasus"]
365
  )
366
 
367
  max_len= st.sidebar.slider("Maximum length of the summarized text",min_value=80,max_value=500,step=10)
 
378
 
379
  st.markdown("""
380
  - Facebook-Bart, trained on large [CNN and Daily Mail](https://huggingface.co/datasets/cnn_dailymail) news articles.
381
+ - Sshleifer-Distilbart, which is a distilled (smaller) version of the large Bart model.
382
+ - Google Pegasus"""
383
  )
384
 
385
  st.markdown("""Please do note that the model will take longer to generate summaries for documents that are too long.""")
 
462
  summarizer_model = schleifer_model()
463
  summarized_text = summarizer_model(text_to_summarize, max_length=max_len, min_length=min_len)
464
  summarized_text = ' '.join([summ['summary_text'] for summ in summarized_text])
465
+
466
+ elif model_type == "Google-Pegasus":
467
+ if url_text:
468
+ text_to_summarize = cleaned_text
469
+ else:
470
+ text_to_summarize = cleaned_text
471
+
472
+ with st.spinner(
473
+ text="Loading Sshleifer-DistilBart Model and Extracting summary. This might take a few seconds depending on the length of your text..."
474
+ ):
475
+ summarizer_model = google_model()
476
+ summarized_text = summarizer_model(text_to_summarize, max_length=max_len, min_length=min_len)
477
+ summarized_text = ' '.join([summ['summary_text'] for summ in summarized_text])
478
 
479
  with st.spinner("Calculating and matching entities, this takes a few seconds..."):
480
 
481
+ entity_match_html = highlight_entities(cleaned_text,summarized_text)
482
  st.subheader("Summarized text with matched entities in Green and mismatched entities in Red relative to the original text")
483
  st.markdown("####")
484