Gladiator commited on
Commit
fe021fb
1 Parent(s): 4c24ae0

add url support for summarization

Browse files
Files changed (2) hide show
  1. app.py +18 -7
  2. src/utils.py +29 -0
app.py CHANGED
@@ -1,13 +1,13 @@
1
  import torch
 
2
  import streamlit as st
3
- from transformers import T5Tokenizer, T5ForConditionalGeneration
4
 
5
  # local modules
6
  from extractive_summarizer.model_processors import Summarizer
7
- from src.utils import clean_text
8
  from src.abstractive_summarizer import abstractive_summarizer
9
 
10
-
11
  # abstractive summarizer model
12
  @st.cache()
13
  def load_abs_model():
@@ -25,9 +25,14 @@ if __name__ == "__main__":
25
  "Summarization type", options=["Extractive", "Abstractive"]
26
  )
27
 
28
- inp_text = st.text_input("Enter the text here")
29
 
30
- inp_text = clean_text(inp_text)
 
 
 
 
 
31
 
32
  # view summarized text (expander)
33
  with st.expander("View input text"):
@@ -44,7 +49,7 @@ if __name__ == "__main__":
44
  text="Creating extractive summary. This might take a few seconds ..."
45
  ):
46
  ext_model = Summarizer()
47
- summarized_text = ext_model(inp_text, num_sentences=6)
48
 
49
  elif summarize_type == "Abstractive":
50
  with st.spinner(
@@ -52,8 +57,14 @@ if __name__ == "__main__":
52
  ):
53
  abs_tokenizer, abs_model = load_abs_model()
54
  summarized_text = abstractive_summarizer(
55
- abs_tokenizer, abs_model, inp_text
56
  )
 
 
 
 
 
 
57
 
58
  # final summarized output
59
  st.subheader("Summarized text")
 
1
  import torch
2
+ import validators
3
  import streamlit as st
4
+ from transformers import pipeline, T5Tokenizer, T5ForConditionalGeneration
5
 
6
  # local modules
7
  from extractive_summarizer.model_processors import Summarizer
8
+ from src.utils import clean_text, fetch_article_text
9
  from src.abstractive_summarizer import abstractive_summarizer
10
 
 
11
  # abstractive summarizer model
12
  @st.cache()
13
  def load_abs_model():
 
25
  "Summarization type", options=["Extractive", "Abstractive"]
26
  )
27
 
28
+ inp_text = st.text_input("Enter text or a url here")
29
 
30
+ is_url = validators.url(inp_text)
31
+ if is_url:
32
+ # complete text, chunks to summarize (list of sentences for long docs)
33
+ text, text_to_summarize = fetch_article_text(url=inp_text)
34
+ else:
35
+ text_to_summarize = clean_text(inp_text)
36
 
37
  # view summarized text (expander)
38
  with st.expander("View input text"):
 
49
  text="Creating extractive summary. This might take a few seconds ..."
50
  ):
51
  ext_model = Summarizer()
52
+ summarized_text = ext_model(text_to_summarize, num_sentences=6)
53
 
54
  elif summarize_type == "Abstractive":
55
  with st.spinner(
 
57
  ):
58
  abs_tokenizer, abs_model = load_abs_model()
59
  summarized_text = abstractive_summarizer(
60
+ abs_tokenizer, abs_model, text_to_summarize
61
  )
62
+ elif summarize_type == "Abstractive" and is_url:
63
+ abs_url_summarizer = pipeline("summarization")
64
+ tmp_sum = abs_url_summarizer(
65
+ text_to_summarize, max_length=120, min_length=30, do_sample=False
66
+ )
67
+ summarized_text = " ".join([summ["summary_text"] for summ in tmp_sum])
68
 
69
  # final summarized output
70
  st.subheader("Summarized text")
src/utils.py CHANGED
@@ -1,4 +1,6 @@
1
  import re
 
 
2
 
3
  emoji_pattern = re.compile(
4
  "["
@@ -27,3 +29,30 @@ def clean_text(x):
27
  x = re.sub("[^A-Za-z0-9]+", " ", x) # special charachters
28
 
29
  return x
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
  import re
2
+ import requests
3
+ from bs4 import BeautifulSoup
4
 
5
  emoji_pattern = re.compile(
6
  "["
 
29
  x = re.sub("[^A-Za-z0-9]+", " ", x) # special charachters
30
 
31
  return x
32
+
33
+
34
+ def fetch_article_text(url: str):
35
+
36
+ r = requests.get(url)
37
+ soup = BeautifulSoup(r.text, "html.parser")
38
+ results = soup.find_all(["h1", "p"])
39
+ text = [result.text for result in results]
40
+ ARTICLE = " ".join(text)
41
+ sentences = ARTICLE.split("<eos>")
42
+ current_chunk = 0
43
+ chunks = []
44
+ for sentence in sentences:
45
+ if len(chunks) == current_chunk + 1:
46
+ if len(chunks[current_chunk]) + len(sentence.split(" ")) <= 500:
47
+ chunks[current_chunk].extend(sentence.split(" "))
48
+ else:
49
+ current_chunk += 1
50
+ chunks.append(sentence.split(" "))
51
+ else:
52
+ print(current_chunk)
53
+ chunks.append(sentence.split(" "))
54
+
55
+ for chunk_id in range(len(chunks)):
56
+ chunks[chunk_id] = " ".join(chunks[chunk_id])
57
+
58
+ return ARTICLE, chunks