ArchitSharma commited on
Commit
62a1e81
β€’
1 Parent(s): b7f9992

Create app.py

Browse files
Files changed (1) hide show
  1. app.py +108 -0
app.py ADDED
@@ -0,0 +1,108 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import nltk
2
+ import validators
3
+ import streamlit as st
4
+ from transformers import AutoTokenizer, pipeline
5
+
6
+ # local modules
7
+ from summarizer import Summarizer
8
+ from utils import (
9
+ clean_text,
10
+ fetch_article_text,
11
+ preprocess_text_for_abstractive_summarization,
12
+ read_text_from_file,
13
+ )
14
+
15
+ if __name__ == "__main__":
16
+ # ---------------------------------
17
+ # Main Application
18
+ # ---------------------------------
19
+ st.title("Text-Summarization-Tool πŸ“")
20
+
21
+ st.markdown("---")
22
+ summarize_type = st.sidebar.selectbox(
23
+ "Summarization type", options=["Extractive", "Abstractive"]
24
+ )
25
+ st.markdown(
26
+ """This app supports two type of summarization:
27
+
28
+ 1. **Extractive Summarization**: The extractive approach involves picking up the most important phrases and lines from the documents. It then combines all the important lines to create the summary. So, in this case, every line and word of the summary actually belongs to the original document which is summarized.
29
+ 2. **Abstractive Summarization**: The abstractive approach involves rephrasing the complete document while capturing the complete meaning of the document. This type of summarization provides more human-like summary"""
30
+ )
31
+ # ---------------------------
32
+ # SETUP & Constants
33
+ nltk.download("punkt")
34
+ abs_tokenizer_name = "facebook/bart-large-cnn"
35
+ abs_model_name = "facebook/bart-large-cnn"
36
+ abs_tokenizer = AutoTokenizer.from_pretrained(abs_tokenizer_name)
37
+ abs_max_length = 130
38
+ abs_min_length = 30
39
+ # ---------------------------
40
+
41
+ inp_text = st.text_input("Enter Text or a URL here")
42
+ st.markdown(
43
+ "<h3 style='text-align: center;'>OR</h3>",
44
+ unsafe_allow_html=True,
45
+ )
46
+ uploaded_file = st.file_uploader(
47
+ "Upload a .txt, .pdf, .docx file for summarization"
48
+ )
49
+
50
+ is_url = validators.url(inp_text)
51
+ if is_url:
52
+ # complete text, chunks to summarize (list of sentences for long docs)
53
+ text, clean_txt = fetch_article_text(url=inp_text)
54
+ elif uploaded_file:
55
+ clean_txt = read_text_from_file(uploaded_file)
56
+ clean_txt = clean_text(clean_txt)
57
+ else:
58
+ clean_txt = clean_text(inp_text)
59
+
60
+ # view summarized text (expander)
61
+ with st.expander("View input text"):
62
+ if is_url:
63
+ st.write(clean_txt[0])
64
+ else:
65
+ st.write(clean_txt)
66
+ summarize = st.button("Summarize")
67
+
68
+ # called on toggle button [summarize]
69
+ if summarize:
70
+ if summarize_type == "Extractive":
71
+ if is_url:
72
+ text_to_summarize = " ".join([txt for txt in clean_txt])
73
+ else:
74
+ text_to_summarize = clean_txt
75
+ # extractive summarizer
76
+
77
+ with st.spinner(
78
+ text="Creating extractive summary. This might take a few seconds ..."
79
+ ):
80
+ ext_model = Summarizer()
81
+ summarized_text = ext_model(text_to_summarize, ratio=ext_sum_ratio)
82
+
83
+ elif summarize_type == "Abstractive":
84
+ with st.spinner(
85
+ text="Creating abstractive summary. This might take a few seconds ..."
86
+ ):
87
+ text_to_summarize = clean_txt
88
+ abs_summarizer = pipeline(
89
+ "summarization", model=abs_model_name, tokenizer=abs_tokenizer_name
90
+ )
91
+
92
+ if is_url is False:
93
+ # list of chunks
94
+ text_to_summarize = preprocess_text_for_abstractive_summarization(
95
+ tokenizer=abs_tokenizer, text=clean_txt
96
+ )
97
+ tmp_sum = abs_summarizer(
98
+ text_to_summarize,
99
+ max_length=abs_max_length,
100
+ min_length=abs_min_length,
101
+ do_sample=False,
102
+ )
103
+
104
+ summarized_text = " ".join([summ["summary_text"] for summ in tmp_sum])
105
+
106
+ # final summarized output
107
+ st.subheader("Summarized text")
108
+ st.info(summarized_text)