File size: 3,461 Bytes
4354680 fe021fb cf53b75 d97bcce 4df3ec6 4b21134 e9ee3ed f3505bb 462dc3c f3505bb cf53b75 e36f01a f39343a 4b21134 121b578 32ff21e 4354680 e7fc023 d97bcce 6bfd2d5 32ff21e 121b578 f39343a fe021fb 3e73eb6 f39343a 3e73eb6 79d5beb fe021fb 4354680 462dc3c 3e73eb6 fe021fb 4354680 4b21134 4df3ec6 4354680 4df3ec6 f39343a 4df3ec6 b916752 4354680 cf93567 4df3ec6 4b21134 6f0c363 fe021fb 4df3ec6 4b21134 f3505bb 890cbac c099517 890cbac 097245e 0c2753a f3505bb 32ff21e 4354680 121b578 4b21134 4df3ec6 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 |
import nltk
import validators
import streamlit as st
from transformers import AutoTokenizer, pipeline
# local modules
from extractive_summarizer.model_processors import Summarizer
from utils import (
clean_text,
fetch_article_text,
preprocess_text_for_abstractive_summarization,
read_text_from_file,
)
if __name__ == "__main__":
# ---------------------------------
# Main Application
# ---------------------------------
st.title("Text Summarizer 📝")
summarize_type = st.sidebar.selectbox(
"Summarization type", options=["Extractive", "Abstractive"]
)
# ---------------------------
# SETUP & Constants
nltk.download("punkt")
abs_tokenizer_name = "facebook/bart-large-cnn"
abs_model_name = "facebook/bart-large-cnn"
abs_tokenizer = AutoTokenizer.from_pretrained(abs_tokenizer_name)
abs_max_length = 130
abs_min_length = 30
# ---------------------------
inp_text = st.text_input("Enter text or a url here")
col1, col2, col3 = st.beta_columns([1, 6, 1])
with col1:
st.write("")
with col2:
st.subheader("----- OR -----")
with col3:
st.write("")
uploaded_file = st.file_uploader(
"Upload a .txt, .pdf, .word file for summarization"
)
is_url = validators.url(inp_text)
if is_url:
# complete text, chunks to summarize (list of sentences for long docs)
text, clean_txt = fetch_article_text(url=inp_text)
elif uploaded_file:
clean_txt = read_text_from_file(uploaded_file)
clean_txt = clean_text(inp_text)
else:
clean_txt = clean_text(inp_text)
# view summarized text (expander)
with st.expander("View input text"):
if is_url:
st.write(clean_txt[0])
else:
st.write(clean_txt)
summarize = st.button("Summarize")
# called on toggle button [summarize]
if summarize:
if summarize_type == "Extractive":
if is_url:
text_to_summarize = " ".join([txt for txt in clean_txt])
else:
text_to_summarize = clean_txt
# extractive summarizer
with st.spinner(
text="Creating extractive summary. This might take a few seconds ..."
):
ext_model = Summarizer()
summarized_text = ext_model(text_to_summarize, num_sentences=6)
elif summarize_type == "Abstractive":
with st.spinner(
text="Creating abstractive summary. This might take a few seconds ..."
):
text_to_summarize = clean_txt
abs_summarizer = pipeline(
"summarization", model=abs_model_name, tokenizer=abs_tokenizer_name
)
if is_url is False:
# list of chunks
text_to_summarize = preprocess_text_for_abstractive_summarization(
tokenizer=abs_tokenizer, text=clean_txt
)
tmp_sum = abs_summarizer(
text_to_summarize,
max_length=abs_max_length,
min_length=abs_min_length,
do_sample=False,
)
summarized_text = " ".join([summ["summary_text"] for summ in tmp_sum])
# final summarized output
st.subheader("Summarized text")
st.info(summarized_text)
|