import streamlit as st from transformers import pipeline from qa.qa import file_to_doc from transformers import AutoTokenizer from typing import Text, Union @st.cache_resource def summarization_model( model_name:str="facebook/bart-large-cnn", custom_tokenizer:Union[AutoTokenizer, bool]=False ): summarizer = pipeline( model=model_name, tokenizer=model_name if custom_tokenizer==False else custom_tokenizer, task="summarization" ) return summarizer @st.cache_data def split_string_into_token_chunks(s:Text, _tokenizer:AutoTokenizer, chunk_size:int): # Tokenize the entire string token_ids = _tokenizer.encode(s) # Split the token ids into chunks of the desired size chunks = [token_ids[i:i+chunk_size] for i in range(0, len(token_ids), chunk_size)] # Decode each chunk back into a string return [_tokenizer.decode(chunk) for chunk in chunks] def summarization_main(): st.markdown("

Text Summarization

", unsafe_allow_html=True) st.markdown("

What is text summarization about?

", unsafe_allow_html=True) st.write(""" Text summarization is common NLP task concerned with producing a shorter version of a given text while preserving the important information contained in such text """) OPTION_1 = "I want to input some text" OPTION_2 = "I want to upload a file" option = st.radio("How would you like to start? Choose an option below", [OPTION_1, OPTION_2]) # greenlight to summarize text_is_given = False if option == OPTION_1: sample_text = "" text = st.text_area( "Input a text in English (10,000 characters max)", value=sample_text, max_chars=10_000, height=330) # toggle text is given greenlight if text != sample_text: text_is_given = not text_is_given elif option == OPTION_2: uploaded_file = st.file_uploader( "Upload a pdf, docx, or txt file (scanned documents not supported)", type=["pdf", "docx", "txt"], help="Scanned documents are not supported yet 🥲" ) if uploaded_file is not None: # parse the file using custom parsers and build a concatenation for the summarizer text = " ".join(file_to_doc(uploaded_file)) # toggle text is given greenlight text_is_given = not text_is_given if text_is_given: # minimal number of words in the summary min_length, max_length = 30, 200 user_max_length = max_length # user_max_lenght = st.slider( # label="Maximal number of tokens in the summary", # min_value=min_length, # max_value=max_length, # value=150, # step=10, # ) summarizer_downloaded = False # loading the tokenizer to split the input document into feasible chunks model_name = "facebook/bart-large-cnn" tokenizer = AutoTokenizer.from_pretrained(model_name) # the maximum number of tokens the model can handle depends on the model - accounting for tokens added by tokenizer chunk_size = int(0.88*tokenizer.model_max_length) # loading the summarization model considered with st.spinner(text="Loading summarization model..."): summarizer = summarization_model(model_name=model_name) summarizer_downloaded = True if summarizer_downloaded: button = st.button("Summarize!") if button: with st.spinner(text="Summarizing text..."): # summarizing each chunk of the input text to avoid exceeding the maximum number of tokens summary = "" chunks = split_string_into_token_chunks(text, tokenizer, chunk_size) for chunk in chunks: chunk_summary = summarizer(chunk, max_length=user_max_length, min_length=min_length) summary += "\n" + chunk_summary[0]["summary_text"] st.markdown("

Summary

", unsafe_allow_html=True) print(summary) st.write(summary)