import streamlit as st from extractor import extract, FewDocumentsError from summarizer import summarize from translation import translate from utils.timing import Timer import cProfile from sentence_transformers import SentenceTransformer from transformers import AutoTokenizer, AutoModelForSeq2SeqLM import torch from os import environ @st.cache(allow_output_mutation=True) def init(): # Dowload required NLTK resources from nltk import download download('punkt') download('stopwords') device = "cuda" if torch.cuda.is_available() else "cpu" # Model for semantic searches search_model = SentenceTransformer('msmarco-distilbert-base-v4', device=device) # Model for abstraction summ_model = AutoModelForSeq2SeqLM.from_pretrained('t5-base') tokenizer = AutoTokenizer.from_pretrained('t5-base') return search_model, summ_model, tokenizer def main(): search_model, summ_model, tokenizer = init() Timer.reset() _, col2, _ = st.columns([1,1,1]) col2.image('AutoSumm.png', width=250) st.subheader("Lucas Antunes & Matheus Vieira") portuguese = st.checkbox('Traduzir para o português.') st.sidebar.markdown(""" # Processing steps #### Translation Step where the system translates the user's query from Portuguese to English and the summary from English to Portuguese. #### Corpus generation Step where the system generates the complete corpus: query-related web pages and documents (PDFs and text files) on query-related knowledge area. The Corpus for this model was built to gather documents related to the Blue Amazon, a maritime region in South America. #### Exhaustive search Step where the system filters the texts of the corpus that contain keywords from the query. #### Semantic search over documents Step in which the system selects documents related to the query through semantic search. #### Semantic search over paragraphs Step in which the system breaks documents into paragraphs and selects those related to the query through semantic search. #### Abstraction Step in which the system generates an abstractive summary about the query from the best three paragraphs of the previous step. """) if portuguese: environ['PORTUGUESE'] = 'true' # work around (gambiarra) query_pt = st.text_input('Digite o tópico sobre o qual você deseja gerar um resumo') #text is stored in this variable button = st.button('Gerar resumo') else: environ['PORTUGUESE'] = 'false' # work around (gambiarra) query = st.text_input('Type the desired topic to generate the summary') #text is stored in this variable button = st.button('Generate summary') result = st.container() if 'few_documents' not in st.session_state: st.session_state['few_documents'] = False few_documents = False else: few_documents = st.session_state['few_documents'] if button: query = translate(query_pt, 'pt', 'en') if portuguese else query try: text = extract(query, search_model=search_model) except FewDocumentsError as e: few_documents = True st.session_state['few_documents'] = True st.session_state['documents'] = e.documents st.session_state['msg'] = e.msg else: summary = summarize(text, summ_model, tokenizer) if portuguese: result.markdown(f'Seu resumo para "{query_pt}":\n\n> {translate(summary, "en", "pt")}') with result.expander(f'Parágrafos usados na geração do resumo'): st.markdown(translate(text, "en", "pt").replace('\n', '\n\n')) else: result.markdown(f'Your summary for "{query}":\n\n> {summary}') with result.expander(f'Paragraphs used in summarization'): st.markdown(text.replace('\n', '\n\n')) Timer.show_total() if few_documents: Timer.reset() error_msg = st.empty() error_msg.warning(st.session_state['msg']) proceed = st.empty() if portuguese: proceed_button = proceed.button('Prosseguir') else: proceed_button = proceed.button('Proceed') if proceed_button: error_msg.empty() proceed.empty() query = translate(query_pt, 'pt', 'en') if portuguese else query text = extract(query, search_model=search_model, extracted_documents=st.session_state['documents']) summary = summarize(text, summ_model, tokenizer) if portuguese: result.markdown(f'Seu resumo para "{query_pt}":\n\n> {translate(summary, "en", "pt")}') with result.expander(f'Parágrafos usados na geração do resumo'): st.markdown(translate(text, "en", "pt").replace('\n', '\n\n')) else: result.markdown(f'Your summary for "{query}":\n\n> {summary}') with result.expander(f'Paragraphs used in summarization'): st.markdown(text.replace('\n', '\n\n')) st.session_state['few_documents'] = False few_documents = False if __name__ == '__main__': cProfile.run('main()', 'stats.txt')