autosumm / app.py
mhsvieira's picture
Update app.py
0b0fe08
import streamlit as st
from extractor import extract, FewDocumentsError
from summarizer import summarize
from translation import translate
from utils.timing import Timer
import cProfile
from sentence_transformers import SentenceTransformer
from transformers import AutoTokenizer, AutoModelForSeq2SeqLM
import torch
from os import environ
@st.cache(allow_output_mutation=True)
def init():
# Dowload required NLTK resources
from nltk import download
download('punkt')
download('stopwords')
device = "cuda" if torch.cuda.is_available() else "cpu"
# Model for semantic searches
search_model = SentenceTransformer('msmarco-distilbert-base-v4', device=device)
# Model for abstraction
summ_model = AutoModelForSeq2SeqLM.from_pretrained('t5-base')
tokenizer = AutoTokenizer.from_pretrained('t5-base')
return search_model, summ_model, tokenizer
def main():
search_model, summ_model, tokenizer = init()
Timer.reset()
_, col2, _ = st.columns([1,1,1])
col2.image('AutoSumm.png', width=250)
st.subheader("Lucas Antunes & Matheus Vieira")
portuguese = st.checkbox('Traduzir para o português.')
st.sidebar.markdown("""
# Processing steps
#### Translation
Step where the system translates the user's query from Portuguese to English and the summary from English to Portuguese.
#### Corpus generation
Step where the system generates the complete corpus: query-related web pages and documents (PDFs and text files) on query-related knowledge area. The Corpus for this model was built to gather documents related to the Blue Amazon, a maritime region in South America.
#### Exhaustive search
Step where the system filters the texts of the corpus that contain keywords from the query.
#### Semantic search over documents
Step in which the system selects documents related to the query through semantic search.
#### Semantic search over paragraphs
Step in which the system breaks documents into paragraphs and selects those related to the query through semantic search.
#### Abstraction
Step in which the system generates an abstractive summary about the query from the best three paragraphs of the previous step.
""")
if portuguese:
environ['PORTUGUESE'] = 'true' # work around (gambiarra)
query_pt = st.text_input('Digite o tópico sobre o qual você deseja gerar um resumo') #text is stored in this variable
button = st.button('Gerar resumo')
else:
environ['PORTUGUESE'] = 'false' # work around (gambiarra)
query = st.text_input('Type the desired topic to generate the summary') #text is stored in this variable
button = st.button('Generate summary')
result = st.container()
if 'few_documents' not in st.session_state:
st.session_state['few_documents'] = False
few_documents = False
else:
few_documents = st.session_state['few_documents']
if button:
query = translate(query_pt, 'pt', 'en') if portuguese else query
try:
text = extract(query, search_model=search_model)
except FewDocumentsError as e:
few_documents = True
st.session_state['few_documents'] = True
st.session_state['documents'] = e.documents
st.session_state['msg'] = e.msg
else:
summary = summarize(text, summ_model, tokenizer)
if portuguese:
result.markdown(f'Seu resumo para "{query_pt}":\n\n> {translate(summary, "en", "pt")}')
with result.expander(f'Parágrafos usados na geração do resumo'):
st.markdown(translate(text, "en", "pt").replace('\n', '\n\n'))
else:
result.markdown(f'Your summary for "{query}":\n\n> {summary}')
with result.expander(f'Paragraphs used in summarization'):
st.markdown(text.replace('\n', '\n\n'))
Timer.show_total()
if few_documents:
Timer.reset()
error_msg = st.empty()
error_msg.warning(st.session_state['msg'])
proceed = st.empty()
if portuguese:
proceed_button = proceed.button('Prosseguir')
else:
proceed_button = proceed.button('Proceed')
if proceed_button:
error_msg.empty()
proceed.empty()
query = translate(query_pt, 'pt', 'en') if portuguese else query
text = extract(query, search_model=search_model, extracted_documents=st.session_state['documents'])
summary = summarize(text, summ_model, tokenizer)
if portuguese:
result.markdown(f'Seu resumo para "{query_pt}":\n\n> {translate(summary, "en", "pt")}')
with result.expander(f'Parágrafos usados na geração do resumo'):
st.markdown(translate(text, "en", "pt").replace('\n', '\n\n'))
else:
result.markdown(f'Your summary for "{query}":\n\n> {summary}')
with result.expander(f'Paragraphs used in summarization'):
st.markdown(text.replace('\n', '\n\n'))
st.session_state['few_documents'] = False
few_documents = False
if __name__ == '__main__':
cProfile.run('main()', 'stats.txt')