autosumm / app.py
mhsvieira's picture
Update app.py
0b0fe08
raw history blame
No virus
5.34 kB
import streamlit as st
from extractor import extract, FewDocumentsError
from summarizer import summarize
from translation import translate
from utils.timing import Timer
import cProfile
from sentence_transformers import SentenceTransformer
from transformers import AutoTokenizer, AutoModelForSeq2SeqLM
import torch
from os import environ
@st.cache(allow_output_mutation=True)
def init():
# Dowload required NLTK resources
from nltk import download
download('punkt')
download('stopwords')
device = "cuda" if torch.cuda.is_available() else "cpu"
# Model for semantic searches
search_model = SentenceTransformer('msmarco-distilbert-base-v4', device=device)
# Model for abstraction
summ_model = AutoModelForSeq2SeqLM.from_pretrained('t5-base')
tokenizer = AutoTokenizer.from_pretrained('t5-base')
return search_model, summ_model, tokenizer
def main():
search_model, summ_model, tokenizer = init()
Timer.reset()
_, col2, _ = st.columns([1,1,1])
col2.image('AutoSumm.png', width=250)
st.subheader("Lucas Antunes & Matheus Vieira")
portuguese = st.checkbox('Traduzir para o português.')
st.sidebar.markdown("""
# Processing steps
#### Translation
Step where the system translates the user's query from Portuguese to English and the summary from English to Portuguese.
#### Corpus generation
Step where the system generates the complete corpus: query-related web pages and documents (PDFs and text files) on query-related knowledge area. The Corpus for this model was built to gather documents related to the Blue Amazon, a maritime region in South America.
#### Exhaustive search
Step where the system filters the texts of the corpus that contain keywords from the query.
#### Semantic search over documents
Step in which the system selects documents related to the query through semantic search.
#### Semantic search over paragraphs
Step in which the system breaks documents into paragraphs and selects those related to the query through semantic search.
#### Abstraction
Step in which the system generates an abstractive summary about the query from the best three paragraphs of the previous step.
""")
if portuguese:
environ['PORTUGUESE'] = 'true' # work around (gambiarra)
query_pt = st.text_input('Digite o tópico sobre o qual você deseja gerar um resumo') #text is stored in this variable
button = st.button('Gerar resumo')
else:
environ['PORTUGUESE'] = 'false' # work around (gambiarra)
query = st.text_input('Type the desired topic to generate the summary') #text is stored in this variable
button = st.button('Generate summary')
result = st.container()
if 'few_documents' not in st.session_state:
st.session_state['few_documents'] = False
few_documents = False
else:
few_documents = st.session_state['few_documents']
if button:
query = translate(query_pt, 'pt', 'en') if portuguese else query
try:
text = extract(query, search_model=search_model)
except FewDocumentsError as e:
few_documents = True
st.session_state['few_documents'] = True
st.session_state['documents'] = e.documents
st.session_state['msg'] = e.msg
else:
summary = summarize(text, summ_model, tokenizer)
if portuguese:
result.markdown(f'Seu resumo para "{query_pt}":\n\n> {translate(summary, "en", "pt")}')
with result.expander(f'Parágrafos usados na geração do resumo'):
st.markdown(translate(text, "en", "pt").replace('\n', '\n\n'))
else:
result.markdown(f'Your summary for "{query}":\n\n> {summary}')
with result.expander(f'Paragraphs used in summarization'):
st.markdown(text.replace('\n', '\n\n'))
Timer.show_total()
if few_documents:
Timer.reset()
error_msg = st.empty()
error_msg.warning(st.session_state['msg'])
proceed = st.empty()
if portuguese:
proceed_button = proceed.button('Prosseguir')
else:
proceed_button = proceed.button('Proceed')
if proceed_button:
error_msg.empty()
proceed.empty()
query = translate(query_pt, 'pt', 'en') if portuguese else query
text = extract(query, search_model=search_model, extracted_documents=st.session_state['documents'])
summary = summarize(text, summ_model, tokenizer)
if portuguese:
result.markdown(f'Seu resumo para "{query_pt}":\n\n> {translate(summary, "en", "pt")}')
with result.expander(f'Parágrafos usados na geração do resumo'):
st.markdown(translate(text, "en", "pt").replace('\n', '\n\n'))
else:
result.markdown(f'Your summary for "{query}":\n\n> {summary}')
with result.expander(f'Paragraphs used in summarization'):
st.markdown(text.replace('\n', '\n\n'))
st.session_state['few_documents'] = False
few_documents = False
if __name__ == '__main__':
cProfile.run('main()', 'stats.txt')