Spaces:
Runtime error
Runtime error
import streamlit as st | |
from extractor import extract, FewDocumentsError | |
from summarizer import summarize | |
from translation import translate | |
from utils.timing import Timer | |
import cProfile | |
from sentence_transformers import SentenceTransformer | |
from transformers import AutoTokenizer, AutoModelForSeq2SeqLM | |
import torch | |
from os import environ | |
def init(): | |
# Dowload required NLTK resources | |
from nltk import download | |
download('punkt') | |
download('stopwords') | |
device = "cuda" if torch.cuda.is_available() else "cpu" | |
# Model for semantic searches | |
search_model = SentenceTransformer('msmarco-distilbert-base-v4', device=device) | |
# Model for abstraction | |
summ_model = AutoModelForSeq2SeqLM.from_pretrained('t5-base') | |
tokenizer = AutoTokenizer.from_pretrained('t5-base') | |
return search_model, summ_model, tokenizer | |
def main(): | |
search_model, summ_model, tokenizer = init() | |
Timer.reset() | |
_, col2, _ = st.columns([1,1,1]) | |
col2.image('AutoSumm.png', width=250) | |
st.subheader("Lucas Antunes & Matheus Vieira") | |
portuguese = st.checkbox('Traduzir para o português.') | |
st.sidebar.markdown(""" | |
# Processing steps | |
#### Translation | |
Step where the system translates the user's query from Portuguese to English and the summary from English to Portuguese. | |
#### Corpus generation | |
Step where the system generates the complete corpus: query-related web pages and documents (PDFs and text files) on query-related knowledge area. The Corpus for this model was built to gather documents related to the Blue Amazon, a maritime region in South America. | |
#### Exhaustive search | |
Step where the system filters the texts of the corpus that contain keywords from the query. | |
#### Semantic search over documents | |
Step in which the system selects documents related to the query through semantic search. | |
#### Semantic search over paragraphs | |
Step in which the system breaks documents into paragraphs and selects those related to the query through semantic search. | |
#### Abstraction | |
Step in which the system generates an abstractive summary about the query from the best three paragraphs of the previous step. | |
""") | |
if portuguese: | |
environ['PORTUGUESE'] = 'true' # work around (gambiarra) | |
query_pt = st.text_input('Digite o tópico sobre o qual você deseja gerar um resumo') #text is stored in this variable | |
button = st.button('Gerar resumo') | |
else: | |
environ['PORTUGUESE'] = 'false' # work around (gambiarra) | |
query = st.text_input('Type the desired topic to generate the summary') #text is stored in this variable | |
button = st.button('Generate summary') | |
result = st.container() | |
if 'few_documents' not in st.session_state: | |
st.session_state['few_documents'] = False | |
few_documents = False | |
else: | |
few_documents = st.session_state['few_documents'] | |
if button: | |
query = translate(query_pt, 'pt', 'en') if portuguese else query | |
try: | |
text = extract(query, search_model=search_model) | |
except FewDocumentsError as e: | |
few_documents = True | |
st.session_state['few_documents'] = True | |
st.session_state['documents'] = e.documents | |
st.session_state['msg'] = e.msg | |
else: | |
summary = summarize(text, summ_model, tokenizer) | |
if portuguese: | |
result.markdown(f'Seu resumo para "{query_pt}":\n\n> {translate(summary, "en", "pt")}') | |
with result.expander(f'Parágrafos usados na geração do resumo'): | |
st.markdown(translate(text, "en", "pt").replace('\n', '\n\n')) | |
else: | |
result.markdown(f'Your summary for "{query}":\n\n> {summary}') | |
with result.expander(f'Paragraphs used in summarization'): | |
st.markdown(text.replace('\n', '\n\n')) | |
Timer.show_total() | |
if few_documents: | |
Timer.reset() | |
error_msg = st.empty() | |
error_msg.warning(st.session_state['msg']) | |
proceed = st.empty() | |
if portuguese: | |
proceed_button = proceed.button('Prosseguir') | |
else: | |
proceed_button = proceed.button('Proceed') | |
if proceed_button: | |
error_msg.empty() | |
proceed.empty() | |
query = translate(query_pt, 'pt', 'en') if portuguese else query | |
text = extract(query, search_model=search_model, extracted_documents=st.session_state['documents']) | |
summary = summarize(text, summ_model, tokenizer) | |
if portuguese: | |
result.markdown(f'Seu resumo para "{query_pt}":\n\n> {translate(summary, "en", "pt")}') | |
with result.expander(f'Parágrafos usados na geração do resumo'): | |
st.markdown(translate(text, "en", "pt").replace('\n', '\n\n')) | |
else: | |
result.markdown(f'Your summary for "{query}":\n\n> {summary}') | |
with result.expander(f'Paragraphs used in summarization'): | |
st.markdown(text.replace('\n', '\n\n')) | |
st.session_state['few_documents'] = False | |
few_documents = False | |
if __name__ == '__main__': | |
cProfile.run('main()', 'stats.txt') |