Spaces:

autosummproject
/

autosumm

Runtime error

App Files Files Community

autosumm / app.py

mhsvieira

Update app.py

0b0fe08 almost 3 years ago

raw

history blame

No virus

5.34 kB

	import streamlit as st
	from extractor import extract, FewDocumentsError
	from summarizer import summarize
	from translation import translate
	from utils.timing import Timer
	import cProfile
	from sentence_transformers import SentenceTransformer
	from transformers import AutoTokenizer, AutoModelForSeq2SeqLM
	import torch
	from os import environ

	@st.cache(allow_output_mutation=True)
	def init():
	# Dowload required NLTK resources
	from nltk import download
	download('punkt')
	download('stopwords')

	device = "cuda" if torch.cuda.is_available() else "cpu"
	# Model for semantic searches
	search_model = SentenceTransformer('msmarco-distilbert-base-v4', device=device)
	# Model for abstraction
	summ_model = AutoModelForSeq2SeqLM.from_pretrained('t5-base')
	tokenizer = AutoTokenizer.from_pretrained('t5-base')

	return search_model, summ_model, tokenizer

	def main():
	search_model, summ_model, tokenizer = init()
	Timer.reset()

	_, col2, _ = st.columns([1,1,1])
	col2.image('AutoSumm.png', width=250)
	st.subheader("Lucas Antunes & Matheus Vieira")

	portuguese = st.checkbox('Traduzir para o português.')

	st.sidebar.markdown("""
	# Processing steps
	#### Translation
	Step where the system translates the user's query from Portuguese to English and the summary from English to Portuguese.

	#### Corpus generation
	Step where the system generates the complete corpus: query-related web pages and documents (PDFs and text files) on query-related knowledge area. The Corpus for this model was built to gather documents related to the Blue Amazon, a maritime region in South America.

	#### Exhaustive search
	Step where the system filters the texts of the corpus that contain keywords from the query.

	#### Semantic search over documents
	Step in which the system selects documents related to the query through semantic search.

	#### Semantic search over paragraphs
	Step in which the system breaks documents into paragraphs and selects those related to the query through semantic search.

	#### Abstraction
	Step in which the system generates an abstractive summary about the query from the best three paragraphs of the previous step.
	""")

	if portuguese:
	environ['PORTUGUESE'] = 'true' # work around (gambiarra)
	query_pt = st.text_input('Digite o tópico sobre o qual você deseja gerar um resumo') #text is stored in this variable
	button = st.button('Gerar resumo')
	else:
	environ['PORTUGUESE'] = 'false' # work around (gambiarra)
	query = st.text_input('Type the desired topic to generate the summary') #text is stored in this variable
	button = st.button('Generate summary')

	result = st.container()

	if 'few_documents' not in st.session_state:
	st.session_state['few_documents'] = False
	few_documents = False
	else:
	few_documents = st.session_state['few_documents']

	if button:
	query = translate(query_pt, 'pt', 'en') if portuguese else query
	try:
	text = extract(query, search_model=search_model)
	except FewDocumentsError as e:
	few_documents = True
	st.session_state['few_documents'] = True
	st.session_state['documents'] = e.documents
	st.session_state['msg'] = e.msg
	else:

	summary = summarize(text, summ_model, tokenizer)

	if portuguese:
	result.markdown(f'Seu resumo para "{query_pt}":\n\n> {translate(summary, "en", "pt")}')
	with result.expander(f'Parágrafos usados na geração do resumo'):
	st.markdown(translate(text, "en", "pt").replace('\n', '\n\n'))
	else:
	result.markdown(f'Your summary for "{query}":\n\n> {summary}')
	with result.expander(f'Paragraphs used in summarization'):
	st.markdown(text.replace('\n', '\n\n'))

	Timer.show_total()


	if few_documents:
	Timer.reset()
	error_msg = st.empty()
	error_msg.warning(st.session_state['msg'])
	proceed = st.empty()
	if portuguese:
	proceed_button = proceed.button('Prosseguir')
	else:
	proceed_button = proceed.button('Proceed')
	if proceed_button:
	error_msg.empty()
	proceed.empty()
	query = translate(query_pt, 'pt', 'en') if portuguese else query
	text = extract(query, search_model=search_model, extracted_documents=st.session_state['documents'])
	summary = summarize(text, summ_model, tokenizer)

	if portuguese:
	result.markdown(f'Seu resumo para "{query_pt}":\n\n> {translate(summary, "en", "pt")}')
	with result.expander(f'Parágrafos usados na geração do resumo'):
	st.markdown(translate(text, "en", "pt").replace('\n', '\n\n'))
	else:
	result.markdown(f'Your summary for "{query}":\n\n> {summary}')
	with result.expander(f'Paragraphs used in summarization'):
	st.markdown(text.replace('\n', '\n\n'))

	st.session_state['few_documents'] = False
	few_documents = False

	if __name__ == '__main__':
	cProfile.run('main()', 'stats.txt')