JCRios's picture
Update app.py
9c4b947 verified
raw
history blame
3.04 kB
import torch
import re
import PyPDF2
import utils
import streamlit as st
from transformers import BertTokenizerFast, EncoderDecoderModel
device = 'cuda' if torch.cuda.is_available() else 'cpu'
model_id = 'mrm8488/bert2bert_shared-spanish-finetuned-summarization'
tokenizer = BertTokenizerFast.from_pretrained(model_id)
modelo = EncoderDecoderModel.from_pretrained(model_id).to(device)
def generate_summary(text):
inputs = tokenizer([text], padding="max_length", truncation=True, max_length=512, return_tensors="pt")
input_ids = inputs.input_ids.to(device)
attention_mask = inputs.attention_mask.to(device) #attention_mask only says that the model that this words are not pedded
output = modelo.generate(input_ids, attention_mask=attention_mask)
return tokenizer.decode(output[0], skip_special_tokens=True)
def summarize_pdf(pdf_file):
if pdf_file is not None:
with st.spinner('Generando resumen, espera un poco...'):
reader = PyPDF2.PdfReader(pdf_file)
if reader.metadata.title == None:
title = ''
else:
title = reader.metadata.title
if reader.metadata.author == None:
author = ''
else:
author = reader.metadata.author
pages =reader.pages
text = [pages[i].extract_text() for i in range(len(pages))]
text = [utils.drop_non_relevant_text(utils.preprocess_text(x)) for x in text]
text = [' '.join(x) for x in text]
text=[x+'\n' if len(x) < 50 else generate_summary(x)+' \n' for x in text]
results = [title+' \n'] + text
st.session_state["summary"] = ' '.join(results)
## Graphic interfaz
def output(pdf_file):
if pdf_file is not None:
reader = PyPDF2.PdfReader(pdf_file)
title = reader.metadata.title
st.session_state["summary"] = title
if 'summary' not in st.session_state:
st.session_state['summary'] = ''
#output = summarize_pdf(pdf_file)
#reader = PyPDF2.PdfReader(pdf_file)
# title = reader.metadata.title
# output = title
# st.write(output)
st.caption('Demo para la generación de resumenes en español')
with st.sidebar:
with st.container(border = True):
st.title('PDF-Summarizer para español')
st.caption('Este demo está basado en el modelo: \n mrm8488/bert2bert_shared-spanish-finetuned-summarization \n creado por Manuel Romero/@mrm8488 con el soporte de Narrativa. \n Importante: Recomendado para PDFs cortos.')
pdf_file = st.file_uploader('Carga tu archivo PDF', type="pdf")
with st.spinner('Estamos generando tu resumen, espera un poco...'):
corre_button = st.button('Genera resumen',
on_click=summarize_pdf,
args = (pdf_file, ),
help = 'Presiona para generar resumen')
container = st.container(height=300)
container.write('Resumen:')
container.write(st.session_state["summary"])