Spaces:
Running
Running
import torch | |
import re | |
import PyPDF2 | |
import utils | |
import streamlit as st | |
from transformers import BertTokenizerFast, EncoderDecoderModel | |
device = 'cuda' if torch.cuda.is_available() else 'cpu' | |
model_id = 'mrm8488/bert2bert_shared-spanish-finetuned-summarization' | |
tokenizer = BertTokenizerFast.from_pretrained(model_id) | |
modelo = EncoderDecoderModel.from_pretrained(model_id).to(device) | |
def generate_summary(text): | |
inputs = tokenizer([text], padding="max_length", truncation=True, max_length=512, return_tensors="pt") | |
input_ids = inputs.input_ids.to(device) | |
attention_mask = inputs.attention_mask.to(device) #attention_mask only says that the model that this words are not pedded | |
output = modelo.generate(input_ids, attention_mask=attention_mask) | |
return tokenizer.decode(output[0], skip_special_tokens=True) | |
def summarize_pdf(pdf_file): | |
if pdf_file is not None: | |
with st.spinner('Generando resumen, espera un poco...'): | |
reader = PyPDF2.PdfReader(pdf_file) | |
if reader.metadata.title == None: | |
title = '' | |
else: | |
title = reader.metadata.title | |
if reader.metadata.author == None: | |
author = '' | |
else: | |
author = reader.metadata.author | |
pages =reader.pages | |
text = [pages[i].extract_text() for i in range(len(pages))] | |
text = [utils.drop_non_relevant_text(utils.preprocess_text(x)) for x in text] | |
text = [' '.join(x) for x in text] | |
text=[x+'\n' if len(x) < 50 else generate_summary(x)+' \n' for x in text] | |
results = [title+' \n'] + text | |
st.session_state["summary"] = ' '.join(results) | |
## Graphic interfaz | |
def output(pdf_file): | |
if pdf_file is not None: | |
reader = PyPDF2.PdfReader(pdf_file) | |
title = reader.metadata.title | |
st.session_state["summary"] = title | |
if 'summary' not in st.session_state: | |
st.session_state['summary'] = '' | |
#output = summarize_pdf(pdf_file) | |
#reader = PyPDF2.PdfReader(pdf_file) | |
# title = reader.metadata.title | |
# output = title | |
# st.write(output) | |
st.caption('Demo para la generación de resumenes en español') | |
with st.sidebar: | |
with st.container(border = True): | |
st.title('PDF-Summarizer para español') | |
st.caption('Este demo está basado en el modelo: \n mrm8488/bert2bert_shared-spanish-finetuned-summarization \n creado por Manuel Romero/@mrm8488 con el soporte de Narrativa. \n Importante: Recomendado para PDFs cortos.') | |
pdf_file = st.file_uploader('Carga tu archivo PDF', type="pdf") | |
with st.spinner('Estamos generando tu resumen, espera un poco...'): | |
corre_button = st.button('Genera resumen', | |
on_click=summarize_pdf, | |
args = (pdf_file, ), | |
help = 'Presiona para generar resumen') | |
container = st.container(height=300) | |
container.write('Resumen:') | |
container.write(st.session_state["summary"]) | |