import torch import re import PyPDF2 import utils import streamlit as st from transformers import BertTokenizerFast, EncoderDecoderModel device = 'cuda' if torch.cuda.is_available() else 'cpu' model_id = 'mrm8488/bert2bert_shared-spanish-finetuned-summarization' tokenizer = BertTokenizerFast.from_pretrained(model_id) modelo = EncoderDecoderModel.from_pretrained(model_id).to(device) def generate_summary(text): inputs = tokenizer([text], padding="max_length", truncation=True, max_length=512, return_tensors="pt") input_ids = inputs.input_ids.to(device) attention_mask = inputs.attention_mask.to(device) #attention_mask only says that the model that this words are not pedded output = modelo.generate(input_ids, attention_mask=attention_mask) return tokenizer.decode(output[0], skip_special_tokens=True) def summarize_pdf(pdf_file): if pdf_file is not None: with st.spinner('Generando resumen, espera un poco...'): reader = PyPDF2.PdfReader(pdf_file) if reader.metadata.title == None: title = '' else: title = reader.metadata.title if reader.metadata.author == None: author = '' else: author = reader.metadata.author pages =reader.pages text = [pages[i].extract_text() for i in range(len(pages))] text = [utils.drop_non_relevant_text(utils.preprocess_text(x)) for x in text] text = [' '.join(x) for x in text] text=[x+'\n' if len(x) < 50 else generate_summary(x)+' \n' for x in text] results = [title+' \n'] + text st.session_state["summary"] = ' '.join(results) ## Graphic interfaz def output(pdf_file): if pdf_file is not None: reader = PyPDF2.PdfReader(pdf_file) title = reader.metadata.title st.session_state["summary"] = title if 'summary' not in st.session_state: st.session_state['summary'] = '' #output = summarize_pdf(pdf_file) #reader = PyPDF2.PdfReader(pdf_file) # title = reader.metadata.title # output = title # st.write(output) st.caption('Demo para la generación de resumenes en español') with st.sidebar: with st.container(border = True): st.title('PDF-Summarizer para español') st.caption('Este demo está basado en el modelo: \n mrm8488/bert2bert_shared-spanish-finetuned-summarization \n creado por Manuel Romero/@mrm8488 con el soporte de Narrativa. \n Importante: Recomendado para PDFs cortos.') pdf_file = st.file_uploader('Carga tu archivo PDF', type="pdf") with st.spinner('Estamos generando tu resumen, espera un poco...'): corre_button = st.button('Genera resumen', on_click=summarize_pdf, args = (pdf_file, ), help = 'Presiona para generar resumen') container = st.container(height=300) container.write('Resumen:') container.write(st.session_state["summary"])