summarizer / app.py
igna7's picture
add summarizer files
b4c7cb7 verified
"""
Espacio de Hugging Face: Resumidor de Texto (BERT2BERT)
========================================================
Modelo: mrm8488/bert2bert_shared-spanish-finetuned-summarization
Entrada: Texto largo en español
Salida: Texto resumido
"""
import gradio as gr
import torch
from transformers import BertTokenizerFast, EncoderDecoderModel
class SummarizationService:
def __init__(self):
ckpt = "mrm8488/bert2bert_shared-spanish-finetuned-summarization"
self.device = torch.device("cpu")
print(f"Cargando modelo BERT2BERT: {ckpt}...")
self.tokenizer = BertTokenizerFast.from_pretrained(ckpt)
self.model = EncoderDecoderModel.from_pretrained(
ckpt,
low_cpu_mem_usage=False,
use_safetensors=False,
torch_dtype=torch.float32,
)
self.model.eval()
print("Modelo cargado correctamente.")
def summarize(self, text: str) -> str:
"""Resume el texto usando micro-chunking para manejar textos largos."""
text = text.replace("\n", " ").strip()
gen_params = {
"min_length": 25,
"max_length": 100,
"num_beams": 4,
"length_penalty": 2.0,
"no_repeat_ngram_size": 3,
"early_stopping": True
}
chunks = self._chunk_text(text, max_tokens=200)
summaries = []
for chunk in chunks:
inputs = self.tokenizer(
[chunk],
padding="max_length",
truncation=True,
max_length=512,
return_tensors="pt"
)
input_ids = inputs["input_ids"].to(self.device)
attention_mask = inputs["attention_mask"].to(self.device)
with torch.no_grad():
output_ids = self.model.generate(
input_ids=input_ids,
attention_mask=attention_mask,
**gen_params
)
summary_piece = self.tokenizer.decode(output_ids[0], skip_special_tokens=True)
if summary_piece.strip():
summaries.append(summary_piece.strip())
return " ".join(summaries)
def _chunk_text(self, text: str, max_tokens: int) -> list:
"""Divide el texto en fragmentos manejables para BERT."""
sentences = text.split('. ')
chunks = []
current_chunk = []
current_length = 0
for sentence in sentences:
sentence = sentence.strip()
if not sentence:
continue
tokens = self.tokenizer.tokenize(sentence)
sent_len = len(tokens)
if sent_len > max_tokens:
if current_chunk:
chunks.append(". ".join(current_chunk) + ".")
current_chunk = []
current_length = 0
chunks.append(sentence + ".")
continue
if current_length + sent_len > max_tokens:
chunks.append(". ".join(current_chunk) + ".")
current_chunk = [sentence]
current_length = sent_len
else:
current_chunk.append(sentence)
current_length += sent_len
if current_chunk:
chunks.append(". ".join(current_chunk) + ".")
return chunks
# Inicializar servicio
print("Inicializando servicio de resumen...")
service = SummarizationService()
print("Servicio listo.")
def resumir_texto(texto: str) -> str:
"""Función principal para Gradio."""
if not texto or not texto.strip():
return "Por favor, introduce un texto para resumir."
try:
resumen = service.summarize(texto)
return resumen
except Exception as e:
return f"Error al resumir: {str(e)}"
# Interfaz Gradio
iface = gr.Interface(
fn=resumir_texto,
inputs=gr.Textbox(
lines=10,
placeholder="Pega aquí tu texto largo en español...",
label="Texto a Resumir"
),
outputs=gr.Textbox(label="Resumen"),
title="📝 Resumidor de Texto (BERT2BERT)",
description="Resume textos largos en español usando el modelo BERT2BERT con técnica de micro-chunking.",
examples=[
["La inteligencia artificial es un campo de la informática que se centra en crear sistemas inteligentes. Estos sistemas pueden aprender de la experiencia y realizar tareas como reconocimiento de voz y toma de decisiones. El aprendizaje automático permite a las computadoras mejorar su rendimiento a través de la experiencia."]
],
flagging_mode="never",
)
if __name__ == "__main__":
iface.launch()