Spaces:

igna7
/

summarizer

Sleeping

App Files Files Community

summarizer / app.py

igna7

add summarizer files

b4c7cb7 verified 3 months ago

raw

history blame contribute delete

4.9 kB

	"""
	Espacio de Hugging Face: Resumidor de Texto (BERT2BERT)
	========================================================
	Modelo: mrm8488/bert2bert_shared-spanish-finetuned-summarization

	Entrada: Texto largo en español
	Salida: Texto resumido
	"""

	import gradio as gr
	import torch
	from transformers import BertTokenizerFast, EncoderDecoderModel


	class SummarizationService:
	def __init__(self):
	ckpt = "mrm8488/bert2bert_shared-spanish-finetuned-summarization"
	self.device = torch.device("cpu")

	print(f"Cargando modelo BERT2BERT: {ckpt}...")
	self.tokenizer = BertTokenizerFast.from_pretrained(ckpt)
	self.model = EncoderDecoderModel.from_pretrained(
	ckpt,
	low_cpu_mem_usage=False,
	use_safetensors=False,
	torch_dtype=torch.float32,
	)
	self.model.eval()
	print("Modelo cargado correctamente.")

	def summarize(self, text: str) -> str:
	"""Resume el texto usando micro-chunking para manejar textos largos."""
	text = text.replace("\n", " ").strip()

	gen_params = {
	"min_length": 25,
	"max_length": 100,
	"num_beams": 4,
	"length_penalty": 2.0,
	"no_repeat_ngram_size": 3,
	"early_stopping": True
	}

	chunks = self._chunk_text(text, max_tokens=200)
	summaries = []

	for chunk in chunks:
	inputs = self.tokenizer(
	[chunk],
	padding="max_length",
	truncation=True,
	max_length=512,
	return_tensors="pt"
	)
	input_ids = inputs["input_ids"].to(self.device)
	attention_mask = inputs["attention_mask"].to(self.device)

	with torch.no_grad():
	output_ids = self.model.generate(
	input_ids=input_ids,
	attention_mask=attention_mask,
	**gen_params
	)

	summary_piece = self.tokenizer.decode(output_ids[0], skip_special_tokens=True)
	if summary_piece.strip():
	summaries.append(summary_piece.strip())

	return " ".join(summaries)

	def _chunk_text(self, text: str, max_tokens: int) -> list:
	"""Divide el texto en fragmentos manejables para BERT."""
	sentences = text.split('. ')
	chunks = []
	current_chunk = []
	current_length = 0

	for sentence in sentences:
	sentence = sentence.strip()
	if not sentence:
	continue

	tokens = self.tokenizer.tokenize(sentence)
	sent_len = len(tokens)

	if sent_len > max_tokens:
	if current_chunk:
	chunks.append(". ".join(current_chunk) + ".")
	current_chunk = []
	current_length = 0
	chunks.append(sentence + ".")
	continue

	if current_length + sent_len > max_tokens:
	chunks.append(". ".join(current_chunk) + ".")
	current_chunk = [sentence]
	current_length = sent_len
	else:
	current_chunk.append(sentence)
	current_length += sent_len

	if current_chunk:
	chunks.append(". ".join(current_chunk) + ".")

	return chunks


	# Inicializar servicio
	print("Inicializando servicio de resumen...")
	service = SummarizationService()
	print("Servicio listo.")


	def resumir_texto(texto: str) -> str:
	"""Función principal para Gradio."""
	if not texto or not texto.strip():
	return "Por favor, introduce un texto para resumir."

	try:
	resumen = service.summarize(texto)
	return resumen
	except Exception as e:
	return f"Error al resumir: {str(e)}"


	# Interfaz Gradio
	iface = gr.Interface(
	fn=resumir_texto,
	inputs=gr.Textbox(
	lines=10,
	placeholder="Pega aquí tu texto largo en español...",
	label="Texto a Resumir"
	),
	outputs=gr.Textbox(label="Resumen"),
	title="📝 Resumidor de Texto (BERT2BERT)",
	description="Resume textos largos en español usando el modelo BERT2BERT con técnica de micro-chunking.",
	examples=[
	["La inteligencia artificial es un campo de la informática que se centra en crear sistemas inteligentes. Estos sistemas pueden aprender de la experiencia y realizar tareas como reconocimiento de voz y toma de decisiones. El aprendizaje automático permite a las computadoras mejorar su rendimiento a través de la experiencia."]
	],
	flagging_mode="never",
	)

	if __name__ == "__main__":
	iface.launch()