Spaces:
Sleeping
Sleeping
File size: 3,589 Bytes
d697844 e639253 f22332c 4c5eb78 4d7fa61 d697844 f22332c d697844 f22332c d697844 4c5eb78 d697844 f22332c d697844 f22332c d697844 e639253 d697844 e639253 d697844 4c5eb78 d697844 4c5eb78 e639253 4c5eb78 f22332c d697844 e639253 f22332c 4c5eb78 f22332c e639253 f22332c e639253 4c5eb78 e639253 4c5eb78 e639253 f22332c e639253 d697844 e639253 d697844 4c5eb78 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 |
import gradio as gr
from docx import Document
import os
import shutil
import zipfile
import tempfile
def split_by_headers(file_path, headers_per_chunk=1):
doc = Document(file_path)
chunks = []
current_chunk = Document()
header_count = 0
for paragraph in doc.paragraphs:
if paragraph.style.name.startswith('Heading'):
header_count += 1
if header_count > headers_per_chunk:
chunks.append(current_chunk)
current_chunk = Document()
header_count = 1
current_chunk.add_paragraph(paragraph.text, style=paragraph.style.name)
if len(current_chunk.paragraphs):
chunks.append(current_chunk)
return chunks
def split_by_pages(file_path, pages_per_chunk=1):
doc = Document(file_path)
chunks = []
current_chunk = Document()
page_count = 0
estimated_chars_per_page = 3000
char_count = 0
for paragraph in doc.paragraphs:
text = paragraph.text
char_count += len(text)
if char_count >= estimated_chars_per_page:
page_count += 1
char_count = 0
if page_count >= pages_per_chunk:
chunks.append(current_chunk)
current_chunk = Document()
page_count = 0
current_chunk.add_paragraph(text, style=paragraph.style.name)
if len(current_chunk.paragraphs):
chunks.append(current_chunk)
return chunks
def save_chunks(chunks, original_filename):
saved_files = []
base_name = os.path.splitext(original_filename)[0]
for i, chunk in enumerate(chunks, 1):
temp_path = os.path.join(tempfile.gettempdir(), f"{base_name}_part{i}.docx")
chunk.save(temp_path)
saved_files.append(temp_path)
return saved_files
def zip_files(files):
zip_path = os.path.join(tempfile.gettempdir(), "document_parts.zip")
with zipfile.ZipFile(zip_path, 'w') as zipf:
for file in files:
zipf.write(file, os.path.basename(file))
return zip_path
def process_document(file, split_type, headers_or_pages, download_type):
if headers_or_pages < 1:
return "Por favor, especifique un número positivo de encabezados o páginas por fragmento."
try:
if split_type == "Encabezados":
chunks = split_by_headers(file.name, headers_or_pages)
else:
chunks = split_by_pages(file.name, headers_or_pages)
saved_files = save_chunks(chunks, os.path.basename(file.name))
if download_type == "ZIP":
zip_path = zip_files(saved_files)
return gr.File(zip_path) # Usar gr.File para devolver el archivo zip
else:
return [gr.File(f) for f in saved_files] # Usar gr.File para devolver archivos separados
except Exception as e:
return f"Error al procesar el documento: {str(e)}"
iface = gr.Interface(
fn=process_document,
inputs=[
gr.File(label="Seleccione el archivo DOCX"),
gr.Radio(["Encabezados", "Páginas"], label="Método de división"),
gr.Number(value=1, label="Número de encabezados/páginas por fragmento", minimum=1),
gr.Radio(["Separados", "ZIP"], label="Tipo de descarga")
],
outputs=gr.File(label="Descargar archivo(s)"),
title="Divisor de Documentos DOCX",
description="Divida documentos DOCX por encabezados o páginas estimadas y descárguelos como archivos separados o en un archivo ZIP."
)
if __name__ == "__main__":
iface.launch() |