File size: 3,589 Bytes
d697844
 
 
e639253
f22332c
4c5eb78
4d7fa61
d697844
 
 
 
 
 
f22332c
 
 
 
 
 
 
 
d697844
f22332c
d697844
 
 
 
 
 
 
 
 
4c5eb78
d697844
 
f22332c
 
 
 
 
 
 
 
 
 
 
 
d697844
f22332c
d697844
 
 
 
e639253
d697844
e639253
d697844
 
4c5eb78
 
 
d697844
 
 
4c5eb78
 
 
e639253
 
4c5eb78
f22332c
 
d697844
e639253
f22332c
 
 
 
4c5eb78
f22332c
 
e639253
f22332c
 
e639253
4c5eb78
e639253
4c5eb78
e639253
f22332c
e639253
d697844
e639253
 
 
 
 
 
 
 
 
 
 
 
d697844
 
4c5eb78
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
import gradio as gr
from docx import Document
import os
import shutil
import zipfile
import tempfile

def split_by_headers(file_path, headers_per_chunk=1):
    doc = Document(file_path)
    chunks = []
    current_chunk = Document()
    header_count = 0
    
    for paragraph in doc.paragraphs:
        if paragraph.style.name.startswith('Heading'):
            header_count += 1
            if header_count > headers_per_chunk:
                chunks.append(current_chunk)
                current_chunk = Document()
                header_count = 1
        current_chunk.add_paragraph(paragraph.text, style=paragraph.style.name)
    
    if len(current_chunk.paragraphs):
        chunks.append(current_chunk)
    
    return chunks

def split_by_pages(file_path, pages_per_chunk=1):
    doc = Document(file_path)
    chunks = []
    current_chunk = Document()
    page_count = 0
    estimated_chars_per_page = 3000
    char_count = 0
    
    for paragraph in doc.paragraphs:
        text = paragraph.text
        char_count += len(text)
        
        if char_count >= estimated_chars_per_page:
            page_count += 1
            char_count = 0
            if page_count >= pages_per_chunk:
                chunks.append(current_chunk)
                current_chunk = Document()
                page_count = 0
        current_chunk.add_paragraph(text, style=paragraph.style.name)
    
    if len(current_chunk.paragraphs):
        chunks.append(current_chunk)
    
    return chunks

def save_chunks(chunks, original_filename):
    saved_files = []
    base_name = os.path.splitext(original_filename)[0]
    
    for i, chunk in enumerate(chunks, 1):
        temp_path = os.path.join(tempfile.gettempdir(), f"{base_name}_part{i}.docx")
        chunk.save(temp_path)
        saved_files.append(temp_path)
    
    return saved_files

def zip_files(files):
    zip_path = os.path.join(tempfile.gettempdir(), "document_parts.zip")
    with zipfile.ZipFile(zip_path, 'w') as zipf:
        for file in files:
            zipf.write(file, os.path.basename(file))
    return zip_path

def process_document(file, split_type, headers_or_pages, download_type):
    if headers_or_pages < 1:
        return "Por favor, especifique un número positivo de encabezados o páginas por fragmento."
    
    try:
        if split_type == "Encabezados":
            chunks = split_by_headers(file.name, headers_or_pages)
        else:
            chunks = split_by_pages(file.name, headers_or_pages)
        
        saved_files = save_chunks(chunks, os.path.basename(file.name))
        
        if download_type == "ZIP":
            zip_path = zip_files(saved_files)
            return gr.File(zip_path)  # Usar gr.File para devolver el archivo zip
        else:
            return [gr.File(f) for f in saved_files]  # Usar gr.File para devolver archivos separados
        
    except Exception as e:
        return f"Error al procesar el documento: {str(e)}"

iface = gr.Interface(
    fn=process_document,
    inputs=[
        gr.File(label="Seleccione el archivo DOCX"),
        gr.Radio(["Encabezados", "Páginas"], label="Método de división"),
        gr.Number(value=1, label="Número de encabezados/páginas por fragmento", minimum=1),
        gr.Radio(["Separados", "ZIP"], label="Tipo de descarga")
    ],
    outputs=gr.File(label="Descargar archivo(s)"),
    title="Divisor de Documentos DOCX",
    description="Divida documentos DOCX por encabezados o páginas estimadas y descárguelos como archivos separados o en un archivo ZIP."
)

if __name__ == "__main__":
    iface.launch()