File size: 3,540 Bytes
5ff6b14
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
from docx import Document
from docx.document import Document as _Document
from docx.table import Table
from docx.text.paragraph import Paragraph
from typing import Union, List, Dict, Any
from PIL import Image
from io import BytesIO
import pytesseract
import os

from zipfile import ZipFile
from lxml import etree
from pathlib import Path
import io

def extract_docx(docx_input) -> str:
    """Extract text from DOCX files with table and text handling."""
    zipf = ZipFile(docx_input)
    xml_content = zipf.read("word/document.xml")
    tree = etree.fromstring(xml_content)

    ns = {
        "w": "http://schemas.openxmlformats.org/wordprocessingml/2006/main",
        "a": "http://schemas.openxmlformats.org/drawingml/2006/main",
        "wps": "http://schemas.microsoft.com/office/word/2010/wordprocessingShape"
    }

    text_blocks = []

    # Extract all tables with gridSpan handling
    tables = tree.xpath("//w:tbl", namespaces=ns)
    table_elements = set(tables)
    table_index = 0
    
    for tbl in tables:
        rows = tbl.xpath("./w:tr", namespaces=ns)
        sub_tables = []
        current_table = []

        prev_col_count = None
        for row in rows:
            row_texts = []
            cells = row.xpath("./w:tc", namespaces=ns)
            col_count = 0
            
            for cell in cells:
                cell_text = ""
                paragraphs = cell.xpath(".//w:p", namespaces=ns)
                for para in paragraphs:
                    text_nodes = para.xpath(".//w:t", namespaces=ns)
                    para_text = "".join(node.text for node in text_nodes if node.text)
                    if para_text.strip():
                        cell_text += para_text + " "
                
                # Handle gridSpan (merged cells)
                gridspan_elem = cell.xpath(".//w:gridSpan", namespaces=ns)
                span = int(gridspan_elem[0].get(ns["w"] + "val", "1")) if gridspan_elem else 1
                
                row_texts.append(cell_text.strip())
                col_count += span

            if row_texts and any(text.strip() for text in row_texts):
                if prev_col_count is not None and col_count != prev_col_count:
                    # Column count changed, save current table and start new one
                    if current_table:
                        sub_tables.append(current_table)
                        current_table = []
                
                current_table.append(row_texts)
                prev_col_count = col_count

        if current_table:
            sub_tables.append(current_table)

        # Format tables
        for sub_table in sub_tables:
            table_text = f"\\n--- Table {table_index + 1} ---\\n"
            for row in sub_table:
                table_text += " | ".join(row) + "\\n"
            text_blocks.append(table_text)
            table_index += 1

    # Extract non-table paragraphs
    paragraphs = tree.xpath("//w:p", namespaces=ns)
    for para in paragraphs:
        # Check if paragraph is inside a table
        is_in_table = any(table in para.xpath("ancestor::*") for table in table_elements)
        if not is_in_table:
            text_nodes = para.xpath(".//w:t", namespaces=ns)
            para_text = "".join(node.text for node in text_nodes if node.text)
            if para_text.strip():
                text_blocks.append(para_text.strip())

    return "\\n\\n".join(text_blocks)