Spaces:
Sleeping
Sleeping
from docx import Document | |
from docx.document import Document as _Document | |
from docx.table import Table | |
from docx.text.paragraph import Paragraph | |
from typing import Union, List, Dict, Any | |
from PIL import Image | |
from io import BytesIO | |
import pytesseract | |
import os | |
from zipfile import ZipFile | |
from lxml import etree | |
from pathlib import Path | |
import io | |
def extract_docx(docx_input) -> str: | |
"""Extract text from DOCX files with table and text handling.""" | |
zipf = ZipFile(docx_input) | |
xml_content = zipf.read("word/document.xml") | |
tree = etree.fromstring(xml_content) | |
ns = { | |
"w": "http://schemas.openxmlformats.org/wordprocessingml/2006/main", | |
"a": "http://schemas.openxmlformats.org/drawingml/2006/main", | |
"wps": "http://schemas.microsoft.com/office/word/2010/wordprocessingShape" | |
} | |
text_blocks = [] | |
# Extract all tables with gridSpan handling | |
tables = tree.xpath("//w:tbl", namespaces=ns) | |
table_elements = set(tables) | |
table_index = 0 | |
for tbl in tables: | |
rows = tbl.xpath("./w:tr", namespaces=ns) | |
sub_tables = [] | |
current_table = [] | |
prev_col_count = None | |
for row in rows: | |
row_texts = [] | |
cells = row.xpath("./w:tc", namespaces=ns) | |
col_count = 0 | |
for cell in cells: | |
cell_text = "" | |
paragraphs = cell.xpath(".//w:p", namespaces=ns) | |
for para in paragraphs: | |
text_nodes = para.xpath(".//w:t", namespaces=ns) | |
para_text = "".join(node.text for node in text_nodes if node.text) | |
if para_text.strip(): | |
cell_text += para_text + " " | |
# Handle gridSpan (merged cells) | |
gridspan_elem = cell.xpath(".//w:gridSpan", namespaces=ns) | |
span = int(gridspan_elem[0].get(ns["w"] + "val", "1")) if gridspan_elem else 1 | |
row_texts.append(cell_text.strip()) | |
col_count += span | |
if row_texts and any(text.strip() for text in row_texts): | |
if prev_col_count is not None and col_count != prev_col_count: | |
# Column count changed, save current table and start new one | |
if current_table: | |
sub_tables.append(current_table) | |
current_table = [] | |
current_table.append(row_texts) | |
prev_col_count = col_count | |
if current_table: | |
sub_tables.append(current_table) | |
# Format tables | |
for sub_table in sub_tables: | |
table_text = f"\\n--- Table {table_index + 1} ---\\n" | |
for row in sub_table: | |
table_text += " | ".join(row) + "\\n" | |
text_blocks.append(table_text) | |
table_index += 1 | |
# Extract non-table paragraphs | |
paragraphs = tree.xpath("//w:p", namespaces=ns) | |
for para in paragraphs: | |
# Check if paragraph is inside a table | |
is_in_table = any(table in para.xpath("ancestor::*") for table in table_elements) | |
if not is_in_table: | |
text_nodes = para.xpath(".//w:t", namespaces=ns) | |
para_text = "".join(node.text for node in text_nodes if node.text) | |
if para_text.strip(): | |
text_blocks.append(para_text.strip()) | |
return "\\n\\n".join(text_blocks) | |