quantumbit's picture
Upload 41 files
5ff6b14 verified
raw
history blame
3.54 kB
from docx import Document
from docx.document import Document as _Document
from docx.table import Table
from docx.text.paragraph import Paragraph
from typing import Union, List, Dict, Any
from PIL import Image
from io import BytesIO
import pytesseract
import os
from zipfile import ZipFile
from lxml import etree
from pathlib import Path
import io
def extract_docx(docx_input) -> str:
"""Extract text from DOCX files with table and text handling."""
zipf = ZipFile(docx_input)
xml_content = zipf.read("word/document.xml")
tree = etree.fromstring(xml_content)
ns = {
"w": "http://schemas.openxmlformats.org/wordprocessingml/2006/main",
"a": "http://schemas.openxmlformats.org/drawingml/2006/main",
"wps": "http://schemas.microsoft.com/office/word/2010/wordprocessingShape"
}
text_blocks = []
# Extract all tables with gridSpan handling
tables = tree.xpath("//w:tbl", namespaces=ns)
table_elements = set(tables)
table_index = 0
for tbl in tables:
rows = tbl.xpath("./w:tr", namespaces=ns)
sub_tables = []
current_table = []
prev_col_count = None
for row in rows:
row_texts = []
cells = row.xpath("./w:tc", namespaces=ns)
col_count = 0
for cell in cells:
cell_text = ""
paragraphs = cell.xpath(".//w:p", namespaces=ns)
for para in paragraphs:
text_nodes = para.xpath(".//w:t", namespaces=ns)
para_text = "".join(node.text for node in text_nodes if node.text)
if para_text.strip():
cell_text += para_text + " "
# Handle gridSpan (merged cells)
gridspan_elem = cell.xpath(".//w:gridSpan", namespaces=ns)
span = int(gridspan_elem[0].get(ns["w"] + "val", "1")) if gridspan_elem else 1
row_texts.append(cell_text.strip())
col_count += span
if row_texts and any(text.strip() for text in row_texts):
if prev_col_count is not None and col_count != prev_col_count:
# Column count changed, save current table and start new one
if current_table:
sub_tables.append(current_table)
current_table = []
current_table.append(row_texts)
prev_col_count = col_count
if current_table:
sub_tables.append(current_table)
# Format tables
for sub_table in sub_tables:
table_text = f"\\n--- Table {table_index + 1} ---\\n"
for row in sub_table:
table_text += " | ".join(row) + "\\n"
text_blocks.append(table_text)
table_index += 1
# Extract non-table paragraphs
paragraphs = tree.xpath("//w:p", namespaces=ns)
for para in paragraphs:
# Check if paragraph is inside a table
is_in_table = any(table in para.xpath("ancestor::*") for table in table_elements)
if not is_in_table:
text_nodes = para.xpath(".//w:t", namespaces=ns)
para_text = "".join(node.text for node in text_nodes if node.text)
if para_text.strip():
text_blocks.append(para_text.strip())
return "\\n\\n".join(text_blocks)