quantumbit's picture
Upload 39 files
e8051be verified
from openpyxl import load_workbook
from openpyxl.drawing.image import Image as OpenPyXLImage
from typing import List, Dict, Any
from PIL import Image
from io import BytesIO
import pytesseract
import os
import pandas as pd
def extract_xlsx(xlsx_path: str, tesseract_cmd: str = None) -> str:
"""Extract data from Excel files including text and images."""
if tesseract_cmd:
pytesseract.pytesseract.tesseract_cmd = tesseract_cmd
try:
wb = load_workbook(xlsx_path, data_only=True)
except Exception as e:
return f"Error loading Excel file: {str(e)}"
all_sheets_content: list[str] = []
preview_text: str | None = None
any_data_found = False
for sheet in wb.worksheets:
sheet_content = [f"\n=== Sheet: {sheet.title} ===\n"]
# Extract table data
has_data = False
non_empty_rows = 0
for row in sheet.iter_rows(max_row=sheet.max_row, values_only=True):
if row is None or all(cell is None for cell in row):
continue # skip completely empty rows
has_data = True
non_empty_rows += 1
any_data_found = True
row_data = [str(cell).strip() if cell is not None else "" for cell in row]
joined = " | ".join(row_data)
sheet_content.append(joined)
if preview_text is None and joined.strip():
preview_text = joined[:15]
if not has_data:
sheet_content.append("[No data in this sheet]")
print(f"ℹ️ XLSX: Sheet '{sheet.title}' has no data (openpyxl)")
else:
print(f"🧾 XLSX: Sheet '{sheet.title}' non-empty rows: {non_empty_rows}")
# Extract images from the sheet
if hasattr(sheet, '_images'):
image_count = 0
for img in sheet._images:
try:
if hasattr(img, '_data'): # if it's a real OpenPyXL Image
image_data = img._data()
elif hasattr(img, '_ref'):
continue # cell ref-only images; ignore
else:
continue
pil_img = Image.open(BytesIO(image_data))
try:
ocr_text = pytesseract.image_to_string(pil_img).strip()
if ocr_text:
sheet_content.append(f"[Image {image_count + 1} Text]: {ocr_text}")
else:
sheet_content.append(f"[Image {image_count + 1}]: No text detected")
except Exception as ocr_e:
sheet_content.append(f"[Image {image_count + 1}]: OCR failed - {str(ocr_e)}")
image_count += 1
except Exception as e:
sheet_content.append(f"[Image extraction error: {str(e)}]")
if image_count == 0:
sheet_content.append("[No images found in this sheet]")
all_sheets_content.append("\n".join(sheet_content))
# If no data found using openpyxl, try pandas fallback (handles some edge cases better)
if not any_data_found:
print("ℹ️ XLSX: No data via openpyxl, trying pandas fallback…")
try:
xls = pd.ExcelFile(xlsx_path, engine="openpyxl")
pandas_parts = []
extracted_sheets = 0
for sheet_name in xls.sheet_names:
df = pd.read_excel(xls, sheet_name=sheet_name, dtype=str)
if not df.empty:
any_data_found = True
header = f"\n=== Sheet: {sheet_name} ===\n"
csv_like = df.fillna("").astype(str).to_csv(index=False)
pandas_parts.append(header + csv_like)
extracted_sheets += 1
if preview_text is None:
flat = "".join(csv_like.splitlines())
if flat:
preview_text = flat[:15]
else:
pandas_parts.append(f"\n=== Sheet: {sheet_name} ===\n[No data in this sheet]")
if pandas_parts:
all_sheets_content = pandas_parts
print(f"✅ XLSX: Pandas fallback extracted {extracted_sheets} non-empty sheet(s)")
except Exception as pe:
# If pandas also fails, keep whatever we had
all_sheets_content.append(f"[Pandas fallback failed: {str(pe)}]")
print(f"❌ XLSX: Pandas fallback failed: {pe}")
combined = "\n\n".join(all_sheets_content)
# Print a small preview for verification
if preview_text is None:
# fallback: take from combined text
flat_combined = "".join(combined.splitlines()).strip()
if flat_combined:
preview_text = flat_combined[:15]
if preview_text:
print(f"🔎 XLSX content preview: {preview_text}")
return combined