from openpyxl import load_workbook from openpyxl.drawing.image import Image as OpenPyXLImage from typing import List, Dict, Any from PIL import Image from io import BytesIO import pytesseract import os import pandas as pd def extract_xlsx(xlsx_path: str, tesseract_cmd: str = None) -> str: """Extract data from Excel files including text and images.""" if tesseract_cmd: pytesseract.pytesseract.tesseract_cmd = tesseract_cmd try: wb = load_workbook(xlsx_path, data_only=True) except Exception as e: return f"Error loading Excel file: {str(e)}" all_sheets_content: list[str] = [] preview_text: str | None = None any_data_found = False for sheet in wb.worksheets: sheet_content = [f"\n=== Sheet: {sheet.title} ===\n"] # Extract table data has_data = False non_empty_rows = 0 for row in sheet.iter_rows(max_row=sheet.max_row, values_only=True): if row is None or all(cell is None for cell in row): continue # skip completely empty rows has_data = True non_empty_rows += 1 any_data_found = True row_data = [str(cell).strip() if cell is not None else "" for cell in row] joined = " | ".join(row_data) sheet_content.append(joined) if preview_text is None and joined.strip(): preview_text = joined[:15] if not has_data: sheet_content.append("[No data in this sheet]") print(f"ℹ️ XLSX: Sheet '{sheet.title}' has no data (openpyxl)") else: print(f"🧾 XLSX: Sheet '{sheet.title}' non-empty rows: {non_empty_rows}") # Extract images from the sheet if hasattr(sheet, '_images'): image_count = 0 for img in sheet._images: try: if hasattr(img, '_data'): # if it's a real OpenPyXL Image image_data = img._data() elif hasattr(img, '_ref'): continue # cell ref-only images; ignore else: continue pil_img = Image.open(BytesIO(image_data)) try: ocr_text = pytesseract.image_to_string(pil_img).strip() if ocr_text: sheet_content.append(f"[Image {image_count + 1} Text]: {ocr_text}") else: sheet_content.append(f"[Image {image_count + 1}]: No text detected") except Exception as ocr_e: sheet_content.append(f"[Image {image_count + 1}]: OCR failed - {str(ocr_e)}") image_count += 1 except Exception as e: sheet_content.append(f"[Image extraction error: {str(e)}]") if image_count == 0: sheet_content.append("[No images found in this sheet]") all_sheets_content.append("\n".join(sheet_content)) # If no data found using openpyxl, try pandas fallback (handles some edge cases better) if not any_data_found: print("ℹ️ XLSX: No data via openpyxl, trying pandas fallback…") try: xls = pd.ExcelFile(xlsx_path, engine="openpyxl") pandas_parts = [] extracted_sheets = 0 for sheet_name in xls.sheet_names: df = pd.read_excel(xls, sheet_name=sheet_name, dtype=str) if not df.empty: any_data_found = True header = f"\n=== Sheet: {sheet_name} ===\n" csv_like = df.fillna("").astype(str).to_csv(index=False) pandas_parts.append(header + csv_like) extracted_sheets += 1 if preview_text is None: flat = "".join(csv_like.splitlines()) if flat: preview_text = flat[:15] else: pandas_parts.append(f"\n=== Sheet: {sheet_name} ===\n[No data in this sheet]") if pandas_parts: all_sheets_content = pandas_parts print(f"✅ XLSX: Pandas fallback extracted {extracted_sheets} non-empty sheet(s)") except Exception as pe: # If pandas also fails, keep whatever we had all_sheets_content.append(f"[Pandas fallback failed: {str(pe)}]") print(f"❌ XLSX: Pandas fallback failed: {pe}") combined = "\n\n".join(all_sheets_content) # Print a small preview for verification if preview_text is None: # fallback: take from combined text flat_combined = "".join(combined.splitlines()).strip() if flat_combined: preview_text = flat_combined[:15] if preview_text: print(f"🔎 XLSX content preview: {preview_text}") return combined