Spaces:
Sleeping
Sleeping
from openpyxl import load_workbook | |
from openpyxl.drawing.image import Image as OpenPyXLImage | |
from typing import List, Dict, Any | |
from PIL import Image | |
from io import BytesIO | |
import pytesseract | |
import os | |
import pandas as pd | |
def extract_xlsx(xlsx_path: str, tesseract_cmd: str = None) -> str: | |
"""Extract data from Excel files including text and images.""" | |
if tesseract_cmd: | |
pytesseract.pytesseract.tesseract_cmd = tesseract_cmd | |
try: | |
wb = load_workbook(xlsx_path, data_only=True) | |
except Exception as e: | |
return f"Error loading Excel file: {str(e)}" | |
all_sheets_content: list[str] = [] | |
preview_text: str | None = None | |
any_data_found = False | |
for sheet in wb.worksheets: | |
sheet_content = [f"\n=== Sheet: {sheet.title} ===\n"] | |
# Extract table data | |
has_data = False | |
non_empty_rows = 0 | |
for row in sheet.iter_rows(max_row=sheet.max_row, values_only=True): | |
if row is None or all(cell is None for cell in row): | |
continue # skip completely empty rows | |
has_data = True | |
non_empty_rows += 1 | |
any_data_found = True | |
row_data = [str(cell).strip() if cell is not None else "" for cell in row] | |
joined = " | ".join(row_data) | |
sheet_content.append(joined) | |
if preview_text is None and joined.strip(): | |
preview_text = joined[:15] | |
if not has_data: | |
sheet_content.append("[No data in this sheet]") | |
print(f"ℹ️ XLSX: Sheet '{sheet.title}' has no data (openpyxl)") | |
else: | |
print(f"🧾 XLSX: Sheet '{sheet.title}' non-empty rows: {non_empty_rows}") | |
# Extract images from the sheet | |
if hasattr(sheet, '_images'): | |
image_count = 0 | |
for img in sheet._images: | |
try: | |
if hasattr(img, '_data'): # if it's a real OpenPyXL Image | |
image_data = img._data() | |
elif hasattr(img, '_ref'): | |
continue # cell ref-only images; ignore | |
else: | |
continue | |
pil_img = Image.open(BytesIO(image_data)) | |
try: | |
ocr_text = pytesseract.image_to_string(pil_img).strip() | |
if ocr_text: | |
sheet_content.append(f"[Image {image_count + 1} Text]: {ocr_text}") | |
else: | |
sheet_content.append(f"[Image {image_count + 1}]: No text detected") | |
except Exception as ocr_e: | |
sheet_content.append(f"[Image {image_count + 1}]: OCR failed - {str(ocr_e)}") | |
image_count += 1 | |
except Exception as e: | |
sheet_content.append(f"[Image extraction error: {str(e)}]") | |
if image_count == 0: | |
sheet_content.append("[No images found in this sheet]") | |
all_sheets_content.append("\n".join(sheet_content)) | |
# If no data found using openpyxl, try pandas fallback (handles some edge cases better) | |
if not any_data_found: | |
print("ℹ️ XLSX: No data via openpyxl, trying pandas fallback…") | |
try: | |
xls = pd.ExcelFile(xlsx_path, engine="openpyxl") | |
pandas_parts = [] | |
extracted_sheets = 0 | |
for sheet_name in xls.sheet_names: | |
df = pd.read_excel(xls, sheet_name=sheet_name, dtype=str) | |
if not df.empty: | |
any_data_found = True | |
header = f"\n=== Sheet: {sheet_name} ===\n" | |
csv_like = df.fillna("").astype(str).to_csv(index=False) | |
pandas_parts.append(header + csv_like) | |
extracted_sheets += 1 | |
if preview_text is None: | |
flat = "".join(csv_like.splitlines()) | |
if flat: | |
preview_text = flat[:15] | |
else: | |
pandas_parts.append(f"\n=== Sheet: {sheet_name} ===\n[No data in this sheet]") | |
if pandas_parts: | |
all_sheets_content = pandas_parts | |
print(f"✅ XLSX: Pandas fallback extracted {extracted_sheets} non-empty sheet(s)") | |
except Exception as pe: | |
# If pandas also fails, keep whatever we had | |
all_sheets_content.append(f"[Pandas fallback failed: {str(pe)}]") | |
print(f"❌ XLSX: Pandas fallback failed: {pe}") | |
combined = "\n\n".join(all_sheets_content) | |
# Print a small preview for verification | |
if preview_text is None: | |
# fallback: take from combined text | |
flat_combined = "".join(combined.splitlines()).strip() | |
if flat_combined: | |
preview_text = flat_combined[:15] | |
if preview_text: | |
print(f"🔎 XLSX content preview: {preview_text}") | |
return combined | |