Spaces:

quantumbit
/

rag-bajaj

Sleeping

App Files Files Community

rag-bajaj / preprocessing /preprocessing_modules /xlsx_extractor.py

quantumbit

Upload 39 files

e8051be verified 15 days ago

raw

history blame contribute delete

5.07 kB

	from openpyxl import load_workbook
	from openpyxl.drawing.image import Image as OpenPyXLImage
	from typing import List, Dict, Any
	from PIL import Image
	from io import BytesIO
	import pytesseract
	import os
	import pandas as pd

	def extract_xlsx(xlsx_path: str, tesseract_cmd: str = None) -> str:
	"""Extract data from Excel files including text and images."""
	if tesseract_cmd:
	pytesseract.pytesseract.tesseract_cmd = tesseract_cmd

	try:
	wb = load_workbook(xlsx_path, data_only=True)
	except Exception as e:
	return f"Error loading Excel file: {str(e)}"

	all_sheets_content: list[str] = []
	preview_text: str \| None = None
	any_data_found = False

	for sheet in wb.worksheets:
	sheet_content = [f"\n=== Sheet: {sheet.title} ===\n"]

	# Extract table data
	has_data = False
	non_empty_rows = 0
	for row in sheet.iter_rows(max_row=sheet.max_row, values_only=True):
	if row is None or all(cell is None for cell in row):
	continue # skip completely empty rows
	has_data = True
	non_empty_rows += 1
	any_data_found = True
	row_data = [str(cell).strip() if cell is not None else "" for cell in row]
	joined = " \| ".join(row_data)
	sheet_content.append(joined)
	if preview_text is None and joined.strip():
	preview_text = joined[:15]

	if not has_data:
	sheet_content.append("[No data in this sheet]")
	print(f"ℹ️ XLSX: Sheet '{sheet.title}' has no data (openpyxl)")
	else:
	print(f"🧾 XLSX: Sheet '{sheet.title}' non-empty rows: {non_empty_rows}")

	# Extract images from the sheet
	if hasattr(sheet, '_images'):
	image_count = 0
	for img in sheet._images:
	try:
	if hasattr(img, '_data'): # if it's a real OpenPyXL Image
	image_data = img._data()
	elif hasattr(img, '_ref'):
	continue # cell ref-only images; ignore
	else:
	continue

	pil_img = Image.open(BytesIO(image_data))
	try:
	ocr_text = pytesseract.image_to_string(pil_img).strip()
	if ocr_text:
	sheet_content.append(f"[Image {image_count + 1} Text]: {ocr_text}")
	else:
	sheet_content.append(f"[Image {image_count + 1}]: No text detected")
	except Exception as ocr_e:
	sheet_content.append(f"[Image {image_count + 1}]: OCR failed - {str(ocr_e)}")

	image_count += 1
	except Exception as e:
	sheet_content.append(f"[Image extraction error: {str(e)}]")

	if image_count == 0:
	sheet_content.append("[No images found in this sheet]")

	all_sheets_content.append("\n".join(sheet_content))

	# If no data found using openpyxl, try pandas fallback (handles some edge cases better)
	if not any_data_found:
	print("ℹ️ XLSX: No data via openpyxl, trying pandas fallback…")
	try:
	xls = pd.ExcelFile(xlsx_path, engine="openpyxl")
	pandas_parts = []
	extracted_sheets = 0
	for sheet_name in xls.sheet_names:
	df = pd.read_excel(xls, sheet_name=sheet_name, dtype=str)
	if not df.empty:
	any_data_found = True
	header = f"\n=== Sheet: {sheet_name} ===\n"
	csv_like = df.fillna("").astype(str).to_csv(index=False)
	pandas_parts.append(header + csv_like)
	extracted_sheets += 1
	if preview_text is None:
	flat = "".join(csv_like.splitlines())
	if flat:
	preview_text = flat[:15]
	else:
	pandas_parts.append(f"\n=== Sheet: {sheet_name} ===\n[No data in this sheet]")
	if pandas_parts:
	all_sheets_content = pandas_parts
	print(f"✅ XLSX: Pandas fallback extracted {extracted_sheets} non-empty sheet(s)")
	except Exception as pe:
	# If pandas also fails, keep whatever we had
	all_sheets_content.append(f"[Pandas fallback failed: {str(pe)}]")
	print(f"❌ XLSX: Pandas fallback failed: {pe}")

	combined = "\n\n".join(all_sheets_content)

	# Print a small preview for verification
	if preview_text is None:
	# fallback: take from combined text
	flat_combined = "".join(combined.splitlines()).strip()
	if flat_combined:
	preview_text = flat_combined[:15]
	if preview_text:
	print(f"🔎 XLSX content preview: {preview_text}")

	return combined