import gradio as gr
import pdfplumber
import pytesseract
from pdf2image import convert_from_path
import pandas as pd
import shutil

# Check if Tesseract is installed
TESSERACT_AVAILABLE = shutil.which("tesseract") is not None

if TESSERACT_AVAILABLE:
    print("Tesseract OCR found, OCR functionality is enabled.")
else:
    print("Tesseract OCR not found, proceeding without OCR functionality.")

# Function to perform OCR on image
def extract_text_with_ocr(image):
    """Extracts text from an image using Tesseract OCR."""
    if not TESSERACT_AVAILABLE:
        raise RuntimeError("Tesseract OCR is not available. Please install Tesseract OCR and ensure it is in the system PATH.")
    ocr_data = pytesseract.image_to_data(image, output_type='dict')
    ocr_df = pd.DataFrame(ocr_data)
    ocr_df = ocr_df[ocr_df['text'].notna()]  # Filter out empty text
    return ocr_df

# Extract tables and text from PDF pages
def extract_table_from_pdf(pdf, columns):
    """Extract tables or perform OCR for unstructured pages in the PDF."""
    extracted_data = []
    
    with pdfplumber.open(pdf) as pdf:
        for page_num, page in enumerate(pdf.pages):
            table = page.extract_table()
            
            if table:
                # Handle structured table
                df = pd.DataFrame(table[1:], columns=table[0])
                if all(col in df.columns for col in columns):  # Filter only specified columns
                    df = df[columns]
                extracted_data.append(df)
            elif TESSERACT_AVAILABLE:
                # Handle unstructured tables using OCR if Tesseract is available
                image = page.to_image().original
                ocr_data = extract_text_with_ocr(image)
                
                # Basic parsing for OCR data, assuming columns
                ocr_text_rows = ocr_data.groupby('block_num')['text'].apply(lambda x: ' '.join(x)).tolist()
                ocr_df = pd.DataFrame([row.split() for row in ocr_text_rows], columns=columns[:len(ocr_text_rows[0].split())])
                extracted_data.append(ocr_df)

    # Combine all extracted tables into a single DataFrame, or return None if empty
    if extracted_data:
        result_df = pd.concat(extracted_data, ignore_index=True)
        return result_df
    else:
        return None

# Process single or multiple PDFs and ensure at least one sheet is visible
def process_pdfs(files, columns):
    """Process multiple PDF files and save extracted data to an Excel file."""
    columns = [col.strip() for col in columns.split(",")]
    output_path = "extracted_tables_combined.xlsx"
    
    with pd.ExcelWriter(output_path) as writer:
        # Placeholder sheet to avoid "At least one sheet must be visible" error
        placeholder_df = pd.DataFrame([["No data found"]], columns=["Message"])
        placeholder_df.to_excel(writer, sheet_name="Placeholder", index=False)
        
        sheets_written = False  # Track if any actual data sheets were written

        for idx, file in enumerate(files):
            df = extract_table_from_pdf(file, columns)
            if df is not None and not df.empty:
                df.to_excel(writer, sheet_name=f"PDF_{idx + 1}", index=False)
                sheets_written = True

        # Remove the placeholder sheet if any data sheets were written
        if sheets_written:
            workbook = writer.book
            if "Placeholder" in workbook.sheetnames:
                workbook.remove(workbook["Placeholder"])

    return output_path

# Gradio interface function
def gradio_interface(files, columns):
    try:
        file_path = process_pdfs(files, columns)
        return file_path, ""
    except RuntimeError as e:
        return None, str(e)

# Gradio application setup with dual outputs (file and error message)
iface = gr.Interface(
    fn=gradio_interface,
    inputs=[
        gr.File(file_count="multiple", label="Upload PDFs"),
        gr.Textbox(label="Enter Required Columns (comma-separated, e.g., 'Sl No, Material Description, Quantity')")
    ],
    outputs=[
        gr.File(label="Download Extracted Tables Excel"),
        gr.Textbox(label="Error Message", placeholder="Any errors will appear here.")
    ],
    title="Advanced PDF Table Extractor",
    description="Upload PDFs, specify columns, and download tables in an Excel file. Extracts tables from both text and scanned images if Tesseract OCR is available."
)

iface.launch()