extract-photos-from-pdf

Sleeping

File size: 7,616 Bytes

febd231
 
09bc970
febd231
24504c5
 
 
e8cd67b
febd231
24504c5
 
 
 
 
 
 
 
e8cd67b
24504c5
e8cd67b
 
 
febd231
24504c5
 
 
 
 
09bc970
24504c5
febd231
24504c5
febd231
24504c5
febd231
09bc970
24504c5
 
 
 
 
 
 
 
 
 
febd231
24504c5
 
 
 
e8cd67b
24504c5
 
 
 
 
 
 
e8cd67b
24504c5
e8cd67b
 
 
24504c5
 
0c76214
e8cd67b
0c76214
24504c5
e8cd67b
24504c5
0c76214
e8cd67b
 
 
 
0c76214
e8cd67b
 
 
0c76214
24504c5
febd231
e8cd67b
febd231
e8cd67b
 
 
 
 
 
0c76214
24504c5
 
 
 
 
 
 
e8cd67b
 
 
 
 
 
 
 
 
 
 
 
 
 
0c76214
24504c5
 
febd231
24504c5
0c76214
24504c5
09bc970
e8cd67b
 
 
 
 
 
24504c5
 
 
e8cd67b
 
 
0c76214
 
 
 
24504c5
 
0c76214
 
 
 
24504c5
 
09bc970
e8cd67b
0c76214
febd231
24504c5
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
e8cd67b
 
 
 
 
24504c5
 
 
93bf38c
 
 
 
 
24504c5
 
0c76214
24504c5
09bc970
24504c5
 
09bc970
 
e8cd67b
24504c5
 
 
0c76214
 
 
24504c5
0c76214
24504c5
e8cd67b
09bc970
febd231
0c76214
 
 
 
 
 
 
 
 
 
 
24504c5
93bf38c

import os
import gradio as gr
from pdf2image import convert_from_path, pdfinfo_from_path
import zipfile
import shutil
import tempfile
from pathlib import Path
import traceback

def zip_folder(folder_path, output_path):
    """Create a zip archive from a folder with improved error handling"""
    try:
        with zipfile.ZipFile(output_path, 'w', zipfile.ZIP_DEFLATED) as zipf:
            for root, _, files in os.walk(folder_path):
                for file in files:
                    file_path = os.path.join(root, file)
                    zipf.write(file_path, os.path.relpath(file_path, folder_path))
        return True, ""
    except Exception as e:
        error_msg = f"Error creating zip file: {str(e)}"
        print(error_msg)
        return False, error_msg

# Use more robust directory handling with pathlib
BASE_DIR = Path(tempfile.gettempdir()) / "pdf_extractor"
DIRECTORY = BASE_DIR / "image_reference" 
DIRECTORY_OUTPUT = BASE_DIR / "output"
DIRECTORIES = [DIRECTORY, DIRECTORY_OUTPUT]

# Check and create directories
for directory in DIRECTORIES:
    directory.mkdir(parents=True, exist_ok=True)

ALLOWED_EXTENSIONS = ['.jpg', '.jpeg', '.png', '.gif']

def get_image_files(directory):
    """Get all image files from a directory with path validation"""
    directory = Path(directory)
    if not directory.exists() or not directory.is_dir():
        return []
    
    image_files = []
    for file_path in directory.glob('*'):
        if file_path.suffix.lower() in ALLOWED_EXTENSIONS:
            image_files.append(str(file_path))
    return image_files

def clear_directory(directory):
    """Safely clear a directory with error handling"""
    directory = Path(directory)
    if not directory.exists():
        return True, ""
    
    try:
        for item in directory.iterdir():
            if item.is_file() or item.is_symlink():
                item.unlink()
            elif item.is_dir():
                shutil.rmtree(item)
        return True, ""
    except Exception as e:
        error_msg = f"Failed to clear directory {directory}. Reason: {str(e)}"
        print(error_msg)
        return False, error_msg

def extract_photos_from_pdf(file_pdf):
    """Extract all pages from a PDF as images"""
    # Update status at the beginning
    status_text = "Starting extraction process..."
    
    # Check if file is provided
    if file_pdf is None:
        return [], None, "Error: No file uploaded"
    
    # Clear directories for new extraction
    clear_success, clear_error = clear_directory(DIRECTORY)
    if not clear_success:
        return [], None, f"Error clearing directories: {clear_error}"
        
    clear_success, clear_error = clear_directory(DIRECTORY_OUTPUT)
    if not clear_success:
        return [], None, f"Error clearing output directory: {clear_error}"
    
    try:
        # Get PDF path and info
        pdf_path = file_pdf.name
        
        try:
            info = pdfinfo_from_path(pdf_path)
            total_pages = info["Pages"]
        except Exception as e:
            error_details = traceback.format_exc()
            return [], None, f"Error reading PDF: {str(e)}\n\nDetails: {error_details}"
        
        # Progress tracking variables
        batch_size = 10  # Smaller batch size for better progress visibility
        
        # Process PDF in batches
        for start_page in range(1, total_pages + 1, batch_size):
            end_page = min(start_page + batch_size - 1, total_pages)
            
            try:
                images = convert_from_path(
                    pdf_path, 
                    first_page=start_page, 
                    last_page=end_page,
                    dpi=150  # Adjustable DPI for quality vs size
                )
                
                for idx, image in enumerate(images, start=start_page):
                    image_path = DIRECTORY / f"{idx}.png"
                    image.save(str(image_path), 'PNG')
            except Exception as e:
                error_details = traceback.format_exc()
                return [], None, f"Error converting PDF pages {start_page}-{end_page}: {str(e)}\n\nDetails: {error_details}"
        
        # Get list of extracted images and sort them numerically
        images_pdf_list = get_image_files(DIRECTORY)
        if not images_pdf_list:
            return [], None, "No images could be extracted from the PDF."
        
        image_names = [(path, os.path.basename(path)) for path in images_pdf_list]
        try:
            sorted_names = sorted(image_names, key=lambda x: int(Path(x[1]).stem))
        except Exception as e:
            # Fallback to unsorted if sorting fails
            sorted_names = image_names
            print(f"Error sorting images: {e}")
        
        # Create zip file of all images
        zip_path = DIRECTORY_OUTPUT / "all_photos.zip"
        zip_success, zip_error = zip_folder(DIRECTORY, zip_path)
        
        if zip_success:
            return (
                sorted_names,
                str(zip_path),
                f"Successfully extracted {len(images_pdf_list)} page{'s' if len(images_pdf_list) != 1 else ''} from PDF."
            )
        else:
            return (
                sorted_names,
                None,
                f"Images extracted but zip creation failed: {zip_error}"
            )
            
    except Exception as e:
        error_details = traceback.format_exc()
        return [], None, f"Unexpected error: {str(e)}\n\nDetails: {error_details}"

# Create Gradio interface with improved layout and error handling
with gr.Blocks(theme=gr.themes.Soft()) as demo:
    with gr.Row():
        with gr.Column(scale=1):
            gr.Markdown("""
            # PDF Image Extractor
            Upload a PDF file to extract all pages as images.
            """)
    
    with gr.Tab("PDF Extractor"):
        with gr.Row():
            with gr.Column(scale=1):
                file_pdf = gr.File(
                    file_types=['.pdf'], 
                    label="Upload PDF file"
                )
                
                with gr.Row():
                    btn = gr.Button("Extract Images", variant="primary")
                    clear_btn = gr.Button("Clear")
        
        with gr.Column():
            status = gr.Textbox(
                label="Status", 
                value="Upload a PDF and click 'Extract Images'",
                visible=True
            )
            gallery = gr.Gallery(
                label="Extracted Pages",
                show_label=True,
                elem_id="gallery",
                columns=3,
                object_fit="contain",
                height="auto"
            )
            download_btn = gr.File(
                label="Download All Images (ZIP)",
                visible=True
            )
    
    # Event handlers
    btn.click(
        fn=extract_photos_from_pdf,
        inputs=[file_pdf],
        outputs=[gallery, download_btn, status],
        api_name="extract"
    )
    
    def clear_outputs():
        return [], None, "Cleared. Upload a PDF to begin."
    
    clear_btn.click(
        fn=clear_outputs,
        inputs=[],
        outputs=[gallery, download_btn, status]
    )

    # Example for demonstration
    example_path = "./examples/sample.pdf"
    if os.path.exists(example_path):
        gr.Examples(
            examples=[[example_path]],
            fn=extract_photos_from_pdf,
            inputs=[file_pdf],
            outputs=[gallery, download_btn, status],
            cache_examples=False
        )

if __name__ == "__main__":
    demo.launch()