File size: 7,616 Bytes
febd231
 
09bc970
febd231
24504c5
 
 
e8cd67b
febd231
24504c5
 
 
 
 
 
 
 
e8cd67b
24504c5
e8cd67b
 
 
febd231
24504c5
 
 
 
 
09bc970
24504c5
febd231
24504c5
febd231
24504c5
febd231
09bc970
24504c5
 
 
 
 
 
 
 
 
 
febd231
24504c5
 
 
 
e8cd67b
24504c5
 
 
 
 
 
 
e8cd67b
24504c5
e8cd67b
 
 
24504c5
 
0c76214
e8cd67b
0c76214
24504c5
e8cd67b
24504c5
0c76214
e8cd67b
 
 
 
0c76214
e8cd67b
 
 
0c76214
24504c5
febd231
e8cd67b
febd231
e8cd67b
 
 
 
 
 
0c76214
24504c5
 
 
 
 
 
 
e8cd67b
 
 
 
 
 
 
 
 
 
 
 
 
 
0c76214
24504c5
 
febd231
24504c5
0c76214
24504c5
09bc970
e8cd67b
 
 
 
 
 
24504c5
 
 
e8cd67b
 
 
0c76214
 
 
 
24504c5
 
0c76214
 
 
 
24504c5
 
09bc970
e8cd67b
0c76214
febd231
24504c5
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
e8cd67b
 
 
 
 
24504c5
 
 
93bf38c
 
 
 
 
24504c5
 
0c76214
24504c5
09bc970
24504c5
 
09bc970
 
e8cd67b
24504c5
 
 
0c76214
 
 
24504c5
0c76214
24504c5
e8cd67b
09bc970
febd231
0c76214
 
 
 
 
 
 
 
 
 
 
24504c5
93bf38c
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
import os
import gradio as gr
from pdf2image import convert_from_path, pdfinfo_from_path
import zipfile
import shutil
import tempfile
from pathlib import Path
import traceback

def zip_folder(folder_path, output_path):
    """Create a zip archive from a folder with improved error handling"""
    try:
        with zipfile.ZipFile(output_path, 'w', zipfile.ZIP_DEFLATED) as zipf:
            for root, _, files in os.walk(folder_path):
                for file in files:
                    file_path = os.path.join(root, file)
                    zipf.write(file_path, os.path.relpath(file_path, folder_path))
        return True, ""
    except Exception as e:
        error_msg = f"Error creating zip file: {str(e)}"
        print(error_msg)
        return False, error_msg

# Use more robust directory handling with pathlib
BASE_DIR = Path(tempfile.gettempdir()) / "pdf_extractor"
DIRECTORY = BASE_DIR / "image_reference" 
DIRECTORY_OUTPUT = BASE_DIR / "output"
DIRECTORIES = [DIRECTORY, DIRECTORY_OUTPUT]

# Check and create directories
for directory in DIRECTORIES:
    directory.mkdir(parents=True, exist_ok=True)

ALLOWED_EXTENSIONS = ['.jpg', '.jpeg', '.png', '.gif']

def get_image_files(directory):
    """Get all image files from a directory with path validation"""
    directory = Path(directory)
    if not directory.exists() or not directory.is_dir():
        return []
    
    image_files = []
    for file_path in directory.glob('*'):
        if file_path.suffix.lower() in ALLOWED_EXTENSIONS:
            image_files.append(str(file_path))
    return image_files

def clear_directory(directory):
    """Safely clear a directory with error handling"""
    directory = Path(directory)
    if not directory.exists():
        return True, ""
    
    try:
        for item in directory.iterdir():
            if item.is_file() or item.is_symlink():
                item.unlink()
            elif item.is_dir():
                shutil.rmtree(item)
        return True, ""
    except Exception as e:
        error_msg = f"Failed to clear directory {directory}. Reason: {str(e)}"
        print(error_msg)
        return False, error_msg

def extract_photos_from_pdf(file_pdf):
    """Extract all pages from a PDF as images"""
    # Update status at the beginning
    status_text = "Starting extraction process..."
    
    # Check if file is provided
    if file_pdf is None:
        return [], None, "Error: No file uploaded"
    
    # Clear directories for new extraction
    clear_success, clear_error = clear_directory(DIRECTORY)
    if not clear_success:
        return [], None, f"Error clearing directories: {clear_error}"
        
    clear_success, clear_error = clear_directory(DIRECTORY_OUTPUT)
    if not clear_success:
        return [], None, f"Error clearing output directory: {clear_error}"
    
    try:
        # Get PDF path and info
        pdf_path = file_pdf.name
        
        try:
            info = pdfinfo_from_path(pdf_path)
            total_pages = info["Pages"]
        except Exception as e:
            error_details = traceback.format_exc()
            return [], None, f"Error reading PDF: {str(e)}\n\nDetails: {error_details}"
        
        # Progress tracking variables
        batch_size = 10  # Smaller batch size for better progress visibility
        
        # Process PDF in batches
        for start_page in range(1, total_pages + 1, batch_size):
            end_page = min(start_page + batch_size - 1, total_pages)
            
            try:
                images = convert_from_path(
                    pdf_path, 
                    first_page=start_page, 
                    last_page=end_page,
                    dpi=150  # Adjustable DPI for quality vs size
                )
                
                for idx, image in enumerate(images, start=start_page):
                    image_path = DIRECTORY / f"{idx}.png"
                    image.save(str(image_path), 'PNG')
            except Exception as e:
                error_details = traceback.format_exc()
                return [], None, f"Error converting PDF pages {start_page}-{end_page}: {str(e)}\n\nDetails: {error_details}"
        
        # Get list of extracted images and sort them numerically
        images_pdf_list = get_image_files(DIRECTORY)
        if not images_pdf_list:
            return [], None, "No images could be extracted from the PDF."
        
        image_names = [(path, os.path.basename(path)) for path in images_pdf_list]
        try:
            sorted_names = sorted(image_names, key=lambda x: int(Path(x[1]).stem))
        except Exception as e:
            # Fallback to unsorted if sorting fails
            sorted_names = image_names
            print(f"Error sorting images: {e}")
        
        # Create zip file of all images
        zip_path = DIRECTORY_OUTPUT / "all_photos.zip"
        zip_success, zip_error = zip_folder(DIRECTORY, zip_path)
        
        if zip_success:
            return (
                sorted_names,
                str(zip_path),
                f"Successfully extracted {len(images_pdf_list)} page{'s' if len(images_pdf_list) != 1 else ''} from PDF."
            )
        else:
            return (
                sorted_names,
                None,
                f"Images extracted but zip creation failed: {zip_error}"
            )
            
    except Exception as e:
        error_details = traceback.format_exc()
        return [], None, f"Unexpected error: {str(e)}\n\nDetails: {error_details}"

# Create Gradio interface with improved layout and error handling
with gr.Blocks(theme=gr.themes.Soft()) as demo:
    with gr.Row():
        with gr.Column(scale=1):
            gr.Markdown("""
            # PDF Image Extractor
            Upload a PDF file to extract all pages as images.
            """)
    
    with gr.Tab("PDF Extractor"):
        with gr.Row():
            with gr.Column(scale=1):
                file_pdf = gr.File(
                    file_types=['.pdf'], 
                    label="Upload PDF file"
                )
                
                with gr.Row():
                    btn = gr.Button("Extract Images", variant="primary")
                    clear_btn = gr.Button("Clear")
        
        with gr.Column():
            status = gr.Textbox(
                label="Status", 
                value="Upload a PDF and click 'Extract Images'",
                visible=True
            )
            gallery = gr.Gallery(
                label="Extracted Pages",
                show_label=True,
                elem_id="gallery",
                columns=3,
                object_fit="contain",
                height="auto"
            )
            download_btn = gr.File(
                label="Download All Images (ZIP)",
                visible=True
            )
    
    # Event handlers
    btn.click(
        fn=extract_photos_from_pdf,
        inputs=[file_pdf],
        outputs=[gallery, download_btn, status],
        api_name="extract"
    )
    
    def clear_outputs():
        return [], None, "Cleared. Upload a PDF to begin."
    
    clear_btn.click(
        fn=clear_outputs,
        inputs=[],
        outputs=[gallery, download_btn, status]
    )

    # Example for demonstration
    example_path = "./examples/sample.pdf"
    if os.path.exists(example_path):
        gr.Examples(
            examples=[[example_path]],
            fn=extract_photos_from_pdf,
            inputs=[file_pdf],
            outputs=[gallery, download_btn, status],
            cache_examples=False
        )

if __name__ == "__main__":
    demo.launch()