File size: 3,423 Bytes
05e6f93
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
import pypdf
from pdf2image import convert_from_path
import os
import tempfile
import shutil


class PDFOptimizer(object):
    def __init__(self):
        pass

    def split_pdf_to_pages(self, file_path, debug_dir=None, convert_to_images=False):
        # Create a temporary directory
        temp_dir = tempfile.mkdtemp()
        output_files = []

        if not convert_to_images:
            # Open the PDF file
            with open(file_path, 'rb') as pdf_file:
                reader = pypdf.PdfReader(pdf_file)
                number_of_pages = len(reader.pages)

                # Split the PDF into separate files per page
                for page_num in range(number_of_pages):
                    writer = pypdf.PdfWriter()
                    writer.add_page(reader.pages[page_num])

                    output_filename = os.path.join(temp_dir, f'page_{page_num + 1}.pdf')
                    with open(output_filename, 'wb') as output_file:
                        writer.write(output_file)
                        output_files.append(output_filename)

                    if debug_dir:
                        # Save each page to the debug folder
                        debug_output_filename = os.path.join(debug_dir, f'page_{page_num + 1}.pdf')
                        with open(debug_output_filename, 'wb') as output_file:
                            writer.write(output_file)

            # Return the number of pages, the list of file paths, and the temporary directory
            return number_of_pages, output_files, temp_dir
        else:
            # Convert the PDF to images
            images = convert_from_path(file_path, dpi=300)
            base_name = os.path.splitext(os.path.basename(file_path))[0]

            # Save the images to the temporary directory
            for i, image in enumerate(images):
                output_filename = os.path.join(temp_dir, f'{base_name}_page_{i + 1}.jpg')
                image.save(output_filename, 'JPEG')
                output_files.append(output_filename)

                if debug_dir:
                    # Save each image to the debug folder
                    os.makedirs(debug_dir, exist_ok=True)
                    debug_output_filename = os.path.join(debug_dir, f'{base_name}_page_{i + 1}_debug.jpg')
                    image.save(debug_output_filename, 'JPEG')
                    print(f"Debug image saved to: {debug_output_filename}")

            # Return the number of pages, the list of file paths, and the temporary directory
            return len(images), output_files, temp_dir


if __name__ == "__main__":
    pdf_optimizer = PDFOptimizer()

    # debug_dir = "/Users/andrejb/infra/shared/katana-git/sparrow/sparrow-ml/llm/data/"
    # # Ensure the output directory exists
    # os.makedirs(output_directory, exist_ok=True)
    #
    # # Split the optimized PDF into separate pages
    # num_pages, output_files, temp_dir = pdf_optimizer.split_pdf_to_pages("/Users/andrejb/infra/shared/katana-git/sparrow/sparrow-ml/llm/data/oracle_10k_2014_q1_small.pdf",
    #                                                                      debug_dir,
    #                                                                      True)
    #
    # print(f"Number of pages: {num_pages}")
    # print(f"Output files: {output_files}")
    # print(f"Temporary directory: {temp_dir}")
    #
    # shutil.rmtree(temp_dir, ignore_errors=True)