import pypdf from pdf2image import convert_from_path import os import tempfile import shutil class PDFOptimizer(object): def __init__(self): pass def split_pdf_to_pages(self, file_path, debug_dir=None, convert_to_images=False): # Create a temporary directory temp_dir = tempfile.mkdtemp() output_files = [] if not convert_to_images: # Open the PDF file with open(file_path, 'rb') as pdf_file: reader = pypdf.PdfReader(pdf_file) number_of_pages = len(reader.pages) # Split the PDF into separate files per page for page_num in range(number_of_pages): writer = pypdf.PdfWriter() writer.add_page(reader.pages[page_num]) output_filename = os.path.join(temp_dir, f'page_{page_num + 1}.pdf') with open(output_filename, 'wb') as output_file: writer.write(output_file) output_files.append(output_filename) if debug_dir: # Save each page to the debug folder debug_output_filename = os.path.join(debug_dir, f'page_{page_num + 1}.pdf') with open(debug_output_filename, 'wb') as output_file: writer.write(output_file) # Return the number of pages, the list of file paths, and the temporary directory return number_of_pages, output_files, temp_dir else: # Convert the PDF to images images = convert_from_path(file_path, dpi=300) base_name = os.path.splitext(os.path.basename(file_path))[0] # Save the images to the temporary directory for i, image in enumerate(images): output_filename = os.path.join(temp_dir, f'{base_name}_page_{i + 1}.jpg') image.save(output_filename, 'JPEG') output_files.append(output_filename) if debug_dir: # Save each image to the debug folder os.makedirs(debug_dir, exist_ok=True) debug_output_filename = os.path.join(debug_dir, f'{base_name}_page_{i + 1}_debug.jpg') image.save(debug_output_filename, 'JPEG') print(f"Debug image saved to: {debug_output_filename}") # Return the number of pages, the list of file paths, and the temporary directory return len(images), output_files, temp_dir if __name__ == "__main__": pdf_optimizer = PDFOptimizer() # debug_dir = "/Users/andrejb/infra/shared/katana-git/sparrow/sparrow-ml/llm/data/" # # Ensure the output directory exists # os.makedirs(output_directory, exist_ok=True) # # # Split the optimized PDF into separate pages # num_pages, output_files, temp_dir = pdf_optimizer.split_pdf_to_pages("/Users/andrejb/infra/shared/katana-git/sparrow/sparrow-ml/llm/data/oracle_10k_2014_q1_small.pdf", # debug_dir, # True) # # print(f"Number of pages: {num_pages}") # print(f"Output files: {output_files}") # print(f"Temporary directory: {temp_dir}") # # shutil.rmtree(temp_dir, ignore_errors=True)