Sparrow / sparrow_parse /helpers /pdf_optimizer.py
Zana897465's picture
Upload 24 files
05e6f93 verified
raw
history blame contribute delete
3.42 kB
import pypdf
from pdf2image import convert_from_path
import os
import tempfile
import shutil
class PDFOptimizer(object):
def __init__(self):
pass
def split_pdf_to_pages(self, file_path, debug_dir=None, convert_to_images=False):
# Create a temporary directory
temp_dir = tempfile.mkdtemp()
output_files = []
if not convert_to_images:
# Open the PDF file
with open(file_path, 'rb') as pdf_file:
reader = pypdf.PdfReader(pdf_file)
number_of_pages = len(reader.pages)
# Split the PDF into separate files per page
for page_num in range(number_of_pages):
writer = pypdf.PdfWriter()
writer.add_page(reader.pages[page_num])
output_filename = os.path.join(temp_dir, f'page_{page_num + 1}.pdf')
with open(output_filename, 'wb') as output_file:
writer.write(output_file)
output_files.append(output_filename)
if debug_dir:
# Save each page to the debug folder
debug_output_filename = os.path.join(debug_dir, f'page_{page_num + 1}.pdf')
with open(debug_output_filename, 'wb') as output_file:
writer.write(output_file)
# Return the number of pages, the list of file paths, and the temporary directory
return number_of_pages, output_files, temp_dir
else:
# Convert the PDF to images
images = convert_from_path(file_path, dpi=300)
base_name = os.path.splitext(os.path.basename(file_path))[0]
# Save the images to the temporary directory
for i, image in enumerate(images):
output_filename = os.path.join(temp_dir, f'{base_name}_page_{i + 1}.jpg')
image.save(output_filename, 'JPEG')
output_files.append(output_filename)
if debug_dir:
# Save each image to the debug folder
os.makedirs(debug_dir, exist_ok=True)
debug_output_filename = os.path.join(debug_dir, f'{base_name}_page_{i + 1}_debug.jpg')
image.save(debug_output_filename, 'JPEG')
print(f"Debug image saved to: {debug_output_filename}")
# Return the number of pages, the list of file paths, and the temporary directory
return len(images), output_files, temp_dir
if __name__ == "__main__":
pdf_optimizer = PDFOptimizer()
# debug_dir = "/Users/andrejb/infra/shared/katana-git/sparrow/sparrow-ml/llm/data/"
# # Ensure the output directory exists
# os.makedirs(output_directory, exist_ok=True)
#
# # Split the optimized PDF into separate pages
# num_pages, output_files, temp_dir = pdf_optimizer.split_pdf_to_pages("/Users/andrejb/infra/shared/katana-git/sparrow/sparrow-ml/llm/data/oracle_10k_2014_q1_small.pdf",
# debug_dir,
# True)
#
# print(f"Number of pages: {num_pages}")
# print(f"Output files: {output_files}")
# print(f"Temporary directory: {temp_dir}")
#
# shutil.rmtree(temp_dir, ignore_errors=True)