Solar-Eyes-Dockerized / render_pdfs.py
3martini's picture
Upload folder using huggingface_hub
786d4da verified
import pypdfium2 as pdfium
import os
# Directory containing PDFs
pdf_dir = './pdf_downloads'
# Directory to store images
image_dir = './pdf_images'
os.makedirs(image_dir, exist_ok=True)
# Process each PDF
for pdf_file in os.listdir(pdf_dir):
if pdf_file.endswith('.pdf'):
pdf_path = os.path.join(pdf_dir, pdf_file)
image_path = os.path.join(image_dir, pdf_file.replace('.pdf', '.jpg'))
# Open the PDF
print(f"Attempting to convert {pdf_path}")
try:
pdf = pdfium.PdfDocument(pdf_path)
except Exception as e:
print(f"Unable to convert {pdf_path} due to {e}")
# Get just the first page
page = pdf[0]
image = page.render(scale=4).to_pil()
# Save the rendered page as a JPEG image
image.save(image_path, 'JPEG')
print(f"Converted {pdf_path} to {image_path}")