import os import PyPDF2 from pathlib import Path from pdf2image import convert_from_path def get_pg_x(infile, outfile, x=0): reader = PyPDF2.PdfReader(infile) first_pg = reader.pages[x] writer = PyPDF2.PdfWriter(outfile) writer.add_page(first_pg) writer.write(outfile) def get_mult_pgs(infile, out): reader = PyPDF2.PdfReader(infile) pg_ind = list(range(99, 123,2)) # Basically get every other page (a very naive way of splitting it up) pages = [reader.pages[i] for i in pg_ind] if not os.path.exists(out): os.makedirs(out) for i,p in enumerate(pages): writer = PyPDF2.PdfWriter() filepath = os.path.join(out, f'case_{i+5}.pdf') writer.add_page(p) writer.write(filepath) def splitall(infile, out): path = Path(out) if not path.exists(): path.mkdir() reader = PyPDF2.PdfReader(infile) for i in range(len(reader.pages)): writer = PyPDF2.PdfWriter() if i%2 == 1: dest = (path/'other') dest.mkdir(exist_ok=True) fpath = dest.joinpath(f'other{i//2}.pdf') writer.add_page(reader.pages[i]) writer.write(fpath) else: dest = (path/'cases') dest.mkdir(exist_ok=True) fpath = dest.joinpath(f'case{i//2}.pdf') writer.add_page(reader.pages[i]) writer.write(fpath) def folder_to_img(folder): for root, _, filelist in os.walk(folder): for file in filelist: if file.lower().endswith('.pdf'): pdf_path = os.path.join(root, file) outfolder = root try: im = convert_from_path(pdf_path) outfile = os.path.join(outfolder,f'{os.path.splitext(os.path.basename(pdf_path))[0]}.jpg') im[0].save(outfile, 'JPEG') except Exception as e: print(f"Error converting {pdf_path}: {e}") Path(pdf_path).unlink() if __name__=='__main__': # splitall('report_batch.pdf', 'data') folder_to_img('data/cases') folder_to_img('data/other')