Spaces:
Sleeping
Sleeping
import os | |
import PyPDF2 | |
from pathlib import Path | |
from pdf2image import convert_from_path | |
def get_pg_x(infile, outfile, x=0): | |
reader = PyPDF2.PdfReader(infile) | |
first_pg = reader.pages[x] | |
writer = PyPDF2.PdfWriter(outfile) | |
writer.add_page(first_pg) | |
writer.write(outfile) | |
def get_mult_pgs(infile, out): | |
reader = PyPDF2.PdfReader(infile) | |
pg_ind = list(range(99, 123,2)) # Basically get every other page (a very naive way of splitting it up) | |
pages = [reader.pages[i] for i in pg_ind] | |
if not os.path.exists(out): | |
os.makedirs(out) | |
for i,p in enumerate(pages): | |
writer = PyPDF2.PdfWriter() | |
filepath = os.path.join(out, f'case_{i+5}.pdf') | |
writer.add_page(p) | |
writer.write(filepath) | |
def splitall(infile, out): | |
path = Path(out) | |
if not path.exists(): | |
path.mkdir() | |
reader = PyPDF2.PdfReader(infile) | |
for i in range(len(reader.pages)): | |
writer = PyPDF2.PdfWriter() | |
if i%2 == 1: | |
dest = (path/'other') | |
dest.mkdir(exist_ok=True) | |
fpath = dest.joinpath(f'other{i//2}.pdf') | |
writer.add_page(reader.pages[i]) | |
writer.write(fpath) | |
else: | |
dest = (path/'cases') | |
dest.mkdir(exist_ok=True) | |
fpath = dest.joinpath(f'case{i//2}.pdf') | |
writer.add_page(reader.pages[i]) | |
writer.write(fpath) | |
def folder_to_img(folder): | |
for root, _, filelist in os.walk(folder): | |
for file in filelist: | |
if file.lower().endswith('.pdf'): | |
pdf_path = os.path.join(root, file) | |
outfolder = root | |
try: | |
im = convert_from_path(pdf_path) | |
outfile = os.path.join(outfolder,f'{os.path.splitext(os.path.basename(pdf_path))[0]}.jpg') | |
im[0].save(outfile, 'JPEG') | |
except Exception as e: | |
print(f"Error converting {pdf_path}: {e}") | |
Path(pdf_path).unlink() | |
if __name__=='__main__': | |
# splitall('report_batch.pdf', 'data') | |
folder_to_img('data/cases') | |
folder_to_img('data/other') | |