Spaces:
Sleeping
Sleeping
File size: 2,165 Bytes
57a8d2e |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 |
import os
import PyPDF2
from pathlib import Path
from pdf2image import convert_from_path
def get_pg_x(infile, outfile, x=0):
reader = PyPDF2.PdfReader(infile)
first_pg = reader.pages[x]
writer = PyPDF2.PdfWriter(outfile)
writer.add_page(first_pg)
writer.write(outfile)
def get_mult_pgs(infile, out):
reader = PyPDF2.PdfReader(infile)
pg_ind = list(range(99, 123,2)) # Basically get every other page (a very naive way of splitting it up)
pages = [reader.pages[i] for i in pg_ind]
if not os.path.exists(out):
os.makedirs(out)
for i,p in enumerate(pages):
writer = PyPDF2.PdfWriter()
filepath = os.path.join(out, f'case_{i+5}.pdf')
writer.add_page(p)
writer.write(filepath)
def splitall(infile, out):
path = Path(out)
if not path.exists():
path.mkdir()
reader = PyPDF2.PdfReader(infile)
for i in range(len(reader.pages)):
writer = PyPDF2.PdfWriter()
if i%2 == 1:
dest = (path/'other')
dest.mkdir(exist_ok=True)
fpath = dest.joinpath(f'other{i//2}.pdf')
writer.add_page(reader.pages[i])
writer.write(fpath)
else:
dest = (path/'cases')
dest.mkdir(exist_ok=True)
fpath = dest.joinpath(f'case{i//2}.pdf')
writer.add_page(reader.pages[i])
writer.write(fpath)
def folder_to_img(folder):
for root, _, filelist in os.walk(folder):
for file in filelist:
if file.lower().endswith('.pdf'):
pdf_path = os.path.join(root, file)
outfolder = root
try:
im = convert_from_path(pdf_path)
outfile = os.path.join(outfolder,f'{os.path.splitext(os.path.basename(pdf_path))[0]}.jpg')
im[0].save(outfile, 'JPEG')
except Exception as e:
print(f"Error converting {pdf_path}: {e}")
Path(pdf_path).unlink()
if __name__=='__main__':
# splitall('report_batch.pdf', 'data')
folder_to_img('data/cases')
folder_to_img('data/other')
|