File size: 2,165 Bytes
57a8d2e
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
import os
import PyPDF2
from pathlib import Path
from pdf2image import convert_from_path

def get_pg_x(infile, outfile, x=0):
    reader = PyPDF2.PdfReader(infile)
    first_pg = reader.pages[x]
    writer = PyPDF2.PdfWriter(outfile)
    writer.add_page(first_pg)
    writer.write(outfile)



def get_mult_pgs(infile, out):
    reader = PyPDF2.PdfReader(infile)
    pg_ind = list(range(99, 123,2)) # Basically get every other page (a very naive way of splitting it up)
    pages = [reader.pages[i] for i in pg_ind]

    if not os.path.exists(out):
        os.makedirs(out)
    for i,p in enumerate(pages):
        writer = PyPDF2.PdfWriter()
        filepath = os.path.join(out, f'case_{i+5}.pdf')
        writer.add_page(p)
        writer.write(filepath)


def splitall(infile, out):
    path = Path(out)
    if not path.exists():
        path.mkdir()
    reader = PyPDF2.PdfReader(infile)
    
    for i in range(len(reader.pages)):
        writer = PyPDF2.PdfWriter()
        if i%2 == 1:
            dest = (path/'other')
            dest.mkdir(exist_ok=True)
            fpath = dest.joinpath(f'other{i//2}.pdf')
            writer.add_page(reader.pages[i])
            writer.write(fpath)
        else:
            dest = (path/'cases')
            dest.mkdir(exist_ok=True)
            fpath = dest.joinpath(f'case{i//2}.pdf')
            writer.add_page(reader.pages[i])
            writer.write(fpath)

def folder_to_img(folder):
    for root, _, filelist in os.walk(folder):
        for file in filelist:
            if file.lower().endswith('.pdf'):
                pdf_path = os.path.join(root, file)
                outfolder = root
                try:
                    im = convert_from_path(pdf_path)
                    outfile = os.path.join(outfolder,f'{os.path.splitext(os.path.basename(pdf_path))[0]}.jpg')
                    im[0].save(outfile, 'JPEG')
                except Exception as e:
                    print(f"Error converting {pdf_path}: {e}")
                Path(pdf_path).unlink()



if __name__=='__main__':
    # splitall('report_batch.pdf', 'data')
    folder_to_img('data/cases')
    folder_to_img('data/other')