Sina-Haz commited on
Commit
57a8d2e
1 Parent(s): 5cd4d16

uploading source code

Browse files
Files changed (8) hide show
  1. .gitignore +3 -0
  2. app.py +17 -0
  3. fields.txt +1 -0
  4. fns.py +227 -0
  5. pr_classifier.pkl +3 -0
  6. quantum-spring-421822-5b13d9d18bde.json +13 -0
  7. requirements.txt +9 -0
  8. split.py +74 -0
.gitignore ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ __pycache__
2
+ .ipynb_checkpoints
3
+ out.xlsx
app.py ADDED
@@ -0,0 +1,17 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import gradio as gr
2
+ from fns import *
3
+
4
+
5
+
6
+ # Define the Gradio interface
7
+ iface = gr.Interface(
8
+ fn=process_file, # Change to process_file later
9
+ inputs=gr.File(type='binary', label='Upload Report'),
10
+ outputs=gr.File(type='filepath', label='Processed Excel File'),
11
+ title='Report Processor',
12
+ description='Upload a report to process and download the resulting Excel file'
13
+ )
14
+
15
+ # Launch the interface
16
+ iface.launch()
17
+
fields.txt ADDED
@@ -0,0 +1 @@
 
 
1
+ 'box_26_name1', 'box_27_street_address1', 'box_28_city1', 'box_56_name1', 'box_57_street_address2', 'box_58_city2', 'box_118a', 'box_118b', 'box_119a', 'box_119b', 'Police_Department_City'
fns.py ADDED
@@ -0,0 +1,227 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ #/export
2
+ import os
3
+ import PyPDF2
4
+ from pathlib import Path
5
+ from pdf2image import convert_from_path
6
+ import numpy as np
7
+ from split import *
8
+ import fastai
9
+ from fastai.learner import load_learner
10
+ from fastai.vision.core import PILImage
11
+ import pandas as pd
12
+ from collections import OrderedDict
13
+ import re
14
+ from google.api_core.exceptions import InternalServerError
15
+ import shutil
16
+ from typing import Optional
17
+ from google.api_core.client_options import ClientOptions
18
+ from google.cloud import documentai # type: ignore
19
+
20
+
21
+
22
+ # Make a mini report batch for testing
23
+ def make_mini_batch(infile, outfile, bs=15):
24
+ reader = PyPDF2.PdfReader(infile)
25
+ rand_pgs = list(np.random.choice(len(reader.pages), bs, replace=False))
26
+ writer = PyPDF2.PdfWriter()
27
+ for pg in rand_pgs:
28
+ page = reader.pages[int(pg)]
29
+ writer.add_page(page)
30
+ writer.write(outfile)
31
+
32
+
33
+ # Now define a function that outputs a folder of individual .jpgs for a batch report
34
+ def report_to_jpegs(filename, outfolder):
35
+ reader = PyPDF2.PdfReader(filename)
36
+ path = Path(outfolder)
37
+ if not path.exists():
38
+ path.mkdir()
39
+ for i, page in enumerate(reader.pages):
40
+ writer = PyPDF2.PdfWriter()
41
+ dest = (path/f'file{i}.pdf')
42
+ writer.add_page(page)
43
+ writer.write(dest)
44
+ folder_to_img(outfolder)
45
+
46
+
47
+ def define_others(folder, classifier):
48
+ other_files = [] # A list of files to unlink
49
+ for root, _, filelist in os.walk(folder):
50
+ if '.ipynb_checkpoints' in root:
51
+ continue
52
+ for file in filelist:
53
+ path = os.path.join(root, file)
54
+ img = PILImage.create(path)
55
+ _, idx, _ = classifier.predict(img)
56
+ if idx.item() == 1:
57
+ other_files.append(path)
58
+ return other_files
59
+
60
+
61
+
62
+ #Importing Boilerplate Documentai code to process a file
63
+
64
+ # [START documentai_process_document]
65
+ # [START documentai_process_document_processor_version]
66
+
67
+
68
+ # TODO(developer): Uncomment these variables before running the sample.
69
+ # project_id = "YOUR_PROJECT_ID"
70
+ # location = "YOUR_PROCESSOR_LOCATION" # Format is "us" or "eu"
71
+ # processor_id = "YOUR_PROCESSOR_ID" # Create processor before running sample
72
+ # file_path = "/path/to/local/pdf"
73
+ # mime_type = "application/pdf" # Refer to https://cloud.google.com/document-ai/docs/file-types for supported file types
74
+ # field_mask = "text,entities,pages.pageNumber" # Optional. The fields to return in the Document object.
75
+ # processor_version_id = "YOUR_PROCESSOR_VERSION_ID" # Optional. Processor version to use
76
+
77
+
78
+ def process_document_sample(
79
+ project_id: str,
80
+ location: str,
81
+ processor_id: str,
82
+ file_path: str,
83
+ mime_type: str,
84
+ field_mask: Optional[str] = None,
85
+ processor_version_id: Optional[str] = None,
86
+ ) -> None:
87
+ # You must set the `api_endpoint` if you use a location other than "us".
88
+ opts = ClientOptions(api_endpoint=f"{location}-documentai.googleapis.com")
89
+
90
+ client = documentai.DocumentProcessorServiceClient(client_options=opts)
91
+
92
+ if processor_version_id:
93
+ # The full resource name of the processor version, e.g.:
94
+ # `projects/{project_id}/locations/{location}/processors/{processor_id}/processorVersions/{processor_version_id}`
95
+ name = client.processor_version_path(
96
+ project_id, location, processor_id, processor_version_id
97
+ )
98
+ else:
99
+ # The full resource name of the processor, e.g.:
100
+ # `projects/{project_id}/locations/{location}/processors/{processor_id}`
101
+ name = client.processor_path(project_id, location, processor_id)
102
+
103
+ # Read the file into memory
104
+ with open(file_path, "rb") as image:
105
+ image_content = image.read()
106
+
107
+ # Load binary data
108
+ raw_document = documentai.RawDocument(content=image_content, mime_type=mime_type)
109
+
110
+ # For more information: https://cloud.google.com/document-ai/docs/reference/rest/v1/ProcessOptions
111
+ # Optional: Additional configurations for processing.
112
+ process_options = documentai.ProcessOptions(
113
+ # Process only specific pages
114
+ individual_page_selector=documentai.ProcessOptions.IndividualPageSelector(
115
+ pages=[1]
116
+ )
117
+ )
118
+
119
+ # Configure the process request
120
+ request = documentai.ProcessRequest(
121
+ name=name,
122
+ raw_document=raw_document,
123
+ field_mask=field_mask,
124
+ process_options=process_options,
125
+ )
126
+
127
+ result = client.process_document(request=request)
128
+
129
+ # For a full list of `Document` object attributes, reference this page:
130
+ # https://cloud.google.com/document-ai/docs/reference/rest/v1/Document
131
+ document = result.document
132
+
133
+ # Read the text recognition output from the processor
134
+ # print("The document contains the following text:")
135
+ # print(document.text)
136
+ return document
137
+
138
+
139
+ # [END documentai_process_document_processor_version]
140
+ # [END documentai_process_document]
141
+
142
+
143
+ # Function that takes in a list of filenames, runs each through google ocr and returns a pandas dataframe of the data
144
+ def extract_fields(files, fields=[]):
145
+ # Initialize an empty DataFrame with the specified fields as columns
146
+ df = pd.DataFrame(columns=fields)
147
+
148
+ for file in files:
149
+ try:
150
+ doc = process_document_sample(
151
+ project_id="573919539759",
152
+ location="us",
153
+ processor_id="7b2493d94a089d26",
154
+ file_path=file,
155
+ mime_type="image/jpeg"
156
+ )
157
+ # Initialize a dictionary to hold the entity mentions for the current document
158
+ row_data = {f: None for f in fields}
159
+
160
+ for entity in doc.entities:
161
+ if entity.type in row_data:
162
+ row_data[entity.type] = entity.mention_text
163
+
164
+ # Convert the row data to a DataFrame and concatenate it
165
+ df = pd.concat([df, pd.DataFrame([row_data])], ignore_index=True)
166
+
167
+ except InternalServerError as e:
168
+ page_num = re.search(r'\d+', file).group()
169
+ print(f'There was an internal error processing page {page_num}')
170
+
171
+ return df
172
+
173
+ def dataframe_from_reports(folder, columns):
174
+ files = []
175
+ for root, _, filelist in os.walk(folder):
176
+ if '.ipynb_checkpoints' in root:
177
+ continue
178
+ for file in filelist:
179
+ path = os.path.join(root, file)
180
+ files.append(path)
181
+ return extract_fields(files, columns)
182
+
183
+
184
+
185
+ # Script
186
+ def script(report, jpeg_foldername = 'images'):
187
+ # First transform report to a folder of individual images
188
+ report_to_jpegs(report, jpeg_foldername)
189
+
190
+ # Load in our classifier and use it to define and delete irrelevant files
191
+ classifier = load_learner('pr_classifier.pkl')
192
+ others = define_others(jpeg_foldername, classifier)
193
+ for o in others:
194
+ Path(o).unlink()
195
+
196
+ # Set credentials for using documentai
197
+ os.environ['GOOGLE_APPLICATION_CREDENTIALS'] = 'quantum-spring-421822-5b13d9d18bde.json'
198
+
199
+ # Reading in file to get fields variable
200
+ with open('fields.txt', 'r') as file:
201
+ fields = file.read().strip().replace("'", "").split(',')
202
+
203
+ fields = [f.replace(' ', '') for f in fields]
204
+
205
+ df = dataframe_from_reports(jpeg_foldername, fields)
206
+ excel_file = 'out.xlsx'
207
+ df.to_excel(excel_file, index=False)
208
+ shutil.rmtree(jpeg_foldername)
209
+
210
+ return excel_file
211
+
212
+
213
+ def process_file(file):
214
+ # Save the uploaded file to a temporary location
215
+ temp_file_path = 'temp_report.pdf'
216
+ with open(temp_file_path, 'wb') as temp_file:
217
+ temp_file.write(file)
218
+
219
+ # Run the script and get the path to the Excel file
220
+ excel_file_path = script(temp_file_path)
221
+
222
+ # Clean up the temporary file
223
+ os.remove(temp_file_path)
224
+
225
+ return excel_file_path
226
+
227
+
pr_classifier.pkl ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:fcaa83e9029fac519d6704ce9fd986417ca81a7f30e40cc9bc289eef932604b2
3
+ size 46964734
quantum-spring-421822-5b13d9d18bde.json ADDED
@@ -0,0 +1,13 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "type": "service_account",
3
+ "project_id": "quantum-spring-421822",
4
+ "private_key_id": "5b13d9d18bde80758227fc7479a602cdeb2dfd48",
5
+ "private_key": "-----BEGIN PRIVATE KEY-----\nMIIEvQIBADANBgkqhkiG9w0BAQEFAASCBKcwggSjAgEAAoIBAQClpcyrADEgAoyj\nbqROddNs7tKbOiDR9hTrHCMJLV9xIVKPOlwU51dK4OiM87V/LD99Ia9zzUUaGKEv\nh3YKfmyz3RcXUI5xAMlr/tilkLQXusqNK8SR8LTxAO3kf9q3aKnjaxVFeilWqcZe\nrXqjzqAU2pWXOvOSdW/pYGVA0vhY31QiiwCIenfYqicZk/WfR6A0Cp/brEn725g/\nnzyD11hNuaKci2vnUEz3sMHKFcBwPd2XV7FZTY/bSSFFoHcjbRpuJdF7d6r7hzz0\naHH8MxoMuv6fxZtteNezSQMCy7mKFccrcnhLvSCA9UvsLuJbOIqUz9G9Llx+NmSi\nXyxH1YsJAgMBAAECggEACzSKQgYHDLCHdXXCURlo1i1Ym04FEtbn5tiWKSSD2IoF\nxPEQcgyNK+BHcuN4g1QJrmwFK5r24fNJEgdCjaasMUWTLo1ROWUxeuf4HUkK2bEV\nyxNrHmkvQD9NGGAqui3vEhZ4D6VaHwXoiJaJHYQYcZiwtlRzThS6H4oZB1fOG2rS\nLB9f1Pl2j0r6JZmq5qSjH0nlcp/5nVWIypMgqGXyESv+LBO+rMrXbw0cOpGsufR6\nWkO8/6nLo0LTnFpXiFvt2A+ScjO34R/PTw2YN5g3R+1nsKWwYi/pGo6kYoqQjNum\n2MzQLsULyDYtuU6LABZ9rUgA+/XNddyhHCvIJAPL1QKBgQDVcM/RwvfGLcZL6qG3\naZuJSDjRKSozY2yKrCZfmZ2UfBatcVRK0we45FlWGHk4KwNHVSzGPXyM/h46xaGq\nuhpXoTj3znJgUR+WQVWRCa5O2Ff83cgC3X58ZwmAuNlWn5+xHsize8wt8blyWxcD\nYuDbnyv3tIAXtVVv5CiRD9LibQKBgQDGrV5jbf6khALElgSmOFrta8DnOTQwtRiH\nVylkTDMCvX6O5cf/e9H0ynqHNq1OAyCc09NGQ/nR4MsM4+0xVr7zIk6DCre0uWhk\nK5Ri2Wc/7cSL7Ng/Tz4lvaVcXk4UtEFpE2VMUlTm4UrWqniti/+FDKuH7AW+QdX4\nuXwDRf0JjQKBgQCO2lVNaDdbXZGlh+E6l6Uw6bSVUP7crAOegH+iZncQRUkMRpiL\nB9jYR4T/WZvwGuGz4W75M3fkCcSDw3hjgvFUiyRMvtDQZBtG/m1diGQwgx3IsUtH\nl6Urb+7tR0q3blh8PEVJ02ahrXJX9l1dt9UVSOAthpJgy83l+IeqJzQWtQKBgAhP\n/KEAWyUa+NqsxCkINpW0am/CLvLSwOH4vZyaq4+zyfXEm2ePEKvDRPkfSoT3EIg/\nPJm2Al/LnfM+HhBwZrDZgJtu+VywXTPDtwOjwPQlyJFqKWoMsLb3hlGlCHNJT2tM\nCnL1BuRaDL2KiF4Ke3hfnVWkhGD/dCph6FRiu0GJAoGANEa186PmEaEjtfDFck9j\nrBiADAXPSnu/SpHzPvO3EJ93yTpNjrWMzxOtzoobcK+5NMPtiR4EmR1p6j+PTMuP\nsTPu6+uQ1MPnfWA+m+Kkzqmk/Fa0HmiDsvXIINt/1weWSKHFBp27u5XAiY41xGBg\n/yO8vfCBKvsFWK5ciAxJAGM=\n-----END PRIVATE KEY-----\n",
6
+ "client_email": "sinas-acct@quantum-spring-421822.iam.gserviceaccount.com",
7
+ "client_id": "117896840412672049182",
8
+ "auth_uri": "https://accounts.google.com/o/oauth2/auth",
9
+ "token_uri": "https://oauth2.googleapis.com/token",
10
+ "auth_provider_x509_cert_url": "https://www.googleapis.com/oauth2/v1/certs",
11
+ "client_x509_cert_url": "https://www.googleapis.com/robot/v1/metadata/x509/sinas-acct%40quantum-spring-421822.iam.gserviceaccount.com",
12
+ "universe_domain": "googleapis.com"
13
+ }
requirements.txt ADDED
@@ -0,0 +1,9 @@
 
 
 
 
 
 
 
 
 
 
1
+ PyPDF2
2
+ pdf2image
3
+ numpy
4
+ fastai
5
+ pandas
6
+ typer>=0.12.3
7
+ google-api-core
8
+ google-cloud-documentai
9
+ openpyxl
split.py ADDED
@@ -0,0 +1,74 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import os
2
+ import PyPDF2
3
+ from pathlib import Path
4
+ from pdf2image import convert_from_path
5
+
6
+ def get_pg_x(infile, outfile, x=0):
7
+ reader = PyPDF2.PdfReader(infile)
8
+ first_pg = reader.pages[x]
9
+ writer = PyPDF2.PdfWriter(outfile)
10
+ writer.add_page(first_pg)
11
+ writer.write(outfile)
12
+
13
+
14
+
15
+ def get_mult_pgs(infile, out):
16
+ reader = PyPDF2.PdfReader(infile)
17
+ pg_ind = list(range(99, 123,2)) # Basically get every other page (a very naive way of splitting it up)
18
+ pages = [reader.pages[i] for i in pg_ind]
19
+
20
+ if not os.path.exists(out):
21
+ os.makedirs(out)
22
+ for i,p in enumerate(pages):
23
+ writer = PyPDF2.PdfWriter()
24
+ filepath = os.path.join(out, f'case_{i+5}.pdf')
25
+ writer.add_page(p)
26
+ writer.write(filepath)
27
+
28
+
29
+ def splitall(infile, out):
30
+ path = Path(out)
31
+ if not path.exists():
32
+ path.mkdir()
33
+ reader = PyPDF2.PdfReader(infile)
34
+
35
+ for i in range(len(reader.pages)):
36
+ writer = PyPDF2.PdfWriter()
37
+ if i%2 == 1:
38
+ dest = (path/'other')
39
+ dest.mkdir(exist_ok=True)
40
+ fpath = dest.joinpath(f'other{i//2}.pdf')
41
+ writer.add_page(reader.pages[i])
42
+ writer.write(fpath)
43
+ else:
44
+ dest = (path/'cases')
45
+ dest.mkdir(exist_ok=True)
46
+ fpath = dest.joinpath(f'case{i//2}.pdf')
47
+ writer.add_page(reader.pages[i])
48
+ writer.write(fpath)
49
+
50
+ def folder_to_img(folder):
51
+ for root, _, filelist in os.walk(folder):
52
+ for file in filelist:
53
+ if file.lower().endswith('.pdf'):
54
+ pdf_path = os.path.join(root, file)
55
+ outfolder = root
56
+ try:
57
+ im = convert_from_path(pdf_path)
58
+ outfile = os.path.join(outfolder,f'{os.path.splitext(os.path.basename(pdf_path))[0]}.jpg')
59
+ im[0].save(outfile, 'JPEG')
60
+ except Exception as e:
61
+ print(f"Error converting {pdf_path}: {e}")
62
+ Path(pdf_path).unlink()
63
+
64
+
65
+
66
+ if __name__=='__main__':
67
+ # splitall('report_batch.pdf', 'data')
68
+ folder_to_img('data/cases')
69
+ folder_to_img('data/other')
70
+
71
+
72
+
73
+
74
+