Spaces:
Sleeping
Sleeping
File size: 13,457 Bytes
57a8d2e 448f661 57a8d2e 448f661 57a8d2e 3170f68 57a8d2e 926b9f9 57a8d2e 5eb3e32 57a8d2e 926b9f9 57a8d2e 3170f68 448f661 57a8d2e 448f661 57a8d2e 448f661 3170f68 448f661 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 |
#/export
import os
import openpyxl as opxl
import PyPDF2
from pathlib import Path
from pdf2image import convert_from_path
import numpy as np
from split import *
import fastai
from fastai.learner import load_learner
from fastai.vision.core import PILImage
import pandas as pd
from collections import OrderedDict
import re
from google.api_core.exceptions import InternalServerError
import shutil
from typing import Optional
from google.api_core.client_options import ClientOptions
from google.cloud import documentai # type: ignore
# Make a mini report batch for testing
def make_mini_batch(infile, outfile, bs=15):
reader = PyPDF2.PdfReader(infile)
rand_pgs = list(np.random.choice(len(reader.pages), bs, replace=False))
writer = PyPDF2.PdfWriter()
for pg in rand_pgs:
page = reader.pages[int(pg)]
writer.add_page(page)
writer.write(outfile)
# Now define a function that outputs a folder of individual .jpgs for a batch report
def report_to_jpegs(filename, outfolder):
reader = PyPDF2.PdfReader(filename)
path = Path(outfolder)
if not path.exists():
path.mkdir()
for i, page in enumerate(reader.pages):
writer = PyPDF2.PdfWriter()
dest = (path/f'file{i}.pdf')
writer.add_page(page)
writer.write(dest)
folder_to_img(outfolder)
def define_others(folder, classifier):
other_files = [] # A list of files to unlink
for root, _, filelist in os.walk(folder):
if '.ipynb_checkpoints' in root:
continue
for file in filelist:
path = os.path.join(root, file)
img = PILImage.create(path)
_, idx, _ = classifier.predict(img)
if idx.item() == 1:
other_files.append(path)
return other_files
#Importing Boilerplate Documentai code to process a file
# [START documentai_process_document]
# [START documentai_process_document_processor_version]
# TODO(developer): Uncomment these variables before running the sample.
# project_id = "YOUR_PROJECT_ID"
# location = "YOUR_PROCESSOR_LOCATION" # Format is "us" or "eu"
# processor_id = "YOUR_PROCESSOR_ID" # Create processor before running sample
# file_path = "/path/to/local/pdf"
# mime_type = "application/pdf" # Refer to https://cloud.google.com/document-ai/docs/file-types for supported file types
# field_mask = "text,entities,pages.pageNumber" # Optional. The fields to return in the Document object.
# processor_version_id = "YOUR_PROCESSOR_VERSION_ID" # Optional. Processor version to use
def process_document_sample(
project_id: str,
location: str,
processor_id: str,
file_path: str,
mime_type: str,
field_mask: Optional[str] = None,
processor_version_id: Optional[str] = None,
) -> None:
# You must set the `api_endpoint` if you use a location other than "us".
opts = ClientOptions(api_endpoint=f"{location}-documentai.googleapis.com")
client = documentai.DocumentProcessorServiceClient(client_options=opts)
if processor_version_id:
# The full resource name of the processor version, e.g.:
# `projects/{project_id}/locations/{location}/processors/{processor_id}/processorVersions/{processor_version_id}`
name = client.processor_version_path(
project_id, location, processor_id, processor_version_id
)
else:
# The full resource name of the processor, e.g.:
# `projects/{project_id}/locations/{location}/processors/{processor_id}`
name = client.processor_path(project_id, location, processor_id)
# Read the file into memory
with open(file_path, "rb") as image:
image_content = image.read()
# Load binary data
raw_document = documentai.RawDocument(content=image_content, mime_type=mime_type)
# For more information: https://cloud.google.com/document-ai/docs/reference/rest/v1/ProcessOptions
# Optional: Additional configurations for processing.
process_options = documentai.ProcessOptions(
# Process only specific pages
individual_page_selector=documentai.ProcessOptions.IndividualPageSelector(
pages=[1]
)
)
# Configure the process request
request = documentai.ProcessRequest(
name=name,
raw_document=raw_document,
field_mask=field_mask,
process_options=process_options,
)
result = client.process_document(request=request)
# For a full list of `Document` object attributes, reference this page:
# https://cloud.google.com/document-ai/docs/reference/rest/v1/Document
document = result.document
# Read the text recognition output from the processor
# print("The document contains the following text:")
# print(document.text)
return document
# [END documentai_process_document_processor_version]
# [END documentai_process_document]
# Function that takes in a list of filenames, runs each through google ocr and returns a pandas dataframe of the data
def extract_fields(files, fields=[]):
# Initialize an empty DataFrame with the specified fields as columns
df = pd.DataFrame(columns=fields)
for file in files:
try:
doc = process_document_sample(
project_id="573919539759",
location="us",
processor_id="7b2493d94a089d26",
processor_version_id="5e493494e810a1f3",
file_path=file,
mime_type="image/jpeg"
)
# Initialize a dictionary to hold the entity mentions for the current document
row_data = {f: None for f in fields}
for entity in doc.entities:
if entity.type in row_data:
row_data[entity.type] = entity.mention_text
# Convert the row data to a DataFrame and concatenate it
df = pd.concat([df, pd.DataFrame([row_data])], ignore_index=True)
except InternalServerError as e:
page_num = re.search(r'\d+', file).group()
print(f'There was an internal error processing page {page_num}')
return df
def dataframe_from_reports(folder, columns):
files = []
for root, _, filelist in os.walk(folder):
if '.ipynb_checkpoints' in root:
continue
for file in filelist:
path = os.path.join(root, file)
files.append(path)
return extract_fields(files, columns)
# a quick function that tells us if we are running in the huggingface space or not
def in_space():
return 'SPACE_ID' in os.environ or 'HUGGINGFACE_SPACE_REPOSITORY' in os.environ
# Script
def script(report, jpeg_foldername = 'images'):
# First transform report to a folder of individual images
report_to_jpegs(report, jpeg_foldername)
# Load in our classifier and use it to define and delete irrelevant files
classifier = load_learner('pr_classifier.pkl')
others = define_others(jpeg_foldername, classifier)
for o in others:
Path(o).unlink()
# Set credentials for using documentai
if in_space():
# get the secret
creds = os.getenv('API_JSON_STR')
# write to a file so we can use
with open('temp.json', 'w') as file:
file.write(creds)
os.environ['GOOGLE_APPLICATION_CREDENTIALS'] = 'temp.json'
else:
os.environ['GOOGLE_APPLICATION_CREDENTIALS'] = 'quantum-spring-421822-9be9922d589f.json'
# Reading in file to get fields variable
with open('fields.txt', 'r') as file:
fields = file.read().strip().replace("'", "").split(',')
fields = [f.replace(' ', '') for f in fields]
df = dataframe_from_reports(jpeg_foldername, fields)
processed = unpivot_df(df_transform(df))
excel_file = 'out.xlsx'
processed.to_excel(excel_file, sheet_name='Processed Data')
with pd.ExcelWriter(excel_file, engine='openpyxl', mode='a') as writer:
df.to_excel(writer, sheet_name='Raw Data')
overwrite_styles(excel_file)
shutil.rmtree(jpeg_foldername)
return excel_file
def process_file(file):
# Save the uploaded file to a temporary location
temp_file_path = 'temp_report.pdf'
with open(temp_file_path, 'wb') as temp_file:
temp_file.write(file)
# Run the script and get the path to the Excel file
excel_file_path = script(temp_file_path)
# Clean up the temporary file
os.remove(temp_file_path)
return excel_file_path
def split_names(name):
if name is not None: name = name.strip()
if pd.isna(name) or name=='':
return '', ''
full_pattern = r'^([A-Za-z]+)\s([A-Za-z]\s)?([A-Za-z]+)'
match = re.match(full_pattern, name)
if match:
return match.group(1), match.group(3)
else:
return name, ''
def extract_apt(address):
if pd.isna(address): return ''
address = address.strip()
pattern = r'\b(?:Apt|Apartment|#|Unit|Suite)\s+(\w+)\b'
match = re.search(pattern, address, re.IGNORECASE)
if match: return match.group(0)
return ''
def liable(val):
# Going to take both 118a or 118b
if '25' in str(val):
return True
return False
def df_transform(df):
# TODO: Split up first and last name for all
# TODO: Extract out Apt # if possible
# TODO: If there's 25 in A, B have Liable=True else false
# df['box_26_name1'], df['box_56_name1'] = df['box_26_name1'].fillna('').fillnan(''), df['box_56_name1'].fillna('').fillnan('')
df = df.fillna('')
df = df.replace(to_replace='nan', value='')
df['box_26_name1'], df['box_56_name1'] = df['box_26_name1'].astype(str), df['box_56_name1'].astype(str)
names1_split = df['box_26_name1'].apply(lambda x: pd.Series(split_names(x)))
names2_split = df['box_56_name1'].apply(lambda x: pd.Series(split_names(x)))
names1_split.columns = ['First Name 1', 'Last Name 1']
names2_split.columns = ['First Name 2', 'Last Name 2']
names = pd.concat([names1_split, names2_split], axis=1)
apts1 = df['box_27_street_address1'].apply(lambda x: extract_apt(x))
apts2 = df['box_57_street_address2'].apply(lambda x: extract_apt(x))
apts = pd.concat([apts1, apts2], axis=1)
apts.columns = ['Apts 1', 'Apts 2']
liable_a, liable_b = df['box_118a'].apply(lambda x: liable(x)), df['box_118b'].apply(lambda x: liable(x))
df['Liable'] = liable_a | liable_b
df = df.join([names, apts])
df = df.drop(columns = ['box_26_name1', 'box_56_name1', 'box_118a', 'box_118b', 'box_119a', 'box_119b'])
df = df.rename(columns={'box_27_street_address1': 'Street Address 1', 'box_28_city1': 'City/Zip 1',
'box_57_street_address2': 'Street Address 2', 'box_58_city2': 'City/Zip 2',
'Police_Department_City':'Incident City'})
col_order = ['First Name 1', 'Last Name 1', 'Street Address 1', 'Apts 1','City/Zip 1',
'First Name 2', 'Last Name 2', 'Street Address 2', 'Apts 2', 'City/Zip 2',
'Incident City', 'Liable']
remaining = [col for col in df.columns if col not in col_order]
df = df.reindex(columns = col_order+remaining)
return df
# expects dataframe returned from df_transform function
def unpivot_df(df):
# Drop this stupid column
if 'Unnamed: 0' in df.columns: df = df.drop(columns='Unnamed: 0')
# first we separate both dataframes into 2
df1 = (df[['First Name 1', 'Last Name 1', 'Street Address 1', 'Apts 1',
'City/Zip 1', 'Incident City', 'Liable']]).copy()
df2 = (df[['First Name 2', 'Last Name 2', 'Street Address 2',
'Apts 2', 'City/Zip 2', 'Incident City', 'Liable']]).copy()
# if the first person wasn't liable then this one is
df2['Liable'] = ~df2['Liable']
# adding these lines to drop any rows where the person isn't captured
df1.dropna(subset=['First Name 1', 'Last Name 1'], how='all', inplace=True)
df2.dropna(subset=['First Name 2', 'Last Name 2'], how='all', inplace=True)
df1.rename(columns={'First Name 1': 'First Name', 'Last Name 1': 'Last Name', 'Street Address 1': 'Street Address',
'Apts 1': 'Apts','City/Zip 1': 'City/Zip'}, inplace=True)
df2.rename(columns={'First Name 2': 'First Name', 'Last Name 2': 'Last Name', 'Street Address 2': 'Street Address',
'Apts 2': 'Apts','City/Zip 2': 'City/Zip'}, inplace=True)
return pd.concat([df1, df2])
def fit_cols(filename):
wb = opxl.load_workbook(filename)
for ws in wb:
for col_cells in ws.columns:
newlen = max(len(str(cell.value)) for cell in col_cells)
col_letter = opxl.utils.get_column_letter(col_cells[0].column)
if newlen > 0:
ws.column_dimensions[col_letter].width = newlen*1.25+0.2
wb.save(filename)
def overwrite_styles(filename):
wb = opxl.load_workbook(filename)
center_align = opxl.styles.Alignment(
horizontal='center', wrapText=True
)
border = opxl.styles.Border(
left = opxl.styles.Side(style='thin'),
right = opxl.styles.Side(style='thin'),
top = opxl.styles.Side(style='thin'),
bottom = opxl.styles.Side(style='thin')
)
for ws in wb:
for col in ws.iter_cols():
for cell in col:
cell.border = border
cell.alignment = center_align
wb.save(filename)
fit_cols(filename)
|