Spaces:

shazeghi
/

Report-Parser

Sleeping

App Files Files Community

Sina-Haz commited on May 26

Commit

57a8d2e

•

1 Parent(s): 5cd4d16

uploading source code

Browse files

Files changed (8) hide show

.gitignore +3 -0
app.py +17 -0
fields.txt +1 -0
fns.py +227 -0
pr_classifier.pkl +3 -0
quantum-spring-421822-5b13d9d18bde.json +13 -0
requirements.txt +9 -0
split.py +74 -0

.gitignore ADDED Viewed

	@@ -0,0 +1,3 @@

+__pycache__
+.ipynb_checkpoints
+out.xlsx

app.py ADDED Viewed

	@@ -0,0 +1,17 @@

+import gradio as gr
+from fns import *
+# Define the Gradio interface
+iface = gr.Interface(
+    fn=process_file, # Change to process_file later
+    inputs=gr.File(type='binary', label='Upload Report'),
+    outputs=gr.File(type='filepath', label='Processed Excel File'),
+    title='Report Processor',
+    description='Upload a report to process and download the resulting Excel file'
+)
+# Launch the interface
+iface.launch()

fields.txt ADDED Viewed

	@@ -0,0 +1 @@


1	+ 'box_26_name1', 'box_27_street_address1', 'box_28_city1', 'box_56_name1', 'box_57_street_address2', 'box_58_city2', 'box_118a', 'box_118b', 'box_119a', 'box_119b', 'Police_Department_City'

fns.py ADDED Viewed

	@@ -0,0 +1,227 @@

+#/export
+import os
+import PyPDF2
+from pathlib import Path
+from pdf2image import convert_from_path
+import numpy as np
+from split import *
+import fastai
+from fastai.learner import load_learner
+from fastai.vision.core import PILImage
+import pandas as pd
+from collections import OrderedDict
+import re
+from google.api_core.exceptions import InternalServerError
+import shutil
+from typing import Optional
+from google.api_core.client_options import ClientOptions
+from google.cloud import documentai  # type: ignore
+# Make a mini report batch for testing
+def make_mini_batch(infile, outfile, bs=15):
+    reader = PyPDF2.PdfReader(infile)
+    rand_pgs = list(np.random.choice(len(reader.pages), bs, replace=False))
+    writer = PyPDF2.PdfWriter()
+    for pg in rand_pgs:
+        page = reader.pages[int(pg)]
+        writer.add_page(page)
+    writer.write(outfile)
+# Now define a function that outputs a folder of individual .jpgs for a batch report
+def report_to_jpegs(filename, outfolder):
+    reader = PyPDF2.PdfReader(filename)
+    path = Path(outfolder)
+    if not path.exists():
+        path.mkdir()
+    for i, page in enumerate(reader.pages):
+        writer = PyPDF2.PdfWriter()
+        dest = (path/f'file{i}.pdf')
+        writer.add_page(page)
+        writer.write(dest)
+    folder_to_img(outfolder)
+def define_others(folder, classifier):
+    other_files = [] # A list of files to unlink
+    for root, _, filelist in os.walk(folder):
+        if '.ipynb_checkpoints' in root:
+            continue
+        for file in filelist:
+            path = os.path.join(root, file)
+            img = PILImage.create(path)
+            _, idx, _ = classifier.predict(img)
+            if idx.item() == 1:
+                other_files.append(path)
+    return other_files
+#Importing Boilerplate Documentai code to process a file
+# [START documentai_process_document]
+# [START documentai_process_document_processor_version]
+# TODO(developer): Uncomment these variables before running the sample.
+# project_id = "YOUR_PROJECT_ID"
+# location = "YOUR_PROCESSOR_LOCATION" # Format is "us" or "eu"
+# processor_id = "YOUR_PROCESSOR_ID" # Create processor before running sample
+# file_path = "/path/to/local/pdf"
+# mime_type = "application/pdf" # Refer to https://cloud.google.com/document-ai/docs/file-types for supported file types
+# field_mask = "text,entities,pages.pageNumber"  # Optional. The fields to return in the Document object.
+# processor_version_id = "YOUR_PROCESSOR_VERSION_ID" # Optional. Processor version to use
+def process_document_sample(
+    project_id: str,
+    location: str,
+    processor_id: str,
+    file_path: str,
+    mime_type: str,
+    field_mask: Optional[str] = None,
+    processor_version_id: Optional[str] = None,
+) -> None:
+    # You must set the `api_endpoint` if you use a location other than "us".
+    opts = ClientOptions(api_endpoint=f"{location}-documentai.googleapis.com")
+    client = documentai.DocumentProcessorServiceClient(client_options=opts)
+    if processor_version_id:
+        # The full resource name of the processor version, e.g.:
+        # `projects/{project_id}/locations/{location}/processors/{processor_id}/processorVersions/{processor_version_id}`
+        name = client.processor_version_path(
+            project_id, location, processor_id, processor_version_id
+        )
+    else:
+        # The full resource name of the processor, e.g.:
+        # `projects/{project_id}/locations/{location}/processors/{processor_id}`
+        name = client.processor_path(project_id, location, processor_id)
+    # Read the file into memory
+    with open(file_path, "rb") as image:
+        image_content = image.read()
+    # Load binary data
+    raw_document = documentai.RawDocument(content=image_content, mime_type=mime_type)
+    # For more information: https://cloud.google.com/document-ai/docs/reference/rest/v1/ProcessOptions
+    # Optional: Additional configurations for processing.
+    process_options = documentai.ProcessOptions(
+        # Process only specific pages
+        individual_page_selector=documentai.ProcessOptions.IndividualPageSelector(
+            pages=[1]
+        )
+    )
+    # Configure the process request
+    request = documentai.ProcessRequest(
+        name=name,
+        raw_document=raw_document,
+        field_mask=field_mask,
+        process_options=process_options,
+    )
+    result = client.process_document(request=request)
+    # For a full list of `Document` object attributes, reference this page:
+    # https://cloud.google.com/document-ai/docs/reference/rest/v1/Document
+    document = result.document
+    # Read the text recognition output from the processor
+    # print("The document contains the following text:")
+    # print(document.text)
+    return document
+# [END documentai_process_document_processor_version]
+# [END documentai_process_document]
+# Function that takes in a list of filenames, runs each through google ocr and returns a pandas dataframe of the data
+def extract_fields(files, fields=[]):
+    # Initialize an empty DataFrame with the specified fields as columns
+    df = pd.DataFrame(columns=fields)
+    for file in files:
+        try:
+            doc = process_document_sample(
+                project_id="573919539759",
+                location="us",
+                processor_id="7b2493d94a089d26",
+                file_path=file,
+                mime_type="image/jpeg"
+            )
+            # Initialize a dictionary to hold the entity mentions for the current document
+            row_data = {f: None for f in fields}
+            for entity in doc.entities:
+                if entity.type in row_data:
+                    row_data[entity.type] = entity.mention_text
+            # Convert the row data to a DataFrame and concatenate it
+            df = pd.concat([df, pd.DataFrame([row_data])], ignore_index=True)
+        except InternalServerError as e:
+            page_num = re.search(r'\d+', file).group()
+            print(f'There was an internal error processing page {page_num}')
+    return df
+def dataframe_from_reports(folder, columns):
+    files = []
+    for root, _, filelist in os.walk(folder):
+        if '.ipynb_checkpoints' in root:
+            continue
+        for file in filelist:
+            path = os.path.join(root, file)
+            files.append(path)
+    return extract_fields(files, columns)
+# Script
+def script(report, jpeg_foldername = 'images'):
+    # First transform report to a folder of individual images
+    report_to_jpegs(report, jpeg_foldername)
+    # Load in our classifier and use it to define and delete irrelevant files
+    classifier = load_learner('pr_classifier.pkl')
+    others = define_others(jpeg_foldername, classifier)
+    for o in others:
+        Path(o).unlink()
+    # Set credentials for using documentai
+    os.environ['GOOGLE_APPLICATION_CREDENTIALS'] = 'quantum-spring-421822-5b13d9d18bde.json'
+    # Reading in file to get fields variable
+    with open('fields.txt', 'r') as file:
+        fields = file.read().strip().replace("'", "").split(',')
+    fields = [f.replace(' ', '') for f in fields]
+    df = dataframe_from_reports(jpeg_foldername, fields)
+    excel_file = 'out.xlsx'
+    df.to_excel(excel_file, index=False)
+    shutil.rmtree(jpeg_foldername)
+    return excel_file
+def process_file(file):
+    # Save the uploaded file to a temporary location
+    temp_file_path = 'temp_report.pdf'
+    with open(temp_file_path, 'wb') as temp_file:
+        temp_file.write(file)
+    # Run the script and get the path to the Excel file
+    excel_file_path = script(temp_file_path)
+    # Clean up the temporary file
+    os.remove(temp_file_path)
+    return excel_file_path

pr_classifier.pkl ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:fcaa83e9029fac519d6704ce9fd986417ca81a7f30e40cc9bc289eef932604b2
+size 46964734

quantum-spring-421822-5b13d9d18bde.json ADDED Viewed

	@@ -0,0 +1,13 @@

+{
+  "type": "service_account",
+  "project_id": "quantum-spring-421822",
+  "private_key_id": "5b13d9d18bde80758227fc7479a602cdeb2dfd48",
+  "private_key": "-----BEGIN PRIVATE KEY-----\nMIIEvQIBADANBgkqhkiG9w0BAQEFAASCBKcwggSjAgEAAoIBAQClpcyrADEgAoyj\nbqROddNs7tKbOiDR9hTrHCMJLV9xIVKPOlwU51dK4OiM87V/LD99Ia9zzUUaGKEv\nh3YKfmyz3RcXUI5xAMlr/tilkLQXusqNK8SR8LTxAO3kf9q3aKnjaxVFeilWqcZe\nrXqjzqAU2pWXOvOSdW/pYGVA0vhY31QiiwCIenfYqicZk/WfR6A0Cp/brEn725g/\nnzyD11hNuaKci2vnUEz3sMHKFcBwPd2XV7FZTY/bSSFFoHcjbRpuJdF7d6r7hzz0\naHH8MxoMuv6fxZtteNezSQMCy7mKFccrcnhLvSCA9UvsLuJbOIqUz9G9Llx+NmSi\nXyxH1YsJAgMBAAECggEACzSKQgYHDLCHdXXCURlo1i1Ym04FEtbn5tiWKSSD2IoF\nxPEQcgyNK+BHcuN4g1QJrmwFK5r24fNJEgdCjaasMUWTLo1ROWUxeuf4HUkK2bEV\nyxNrHmkvQD9NGGAqui3vEhZ4D6VaHwXoiJaJHYQYcZiwtlRzThS6H4oZB1fOG2rS\nLB9f1Pl2j0r6JZmq5qSjH0nlcp/5nVWIypMgqGXyESv+LBO+rMrXbw0cOpGsufR6\nWkO8/6nLo0LTnFpXiFvt2A+ScjO34R/PTw2YN5g3R+1nsKWwYi/pGo6kYoqQjNum\n2MzQLsULyDYtuU6LABZ9rUgA+/XNddyhHCvIJAPL1QKBgQDVcM/RwvfGLcZL6qG3\naZuJSDjRKSozY2yKrCZfmZ2UfBatcVRK0we45FlWGHk4KwNHVSzGPXyM/h46xaGq\nuhpXoTj3znJgUR+WQVWRCa5O2Ff83cgC3X58ZwmAuNlWn5+xHsize8wt8blyWxcD\nYuDbnyv3tIAXtVVv5CiRD9LibQKBgQDGrV5jbf6khALElgSmOFrta8DnOTQwtRiH\nVylkTDMCvX6O5cf/e9H0ynqHNq1OAyCc09NGQ/nR4MsM4+0xVr7zIk6DCre0uWhk\nK5Ri2Wc/7cSL7Ng/Tz4lvaVcXk4UtEFpE2VMUlTm4UrWqniti/+FDKuH7AW+QdX4\nuXwDRf0JjQKBgQCO2lVNaDdbXZGlh+E6l6Uw6bSVUP7crAOegH+iZncQRUkMRpiL\nB9jYR4T/WZvwGuGz4W75M3fkCcSDw3hjgvFUiyRMvtDQZBtG/m1diGQwgx3IsUtH\nl6Urb+7tR0q3blh8PEVJ02ahrXJX9l1dt9UVSOAthpJgy83l+IeqJzQWtQKBgAhP\n/KEAWyUa+NqsxCkINpW0am/CLvLSwOH4vZyaq4+zyfXEm2ePEKvDRPkfSoT3EIg/\nPJm2Al/LnfM+HhBwZrDZgJtu+VywXTPDtwOjwPQlyJFqKWoMsLb3hlGlCHNJT2tM\nCnL1BuRaDL2KiF4Ke3hfnVWkhGD/dCph6FRiu0GJAoGANEa186PmEaEjtfDFck9j\nrBiADAXPSnu/SpHzPvO3EJ93yTpNjrWMzxOtzoobcK+5NMPtiR4EmR1p6j+PTMuP\nsTPu6+uQ1MPnfWA+m+Kkzqmk/Fa0HmiDsvXIINt/1weWSKHFBp27u5XAiY41xGBg\n/yO8vfCBKvsFWK5ciAxJAGM=\n-----END PRIVATE KEY-----\n",
+  "client_email": "sinas-acct@quantum-spring-421822.iam.gserviceaccount.com",
+  "client_id": "117896840412672049182",
+  "auth_uri": "https://accounts.google.com/o/oauth2/auth",
+  "token_uri": "https://oauth2.googleapis.com/token",
+  "auth_provider_x509_cert_url": "https://www.googleapis.com/oauth2/v1/certs",
+  "client_x509_cert_url": "https://www.googleapis.com/robot/v1/metadata/x509/sinas-acct%40quantum-spring-421822.iam.gserviceaccount.com",
+  "universe_domain": "googleapis.com"
+}

requirements.txt ADDED Viewed

	@@ -0,0 +1,9 @@

+PyPDF2
+pdf2image
+numpy
+fastai
+pandas
+typer>=0.12.3
+google-api-core
+google-cloud-documentai
+openpyxl

split.py ADDED Viewed

	@@ -0,0 +1,74 @@

+import os
+import PyPDF2
+from pathlib import Path
+from pdf2image import convert_from_path
+def get_pg_x(infile, outfile, x=0):
+    reader = PyPDF2.PdfReader(infile)
+    first_pg = reader.pages[x]
+    writer = PyPDF2.PdfWriter(outfile)
+    writer.add_page(first_pg)
+    writer.write(outfile)
+def get_mult_pgs(infile, out):
+    reader = PyPDF2.PdfReader(infile)
+    pg_ind = list(range(99, 123,2)) # Basically get every other page (a very naive way of splitting it up)
+    pages = [reader.pages[i] for i in pg_ind]
+    if not os.path.exists(out):
+        os.makedirs(out)
+    for i,p in enumerate(pages):
+        writer = PyPDF2.PdfWriter()
+        filepath = os.path.join(out, f'case_{i+5}.pdf')
+        writer.add_page(p)
+        writer.write(filepath)
+def splitall(infile, out):
+    path = Path(out)
+    if not path.exists():
+        path.mkdir()
+    reader = PyPDF2.PdfReader(infile)
+    for i in range(len(reader.pages)):
+        writer = PyPDF2.PdfWriter()
+        if i%2 == 1:
+            dest = (path/'other')
+            dest.mkdir(exist_ok=True)
+            fpath = dest.joinpath(f'other{i//2}.pdf')
+            writer.add_page(reader.pages[i])
+            writer.write(fpath)
+        else:
+            dest = (path/'cases')
+            dest.mkdir(exist_ok=True)
+            fpath = dest.joinpath(f'case{i//2}.pdf')
+            writer.add_page(reader.pages[i])
+            writer.write(fpath)
+def folder_to_img(folder):
+    for root, _, filelist in os.walk(folder):
+        for file in filelist:
+            if file.lower().endswith('.pdf'):
+                pdf_path = os.path.join(root, file)
+                outfolder = root
+                try:
+                    im = convert_from_path(pdf_path)
+                    outfile = os.path.join(outfolder,f'{os.path.splitext(os.path.basename(pdf_path))[0]}.jpg')
+                    im[0].save(outfile, 'JPEG')
+                except Exception as e:
+                    print(f"Error converting {pdf_path}: {e}")
+                Path(pdf_path).unlink()
+if __name__=='__main__':
+    # splitall('report_batch.pdf', 'data')
+    folder_to_img('data/cases')
+    folder_to_img('data/other')