Spaces:
Sleeping
Sleeping
File size: 9,827 Bytes
362362d 6e805b9 0e7b36e 6e805b9 985808a 6e805b9 985808a 6e805b9 9edba37 6e805b9 a598534 6e805b9 9edba37 df646cb 6e805b9 df646cb 6e805b9 df646cb 6e805b9 9edba37 6e805b9 9edba37 6e805b9 9edba37 6e805b9 9edba37 6e805b9 9edba37 6e805b9 9edba37 6e805b9 9edba37 6e805b9 9edba37 6e805b9 df646cb 6e805b9 e5f4b97 df646cb 9edba37 bd3440d 9edba37 bd3440d 6e805b9 50c7387 a36aede 9edba37 c38db77 9edba37 df646cb 182218c 985808a 182218c 9edba37 df646cb 985808a 9edba37 df646cb 9edba37 df646cb 9edba37 df646cb 9edba37 df646cb 9edba37 df646cb 9edba37 df646cb 9edba37 bd3440d 9edba37 bd3440d 9edba37 df646cb bd3440d 9edba37 bd3440d e5f4b97 0e7b36e 9edba37 0e7b36e 9edba37 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 |
import gradio as gr #type: ignore
import pymupdf #type: ignore
from PIL import Image
import os
from functions import get_image_informations
from dataSchema import *
# import shutil
def Noc_timeSheet_pdf_to_img(pdf_path,dpi: int = 300, quality: int = 95):
pdf_document = pymupdf.open(pdf_path)
output_path="output.jpg"
# Get the first page of the PDF
page = pdf_document.load_page(0) # 0 is the first page
# Convert the page to a pixmap (image)
pix = page.get_pixmap(dpi=dpi)
# Convert the pixmap to a PIL Image and save as JPG
image = Image.frombytes("RGB", [pix.width, pix.height], pix.samples)
width, height = image.size
start_y_total_table = int(height * 0.42)
end_y_first_table = int(height * 0.30)
croped1 = image.crop((0, 0, width, end_y_first_table))
croped2 = image.crop((0, start_y_total_table, width, height))
upper_width, upper_height = croped1.size
lower_width, lower_height = croped2.size
combined_image = Image.new('RGB', (upper_width, upper_height + lower_height))
# Paste the upper image (croped1) on top
combined_image.paste(croped1, (0, 0))
# Paste the lower image (croped2) below the upper image
combined_image.paste(croped2, (0, upper_height))
# Save the combined image
combined_image.save(output_path, "JPEG", quality=quality)
def Clauses_in_invoice(pdf_path: str) -> bool:
"""
Extract text from the last page of a PDF.
"""
try:
pdf_document = pymupdf.open(pdf_path)
total_pages = pdf_document.page_count
if total_pages < 2:
print("The PDF has fewer than 2 pages.")
return False
last_page = pdf_document.load_page(total_pages - 1)
text = last_page.get_text()
last_page = text.lower()
if "clauses" in last_page:
return True
else:
return False
except Exception as e:
print(f"error :{e}")
return False
finally:
# Ensure the PDF document is closed
if 'pdf_document' in locals():
pdf_document.close()
def Clauses_in_invoice_2nd_version(pdf_path: str) -> bool:
"""
Extract text from the last page of a PDF.
"""
try:
pdf_document = pymupdf.open(pdf_path)
total_pages = pdf_document.page_count
if total_pages < 2:
print("The PDF has fewer than 2 pages.")
return False
second_to_last_page = pdf_document.load_page(total_pages - 2)
text = second_to_last_page.get_text()
if "clauses" in text.lower():
return True
else:
return False
except Exception as e:
print(f"error :{e}")
return False
finally:
# Ensure the PDF document is closed
if 'pdf_document' in locals():
pdf_document.close()
def Noc_invoice_pdf_to_img(pdf_path: str, folder_path: str, dpi: int = 300, quality: int = 95):
pdf_document = pymupdf.open(pdf_path)
folder_path = folder_path.rstrip(os.sep)
os.makedirs(folder_path, exist_ok=True)
pdf_name = os.path.splitext(os.path.basename(pdf_path))[0]
total_pages = pdf_document.page_count
image_paths = []
for page_num in range(total_pages):
page = pdf_document.load_page(page_num)
pix = page.get_pixmap(dpi=dpi)
image = Image.frombytes("RGB", [pix.width, pix.height], pix.samples)
output_path = os.path.join(folder_path, f"{pdf_name}_page_{page_num + 1}.jpg")
image.save(output_path, "JPEG", quality=quality)
image_paths.append(output_path)
pdf_document.close()
return image_paths
def delete_images(image_paths):
# Iterate through the list of image paths
for image_path in image_paths:
try:
# Check if the file exists before attempting to delete
if os.path.exists(image_path):
os.remove(image_path)
print(f"Deleted: {image_path}")
else:
print(f"File not found: {image_path}")
except Exception as e:
print(f"Error deleting {image_path}: {e}")
def noc_invoice_extraction(pdf_path: str, folder_path):
image_paths = Noc_invoice_pdf_to_img(pdf_path, folder_path)
data = {}
result = get_image_informations(image_paths[0], invoice_first_page_prompt, Noc_PurchaseOrder_information_parser)
data.update(result)
result = get_image_informations(image_paths[1], invoice_item_page1_prompt, Noc_PurchaseOrder_item1_parser)
data.update(result)
if Clauses_in_invoice(pdf_path):
for pic in range(len(image_paths) - 4):
new_item = get_image_informations(image_paths[pic + 2], invoice_item_pages_prompt, Noc_PurchaseOrder_items_parser)
for item in new_item["items"]:
data["items"].append(item)
result = get_image_informations(image_paths[-2], invoice_total_page_prompt, Noc_PurchaseOrder_total_parser)
data.update(result)
result = get_image_informations(image_paths[-1], invoice_clauses_page_prompt, Noc_PurchaseOrder_clauses_parser)
data.update(result)
delete_images(image_paths)
return data
else:
for pic in range(len(image_paths) - 3):
new_item = get_image_informations(image_paths[pic + 2], invoice_item_pages_prompt, Noc_PurchaseOrder_items_parser)
for item in new_item["items"]:
data["items"].append(item)
result = get_image_informations(image_paths[-1], invoice_total_page_prompt, Noc_PurchaseOrder_total_parser)
data.update(result)
delete_images(image_paths)
return data
def pdf_to_img(pdf_path, dpi: float = 300, quality: float = 95):
pdf_document = pymupdf.open(pdf_path)
page = pdf_document.load_page(0) # Load the first page
output_path = "output.jpg"
# Convert the page to a pixmap (image)
pix = page.get_pixmap(dpi=dpi)
# Convert the pixmap to a PIL Image
image = Image.frombytes("RGB", [pix.width, pix.height], pix.samples)
image.save(output_path, "JPEG",quality=quality)
def process_file(file, option):
if file is None:
return "Please upload a PDF or image file."
try:
save_dir = "uploaded_files"
os.makedirs(save_dir, exist_ok=True) # Create the directory if it doesn't exist
file_path = file.name
file_extension = os.path.splitext(file_path)[1].lower()
print(file_extension)
if file_extension in ['.pdf']:
# Process PDF files
if option == "Noc_timesheet_residential_old":
print(file_path)
Noc_timeSheet_pdf_to_img(file_path)
print("here 2")
result = get_image_informations("output.jpg", Noc_Res_timesheet_prompt, Noc_Res_timeSheet_parser)
return result
elif option == "Noc_timesheet_rotational_old":
Noc_timeSheet_pdf_to_img(file_path)
result = get_image_informations("output.jpg", Noc_Rot_timesheet_prompt, Noc_Rot_timeSheet_parser)
return result
elif option == "Noc_PO":
result = noc_invoice_extraction(file_path, save_dir)
return result
elif option =="Noc_timesheet_new":
pdf_to_img(file_path)
result = get_image_informations("output.jpg", Noc_timesheet_prompt, Noc_timesheet_parser_v1)
return result
elif option == "Noc_invoice":
pdf_to_img(file_path)
result = get_image_informations("output.jpg", Noc_invoice_prompt, Noc_invoice_parser_v1)
return result
elif file_extension in ['.jpg', '.jpeg', '.png']:
# Process image files directly
if option == "Noc_timesheet_residential_old":
result = get_image_informations(file_path, Noc_Res_timesheet_prompt, Noc_Res_timeSheet_parser)
return result
elif option == "Noc_timesheet_rotational_old":
result = get_image_informations(file_path, Noc_Rot_timesheet_prompt, Noc_Rot_timeSheet_parser)
return result
elif option == "Noc_PO":
# For invoice images, we assume it's a single page
result = get_image_informations(file_path, invoice_first_page_prompt, Noc_PurchaseOrder_information_parser)
return result
elif option =="Noc_timesheet_new":
result = get_image_informations(file_path, Noc_timesheet_prompt, Noc_timesheet_parser_v1)
return result
elif option == "Noc_invoice":
result = get_image_informations(file_path, Noc_invoice_prompt, Noc_invoice_parser_v1)
return result
else:
return "Unsupported file type. Please upload a PDF or image file."
except Exception as e:
return f"An error occurred: {e}"
# Define the Gradio interface
demo = gr.Interface(
fn=process_file,
inputs=[
gr.File(label="Upload PDF or Image"), # File upload input
gr.Radio(["Noc_timesheet_new","Noc_invoice","Noc_timesheet_residential_old", "Noc_timesheet_rotational_old", "Noc_PO"], label="Choose an option") # Radio buttons for options
],
outputs="text", # Text output
title="PDF/Image Processor",
description="Upload a PDF or image and choose an option to process the content."
)
with gr.Blocks() as app:
demo.render()
gr.Markdown("### PDF/Image examples") # Section title
with gr.Row():
gr.Image("TS.png", label="NOC timesheet example")
gr.Image("invoice.png", label="NOC invoice example")
app.launch() |