Spaces:

amamrnaf
/

data_extraction_demo

Sleeping

File size: 9,827 Bytes

362362d
 
6e805b9
 
 
 
0e7b36e
6e805b9
985808a
6e805b9
985808a
6e805b9
 
 
 
 
 
 
 
 
 
9edba37
 
6e805b9
a598534
 
6e805b9
 
 
 
 
 
 
 
 
 
 
9edba37
df646cb
6e805b9
 
 
 
df646cb
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
6e805b9
df646cb
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
6e805b9
 
 
 
 
 
 
9edba37
6e805b9
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
9edba37
 
6e805b9
9edba37
6e805b9
9edba37
6e805b9
 
9edba37
 
6e805b9
 
9edba37
6e805b9
9edba37
6e805b9
 
 
 
9edba37
 
6e805b9
 
df646cb
6e805b9
 
 
e5f4b97
df646cb
 
 
 
 
 
 
 
 
 
 
 
 
 
 
9edba37
bd3440d
9edba37
bd3440d
 
6e805b9
 
50c7387
a36aede
9edba37
c38db77
9edba37
 
df646cb
182218c
985808a
182218c
9edba37
 
df646cb
985808a
9edba37
 
df646cb
9edba37
 
df646cb
 
 
 
 
 
 
 
9edba37
 
df646cb
9edba37
 
df646cb
9edba37
 
df646cb
9edba37
 
 
df646cb
 
 
 
 
 
9edba37
 
bd3440d
 
 
 
 
9edba37
bd3440d
9edba37
df646cb
bd3440d
 
9edba37
 
bd3440d
e5f4b97
0e7b36e
 
9edba37
0e7b36e
 
 
 
9edba37

import gradio as gr #type: ignore
import pymupdf  #type: ignore
from PIL import Image
import os 
from functions import get_image_informations
from dataSchema import *
# import shutil

def Noc_timeSheet_pdf_to_img(pdf_path,dpi: int = 300, quality: int = 95):
    pdf_document = pymupdf.open(pdf_path)
    output_path="output.jpg"
    # Get the first page of the PDF
    page = pdf_document.load_page(0)  # 0 is the first page

    # Convert the page to a pixmap (image)
    pix = page.get_pixmap(dpi=dpi)

    # Convert the pixmap to a PIL Image and save as JPG
    image = Image.frombytes("RGB", [pix.width, pix.height], pix.samples)

    width, height = image.size
    start_y_total_table = int(height * 0.42)
    end_y_first_table = int(height * 0.30)

    croped1 = image.crop((0, 0, width, end_y_first_table))
    croped2 = image.crop((0, start_y_total_table, width, height))
    upper_width, upper_height = croped1.size
    lower_width, lower_height = croped2.size
    combined_image = Image.new('RGB', (upper_width, upper_height + lower_height))

    # Paste the upper image (croped1) on top
    combined_image.paste(croped1, (0, 0))

    # Paste the lower image (croped2) below the upper image
    combined_image.paste(croped2, (0, upper_height))

    # Save the combined image
    combined_image.save(output_path, "JPEG", quality=quality)
 
def Clauses_in_invoice(pdf_path: str) -> bool:
    """
    Extract text from the last page of a PDF.
    """
    try:
        
        pdf_document = pymupdf.open(pdf_path)
        total_pages = pdf_document.page_count 
        if total_pages < 2:
                print("The PDF has fewer than 2 pages.")
                return False
        last_page = pdf_document.load_page(total_pages - 1)
        text = last_page.get_text() 
        
        last_page = text.lower()
        
        if "clauses" in last_page:
            return True
        else:
            return False
    except Exception as e: 
        print(f"error :{e}")
        return False
    
    finally:
        # Ensure the PDF document is closed
        if 'pdf_document' in locals():
            pdf_document.close()
            
def Clauses_in_invoice_2nd_version(pdf_path: str) -> bool:
    """
    Extract text from the last page of a PDF.
    """
    try:
        
        pdf_document = pymupdf.open(pdf_path)
        total_pages = pdf_document.page_count 
        if total_pages < 2:
                print("The PDF has fewer than 2 pages.")
                return False
        second_to_last_page = pdf_document.load_page(total_pages - 2)
        text = second_to_last_page.get_text() 
        
        if "clauses" in text.lower():
            return True
        else:
            return False
    except Exception as e: 
        print(f"error :{e}")
        return False
    
    finally:
        # Ensure the PDF document is closed
        if 'pdf_document' in locals():
            pdf_document.close()
 
def Noc_invoice_pdf_to_img(pdf_path: str, folder_path: str, dpi: int = 300, quality: int = 95):
    pdf_document = pymupdf.open(pdf_path)
    folder_path = folder_path.rstrip(os.sep)
    os.makedirs(folder_path, exist_ok=True)
    
    pdf_name = os.path.splitext(os.path.basename(pdf_path))[0]
    total_pages = pdf_document.page_count
    image_paths = []
    for page_num in range(total_pages):
        page = pdf_document.load_page(page_num)
        pix = page.get_pixmap(dpi=dpi)
        image = Image.frombytes("RGB", [pix.width, pix.height], pix.samples)

        output_path = os.path.join(folder_path, f"{pdf_name}_page_{page_num + 1}.jpg")
        image.save(output_path, "JPEG", quality=quality)
        image_paths.append(output_path)

    pdf_document.close()
    return image_paths

def delete_images(image_paths):
    # Iterate through the list of image paths
    for image_path in image_paths:
        try:
            # Check if the file exists before attempting to delete
            if os.path.exists(image_path):
                os.remove(image_path)
                print(f"Deleted: {image_path}")
            else:
                print(f"File not found: {image_path}")
        except Exception as e:
            print(f"Error deleting {image_path}: {e}")

def noc_invoice_extraction(pdf_path: str, folder_path):
    image_paths = Noc_invoice_pdf_to_img(pdf_path, folder_path)
    data = {}
    result = get_image_informations(image_paths[0], invoice_first_page_prompt, Noc_PurchaseOrder_information_parser)
    data.update(result)
    result = get_image_informations(image_paths[1], invoice_item_page1_prompt, Noc_PurchaseOrder_item1_parser)
    data.update(result)
    if Clauses_in_invoice(pdf_path):      
        for pic in range(len(image_paths) - 4):
            new_item = get_image_informations(image_paths[pic + 2], invoice_item_pages_prompt, Noc_PurchaseOrder_items_parser)
            for item in new_item["items"]:
                data["items"].append(item)
        result = get_image_informations(image_paths[-2], invoice_total_page_prompt, Noc_PurchaseOrder_total_parser)
        data.update(result)
        result = get_image_informations(image_paths[-1], invoice_clauses_page_prompt, Noc_PurchaseOrder_clauses_parser)
        data.update(result)
        delete_images(image_paths)
        return data
    else:
        for pic in range(len(image_paths) - 3):
            new_item = get_image_informations(image_paths[pic + 2], invoice_item_pages_prompt, Noc_PurchaseOrder_items_parser)
            for item in new_item["items"]:
                data["items"].append(item)
        result = get_image_informations(image_paths[-1], invoice_total_page_prompt, Noc_PurchaseOrder_total_parser)
        data.update(result)
        delete_images(image_paths)
        return data

       
def pdf_to_img(pdf_path, dpi: float = 300, quality: float = 95):
    pdf_document = pymupdf.open(pdf_path)
    page = pdf_document.load_page(0)  # Load the first page
    output_path = "output.jpg"

    # Convert the page to a pixmap (image)
    pix = page.get_pixmap(dpi=dpi)
    
    # Convert the pixmap to a PIL Image
    image = Image.frombytes("RGB", [pix.width, pix.height], pix.samples)

    image.save(output_path, "JPEG",quality=quality)


def process_file(file, option):
    if file is None:
        return "Please upload a PDF or image file."
    
    try:
        save_dir = "uploaded_files"
        os.makedirs(save_dir, exist_ok=True)  # Create the directory if it doesn't exist

        file_path = file.name
        file_extension = os.path.splitext(file_path)[1].lower()
        print(file_extension)
        if file_extension in ['.pdf']:
            # Process PDF files
            if option == "Noc_timesheet_residential_old":
                print(file_path)
                Noc_timeSheet_pdf_to_img(file_path)
                print("here 2")
                result = get_image_informations("output.jpg", Noc_Res_timesheet_prompt, Noc_Res_timeSheet_parser)
                return result
            elif option == "Noc_timesheet_rotational_old":
                Noc_timeSheet_pdf_to_img(file_path)
                result = get_image_informations("output.jpg", Noc_Rot_timesheet_prompt, Noc_Rot_timeSheet_parser)
                return result
            elif option == "Noc_PO":
                result = noc_invoice_extraction(file_path, save_dir)
                return result
            elif option =="Noc_timesheet_new":
                pdf_to_img(file_path)
                result = get_image_informations("output.jpg", Noc_timesheet_prompt, Noc_timesheet_parser_v1)
                return result
            elif option == "Noc_invoice":
                pdf_to_img(file_path)
                result = get_image_informations("output.jpg", Noc_invoice_prompt, Noc_invoice_parser_v1)
                return result
        elif file_extension in ['.jpg', '.jpeg', '.png']:
            # Process image files directly
            if option == "Noc_timesheet_residential_old":
                result = get_image_informations(file_path, Noc_Res_timesheet_prompt, Noc_Res_timeSheet_parser)
                return result
            elif option == "Noc_timesheet_rotational_old":
                result = get_image_informations(file_path, Noc_Rot_timesheet_prompt, Noc_Rot_timeSheet_parser)
                return result
            elif option == "Noc_PO":
                # For invoice images, we assume it's a single page
                result = get_image_informations(file_path, invoice_first_page_prompt, Noc_PurchaseOrder_information_parser)
                return result
            elif option =="Noc_timesheet_new":
                result = get_image_informations(file_path, Noc_timesheet_prompt, Noc_timesheet_parser_v1)
                return result
            elif option == "Noc_invoice":
                result = get_image_informations(file_path, Noc_invoice_prompt, Noc_invoice_parser_v1)
                return result
        else:
            return "Unsupported file type. Please upload a PDF or image file."
    except Exception as e:
        return f"An error occurred: {e}"

# Define the Gradio interface
demo = gr.Interface(
    fn=process_file,
    inputs=[
        gr.File(label="Upload PDF or Image"),  # File upload input
        gr.Radio(["Noc_timesheet_new","Noc_invoice","Noc_timesheet_residential_old", "Noc_timesheet_rotational_old", "Noc_PO"], label="Choose an option")  # Radio buttons for options
    ],
    outputs="text",  # Text output
    title="PDF/Image Processor",
    description="Upload a PDF or image and choose an option to process the content."
)

with gr.Blocks() as app:
    demo.render()
    gr.Markdown("### PDF/Image examples")  # Section title
    with gr.Row():
        gr.Image("TS.png", label="NOC timesheet example")
        gr.Image("invoice.png", label="NOC invoice example")

app.launch()