RollNoExtractionCopy

Runtime error

File size: 8,748 Bytes

import gradio as gr
from tempfile import TemporaryDirectory, NamedTemporaryFile
from pdf2image import convert_from_path
from PIL import Image
import os
from io import BytesIO
import base64
import requests
import pandas as pd
import json
import logging
import re


os.system("apt-get update")
os.system("apt-get install poppler-utils")

logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(levelname)s - %(message)s')


# Function to convert PDF to images or open a single image
def get_images(file_path):
    images = []
    extension = os.path.splitext(file_path)[-1].lower()
    if extension == ".pdf":
        images = convert_from_path(file_path)
    elif extension in [".tiff", ".tif", ".png", ".jpg", ".jpeg"]:
        image = Image.open(file_path)
        images.append(image)
    return images

# Function to encode image to base64
def encode_image_to_base64(image):
    # Ensure the image is in a format compatible with JPEG
    if image.mode in ["P", "RGBA"]:
        image = image.convert("RGB")
    buffered = BytesIO()
    image.save(buffered, format="JPEG")
    return base64.b64encode(buffered.getvalue()).decode("utf-8")


def process_files_fixed(image_path, page_identifier, error_pages):
    api_key = os.getenv('OPENAI_API_KEY')
    headers = {
        "Content-Type": "application/json",
        "Authorization": f"Bearer {api_key}"
    }

    try:
        image = Image.open(image_path)
        base64_image = encode_image_to_base64(image)
    except Exception as e:
        logging.error(f"Failed to process image at {image_path}: {e}")
        error_pages.append(page_identifier)
        return []

    prompt = """**Objective:** Extract specific data from a table within an image using OCR.

            **Image Description:** The image contains a table with student information.
            
            **Columns of Interest:**
            
            *   S.No (Serial Number)
            *   Admission No.
            *   Date of Admission
            *   Name of Student
            *   Father's Name
            *   Date of Birth
            *   Telephone No. 
            *   Address
            *   F.CNIC (Father's CNIC) 
            *   S.CNIC (Student's CNIC) - Located under the "REMARKS" column 
            *   M.Name (Mother's Name) - Located under the "REMARKS" column
            
            **Instructions:**
            
            1. **Perform OCR:** Use Optical Character Recognition to extract text from the image. 
            2. **Table Detection:** Identify the table within the image. 
            3. **Column Identification:**
            *   If table headers are present and visible, use them to identify the columns of interest.
            *   If headers are missing or unclear, assume the order of columns as specified above. 
            4. **Data Extraction:**
            *   Extract data from each row of the table for the specified columns only.
            *   Disregard any additional columns present in the table.
            *   **Important:** Extract data from all rows, do not skip any rows. 
            *   For "Telephone No.", focus on the number itself and ignore any labels like "office" or "residence" associated with it. 
            *   For "F.CNIC", "S.CNIC", and "M.Name", extract this information from the "REMARKS" column. 
            5. **Data Verification:**
            *   Implement checks to ensure the accuracy of extracted data, especially for numerical values like "S.No" and "Telephone No." 
            *   Consider using checksums or validation rules based on known formats (e.g., CNIC format).
            
            **Output Format:**
            
            ```json
            {
            "data": [
                {
                    "S_No": "1",
                    "Admission No.": "1604",
                    "Date of Admission": "25-4-17",
                    "Name of Student": "Maham Tariq",
                    "Father's Name": "Tariq Mehman",
                    "Date of Birth": "12-05-12",
                    "Telephone No.": "03125350838",
                    "Address": "Dewan-e-umar Masjid F1014",
                    "F.CNIC": "61101-9729652-7",
                    "S.CNIC": "61101-8018797-4",
                    "M.Name": "Nasira"
                },
                {
                    "S_No": "2",
                        "Admission No.": "1640",
                        "Date of Admission": "05-10-20",
                        "Name of Student": "Areej Jibran",
                        "Father's Name": "M.Jibran",
                        "Date of Birth": "05-04-14",
                        "Telephone No.": "03335173534",
                        "Address": "H#65 st#11 G11/I isb",
                        "F. CNIC": "37405-0393951-3",
                        "S.CNIC": "37405-5642572-3",
                        "M.Name": "Taqdees Jibran"
                }
            ]
            }
            
            """

        

    payload = {
        "model": "gpt-4-vision-preview",
        "messages": [
            {
                "role": "user",
                "content": [
                    {
                        "type": "text",
                        "text": prompt
                    },
                    {
                        "type": "image_url",
                        "image_url": f"data:image/jpeg;base64,{base64_image}"
                    }
                ]
            }
        ],
        "max_tokens": 4096
    }

    try:
        response = requests.post("https://api.openai.com/v1/chat/completions", headers=headers, json=payload)
        logging.info(f"Full API response: {response.text}")
        if response.status_code == 200:
            json_response = response.json()
            response_content = json_response["choices"][0]["message"]["content"]
            if response_content:
                try:
                    json_string = response_content[response_content.find("{"): response_content.rfind("}") + 1]
                    json_data = json.loads(json_string)
                    if "data" in json_data and json_data["data"]:
                        return json_data["data"]
                    else:
                        logging.error(f"No records found in page/file: {page_identifier}")
                        error_pages.append(page_identifier)
                        return []
                except json.JSONDecodeError:
                    logging.error(f"JSON parsing error in response for page/file {page_identifier}")
                    error_pages.append(page_identifier)
                    return []
            else:
                logging.error(f"No content in JSON response for page/file {page_identifier}")
                error_pages.append(page_identifier)
                return []
        else:
            logging.error(f"Error in API call for page/file {page_identifier}: HTTP {response.status_code} - {response.text}")
            error_pages.append(page_identifier)
            return []
    except requests.exceptions.RequestException as e:
        logging.error(f"Network or API error when processing page/file {page_identifier}: {e}")
        error_pages.append(page_identifier)
        return []



def process_pdf_and_generate_csv(file_path):
    error_pages = []  # Initialize the list to track error pages or files
    images = get_images(file_path)
    structured_data = []
    
    for i, image in enumerate(images, start=1):
        with TemporaryDirectory() as temp_dir:
            image_path = os.path.join(temp_dir, "image.jpg")
            image.save(image_path)
            data = process_files_fixed(image_path, i, error_pages)
            structured_data.extend(data or [])
    
    if structured_data:
        df = pd.DataFrame(structured_data)
        # Save to a temporary file to return through Gradio
        tmp_file = NamedTemporaryFile(delete=False, suffix='.csv')
        df.to_csv(tmp_file.name, index=False)
        return tmp_file.name, f"Errors or no records found in {len(error_pages)} pages/files: {error_pages}"
    else:
        return None, "No data to save or an error occurred."

def gradio_interface(pdf_file):
    result_csv, message = process_pdf_and_generate_csv(pdf_file.name)
    if result_csv:
        return result_csv, message
    else:
        return None, message

iface = gr.Interface(fn=gradio_interface,
                     inputs=gr.File(label="Please upload your PDF file"),
                     outputs=[gr.File(label="Download the generated CSV file"), gr.Textbox(label="Messages")],
                     title="PDF to CSV Table Extractor",
                     description="Upload a PDF file to extract tables into a CSV format.")

iface.queue().launch(share=False)
#