danial0203's picture
Update app.py
0bad17c verified
import gradio as gr
from tempfile import TemporaryDirectory, NamedTemporaryFile
from pdf2image import convert_from_path
from PIL import Image
import os
from io import BytesIO
import base64
import requests
import pandas as pd
import json
import logging
import re
os.system("apt-get update")
os.system("apt-get install poppler-utils")
logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(levelname)s - %(message)s')
# Function to convert PDF to images or open a single image
def get_images(file_path):
images = []
extension = os.path.splitext(file_path)[-1].lower()
if extension == ".pdf":
images = convert_from_path(file_path)
elif extension in [".tiff", ".tif", ".png", ".jpg", ".jpeg"]:
image = Image.open(file_path)
images.append(image)
return images
# Function to encode image to base64
def encode_image_to_base64(image):
# Ensure the image is in a format compatible with JPEG
if image.mode in ["P", "RGBA"]:
image = image.convert("RGB")
buffered = BytesIO()
image.save(buffered, format="JPEG")
return base64.b64encode(buffered.getvalue()).decode("utf-8")
def process_files_fixed(image_path, page_identifier, error_pages):
api_key = os.getenv('OPENAI_API_KEY')
headers = {
"Content-Type": "application/json",
"Authorization": f"Bearer {api_key}"
}
try:
image = Image.open(image_path)
base64_image = encode_image_to_base64(image)
except Exception as e:
logging.error(f"Failed to process image at {image_path}: {e}")
error_pages.append(page_identifier)
return []
prompt = """**Objective:** Extract specific data from a table within an image using OCR.
**Image Description:** The image contains a table with student information.
**Columns of Interest:**
* S.No (Serial Number)
* Admission No.
* Date of Admission
* Name of Student
* Father's Name
* Date of Birth
* Telephone No.
* Address
* F.CNIC (Father's CNIC)
* S.CNIC (Student's CNIC) - Located under the "REMARKS" column
* M.Name (Mother's Name) - Located under the "REMARKS" column
**Instructions:**
1. **Perform OCR:** Use Optical Character Recognition to extract text from the image.
2. **Table Detection:** Identify the table within the image.
3. **Column Identification:**
* If table headers are present and visible, use them to identify the columns of interest.
* If headers are missing or unclear, assume the order of columns as specified above.
4. **Data Extraction:**
* Extract data from each row of the table for the specified columns only.
* Disregard any additional columns present in the table.
* **Important:** Extract data from all rows, do not skip any rows.
* For "Telephone No.", focus on the number itself and ignore any labels like "office" or "residence" associated with it.
* For "F.CNIC", "S.CNIC", and "M.Name", extract this information from the "REMARKS" column.
5. **Data Verification:**
* Implement checks to ensure the accuracy of extracted data, especially for numerical values like "S.No" and "Telephone No."
* Consider using checksums or validation rules based on known formats (e.g., CNIC format).
**Output Format:**
```json
{
"data": [
{
"S_No": "1",
"Admission No.": "1604",
"Date of Admission": "25-4-17",
"Name of Student": "Maham Tariq",
"Father's Name": "Tariq Mehman",
"Date of Birth": "12-05-12",
"Telephone No.": "03125350838",
"Address": "Dewan-e-umar Masjid F1014",
"F.CNIC": "61101-9729652-7",
"S.CNIC": "61101-8018797-4",
"M.Name": "Nasira"
},
{
"S_No": "2",
"Admission No.": "1640",
"Date of Admission": "05-10-20",
"Name of Student": "Areej Jibran",
"Father's Name": "M.Jibran",
"Date of Birth": "05-04-14",
"Telephone No.": "03335173534",
"Address": "H#65 st#11 G11/I isb",
"F. CNIC": "37405-0393951-3",
"S.CNIC": "37405-5642572-3",
"M.Name": "Taqdees Jibran"
}
]
}
"""
payload = {
"model": "gpt-4-vision-preview",
"messages": [
{
"role": "user",
"content": [
{
"type": "text",
"text": prompt
},
{
"type": "image_url",
"image_url": f"data:image/jpeg;base64,{base64_image}"
}
]
}
],
"max_tokens": 4096
}
try:
response = requests.post("https://api.openai.com/v1/chat/completions", headers=headers, json=payload)
logging.info(f"Full API response: {response.text}")
if response.status_code == 200:
json_response = response.json()
response_content = json_response["choices"][0]["message"]["content"]
if response_content:
try:
json_string = response_content[response_content.find("{"): response_content.rfind("}") + 1]
json_data = json.loads(json_string)
if "data" in json_data and json_data["data"]:
return json_data["data"]
else:
logging.error(f"No records found in page/file: {page_identifier}")
error_pages.append(page_identifier)
return []
except json.JSONDecodeError:
logging.error(f"JSON parsing error in response for page/file {page_identifier}")
error_pages.append(page_identifier)
return []
else:
logging.error(f"No content in JSON response for page/file {page_identifier}")
error_pages.append(page_identifier)
return []
else:
logging.error(f"Error in API call for page/file {page_identifier}: HTTP {response.status_code} - {response.text}")
error_pages.append(page_identifier)
return []
except requests.exceptions.RequestException as e:
logging.error(f"Network or API error when processing page/file {page_identifier}: {e}")
error_pages.append(page_identifier)
return []
def process_pdf_and_generate_csv(file_path):
error_pages = [] # Initialize the list to track error pages or files
images = get_images(file_path)
structured_data = []
for i, image in enumerate(images, start=1):
with TemporaryDirectory() as temp_dir:
image_path = os.path.join(temp_dir, "image.jpg")
image.save(image_path)
data = process_files_fixed(image_path, i, error_pages)
structured_data.extend(data or [])
if structured_data:
df = pd.DataFrame(structured_data)
# Save to a temporary file to return through Gradio
tmp_file = NamedTemporaryFile(delete=False, suffix='.csv')
df.to_csv(tmp_file.name, index=False)
return tmp_file.name, f"Errors or no records found in {len(error_pages)} pages/files: {error_pages}"
else:
return None, "No data to save or an error occurred."
def gradio_interface(pdf_file):
result_csv, message = process_pdf_and_generate_csv(pdf_file.name)
if result_csv:
return result_csv, message
else:
return None, message
iface = gr.Interface(fn=gradio_interface,
inputs=gr.File(label="Please upload your PDF file"),
outputs=[gr.File(label="Download the generated CSV file"), gr.Textbox(label="Messages")],
title="PDF to CSV Table Extractor",
description="Upload a PDF file to extract tables into a CSV format.")
iface.queue().launch(share=False)
#