Spaces:
Runtime error
Runtime error
import gradio as gr | |
from tempfile import TemporaryDirectory, NamedTemporaryFile | |
from pdf2image import convert_from_path | |
from PIL import Image | |
import os | |
from io import BytesIO | |
import base64 | |
import requests | |
import pandas as pd | |
import json | |
import logging | |
import re | |
os.system("apt-get update") | |
os.system("apt-get install poppler-utils") | |
logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(levelname)s - %(message)s') | |
# Function to convert PDF to images or open a single image | |
def get_images(file_path): | |
images = [] | |
extension = os.path.splitext(file_path)[-1].lower() | |
if extension == ".pdf": | |
images = convert_from_path(file_path) | |
elif extension in [".tiff", ".tif", ".png", ".jpg", ".jpeg"]: | |
image = Image.open(file_path) | |
images.append(image) | |
return images | |
# Function to encode image to base64 | |
def encode_image_to_base64(image): | |
# Ensure the image is in a format compatible with JPEG | |
if image.mode in ["P", "RGBA"]: | |
image = image.convert("RGB") | |
buffered = BytesIO() | |
image.save(buffered, format="JPEG") | |
return base64.b64encode(buffered.getvalue()).decode("utf-8") | |
def process_files_fixed(image_path, page_identifier, error_pages): | |
api_key = os.getenv('OPENAI_API_KEY') | |
headers = { | |
"Content-Type": "application/json", | |
"Authorization": f"Bearer {api_key}" | |
} | |
try: | |
image = Image.open(image_path) | |
base64_image = encode_image_to_base64(image) | |
except Exception as e: | |
logging.error(f"Failed to process image at {image_path}: {e}") | |
error_pages.append(page_identifier) | |
return [] | |
prompt = """**Objective:** Extract specific data from a table within an image using OCR. | |
**Image Description:** The image contains a table with student information. | |
**Columns of Interest:** | |
* S.No (Serial Number) | |
* Admission No. | |
* Date of Admission | |
* Name of Student | |
* Father's Name | |
* Date of Birth | |
* Telephone No. | |
* Address | |
* F.CNIC (Father's CNIC) | |
* S.CNIC (Student's CNIC) - Located under the "REMARKS" column | |
* M.Name (Mother's Name) - Located under the "REMARKS" column | |
**Instructions:** | |
1. **Perform OCR:** Use Optical Character Recognition to extract text from the image. | |
2. **Table Detection:** Identify the table within the image. | |
3. **Column Identification:** | |
* If table headers are present and visible, use them to identify the columns of interest. | |
* If headers are missing or unclear, assume the order of columns as specified above. | |
4. **Data Extraction:** | |
* Extract data from each row of the table for the specified columns only. | |
* Disregard any additional columns present in the table. | |
* **Important:** Extract data from all rows, do not skip any rows. | |
* For "Telephone No.", focus on the number itself and ignore any labels like "office" or "residence" associated with it. | |
* For "F.CNIC", "S.CNIC", and "M.Name", extract this information from the "REMARKS" column. | |
5. **Data Verification:** | |
* Implement checks to ensure the accuracy of extracted data, especially for numerical values like "S.No" and "Telephone No." | |
* Consider using checksums or validation rules based on known formats (e.g., CNIC format). | |
**Output Format:** | |
```json | |
{ | |
"data": [ | |
{ | |
"S_No": "1", | |
"Admission No.": "1604", | |
"Date of Admission": "25-4-17", | |
"Name of Student": "Maham Tariq", | |
"Father's Name": "Tariq Mehman", | |
"Date of Birth": "12-05-12", | |
"Telephone No.": "03125350838", | |
"Address": "Dewan-e-umar Masjid F1014", | |
"F.CNIC": "61101-9729652-7", | |
"S.CNIC": "61101-8018797-4", | |
"M.Name": "Nasira" | |
}, | |
{ | |
"S_No": "2", | |
"Admission No.": "1640", | |
"Date of Admission": "05-10-20", | |
"Name of Student": "Areej Jibran", | |
"Father's Name": "M.Jibran", | |
"Date of Birth": "05-04-14", | |
"Telephone No.": "03335173534", | |
"Address": "H#65 st#11 G11/I isb", | |
"F. CNIC": "37405-0393951-3", | |
"S.CNIC": "37405-5642572-3", | |
"M.Name": "Taqdees Jibran" | |
} | |
] | |
} | |
""" | |
payload = { | |
"model": "gpt-4-vision-preview", | |
"messages": [ | |
{ | |
"role": "user", | |
"content": [ | |
{ | |
"type": "text", | |
"text": prompt | |
}, | |
{ | |
"type": "image_url", | |
"image_url": f"data:image/jpeg;base64,{base64_image}" | |
} | |
] | |
} | |
], | |
"max_tokens": 4096 | |
} | |
try: | |
response = requests.post("https://api.openai.com/v1/chat/completions", headers=headers, json=payload) | |
logging.info(f"Full API response: {response.text}") | |
if response.status_code == 200: | |
json_response = response.json() | |
response_content = json_response["choices"][0]["message"]["content"] | |
if response_content: | |
try: | |
json_string = response_content[response_content.find("{"): response_content.rfind("}") + 1] | |
json_data = json.loads(json_string) | |
if "data" in json_data and json_data["data"]: | |
return json_data["data"] | |
else: | |
logging.error(f"No records found in page/file: {page_identifier}") | |
error_pages.append(page_identifier) | |
return [] | |
except json.JSONDecodeError: | |
logging.error(f"JSON parsing error in response for page/file {page_identifier}") | |
error_pages.append(page_identifier) | |
return [] | |
else: | |
logging.error(f"No content in JSON response for page/file {page_identifier}") | |
error_pages.append(page_identifier) | |
return [] | |
else: | |
logging.error(f"Error in API call for page/file {page_identifier}: HTTP {response.status_code} - {response.text}") | |
error_pages.append(page_identifier) | |
return [] | |
except requests.exceptions.RequestException as e: | |
logging.error(f"Network or API error when processing page/file {page_identifier}: {e}") | |
error_pages.append(page_identifier) | |
return [] | |
def process_pdf_and_generate_csv(file_path): | |
error_pages = [] # Initialize the list to track error pages or files | |
images = get_images(file_path) | |
structured_data = [] | |
for i, image in enumerate(images, start=1): | |
with TemporaryDirectory() as temp_dir: | |
image_path = os.path.join(temp_dir, "image.jpg") | |
image.save(image_path) | |
data = process_files_fixed(image_path, i, error_pages) | |
structured_data.extend(data or []) | |
if structured_data: | |
df = pd.DataFrame(structured_data) | |
# Save to a temporary file to return through Gradio | |
tmp_file = NamedTemporaryFile(delete=False, suffix='.csv') | |
df.to_csv(tmp_file.name, index=False) | |
return tmp_file.name, f"Errors or no records found in {len(error_pages)} pages/files: {error_pages}" | |
else: | |
return None, "No data to save or an error occurred." | |
def gradio_interface(pdf_file): | |
result_csv, message = process_pdf_and_generate_csv(pdf_file.name) | |
if result_csv: | |
return result_csv, message | |
else: | |
return None, message | |
iface = gr.Interface(fn=gradio_interface, | |
inputs=gr.File(label="Please upload your PDF file"), | |
outputs=[gr.File(label="Download the generated CSV file"), gr.Textbox(label="Messages")], | |
title="PDF to CSV Table Extractor", | |
description="Upload a PDF file to extract tables into a CSV format.") | |
iface.queue().launch(share=False) | |
# |