import gradio as gr from tempfile import TemporaryDirectory, NamedTemporaryFile from pdf2image import convert_from_path from PIL import Image import os from io import BytesIO import base64 import requests import pandas as pd import json import logging import re os.system("apt-get update") os.system("apt-get install poppler-utils") logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(levelname)s - %(message)s') # Function to convert PDF to images or open a single image def get_images(file_path): images = [] extension = os.path.splitext(file_path)[-1].lower() if extension == ".pdf": images = convert_from_path(file_path) elif extension in [".tiff", ".tif", ".png", ".jpg", ".jpeg"]: image = Image.open(file_path) images.append(image) return images # Function to encode image to base64 def encode_image_to_base64(image): # Ensure the image is in a format compatible with JPEG if image.mode in ["P", "RGBA"]: image = image.convert("RGB") buffered = BytesIO() image.save(buffered, format="JPEG") return base64.b64encode(buffered.getvalue()).decode("utf-8") def process_files_fixed(image_path, page_identifier, error_pages): api_key = os.getenv('OPENAI_API_KEY') headers = { "Content-Type": "application/json", "Authorization": f"Bearer {api_key}" } try: image = Image.open(image_path) base64_image = encode_image_to_base64(image) except Exception as e: logging.error(f"Failed to process image at {image_path}: {e}") error_pages.append(page_identifier) return [] prompt = """**Objective:** Extract specific data from a table within an image using OCR. **Image Description:** The image contains a table with student information. **Columns of Interest:** * S.No (Serial Number) * Admission No. * Date of Admission * Name of Student * Father's Name * Date of Birth * Telephone No. * Address * F.CNIC (Father's CNIC) * S.CNIC (Student's CNIC) - Located under the "REMARKS" column * M.Name (Mother's Name) - Located under the "REMARKS" column **Instructions:** 1. **Perform OCR:** Use Optical Character Recognition to extract text from the image. 2. **Table Detection:** Identify the table within the image. 3. **Column Identification:** * If table headers are present and visible, use them to identify the columns of interest. * If headers are missing or unclear, assume the order of columns as specified above. 4. **Data Extraction:** * Extract data from each row of the table for the specified columns only. * Disregard any additional columns present in the table. * **Important:** Extract data from all rows, do not skip any rows. * For "Telephone No.", focus on the number itself and ignore any labels like "office" or "residence" associated with it. * For "F.CNIC", "S.CNIC", and "M.Name", extract this information from the "REMARKS" column. 5. **Data Verification:** * Implement checks to ensure the accuracy of extracted data, especially for numerical values like "S.No" and "Telephone No." * Consider using checksums or validation rules based on known formats (e.g., CNIC format). **Output Format:** ```json { "data": [ { "S_No": "1", "Admission No.": "1604", "Date of Admission": "25-4-17", "Name of Student": "Maham Tariq", "Father's Name": "Tariq Mehman", "Date of Birth": "12-05-12", "Telephone No.": "03125350838", "Address": "Dewan-e-umar Masjid F1014", "F.CNIC": "61101-9729652-7", "S.CNIC": "61101-8018797-4", "M.Name": "Nasira" }, { "S_No": "2", "Admission No.": "1640", "Date of Admission": "05-10-20", "Name of Student": "Areej Jibran", "Father's Name": "M.Jibran", "Date of Birth": "05-04-14", "Telephone No.": "03335173534", "Address": "H#65 st#11 G11/I isb", "F. CNIC": "37405-0393951-3", "S.CNIC": "37405-5642572-3", "M.Name": "Taqdees Jibran" } ] } """ payload = { "model": "gpt-4-vision-preview", "messages": [ { "role": "user", "content": [ { "type": "text", "text": prompt }, { "type": "image_url", "image_url": f"data:image/jpeg;base64,{base64_image}" } ] } ], "max_tokens": 4096 } try: response = requests.post("https://api.openai.com/v1/chat/completions", headers=headers, json=payload) logging.info(f"Full API response: {response.text}") if response.status_code == 200: json_response = response.json() response_content = json_response["choices"][0]["message"]["content"] if response_content: try: json_string = response_content[response_content.find("{"): response_content.rfind("}") + 1] json_data = json.loads(json_string) if "data" in json_data and json_data["data"]: return json_data["data"] else: logging.error(f"No records found in page/file: {page_identifier}") error_pages.append(page_identifier) return [] except json.JSONDecodeError: logging.error(f"JSON parsing error in response for page/file {page_identifier}") error_pages.append(page_identifier) return [] else: logging.error(f"No content in JSON response for page/file {page_identifier}") error_pages.append(page_identifier) return [] else: logging.error(f"Error in API call for page/file {page_identifier}: HTTP {response.status_code} - {response.text}") error_pages.append(page_identifier) return [] except requests.exceptions.RequestException as e: logging.error(f"Network or API error when processing page/file {page_identifier}: {e}") error_pages.append(page_identifier) return [] def process_pdf_and_generate_csv(file_path): error_pages = [] # Initialize the list to track error pages or files images = get_images(file_path) structured_data = [] for i, image in enumerate(images, start=1): with TemporaryDirectory() as temp_dir: image_path = os.path.join(temp_dir, "image.jpg") image.save(image_path) data = process_files_fixed(image_path, i, error_pages) structured_data.extend(data or []) if structured_data: df = pd.DataFrame(structured_data) # Save to a temporary file to return through Gradio tmp_file = NamedTemporaryFile(delete=False, suffix='.csv') df.to_csv(tmp_file.name, index=False) return tmp_file.name, f"Errors or no records found in {len(error_pages)} pages/files: {error_pages}" else: return None, "No data to save or an error occurred." def gradio_interface(pdf_file): result_csv, message = process_pdf_and_generate_csv(pdf_file.name) if result_csv: return result_csv, message else: return None, message iface = gr.Interface(fn=gradio_interface, inputs=gr.File(label="Please upload your PDF file"), outputs=[gr.File(label="Download the generated CSV file"), gr.Textbox(label="Messages")], title="PDF to CSV Table Extractor", description="Upload a PDF file to extract tables into a CSV format.") iface.queue().launch(share=False) #