File size: 8,748 Bytes
7df83cd
50693de
52c66f2
 
 
 
 
 
 
 
86f1d14
2e61138
86f1d14
52c66f2
 
 
 
af02a05
6e44813
 
52c66f2
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
6e44813
52c66f2
6e44813
52c66f2
 
 
 
 
6e44813
 
 
 
 
 
 
52c66f2
0bad17c
8383642
0bad17c
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1cbb4c0
0bad17c
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1cbb4c0
0bad17c
 
 
 
8383642
1cbb4c0
52c66f2
 
f6ae243
52c66f2
 
 
 
 
 
 
 
 
f6ae243
6e44813
52c66f2
 
 
 
6e44813
52c66f2
 
e6560fa
 
 
 
8383642
 
 
e6560fa
8383642
 
e6560fa
 
 
 
 
 
 
 
52c66f2
 
e6560fa
8383642
52c66f2
 
 
e6560fa
52c66f2
 
e6560fa
 
52c66f2
 
 
8383642
 
52c66f2
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
9558ac4
 
52c66f2
 
 
 
6c3cc5a
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
import gradio as gr
from tempfile import TemporaryDirectory, NamedTemporaryFile
from pdf2image import convert_from_path
from PIL import Image
import os
from io import BytesIO
import base64
import requests
import pandas as pd
import json
import logging
import re


os.system("apt-get update")
os.system("apt-get install poppler-utils")

logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(levelname)s - %(message)s')


# Function to convert PDF to images or open a single image
def get_images(file_path):
    images = []
    extension = os.path.splitext(file_path)[-1].lower()
    if extension == ".pdf":
        images = convert_from_path(file_path)
    elif extension in [".tiff", ".tif", ".png", ".jpg", ".jpeg"]:
        image = Image.open(file_path)
        images.append(image)
    return images

# Function to encode image to base64
def encode_image_to_base64(image):
    # Ensure the image is in a format compatible with JPEG
    if image.mode in ["P", "RGBA"]:
        image = image.convert("RGB")
    buffered = BytesIO()
    image.save(buffered, format="JPEG")
    return base64.b64encode(buffered.getvalue()).decode("utf-8")


def process_files_fixed(image_path, page_identifier, error_pages):
    api_key = os.getenv('OPENAI_API_KEY')
    headers = {
        "Content-Type": "application/json",
        "Authorization": f"Bearer {api_key}"
    }

    try:
        image = Image.open(image_path)
        base64_image = encode_image_to_base64(image)
    except Exception as e:
        logging.error(f"Failed to process image at {image_path}: {e}")
        error_pages.append(page_identifier)
        return []

    prompt = """**Objective:** Extract specific data from a table within an image using OCR.

            **Image Description:** The image contains a table with student information.
            
            **Columns of Interest:**
            
            *   S.No (Serial Number)
            *   Admission No.
            *   Date of Admission
            *   Name of Student
            *   Father's Name
            *   Date of Birth
            *   Telephone No. 
            *   Address
            *   F.CNIC (Father's CNIC) 
            *   S.CNIC (Student's CNIC) - Located under the "REMARKS" column 
            *   M.Name (Mother's Name) - Located under the "REMARKS" column
            
            **Instructions:**
            
            1. **Perform OCR:** Use Optical Character Recognition to extract text from the image. 
            2. **Table Detection:** Identify the table within the image. 
            3. **Column Identification:**
            *   If table headers are present and visible, use them to identify the columns of interest.
            *   If headers are missing or unclear, assume the order of columns as specified above. 
            4. **Data Extraction:**
            *   Extract data from each row of the table for the specified columns only.
            *   Disregard any additional columns present in the table.
            *   **Important:** Extract data from all rows, do not skip any rows. 
            *   For "Telephone No.", focus on the number itself and ignore any labels like "office" or "residence" associated with it. 
            *   For "F.CNIC", "S.CNIC", and "M.Name", extract this information from the "REMARKS" column. 
            5. **Data Verification:**
            *   Implement checks to ensure the accuracy of extracted data, especially for numerical values like "S.No" and "Telephone No." 
            *   Consider using checksums or validation rules based on known formats (e.g., CNIC format).
            
            **Output Format:**
            
            ```json
            {
            "data": [
                {
                    "S_No": "1",
                    "Admission No.": "1604",
                    "Date of Admission": "25-4-17",
                    "Name of Student": "Maham Tariq",
                    "Father's Name": "Tariq Mehman",
                    "Date of Birth": "12-05-12",
                    "Telephone No.": "03125350838",
                    "Address": "Dewan-e-umar Masjid F1014",
                    "F.CNIC": "61101-9729652-7",
                    "S.CNIC": "61101-8018797-4",
                    "M.Name": "Nasira"
                },
                {
                    "S_No": "2",
                        "Admission No.": "1640",
                        "Date of Admission": "05-10-20",
                        "Name of Student": "Areej Jibran",
                        "Father's Name": "M.Jibran",
                        "Date of Birth": "05-04-14",
                        "Telephone No.": "03335173534",
                        "Address": "H#65 st#11 G11/I isb",
                        "F. CNIC": "37405-0393951-3",
                        "S.CNIC": "37405-5642572-3",
                        "M.Name": "Taqdees Jibran"
                }
            ]
            }
            
            """

        

    payload = {
        "model": "gpt-4-vision-preview",
        "messages": [
            {
                "role": "user",
                "content": [
                    {
                        "type": "text",
                        "text": prompt
                    },
                    {
                        "type": "image_url",
                        "image_url": f"data:image/jpeg;base64,{base64_image}"
                    }
                ]
            }
        ],
        "max_tokens": 4096
    }

    try:
        response = requests.post("https://api.openai.com/v1/chat/completions", headers=headers, json=payload)
        logging.info(f"Full API response: {response.text}")
        if response.status_code == 200:
            json_response = response.json()
            response_content = json_response["choices"][0]["message"]["content"]
            if response_content:
                try:
                    json_string = response_content[response_content.find("{"): response_content.rfind("}") + 1]
                    json_data = json.loads(json_string)
                    if "data" in json_data and json_data["data"]:
                        return json_data["data"]
                    else:
                        logging.error(f"No records found in page/file: {page_identifier}")
                        error_pages.append(page_identifier)
                        return []
                except json.JSONDecodeError:
                    logging.error(f"JSON parsing error in response for page/file {page_identifier}")
                    error_pages.append(page_identifier)
                    return []
            else:
                logging.error(f"No content in JSON response for page/file {page_identifier}")
                error_pages.append(page_identifier)
                return []
        else:
            logging.error(f"Error in API call for page/file {page_identifier}: HTTP {response.status_code} - {response.text}")
            error_pages.append(page_identifier)
            return []
    except requests.exceptions.RequestException as e:
        logging.error(f"Network or API error when processing page/file {page_identifier}: {e}")
        error_pages.append(page_identifier)
        return []



def process_pdf_and_generate_csv(file_path):
    error_pages = []  # Initialize the list to track error pages or files
    images = get_images(file_path)
    structured_data = []
    
    for i, image in enumerate(images, start=1):
        with TemporaryDirectory() as temp_dir:
            image_path = os.path.join(temp_dir, "image.jpg")
            image.save(image_path)
            data = process_files_fixed(image_path, i, error_pages)
            structured_data.extend(data or [])
    
    if structured_data:
        df = pd.DataFrame(structured_data)
        # Save to a temporary file to return through Gradio
        tmp_file = NamedTemporaryFile(delete=False, suffix='.csv')
        df.to_csv(tmp_file.name, index=False)
        return tmp_file.name, f"Errors or no records found in {len(error_pages)} pages/files: {error_pages}"
    else:
        return None, "No data to save or an error occurred."

def gradio_interface(pdf_file):
    result_csv, message = process_pdf_and_generate_csv(pdf_file.name)
    if result_csv:
        return result_csv, message
    else:
        return None, message

iface = gr.Interface(fn=gradio_interface,
                     inputs=gr.File(label="Please upload your PDF file"),
                     outputs=[gr.File(label="Download the generated CSV file"), gr.Textbox(label="Messages")],
                     title="PDF to CSV Table Extractor",
                     description="Upload a PDF file to extract tables into a CSV format.")

iface.queue().launch(share=False)
#