File size: 6,960 Bytes
3905e66
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
import string
import random
import fitz
from PIL import Image as Img
import os
import shutil
import base64
from openai import OpenAI

import string
import random
import fitz
from PIL import Image as Img
import os
import tqdm
import shutil
import base64
from openai import OpenAI
import streamlit as st

def process_pdf_with_ocr(pdf_path, api_key):
    def generate_random_string(length=10):
        characters = string.ascii_letters + string.digits
        return ''.join(random.choices(characters, k=length))

    def encode_image(image_path):
        with open(image_path, "rb") as image_file:
            return base64.b64encode(image_file.read()).decode("utf-8")

    def get_ocr_text(image_path, client, current_page, total_pages):
        progress = (current_page / total_pages) * 100
        status_text.text(f"Processing page {current_page}/{total_pages} with OCR")
        progress_bar.progress(int(progress))
        
        prompt = """
        You are provided with an image that may contain handwritten text in a local Indian language or English, along with possible table structures. Your task is to extract all text using OCR, ensuring that:
        - Regular text is returned as plain text.
        - Any detected tables are reconstructed using proper markdown table formatting (using pipes "|" for columns and dashes "-" for row separators).
        Return only the extracted text in markdown format, with no additional commentary. If no text is detected, return an empty response.
        """
        
        base64_image = encode_image(image_path)
        response = client.chat.completions.create(
            model="gpt-4o",
            messages=[{
                "role": "user",
                "content": [
                    {"type": "text", "text": prompt},
                    {"type": "image_url", "image_url": {"url": f"data:image/jpeg;base64,{base64_image}"}}
                ]
            }]
        )
        return response.choices[0].message.content

    # Initialize progress tracking
    progress_bar = st.progress(0)
    status_text = st.empty()
    progress_info = st.empty()
    
    # Initialize OpenAI client
    status_text.text("Initializing OpenAI client...")
    progress_bar.progress(5)
    os.environ["OPENAI_API_KEY"] = api_key
    client = OpenAI()

    # Create temp folder for images
    temp_folder = f"Images/{generate_random_string()}"
    os.makedirs(temp_folder, exist_ok=True)
    progress_bar.progress(10)

    result = {}
    try:
        # Open PDF and get total pages
        status_text.text("Opening PDF document...")
        pdf_document = fitz.open(pdf_path)
        total_pages = len(pdf_document)
        progress_bar.progress(15)

        # Convert PDF to images
        for page_num in range(total_pages):
            current_progress = 15 + (page_num / total_pages * 25)  # 15-40% progress for PDF to image conversion
            status_text.text(f"Converting page {page_num + 1}/{total_pages} to image")
            progress_info.text(f"PDF to Image conversion: {int(current_progress)}%")
            progress_bar.progress(int(current_progress))
            
            page = pdf_document[page_num]
            pix = page.get_pixmap(dpi=150)
            image_path = f"{temp_folder}/page_{page_num + 1}.png"
            image = Img.frombytes("RGB", [pix.width, pix.height], pix.samples)
            image.save(image_path)

        # Process OCR for each image
        status_text.text("Starting OCR processing...")
        progress_bar.progress(40)
        
        for page_num in range(total_pages):
            current_progress = 40 + (page_num / total_pages * 55)  # 40-95% progress for OCR
            image_path = f"{temp_folder}/page_{page_num + 1}.png"
            progress_info.text(f"OCR Processing: {int(current_progress)}%")
            
            ocr_text = get_ocr_text(image_path, client, page_num + 1, total_pages)
            result[page_num + 1] = ocr_text

        pdf_document.close()
        status_text.text("Finalizing...")
        progress_bar.progress(95)

    finally:
        # Clean up
        if os.path.exists(temp_folder):
            status_text.text("Cleaning up temporary files...")
            shutil.rmtree(temp_folder)
            progress_bar.progress(100)
            status_text.text("Processing complete!")
            progress_info.empty()

    return result

'''
def process_pdf_with_ocr(pdf_path, api_key):
    def generate_random_string(length=10):
        characters = string.ascii_letters + string.digits
        return ''.join(random.choices(characters, k=length))
    
    def encode_image(image_path):
        with open(image_path, "rb") as image_file:
            return base64.b64encode(image_file.read()).decode("utf-8")
    
    def get_ocr_text(image_path, client):
        prompt = """
        You are provided with an image that may contain handwritten text in a local Indian language or English, along with possible table structures. Your task is to extract all text using OCR, ensuring that:
        - Regular text is returned as plain text.
        - Any detected tables are reconstructed using proper markdown table formatting (using pipes "|" for columns and dashes "-" for row separators).
        Return only the extracted text in markdown format, with no additional commentary. If no text is detected, return an empty response.
        """
        base64_image = encode_image(image_path)
        response = client.chat.completions.create(
            model="gpt-4o",
            messages=[{
                "role": "user",
                "content": [
                    {"type": "text", "text": prompt},
                    {"type": "image_url", "image_url": {"url": f"data:image/jpeg;base64,{base64_image}"}}
                ]
            }]
        )
        print(image_path)
        print(response.choices[0].message.content)
        return response.choices[0].message.content

    # Initialize OpenAI client
    os.environ["OPENAI_API_KEY"] = api_key
    client = OpenAI()
    
    # Create temp folder for images
    temp_folder = f"Images/{generate_random_string()}"
    os.makedirs(temp_folder, exist_ok=True)
    
    # Process PDF
    result = {}
    try:
        # Convert PDF to images
        pdf_document = fitz.open(pdf_path)
        for page_num in range(len(pdf_document)):
            page = pdf_document[page_num]
            pix = page.get_pixmap(dpi=150)
            image_path = f"{temp_folder}/page_{page_num + 1}.png"
            image = Img.frombytes("RGB", [pix.width, pix.height], pix.samples)
            image.save(image_path)
            
            # Process each image with OCR
            ocr_text = get_ocr_text(image_path, client)
            result[page_num + 1] = ocr_text
        
        pdf_document.close()
        
    finally:
        # Clean up temporary files
        if os.path.exists(temp_folder):
            shutil.rmtree(temp_folder)
    
    return result
    '''