Spaces:

OmkarGhugarkar
/

Multi-File-PDF-Chat-Application

Sleeping

File size: 6,960 Bytes

3905e66

import string
import random
import fitz
from PIL import Image as Img
import os
import shutil
import base64
from openai import OpenAI

import string
import random
import fitz
from PIL import Image as Img
import os
import tqdm
import shutil
import base64
from openai import OpenAI
import streamlit as st

def process_pdf_with_ocr(pdf_path, api_key):
    def generate_random_string(length=10):
        characters = string.ascii_letters + string.digits
        return ''.join(random.choices(characters, k=length))

    def encode_image(image_path):
        with open(image_path, "rb") as image_file:
            return base64.b64encode(image_file.read()).decode("utf-8")

    def get_ocr_text(image_path, client, current_page, total_pages):
        progress = (current_page / total_pages) * 100
        status_text.text(f"Processing page {current_page}/{total_pages} with OCR")
        progress_bar.progress(int(progress))
        
        prompt = """
        You are provided with an image that may contain handwritten text in a local Indian language or English, along with possible table structures. Your task is to extract all text using OCR, ensuring that:
        - Regular text is returned as plain text.
        - Any detected tables are reconstructed using proper markdown table formatting (using pipes "|" for columns and dashes "-" for row separators).
        Return only the extracted text in markdown format, with no additional commentary. If no text is detected, return an empty response.
        """
        
        base64_image = encode_image(image_path)
        response = client.chat.completions.create(
            model="gpt-4o",
            messages=[{
                "role": "user",
                "content": [
                    {"type": "text", "text": prompt},
                    {"type": "image_url", "image_url": {"url": f"data:image/jpeg;base64,{base64_image}"}}
                ]
            }]
        )
        return response.choices[0].message.content

    # Initialize progress tracking
    progress_bar = st.progress(0)
    status_text = st.empty()
    progress_info = st.empty()
    
    # Initialize OpenAI client
    status_text.text("Initializing OpenAI client...")
    progress_bar.progress(5)
    os.environ["OPENAI_API_KEY"] = api_key
    client = OpenAI()

    # Create temp folder for images
    temp_folder = f"Images/{generate_random_string()}"
    os.makedirs(temp_folder, exist_ok=True)
    progress_bar.progress(10)

    result = {}
    try:
        # Open PDF and get total pages
        status_text.text("Opening PDF document...")
        pdf_document = fitz.open(pdf_path)
        total_pages = len(pdf_document)
        progress_bar.progress(15)

        # Convert PDF to images
        for page_num in range(total_pages):
            current_progress = 15 + (page_num / total_pages * 25)  # 15-40% progress for PDF to image conversion
            status_text.text(f"Converting page {page_num + 1}/{total_pages} to image")
            progress_info.text(f"PDF to Image conversion: {int(current_progress)}%")
            progress_bar.progress(int(current_progress))
            
            page = pdf_document[page_num]
            pix = page.get_pixmap(dpi=150)
            image_path = f"{temp_folder}/page_{page_num + 1}.png"
            image = Img.frombytes("RGB", [pix.width, pix.height], pix.samples)
            image.save(image_path)

        # Process OCR for each image
        status_text.text("Starting OCR processing...")
        progress_bar.progress(40)
        
        for page_num in range(total_pages):
            current_progress = 40 + (page_num / total_pages * 55)  # 40-95% progress for OCR
            image_path = f"{temp_folder}/page_{page_num + 1}.png"
            progress_info.text(f"OCR Processing: {int(current_progress)}%")
            
            ocr_text = get_ocr_text(image_path, client, page_num + 1, total_pages)
            result[page_num + 1] = ocr_text

        pdf_document.close()
        status_text.text("Finalizing...")
        progress_bar.progress(95)

    finally:
        # Clean up
        if os.path.exists(temp_folder):
            status_text.text("Cleaning up temporary files...")
            shutil.rmtree(temp_folder)
            progress_bar.progress(100)
            status_text.text("Processing complete!")
            progress_info.empty()

    return result

'''
def process_pdf_with_ocr(pdf_path, api_key):
    def generate_random_string(length=10):
        characters = string.ascii_letters + string.digits
        return ''.join(random.choices(characters, k=length))
    
    def encode_image(image_path):
        with open(image_path, "rb") as image_file:
            return base64.b64encode(image_file.read()).decode("utf-8")
    
    def get_ocr_text(image_path, client):
        prompt = """
        You are provided with an image that may contain handwritten text in a local Indian language or English, along with possible table structures. Your task is to extract all text using OCR, ensuring that:
        - Regular text is returned as plain text.
        - Any detected tables are reconstructed using proper markdown table formatting (using pipes "|" for columns and dashes "-" for row separators).
        Return only the extracted text in markdown format, with no additional commentary. If no text is detected, return an empty response.
        """
        base64_image = encode_image(image_path)
        response = client.chat.completions.create(
            model="gpt-4o",
            messages=[{
                "role": "user",
                "content": [
                    {"type": "text", "text": prompt},
                    {"type": "image_url", "image_url": {"url": f"data:image/jpeg;base64,{base64_image}"}}
                ]
            }]
        )
        print(image_path)
        print(response.choices[0].message.content)
        return response.choices[0].message.content

    # Initialize OpenAI client
    os.environ["OPENAI_API_KEY"] = api_key
    client = OpenAI()
    
    # Create temp folder for images
    temp_folder = f"Images/{generate_random_string()}"
    os.makedirs(temp_folder, exist_ok=True)
    
    # Process PDF
    result = {}
    try:
        # Convert PDF to images
        pdf_document = fitz.open(pdf_path)
        for page_num in range(len(pdf_document)):
            page = pdf_document[page_num]
            pix = page.get_pixmap(dpi=150)
            image_path = f"{temp_folder}/page_{page_num + 1}.png"
            image = Img.frombytes("RGB", [pix.width, pix.height], pix.samples)
            image.save(image_path)
            
            # Process each image with OCR
            ocr_text = get_ocr_text(image_path, client)
            result[page_num + 1] = ocr_text
        
        pdf_document.close()
        
    finally:
        # Clean up temporary files
        if os.path.exists(temp_folder):
            shutil.rmtree(temp_folder)
    
    return result
    '''