Spaces:

KrishP-12
/

ocrchatbot

Running

File size: 11,280 Bytes

28b261d

import os
import gradio as gr
from PIL import Image
import pytesseract
from pdf2image import convert_from_path
from langchain_community.embeddings import HuggingFaceEmbeddings
from langchain.prompts import PromptTemplate
from langchain.chains import RetrievalQA
from langchain.memory import ConversationBufferMemory
from langchain_groq import ChatGroq
from langchain_community.vectorstores import FAISS
from langchain.text_splitter import RecursiveCharacterTextSplitter
import base64
from io import BytesIO

# Set up Groq API Key and LLM 
os.environ["GROQ_API_KEY"] = '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'
llm = ChatGroq(
    model='llama3-70b-8192',
    temperature=0.5,
    max_tokens=None,
    timeout=None,
    max_retries=2
)

# OCR Functions
def ocr_image(image_path, language='eng+guj'):
    img = Image.open(image_path)
    text = pytesseract.image_to_string(img, lang=language)
    return text

def ocr_pdf(pdf_path, language='eng+guj'):
    images = convert_from_path(pdf_path)
    all_text = ""
    for img in images:
        text = pytesseract.image_to_string(img, lang=language)
        all_text += text + "\n"
    return all_text

def ocr_file(file_path):
    file_extension = os.path.splitext(file_path)[1].lower()

    if file_extension == ".pdf":
        text_re = ocr_pdf(file_path, language='guj+eng')
    elif file_extension in [".jpg", ".jpeg", ".png", ".bmp"]:
        text_re = ocr_image(file_path, language='guj+eng')
    else:
        raise ValueError("Unsupported file format. Supported formats are PDF, JPG, JPEG, PNG, BMP.")

    return text_re

def get_text_chunks(text):
    text_splitter = RecursiveCharacterTextSplitter(chunk_size=500, chunk_overlap=100)
    chunks = text_splitter.split_text(text)
    return chunks

def get_vector_store(text_chunks):
    embeddings = HuggingFaceEmbeddings(
        model_name="sentence-transformers/all-MiniLM-L6-v2", 
        model_kwargs={'device': 'cpu'}, 
        encode_kwargs={'normalize_embeddings': True}
    )
    vector_store = FAISS.from_texts(text_chunks, embedding=embeddings)
    
    os.makedirs("faiss_index", exist_ok=True)
    vector_store.save_local("faiss_index")
    
    return vector_store

def process_ocr_and_pdf_files(file_paths):
    raw_text = ""
    for file_path in file_paths:
        raw_text += ocr_file(file_path) + "\n"
    text_chunks = get_text_chunks(raw_text)
    return get_vector_store(text_chunks)

def get_conversational_chain():
    template = """You are an intelligent educational assistant specialized in handling queries about documents. You have been provided with OCR-processed text from the uploaded files that contains important educational information.

Core Responsibilities:
1. Language Processing:
   - Identify the language of the user's query (English or Gujarati)
   - Respond in the same language as the query
   - If the query is in Gujarati, ensure the response maintains proper Gujarati grammar and terminology
   - For technical terms, provide both English and Gujarati versions when relevant

2. Document Understanding:
   - Analyze the OCR-processed text from the uploaded files
   - Account for potential OCR errors or misinterpretations
   - Focus on extracting accurate information despite possible OCR imperfections

3. Response Guidelines:
   - Provide direct, clear answers based solely on the document content
   - If information is unclear due to OCR quality, mention this limitation
   - For numerical data (dates, percentages, marks), double-check accuracy before responding
   - If information is not found in the documents, clearly state: "This information is not present in the uploaded documents"

4. Educational Context:
   - Maintain focus on educational queries related to the document content
   - For admission-related queries, emphasize important deadlines and requirements
   - For scholarship information, highlight eligibility criteria and application processes
   - For course-related queries, provide detailed, accurate information from the documents

5. Response Format:
   - Structure responses clearly with relevant subpoints when necessary
   - For complex information, break down the answer into digestible parts
   - Include relevant reference points from the documents when applicable
   - Format numerical data and dates clearly

6. Quality Control:
   - Verify that responses align with the document content
   - Don't make assumptions beyond the provided information
   - If multiple interpretations are possible due to OCR quality, mention all possibilities
   - Maintain consistency in terminology throughout the conversation

Important Rules:
- Never make up information not present in the documents
- Don't combine information from previous conversations or external knowledge
- Always indicate if certain parts of the documents are unclear due to OCR quality
- Maintain professional tone while being accessible to students and parents
- If the query is out of scope of the uploaded documents, politely redirect to relevant official sources

Context from uploaded documents:
{context}

Chat History:
{history}

Current Question: {question}
Assistant: Let me provide a clear and accurate response based on the uploaded documents...
"""
    embeddings = HuggingFaceEmbeddings(
        model_name="sentence-transformers/paraphrase-MiniLM-L6-v2", 
        model_kwargs={'device': 'cpu'}, 
        encode_kwargs={'normalize_embeddings': True}
    )
    
    new_vector_store = FAISS.load_local(
        "faiss_index", embeddings, allow_dangerous_deserialization=True
    )
    
    QA_CHAIN_PROMPT = PromptTemplate(
        input_variables=["history", "context", "question"], 
        template=template
    )
    
    qa_chain = RetrievalQA.from_chain_type(
        llm, 
        retriever=new_vector_store.as_retriever(), 
        chain_type='stuff', 
        verbose=True, 
        chain_type_kwargs={
            "verbose": True,
            "prompt": QA_CHAIN_PROMPT,
            "memory": ConversationBufferMemory(memory_key="history", input_key="question"),
        }
    )
    
    return qa_chain
def process_files_and_query(files, query):
    if len(files) > 5:
        return "Error: You can upload a maximum of 5 files only."
    
    # Ensure temp directory exists
    os.makedirs("temp", exist_ok=True)
    
    # Save uploaded files
    file_paths = []
    for file in files:
        file_path = os.path.join("temp", os.path.basename(file))
        with open(file_path, "wb") as f:
            f.write(open(file, 'rb').read())
        file_paths.append(file_path)
    
    # Process files and create vector store
    process_ocr_and_pdf_files(file_paths)
    
    # Perform query
    embeddings = HuggingFaceEmbeddings(
        model_name="sentence-transformers/all-MiniLM-L6-v2", 
        model_kwargs={'device': 'cpu'}, 
        encode_kwargs={'normalize_embeddings': True}
    )
    
    new_db = FAISS.load_local("faiss_index", embeddings, allow_dangerous_deserialization=True)
    docs = new_db.similarity_search(query)
    
    chain = get_conversational_chain()
    response = chain({"input_documents": docs, "query": query}, return_only_outputs=True)
    result = response.get("result", "No result found")
    
    return result
def handle_uploaded_file(uploaded_files, show_in_sidebar=False):
    sidebar_content = ""
    
    if len(uploaded_files) > 5:
        return "Error: You can upload a maximum of 5 files only."
    
    # If the uploaded_files is a list, process each file
    for uploaded_file in uploaded_files:
        # Determine the file extension
        file_extension = os.path.splitext(uploaded_file.name)[1].lower()
        file_path = os.path.join("temp", uploaded_file.name)
        os.makedirs(os.path.dirname(file_path), exist_ok=True)

        # Check if the uploaded file is in 'NamedString' format (Gradio sometimes returns it this way)
        if isinstance(uploaded_file, gr.File):
            # In this case, read the file directly from the 'data' attribute
            file_data = uploaded_file.read()  # This is the file content in bytes

            # Save the file content to a local file
            with open(file_path, "wb") as f:
                f.write(file_data)

        if file_extension == ".pdf":
            # Read and encode the PDF as base64 to embed in the sidebar
            with open(file_path, "rb") as pdf_file:
                pdf_data = pdf_file.read()
            pdf_base64 = base64.b64encode(pdf_data).decode('utf-8')
            sidebar_content += f'<iframe src="data:application/pdf;base64,{pdf_base64}" width="500" height="500"></iframe>'
        
        elif file_extension in ['.jpg', '.jpeg', '.png', '.bmp']:
            # Display image in the sidebar
            img = Image.open(file_path)
            img_byte_array = BytesIO()
            img.save(img_byte_array, format="PNG")
            img_byte_array.seek(0)
            sidebar_content += f'<img src="data:image/png;base64,{base64.b64encode(img_byte_array.getvalue()).decode()}" width="400" height="400"/>'

        else:
            # For text files, show the file content
            with open(file_path, 'r', encoding='utf-8') as f:
                content = f.read()
            sidebar_content += f"<pre>{content}</pre>"

    return sidebar_content

# Gradio interface setup
def upload_and_display(files):

    if len(files) > 5:
        return "Error: You can upload a maximum of 5 files only."
    
    sidebar_content = handle_uploaded_file(files, show_in_sidebar=True)
    return sidebar_content

def launch_gradio_app():
    with gr.Blocks() as demo:
        gr.Markdown("# Document OCR and Q&A Assistant")
        
        with gr.Row():
            with gr.Column(scale=1):  # Main content area (adjusted scale to an integer)
                file_input = gr.File(
                    file_count="multiple", 
                    type="filepath",  # Changed from 'filepath' to 'file'
                    file_types=[".pdf", ".jpg", ".jpeg", ".png", ".bmp"],
                    label="Upload Documents (PDF/Images)"
                )
                
                query_input = gr.Textbox(
                    label="Ask a Question about the Documents", 
                    lines=3
                )
                
                submit_btn = gr.Button("Process and Query")
                
                output = gr.Textbox(label="Answer", lines=5)
                
                submit_btn.click(
                    fn=process_files_and_query, 
                    inputs=[file_input, query_input], 
                    outputs=[output]
                )
                
            with gr.Column(scale=1):  # Sidebar (adjusted scale to an integer)
                gr.Markdown("## Sidebar")
                file_preview = gr.HTML(label="File Preview")  # Display the preview content here
                file_input.change(fn=upload_and_display, inputs=file_input, outputs=file_preview)
    
    return demo

# Launch the Gradio app
if __name__ == "__main__":
    app = launch_gradio_app()
    app.launch(share=True)  # Set share=True to create a public link



# # Launch the Gradio app
# if __name__ == "__main__":
#     app = launch_gradio_app()
#     # app.launch()
#     app.launch(share=True) 
    # demo.launch()