File size: 3,617 Bytes
9453eac
5c3f634
4abc449
87a1c7f
 
 
 
 
 
 
 
 
9453eac
fd5f89e
c5a772e
 
fd5f89e
54a29b3
c5a772e
 
6ecc4f4
 
c5a772e
fd5f89e
768d260
87a1c7f
 
c5a772e
87a1c7f
 
872e2d7
 
 
 
 
3209503
6ecc4f4
87a1c7f
 
 
 
 
 
 
 
 
872e2d7
 
 
 
 
 
 
 
 
87a1c7f
57fa964
12a2f23
87a1c7f
c5a772e
2711484
 
12a2f23
fd5f89e
 
87a1c7f
 
fd5f89e
12a2f23
87a1c7f
c5a772e
258e6aa
2711484
87a1c7f
c5a772e
9453eac
 
fd5f89e
 
 
279ab91
fd5f89e
 
28a9f71
57fa964
2711484
12a2f23
 
57fa964
9453eac
 
4abc449
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
import gradio as gr
import easyocr
import numpy as np

from langchain_community.llms import HuggingFacePipeline
from langchain.chains import RetrievalQA
from langchain_community.vectorstores import FAISS
from langchain_community.embeddings import HuggingFaceEmbeddings
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain.docstore.document import Document

from transformers import pipeline as hf_pipeline

# 1. OCR Processor (English)
class OCRProcessor:
    def __init__(self):
        self.reader = easyocr.Reader(['en'])

    def extract_text(self, image: np.ndarray) -> str:
        try:
            results = self.reader.readtext(image, detail=0, paragraph=True)
            return "\n".join(results) if results else ""
        except Exception as e:
            return f"OCR error: {str(e)}"

# 2. LangChain-based DocQA Agent
class LangChainDocQAAgent:
    def __init__(self):
        self.embeddings = HuggingFaceEmbeddings(model_name="all-MiniLM-L6-v2")
        self.text_splitter = RecursiveCharacterTextSplitter(chunk_size=500, chunk_overlap=50)
        
        self.qa_pipeline = hf_pipeline(
            "question-answering",
            model="deepset/roberta-base-squad2",
            tokenizer="deepset/roberta-base-squad2"
        )

    def prepare_retriever(self, text):
        docs = [Document(page_content=chunk) for chunk in self.text_splitter.split_text(text)]
        vectorstore = FAISS.from_documents(docs, self.embeddings)
        return vectorstore.as_retriever(), docs

    def qa(self, text, question):
        if not text.strip() or not question.strip():
            return "No text or question provided.", ""
        retriever, docs = self.prepare_retriever(text)
        
        relevant_docs = retriever.get_relevant_documents(question)
        relevant_context = relevant_docs[0].page_content if relevant_docs else ""
        
        if relevant_context:
            result = self.qa_pipeline({"context": relevant_context, "question": question})
            answer = result["answer"]
        else:
            answer = "No answer found."
        return relevant_context, answer

ocr_processor = OCRProcessor()
docqa_agent = LangChainDocQAAgent()

def docqa_pipeline(image, question):
    # 1. OCR
    context = ocr_processor.extract_text(image)
    if context.startswith("OCR error"):
        return context, "No answer."
    # 2. LangChain RetrievalQA
    relevant_chunk, answer = docqa_agent.qa(context, question)
    return context, f"Relevant chunk:\n{relevant_chunk}\n\nModel answer:\n{answer}"

with gr.Blocks(title="DocQA Agent (LangChain): Intelligent Q&A from Extracted English Document") as app:
    gr.Markdown("""
    # omidsakaki.ir
    <br>
    A multi-agent system for question answering from English documents (OCR + retrieval + intelligent answer with LangChain)
    """)
    with gr.Row():
        with gr.Column():
            img_input = gr.Image(label="Input Image", type="numpy")
            question_input = gr.Textbox(label="Your question (in English)", placeholder="e.g. Who is the author of this text?", lines=1)
            process_btn = gr.Button("Get Answer")
        with gr.Column():
            context_output = gr.Textbox(label="Extracted Text", lines=10, max_lines=None, interactive=False)
            answer_output = gr.Textbox(label="Model Output (Relevant Chunk & Answer)", lines=10, max_lines=None, interactive=False)

    process_btn.click(
        fn=docqa_pipeline,
        inputs=[img_input, question_input],
        outputs=[context_output, answer_output]
    )

if __name__ == "__main__":
    app.launch()