Multimodal-PDF-Chatbot

Running

App Files Files Community

anand004 commited on Jun 21

Commit

3f98f11

•

1 Parent(s): 243e843

Create app.py

Browse files

Files changed (1) hide show

app.py +434 -0

app.py ADDED Viewed

	@@ -0,0 +1,434 @@

+import gradio as gr
+from unstructured.partition.pdf import partition_pdf
+import pymupdf
+from PIL import Image
+import numpy as np
+import io
+import pandas as pd
+from langchain.text_splitter import RecursiveCharacterTextSplitter
+import gc
+import torch
+import chromadb
+from chromadb.utils.embedding_functions import OpenCLIPEmbeddingFunction
+from chromadb.utils.data_loaders import ImageLoader
+from sentence_transformers import SentenceTransformer
+from chromadb.utils import embedding_functions
+from transformers import LlavaNextProcessor, LlavaNextForConditionalGeneration
+import base64
+from langchain_community.llms import HuggingFaceEndpoint
+from langchain import PromptTemplate
+import spaces
+if torch.cuda.is_available():
+    processor = LlavaNextProcessor.from_pretrained("llava-hf/llava-v1.6-mistral-7b-hf")
+    vision_model = LlavaNextForConditionalGeneration.from_pretrained(
+        "llava-hf/llava-v1.6-mistral-7b-hf",
+        torch_dtype=torch.float16,
+        low_cpu_mem_usage=True,
+        load_in_4bit=True,
+    )
+def image_to_bytes(image):
+    img_byte_arr = io.BytesIO()
+    image.save(img_byte_arr, format="PNG")
+    return base64.b64encode(img_byte_arr.getvalue()).decode("utf-8")
+@spaces.GPU
+def get_image_descriptions(images):
+    torch.cuda.empty_cache()
+    gc.collect()
+    descriptions = []
+    prompt = "[INST] <image>\nDescribe the image in a sentence [/INST]"
+    for img in images:
+        inputs = processor(prompt, img, return_tensors="pt").to("cuda:0")
+        output = vision_model.generate(**inputs, max_new_tokens=100)
+        descriptions.append(processor.decode(output[0], skip_special_tokens=True))
+    return descriptions
+CSS = """
+#table_col {background-color: rgb(33, 41, 54);}
+"""
+def extract_pdfs(docs, doc_collection):
+    if docs:
+        doc_collection = []
+        doc_collection.extend(docs)
+    return (
+        doc_collection,
+        gr.Tabs(selected=1),
+        pd.DataFrame([i.split("/")[-1] for i in list(docs)], columns=["Filename"]),
+    )
+def extract_images(docs):
+    images = []
+    for doc_path in docs:
+        doc = pymupdf.open(doc_path)  # open a document
+        for page_index in range(len(doc)):  # iterate over pdf pages
+            page = doc[page_index]  # get the page
+            image_list = page.get_images()
+            for image_index, img in enumerate(
+                image_list, start=1
+            ):  # enumerate the image list
+                xref = img[0]  # get the XREF of the image
+                pix = pymupdf.Pixmap(doc, xref)  # create a Pixmap
+                if pix.n - pix.alpha > 3:  # CMYK: convert to RGB first
+                    pix = pymupdf.Pixmap(pymupdf.csRGB, pix)
+                images.append(Image.open(io.BytesIO(pix.pil_tobytes("JPEG"))))
+    return images
+# def get_vectordb(text, images, tables):
+def get_vectordb(text, images):
+    client = chromadb.EphemeralClient()
+    loader = ImageLoader()
+    sentence_transformer_ef = embedding_functions.SentenceTransformerEmbeddingFunction(
+        model_name="multi-qa-mpnet-base-dot-v1"
+    )
+    if "text_db" in [i.name for i in client.list_collections()]:
+        client.delete_collection("text_db")
+    if "image_db" in [i.name for i in client.list_collections()]:
+        client.delete_collection("image_db")
+    text_collection = client.get_or_create_collection(
+        name="text_db",
+        embedding_function=sentence_transformer_ef,
+        data_loader=loader,
+    )
+    image_collection = client.get_or_create_collection(
+        name="image_db",
+        embedding_function=sentence_transformer_ef,
+        data_loader=loader,
+        metadata={"hnsw:space": "cosine"},
+    )
+    image_descriptions = get_image_descriptions(images)
+    image_dict = [{"image": image_to_bytes(img) for img in images}]
+    image_collection.add(
+        ids=[str(i) for i in range(len(images))],
+        documents=image_descriptions,
+        metadatas=image_dict,
+    )
+    splitter = RecursiveCharacterTextSplitter(
+        chunk_size=500,
+        chunk_overlap=10,
+    )
+    docs = splitter.create_documents([text])
+    doc_texts = [i.page_content for i in docs]
+    text_collection.add(
+        ids=[str(i) for i in list(range(len(doc_texts)))], documents=doc_texts
+    )
+    return client
+def extract_data_from_pdfs(docs, session, progress=gr.Progress()):
+    if len(docs) == 0:
+        raise gr.Error("No documents to process")
+    progress(0, "Extracting Images")
+    images = extract_images(docs)
+    progress(0.25, "Extracting Text")
+    strategy = "hi_res"
+    model_name = "yolox"
+    all_elements = []
+    for doc in docs:
+        elements = partition_pdf(
+            filename=doc,
+            strategy=strategy,
+            infer_table_structure=True,
+            model_name=model_name,
+        )
+        all_elements.extend(elements)
+    all_text = ""
+    # tables = []
+    prev = None
+    for i in all_elements:
+        meta = i.to_dict()
+        if meta["type"].lower() not in ["table", "figurecaption"]:
+            if meta["type"].lower() in ["listitem", "title"]:
+                all_text += "\n\n" + meta["text"] + "\n"
+            else:
+                all_text += meta["text"]
+        elif meta["type"] == "Table":
+            continue
+            # tables.append(meta["metadata"]["text_as_html"])
+    # html = "<br>".join(tables)
+    # display = "<h3>Sample Tables</h3>" + "<br>".join(tables[:2])
+    # html = gr.HTML(html)
+    # vectordb = get_vectordb(all_text, images, tables)
+    progress(0.5, "Generating image descriptions")
+    image_descriptions = "\n".join(get_image_descriptions(images))
+    progress(0.75, "Inserting data into vector database")
+    vectordb = get_vectordb(all_text, images)
+    progress(1, "Completed")
+    session["processed"] = True
+    return (
+        vectordb,
+        session,
+        gr.Row(visible=True),
+        all_text[:2000] + "...",
+        # display,
+        images[:2],
+        "<h1 style='text-align: center'>Completed<h1>",
+        # image_descriptions
+    )
+sentence_transformer_ef = embedding_functions.SentenceTransformerEmbeddingFunction(
+    model_name="multi-qa-mpnet-base-dot-v1"
+)
+def conversation(vectordb_client, msg, num_context, img_context, history):
+    text_collection = vectordb_client.get_collection(
+        "text_db", embedding_function=sentence_transformer_ef
+    )
+    image_collection = vectordb_client.get_collection(
+        "image_db", embedding_function=sentence_transformer_ef
+    )
+    results = text_collection.query(
+        query_texts=[msg], include=["documents"], n_results=num_context
+    )["documents"][0]
+    similar_images = image_collection.query(
+        query_texts=[msg],
+        include=["metadatas", "distances", "documents"],
+        n_results=img_context,
+    )
+    img_links = [i["image"] for i in similar_images["metadatas"][0]]
+    images_and_locs = [
+        Image.open(io.BytesIO(base64.b64decode(i[1])))
+        for i in zip(similar_images["distances"][0], img_links)
+    ]
+    img_desc = "\n".join(similar_images["documents"][0])
+    if len(img_links) == 0:
+        img_desc = "No Images Are Provided"
+    template = """
+    Context:
+    {context}
+    Included Images:
+    {images}
+    Question:
+    {question}
+    Answer:
+    """
+    prompt = PromptTemplate(template=template, input_variables=["context", "question"])
+    context = "\n\n".join(results)
+    response = llm(prompt.format(context=context, question=msg, images=img_desc))
+    return history + [(msg, response)], context, images_and_locs
+def check_validity_and_llm(session_states):
+    if session_states.get("processed", False) == True:
+        return gr.Tabs(selected=2)
+    raise gr.Error("Please extract data first")
+def get_stats(vectordb):
+    eles = vectordb.get()
+    # words =
+    text_data = [f"Chunks: {len(eles)}", "HIII"]
+    return "\n".join(text_data), "", ""
+llm = HuggingFaceEndpoint(
+    repo_id="mistralai/Mixtral-8x7B-Instruct-v0.1",
+    temperature=0.4,
+    max_new_tokens=800,
+)
+with gr.Blocks(css=CSS) as demo:
+    vectordb = gr.State()
+    doc_collection = gr.State(value=[])
+    session_states = gr.State(value={})
+    gr.Markdown(
+        """<h2><center>Multimodal PDF Chatbot</center></h2>
+    <h3><center><b>Interact With Your PDF Documents</b></center></h3>"""
+    )
+    gr.Markdown(
+        """<center><h3><b>Note: </b> This application leverages advanced Retrieval-Augmented Generation (RAG) techniques to provide context-aware responses from your PDF documents</center><h3><br>
+    <center>Utilizing multimodal capabilities, this chatbot can interpret and answer queries based on both textual and visual information within your PDFs.</center>"""
+    )
+    gr.Markdown(
+        """
+    <center><b>Warning: </b> Extracting text and images from your document and generating embeddings may take some time due to the use of OCR and multimodal LLMs for image description<center>
+    """
+    )
+    with gr.Tabs() as tabs:
+        with gr.TabItem("Upload PDFs", id=0) as pdf_tab:
+            with gr.Row():
+                with gr.Column():
+                    documents = gr.File(
+                        file_count="multiple",
+                        file_types=["pdf"],
+                        interactive=True,
+                        label="Upload your PDF file/s",
+                    )
+                    pdf_btn = gr.Button(value="Next", elem_id="button1")
+        with gr.TabItem("Extract Data", id=1) as preprocess:
+            with gr.Row():
+                with gr.Column():
+                    back_p1 = gr.Button(value="Back")
+                with gr.Column():
+                    embed = gr.Button(value="Extract Data")
+                with gr.Column():
+                    next_p1 = gr.Button(value="Next")
+            with gr.Row() as row:
+                with gr.Column():
+                    selected = gr.Dataframe(
+                        interactive=False,
+                        col_count=(1, "fixed"),
+                        headers=["Selected Files"],
+                    )
+                with gr.Column(variant="panel"):
+                    prog = gr.HTML(
+                        value="<h1 style='text-align: center'>Click the 'Extract' button to extract data from PDFs<h1>"
+                    )
+            with gr.Accordion("See Parts of Extracted Data", open=False):
+                with gr.Column(visible=True) as sample_data:
+                    with gr.Row():
+                        with gr.Column():
+                            ext_text = gr.Textbox(
+                                label="Sample Extracted Text", lines=15
+                            )
+                        with gr.Column():
+                            images = gr.Gallery(
+                                label="Sample Extracted Images", columns=1, rows=2
+                            )
+                    # with gr.Row():
+                    #   image_desc = gr.Textbox(label="Image Descriptions", interactive=False)
+                    # with gr.Row(variant="panel"):
+                    #     ext_tables = gr.HTML("<h3>Sample Tables</h3>", label="Extracted Tables")
+        # with gr.TabItem("Embeddings", id=3) as embed_tab:
+        #     with gr.Row():
+        #         with gr.Column():
+        #             back_p2 = gr.Button(value="Back")
+        #         with gr.Column():
+        #             view_stats = gr.Button(value="View Stats")
+        #         with gr.Column():
+        #             next_p2 = gr.Button(value="Next")
+        #     with gr.Row():
+        #         with gr.Column():
+        #             text_stats = gr.Textbox(label="Text Stats", interactive=False)
+        #         with gr.Column():
+        #             table_stats = gr.Textbox(label="Table Stats", interactive=False)
+        #         with gr.Column():
+        #             image_stats = gr.Textbox(label="Image Stats", interactive=False)
+        with gr.TabItem("Chat", id=2) as chat_tab:
+            with gr.Column():
+                choice = gr.Radio(
+                    ["chromaDB"],
+                    value="chromaDB",
+                    label="Vector Database",
+                    interactive=True,
+                )
+                num_context = gr.Slider(
+                    label="Number of text context elements",
+                    minimum=1,
+                    maximum=20,
+                    step=1,
+                    interactive=True,
+                    value=3,
+                )
+                img_context = gr.Slider(
+                    label="Number of image context elements",
+                    minimum=1,
+                    maximum=10,
+                    step=1,
+                    interactive=True,
+                    value=2,
+                )
+            with gr.Row():
+                with gr.Column():
+                    ret_images = gr.Gallery("Similar Images", columns=1, rows=2)
+                with gr.Column():
+                    chatbot = gr.Chatbot(height=400)
+            with gr.Accordion("Text References", open=False):
+                with gr.Row():
+                    text_context = gr.Textbox(interactive=False, lines=10)
+            with gr.Row():
+                msg = gr.Textbox(
+                    placeholder="Type your question here (e.g. 'What is this document about?')",
+                    interactive=True,
+                    container=True,
+                )
+            with gr.Row():
+                submit_btn = gr.Button("Submit message")
+                clear_btn = gr.ClearButton([msg, chatbot], value="Clear conversation")
+    pdf_btn.click(
+        fn=extract_pdfs,
+        inputs=[documents, doc_collection],
+        outputs=[doc_collection, tabs, selected],
+    )
+    embed.click(
+        extract_data_from_pdfs,
+        inputs=[doc_collection, session_states],
+        outputs=[
+            vectordb,
+            session_states,
+            sample_data,
+            ext_text,
+            # ext_tables,
+            images,
+            prog,
+            # image_desc
+        ],
+    )
+    submit_btn.click(
+        conversation,
+        [vectordb, msg, num_context, img_context, chatbot],
+        [chatbot, text_context, ret_images],
+    )
+    # view_stats.click(
+    #     get_stats, [vectordb], outputs=[text_stats, table_stats, image_stats]
+    # )
+    # Page Navigation
+    back_p1.click(lambda: gr.Tabs(selected=0), None, tabs)
+    next_p1.click(check_validity_and_llm, session_states, tabs)
+if __name__ == "__main__":
+    demo.launch(share=True)