Multimodal-PDF-Chatbot

Running

App Files Files Community

anand004 commited on Jun 23, 2024

Commit

e70cddd

unverified ·

1 Parent(s): 7afeb0e

bug fixes, faster ocr and restructure

Browse files

Files changed (3) hide show

app.py +125 -158
requirements.txt +4 -1
utils.py +53 -0

app.py CHANGED Viewed

@@ -1,23 +1,23 @@
 import gradio as gr
-from unstructured.partition.pdf import partition_pdf
-import pymupdf
-from PIL import Image
-import numpy as np
 import io
 import pandas as pd
-from langchain.text_splitter import RecursiveCharacterTextSplitter
-import gc
 import torch
-import chromadb
-from chromadb.utils.embedding_functions import OpenCLIPEmbeddingFunction
-from chromadb.utils.data_loaders import ImageLoader
-from sentence_transformers import SentenceTransformer
 from chromadb.utils import embedding_functions
-from transformers import LlavaNextProcessor, LlavaNextForConditionalGeneration
-import base64
-from langchain_community.llms import HuggingFaceEndpoint
 from langchain import PromptTemplate
-import spaces
 if torch.cuda.is_available():
     processor = LlavaNextProcessor.from_pretrained("llava-hf/llava-v1.6-mistral-7b-hf")
@@ -29,24 +29,17 @@ if torch.cuda.is_available():
     )
-def image_to_bytes(image):
-    img_byte_arr = io.BytesIO()
-    image.save(img_byte_arr, format="PNG")
-    return base64.b64encode(img_byte_arr.getvalue()).decode("utf-8")
-@spaces.GPU(duration=60*4)
-def get_image_descriptions(images):
     torch.cuda.empty_cache()
     gc.collect()
     descriptions = []
     prompt = "[INST] <image>\nDescribe the image in a sentence [/INST]"
-    for img in images:
-        inputs = processor(prompt, img, return_tensors="pt").to("cuda:0")
-        output = vision_model.generate(**inputs, max_new_tokens=100)
-        descriptions.append(processor.decode(output[0], skip_special_tokens=True))
     return descriptions
@@ -55,39 +48,6 @@ CSS = """
 """
-def extract_pdfs(docs, doc_collection):
-    if docs:
-        doc_collection = []
-        doc_collection.extend(docs)
-    return (
-        doc_collection,
-        gr.Tabs(selected=1),
-        pd.DataFrame([i.split("/")[-1] for i in list(docs)], columns=["Filename"]),
-    )
-def extract_images(docs):
-    images = []
-    for doc_path in docs:
-        doc = pymupdf.open(doc_path)  # open a document
-        for page_index in range(len(doc)):  # iterate over pdf pages
-            page = doc[page_index]  # get the page
-            image_list = page.get_images()
-            for image_index, img in enumerate(
-                image_list, start=1
-            ):  # enumerate the image list
-                xref = img[0]  # get the XREF of the image
-                pix = pymupdf.Pixmap(doc, xref)  # create a Pixmap
-                if pix.n - pix.alpha > 3:  # CMYK: convert to RGB first
-                    pix = pymupdf.Pixmap(pymupdf.csRGB, pix)
-                images.append(Image.open(io.BytesIO(pix.pil_tobytes("JPEG"))))
-    return images
 # def get_vectordb(text, images, tables):
 def get_vectordb(text, images):
     client = chromadb.EphemeralClient()
@@ -99,7 +59,7 @@ def get_vectordb(text, images):
         client.delete_collection("text_db")
     if "image_db" in [i.name for i in client.list_collections()]:
         client.delete_collection("image_db")
     text_collection = client.get_or_create_collection(
         name="text_db",
         embedding_function=sentence_transformer_ef,
@@ -111,14 +71,21 @@ def get_vectordb(text, images):
         data_loader=loader,
         metadata={"hnsw:space": "cosine"},
     )
-    image_descriptions = get_image_descriptions(images)
-    image_dict = [{"image": image_to_bytes(img) for img in images}]
-    if len(images)>0:
         image_collection.add(
             ids=[str(i) for i in range(len(images))],
-            documents=image_descriptions,
             metadatas=image_dict,
         )
@@ -127,7 +94,7 @@ def get_vectordb(text, images):
         chunk_overlap=10,
     )
-    if len(text)>0:
         docs = splitter.create_documents([text])
         doc_texts = [i.page_content for i in docs]
         text_collection.add(
@@ -136,54 +103,31 @@ def get_vectordb(text, images):
     return client
-def extract_data_from_pdfs(docs, session, progress=gr.Progress()):
     if len(docs) == 0:
         raise gr.Error("No documents to process")
     progress(0, "Extracting Images")
-    images = extract_images(docs)
     progress(0.25, "Extracting Text")
     strategy = "hi_res"
     model_name = "yolox"
     all_elements = []
-    for doc in docs:
-        elements = partition_pdf(
-            filename=doc,
-            strategy=strategy,
-            infer_table_structure=True,
-            model_name=model_name,
-        )
-        all_elements.extend(elements)
     all_text = ""
-    # tables = []
-    prev = None
-    for i in all_elements:
-        meta = i.to_dict()
-        if meta["type"].lower() not in ["table", "figurecaption"]:
-            if meta["type"].lower() in ["listitem", "title"]:
-                all_text += "\n\n" + meta["text"] + "\n"
-            else:
-                all_text += meta["text"]
-        elif meta["type"] == "Table":
-            continue
-            # tables.append(meta["metadata"]["text_as_html"])
-    # html = "<br>".join(tables)
-    # display = "<h3>Sample Tables</h3>" + "<br>".join(tables[:2])
-    # html = gr.HTML(html)
-    # vectordb = get_vectordb(all_text, images, tables)
-    progress(0.5, "Generating image descriptions")
-    image_descriptions = "\n".join(get_image_descriptions(images))
-    progress(0.75, "Inserting data into vector database")
     vectordb = get_vectordb(all_text, images)
     progress(1, "Completed")
@@ -205,7 +149,23 @@ sentence_transformer_ef = embedding_functions.SentenceTransformerEmbeddingFuncti
 )
-def conversation(vectordb_client, msg, num_context, img_context, history):
     text_collection = vectordb_client.get_collection(
         "text_db", embedding_function=sentence_transformer_ef
@@ -217,8 +177,6 @@ def conversation(vectordb_client, msg, num_context, img_context, history):
     results = text_collection.query(
         query_texts=[msg], include=["documents"], n_results=num_context
     )["documents"][0]
-    # print(results)
-    # print("R"*100)
     similar_images = image_collection.query(
         query_texts=[msg],
         include=["metadatas", "distances", "documents"],
@@ -266,19 +224,12 @@ def get_stats(vectordb):
     return "\n".join(text_data), "", ""
-llm = HuggingFaceEndpoint(
-    repo_id="mistralai/Mixtral-8x7B-Instruct-v0.1",
-    temperature=0.4,
-    max_new_tokens=800,
-)
-with gr.Blocks(css=CSS, theme=gr.themes.Soft()) as demo:
     vectordb = gr.State()
     doc_collection = gr.State(value=[])
     session_states = gr.State(value={})
     references = gr.State(value=[])
     gr.Markdown(
         """<h2><center>Multimodal PDF Chatbot</center></h2>
     <h3><center><b>Interact With Your PDF Documents</b></center></h3>"""
@@ -312,18 +263,23 @@ with gr.Blocks(css=CSS, theme=gr.themes.Soft()) as demo:
                     embed = gr.Button(value="Extract Data")
                 with gr.Column():
                     next_p1 = gr.Button(value="Next")
-            with gr.Row() as row:
-                with gr.Column():
-                    selected = gr.Dataframe(
-                        interactive=False,
-                        col_count=(1, "fixed"),
-                        headers=["Selected Files"],
-                    )
-                with gr.Column(variant="panel"):
-                    prog = gr.HTML(
-                        value="<h1 style='text-align: center'>Click the 'Extract' button to extract data from PDFs<h1>"
-                    )
             with gr.Accordion("See Parts of Extracted Data", open=False):
                 with gr.Column(visible=True) as sample_data:
@@ -337,32 +293,37 @@ with gr.Blocks(css=CSS, theme=gr.themes.Soft()) as demo:
                                 label="Sample Extracted Images", columns=1, rows=2
                             )
         with gr.TabItem("Chat", id=2) as chat_tab:
-            with gr.Column():
-                choice = gr.Radio(
-                    ["chromaDB"],
-                    value="chromaDB",
-                    label="Vector Database",
-                    interactive=True,
-                )
-                num_context = gr.Slider(
-                    label="Number of text context elements",
-                    minimum=1,
-                    maximum=20,
-                    step=1,
-                    interactive=True,
-                    value=3,
-                )
-                img_context = gr.Slider(
-                    label="Number of image context elements",
-                    minimum=1,
-                    maximum=10,
-                    step=1,
-                    interactive=True,
-                    value=2,
-                )
             with gr.Row():
                 with gr.Column():
                     ret_images = gr.Gallery("Similar Images", columns=1, rows=2)
@@ -370,14 +331,15 @@ with gr.Blocks(css=CSS, theme=gr.themes.Soft()) as demo:
                     chatbot = gr.Chatbot(height=400)
             with gr.Accordion("Text References", open=False):
                 # text_context = gr.Row()
                 @gr.render(inputs=references)
                 def gen_refs(references):
                     # print(references)
                     n = len(references)
                     for i in range(n):
-                        gr.Textbox(label=f"Reference-{i+1}", value=references[i], lines=3)
             with gr.Row():
                 msg = gr.Textbox(
@@ -396,7 +358,7 @@ with gr.Blocks(css=CSS, theme=gr.themes.Soft()) as demo:
     )
     embed.click(
         extract_data_from_pdfs,
-        inputs=[doc_collection, session_states],
         outputs=[
             vectordb,
             session_states,
@@ -409,13 +371,18 @@ with gr.Blocks(css=CSS, theme=gr.themes.Soft()) as demo:
     submit_btn.click(
         conversation,
-        [vectordb, msg, num_context, img_context, chatbot],
-        [chatbot,references ,ret_images],
     )
     back_p1.click(lambda: gr.Tabs(selected=0), None, tabs)
     next_p1.click(check_validity_and_llm, session_states, tabs)
 if __name__ == "__main__":
-    demo.launch()

+import base64
+import chromadb
+import gc
 import gradio as gr
 import io
+import numpy as np
+import ocrmypdf
+import os
 import pandas as pd
+import pymupdf
 import torch
+from PIL import Image
 from chromadb.utils import embedding_functions
+from chromadb.utils.data_loaders import ImageLoader
 from langchain import PromptTemplate
+from langchain.text_splitter import RecursiveCharacterTextSplitter
+from langchain_community.llms import HuggingFaceEndpoint
+from pdfminer.high_level import extract_text
+from transformers import LlavaNextForConditionalGeneration, LlavaNextProcessor
+from utils import *
 if torch.cuda.is_available():
     processor = LlavaNextProcessor.from_pretrained("llava-hf/llava-v1.6-mistral-7b-hf")
     )
+@spaces.GPU()
+def get_image_description(image):
     torch.cuda.empty_cache()
     gc.collect()
     descriptions = []
     prompt = "[INST] <image>\nDescribe the image in a sentence [/INST]"
+    inputs = processor(prompt, image, return_tensors="pt").to("cuda:0")
+    output = vision_model.generate(**inputs, max_new_tokens=100)
+    descriptions.append(processor.decode(output[0], skip_special_tokens=True))
     return descriptions
 """
 # def get_vectordb(text, images, tables):
 def get_vectordb(text, images):
     client = chromadb.EphemeralClient()
         client.delete_collection("text_db")
     if "image_db" in [i.name for i in client.list_collections()]:
         client.delete_collection("image_db")
     text_collection = client.get_or_create_collection(
         name="text_db",
         embedding_function=sentence_transformer_ef,
         data_loader=loader,
         metadata={"hnsw:space": "cosine"},
     )
+    descs = []
+    print(descs)
+    for image in images:
+        try:
+            descs.append(get_image_description(image)[0])
+        except:
+            descs.append("Could not generate image description due to some error")
+    # image_descriptions = get_image_descriptions(images)
+    image_dict = [{"image": image_to_bytes(img)} for img in images]
+    if len(images) > 0:
         image_collection.add(
             ids=[str(i) for i in range(len(images))],
+            documents=descs,
             metadatas=image_dict,
         )
         chunk_overlap=10,
     )
+    if len(text) > 0:
         docs = splitter.create_documents([text])
         doc_texts = [i.page_content for i in docs]
         text_collection.add(
     return client
+def extract_data_from_pdfs(docs, session, include_images, progress=gr.Progress()):
     if len(docs) == 0:
         raise gr.Error("No documents to process")
     progress(0, "Extracting Images")
+    # images = extract_images(docs)
     progress(0.25, "Extracting Text")
     strategy = "hi_res"
     model_name = "yolox"
     all_elements = []
     all_text = ""
+    images = []
+    for doc in docs:
+        ocrmypdf.ocr(doc, "ocr.pdf", deskew=True, skip_text=True)
+        text = extract_text("ocr.pdf")
+        all_text += clean_text(text) + "\n\n"
+        if include_images == "Include Images":
+            images.extend(extract_images(["ocr.pdf"]))
+    progress(
+        0.6, "Generating image descriptions and inserting everything into vectorDB"
+    )
     vectordb = get_vectordb(all_text, images)
     progress(1, "Completed")
 )
+def conversation(
+    vectordb_client, msg, num_context, img_context, history, hf_token, model_path
+):
+    if hf_token.strip() != "" and model_path.strip() != "":
+        llm = HuggingFaceEndpoint(
+            repo_id=model_path,
+            temperature=0.4,
+            max_new_tokens=800,
+            huggingfacehub_api_token=hf_token,
+        )
+    else:
+        llm = HuggingFaceEndpoint(
+            repo_id="meta-llama/Meta-Llama-3-8B-Instruct",
+            temperature=0.4,
+            max_new_tokens=800,
+            huggingfacehub_api_token=os.getenv("P_HF_TOKEN", "None"),
+        )
     text_collection = vectordb_client.get_collection(
         "text_db", embedding_function=sentence_transformer_ef
     results = text_collection.query(
         query_texts=[msg], include=["documents"], n_results=num_context
     )["documents"][0]
     similar_images = image_collection.query(
         query_texts=[msg],
         include=["metadatas", "distances", "documents"],
     return "\n".join(text_data), "", ""
+with gr.Blocks(css=CSS, theme=gr.themes.Soft(text_size=sizes.text_md)) as demo:
     vectordb = gr.State()
     doc_collection = gr.State(value=[])
     session_states = gr.State(value={})
     references = gr.State(value=[])
     gr.Markdown(
         """<h2><center>Multimodal PDF Chatbot</center></h2>
     <h3><center><b>Interact With Your PDF Documents</b></center></h3>"""
                     embed = gr.Button(value="Extract Data")
                 with gr.Column():
                     next_p1 = gr.Button(value="Next")
+            with gr.Row():
+                include_images = gr.Radio(
+                    ["Include Images", "Exclude Images"],
+                    value="Include Images",
+                    label="Include/ Exclude Images",
+                    interactive=True,
+                )
+            with gr.Row(equal_height=True, variant="panel") as row:
+                selected = gr.Dataframe(
+                    interactive=False,
+                    col_count=(1, "fixed"),
+                    headers=["Selected Files"],
+                )
+                prog = gr.HTML(
+                    value="<h1 style='text-align: center'>Click the 'Extract' button to extract data from PDFs<h1>"
+                )
             with gr.Accordion("See Parts of Extracted Data", open=False):
                 with gr.Column(visible=True) as sample_data:
                                 label="Sample Extracted Images", columns=1, rows=2
                             )
         with gr.TabItem("Chat", id=2) as chat_tab:
+            with gr.Accordion("Config (Advanced) (Optional)", open=False):
+                with gr.Row(variant="panel", equal_height=True):
+                    choice = gr.Radio(
+                        ["chromaDB"],
+                        value="chromaDB",
+                        label="Vector Database",
+                        interactive=True,
+                    )
+                    with gr.Accordion("Use your own model (optional)", open=False):
+                        hf_token = gr.Textbox(
+                            label="HuggingFace Token", interactive=True
+                        )
+                        model_path = gr.Textbox(label="Model Path", interactive=True)
+                with gr.Row(variant="panel", equal_height=True):
+                    num_context = gr.Slider(
+                        label="Number of text context elements",
+                        minimum=1,
+                        maximum=20,
+                        step=1,
+                        interactive=True,
+                        value=3,
+                    )
+                    img_context = gr.Slider(
+                        label="Number of image context elements",
+                        minimum=1,
+                        maximum=10,
+                        step=1,
+                        interactive=True,
+                        value=2,
+                    )
             with gr.Row():
                 with gr.Column():
                     ret_images = gr.Gallery("Similar Images", columns=1, rows=2)
                     chatbot = gr.Chatbot(height=400)
             with gr.Accordion("Text References", open=False):
                 # text_context = gr.Row()
                 @gr.render(inputs=references)
                 def gen_refs(references):
                     # print(references)
                     n = len(references)
                     for i in range(n):
+                        gr.Textbox(
+                            label=f"Reference-{i+1}", value=references[i], lines=3
+                        )
             with gr.Row():
                 msg = gr.Textbox(
     )
     embed.click(
         extract_data_from_pdfs,
+        inputs=[doc_collection, session_states, include_images],
         outputs=[
             vectordb,
             session_states,
     submit_btn.click(
         conversation,
+        [vectordb, msg, num_context, img_context, chatbot, hf_token, model_path],
+        [chatbot, references, ret_images],
     )
+    msg.submit(
+        conversation,
+        [vectordb, msg, num_context, img_context, chatbot, hf_token, model_path],
+        [chatbot, references, ret_images],
+    )
     back_p1.click(lambda: gr.Tabs(selected=0), None, tabs)
     next_p1.click(check_validity_and_llm, session_states, tabs)
 if __name__ == "__main__":
+    demo.launch()

requirements.txt CHANGED Viewed

@@ -1,6 +1,7 @@
 chromadb==0.5.3
 langchain==0.2.5
 langchain_community==0.2.5
 numpy<2.0.0
 pandas==2.2.2
 Pillow==10.3.0
@@ -8,4 +9,6 @@ pymupdf==1.24.5
 sentence_transformers==3.0.1
 unstructured[all-docs]
 accelerate
-bitsandbytes

 chromadb==0.5.3
 langchain==0.2.5
 langchain_community==0.2.5
+langchain-huggingface
 numpy<2.0.0
 pandas==2.2.2
 Pillow==10.3.0
 sentence_transformers==3.0.1
 unstructured[all-docs]
 accelerate
+bitsandbytes
+easyocr
+ocrmypdf

utils.py ADDED Viewed

	@@ -0,0 +1,53 @@

+import pymupdf
+from PIL import Image
+import io
+import gradio as gr
+import pandas as pd
+def image_to_bytes(image):
+    img_byte_arr = io.BytesIO()
+    image.save(img_byte_arr, format="PNG")
+    return base64.b64encode(img_byte_arr.getvalue()).decode("utf-8")
+def extract_pdfs(docs, doc_collection):
+    if docs:
+        doc_collection = []
+        doc_collection.extend(docs)
+    return (
+        doc_collection,
+        gr.Tabs(selected=1),
+        pd.DataFrame([i.split("/")[-1] for i in list(docs)], columns=["Filename"]),
+    )
+def extract_images(docs):
+    images = []
+    for doc_path in docs:
+        doc = pymupdf.open(doc_path)  # open a document
+        for page_index in range(len(doc)):  # iterate over pdf pages
+            page = doc[page_index]  # get the page
+            image_list = page.get_images()
+            for image_index, img in enumerate(
+                image_list, start=1
+            ):  # enumerate the image list
+                xref = img[0]  # get the XREF of the image
+                pix = pymupdf.Pixmap(doc, xref)  # create a Pixmap
+                if pix.n - pix.alpha > 3:  # CMYK: convert to RGB first
+                    pix = pymupdf.Pixmap(pymupdf.csRGB, pix)
+                images.append(Image.open(io.BytesIO(pix.pil_tobytes("JPEG"))))
+    return images
+def clean_text(text):
+    text = text.strip()
+    cleaned_text = text.replace("\n", " ")
+    cleaned_text = cleaned_text.replace("\t", " ")
+    cleaned_text = cleaned_text.replace("  ", " ")
+    cleaned_text = cleaned_text.strip()
+    return cleaned_text