Spaces:

anand004
/

Multimodal-PDF-RAG

Runtime error

App Files Files Community

anand004 commited on Jun 24, 2024

Commit

e014b81

unverified ·

1 Parent(s): 65aad38

bug fixes and improvement

Browse files

Files changed (3) hide show

app.py +96 -17
requirements.txt +4 -2
utils.py +7 -9

app.py CHANGED Viewed

@@ -8,11 +8,14 @@ import ocrmypdf
 import os
 import pandas as pd
 import pymupdf
 import spaces
 import torch
 from PIL import Image
 from chromadb.utils import embedding_functions
 from chromadb.utils.data_loaders import ImageLoader
 from gradio.themes.utils import sizes
 from langchain import PromptTemplate
 from langchain.text_splitter import RecursiveCharacterTextSplitter
@@ -22,6 +25,29 @@ from transformers import LlavaNextForConditionalGeneration, LlavaNextProcessor
 from utils import *
 if torch.cuda.is_available():
     processor = LlavaNextProcessor.from_pretrained("llava-hf/llava-v1.6-mistral-7b-hf")
     vision_model = LlavaNextForConditionalGeneration.from_pretrained(
@@ -75,7 +101,6 @@ def get_vectordb(text, images):
         metadata={"hnsw:space": "cosine"},
     )
     descs = []
-    print(descs)
     for image in images:
         try:
             descs.append(get_image_description(image)[0])
@@ -97,7 +122,9 @@ def get_vectordb(text, images):
         chunk_overlap=10,
     )
-    if len(text) > 0:
         docs = splitter.create_documents([text])
         doc_texts = [i.page_content for i in docs]
         text_collection.add(
@@ -106,7 +133,16 @@ def get_vectordb(text, images):
     return client
-def extract_data_from_pdfs(docs, session, include_images, progress=gr.Progress()):
     if len(docs) == 0:
         raise gr.Error("No documents to process")
     progress(0, "Extracting Images")
@@ -115,18 +151,20 @@ def extract_data_from_pdfs(docs, session, include_images, progress=gr.Progress()
     progress(0.25, "Extracting Text")
-    strategy = "hi_res"
-    model_name = "yolox"
-    all_elements = []
     all_text = ""
     images = []
     for doc in docs:
-        ocrmypdf.ocr(doc, "ocr.pdf", deskew=True, force_ocr=True)
-        text = extract_text("ocr.pdf")
-        all_text += clean_text(text) + "\n\n"
         if include_images == "Include Images":
-            images.extend(extract_images(["ocr.pdf"]))
     progress(
         0.6, "Generating image descriptions and inserting everything into vectorDB"
@@ -153,20 +191,28 @@ sentence_transformer_ef = embedding_functions.SentenceTransformerEmbeddingFuncti
 def conversation(
-    vectordb_client, msg, num_context, img_context, history, hf_token, model_path
 ):
     if hf_token.strip() != "" and model_path.strip() != "":
         llm = HuggingFaceEndpoint(
             repo_id=model_path,
-            temperature=0.4,
-            max_new_tokens=800,
             huggingfacehub_api_token=hf_token,
         )
     else:
         llm = HuggingFaceEndpoint(
             repo_id="meta-llama/Meta-Llama-3-8B-Instruct",
-            temperature=0.4,
-            max_new_tokens=800,
             huggingfacehub_api_token=os.getenv("P_HF_TOKEN", "None"),
         )
@@ -273,6 +319,12 @@ with gr.Blocks(css=CSS, theme=gr.themes.Soft(text_size=sizes.text_md)) as demo:
                     label="Include/ Exclude Images",
                     interactive=True,
                 )
             with gr.Row(equal_height=True, variant="panel") as row:
                 selected = gr.Dataframe(
@@ -327,6 +379,23 @@ with gr.Blocks(css=CSS, theme=gr.themes.Soft(text_size=sizes.text_md)) as demo:
                         interactive=True,
                         value=2,
                     )
             with gr.Row():
                 with gr.Column():
                     ret_images = gr.Gallery("Similar Images", columns=1, rows=2)
@@ -361,7 +430,7 @@ with gr.Blocks(css=CSS, theme=gr.themes.Soft(text_size=sizes.text_md)) as demo:
     )
     embed.click(
         extract_data_from_pdfs,
-        inputs=[doc_collection, session_states, include_images],
         outputs=[
             vectordb,
             session_states,
@@ -374,7 +443,17 @@ with gr.Blocks(css=CSS, theme=gr.themes.Soft(text_size=sizes.text_md)) as demo:
     submit_btn.click(
         conversation,
-        [vectordb, msg, num_context, img_context, chatbot, hf_token, model_path],
         [chatbot, references, ret_images],
     )

 import os
 import pandas as pd
 import pymupdf
+from pypdf import PdfReader
 import spaces
 import torch
 from PIL import Image
 from chromadb.utils import embedding_functions
 from chromadb.utils.data_loaders import ImageLoader
+from doctr.io import DocumentFile
+from doctr.models import ocr_predictor
 from gradio.themes.utils import sizes
 from langchain import PromptTemplate
 from langchain.text_splitter import RecursiveCharacterTextSplitter
 from utils import *
+def result_to_text(result, as_text=False) -> str or list:
+    full_doc = []
+    for _, page in enumerate(result.pages, start=1):
+        text = ""
+        for block in page.blocks:
+            text += "\n\t"
+            for line in block.lines:
+                for word in line.words:
+                    text += word.value + " "
+        full_doc.append(clean_text(text) + "\n\n")
+    return "\n".join(full_doc) if as_text else full_doc
+ocr_model = ocr_predictor(
+    "db_resnet50",
+    "crnn_mobilenet_v3_large",
+    pretrained=True,
+    assume_straight_pages=True,
+)
 if torch.cuda.is_available():
     processor = LlavaNextProcessor.from_pretrained("llava-hf/llava-v1.6-mistral-7b-hf")
     vision_model = LlavaNextForConditionalGeneration.from_pretrained(
         metadata={"hnsw:space": "cosine"},
     )
     descs = []
     for image in images:
         try:
             descs.append(get_image_description(image)[0])
         chunk_overlap=10,
     )
+    if len(text.replace(" ", "").replace("\n", "")) == 0:
+        gr.Error("No text found in documents")
+    else:
         docs = splitter.create_documents([text])
         doc_texts = [i.page_content for i in docs]
         text_collection.add(
     return client
+def extract_only_text(reader):
+    text = ""
+    for _, page in enumerate(reader.pages):
+        text = page.extract_text()
+    return text.strip()
+def extract_data_from_pdfs(
+    docs, session, include_images, do_ocr, progress=gr.Progress()
+):
     if len(docs) == 0:
         raise gr.Error("No documents to process")
     progress(0, "Extracting Images")
     progress(0.25, "Extracting Text")
     all_text = ""
     images = []
     for doc in docs:
+        if do_ocr == "Get Text With OCR":
+            pdf_doc = DocumentFile.from_pdf(doc)
+            result = ocr_model(pdf_doc)
+            all_text += result_to_text(result, as_text=True) + "\n\n"
+        else:
+            reader = PdfReader(doc)
+            all_text += extract_only_text(reader) + "\n\n"
         if include_images == "Include Images":
+            images.extend(extract_images([doc]))
     progress(
         0.6, "Generating image descriptions and inserting everything into vectorDB"
 def conversation(
+    vectordb_client,
+    msg,
+    num_context,
+    img_context,
+    history,
+    temperature,
+    max_new_tokens,
+    hf_token,
+    model_path,
 ):
     if hf_token.strip() != "" and model_path.strip() != "":
         llm = HuggingFaceEndpoint(
             repo_id=model_path,
+            temperature=temperature,
+            max_new_tokens=max_new_tokens,
             huggingfacehub_api_token=hf_token,
         )
     else:
         llm = HuggingFaceEndpoint(
             repo_id="meta-llama/Meta-Llama-3-8B-Instruct",
+            temperature=temperature,
+            max_new_tokens=max_new_tokens,
             huggingfacehub_api_token=os.getenv("P_HF_TOKEN", "None"),
         )
                     label="Include/ Exclude Images",
                     interactive=True,
                 )
+                do_ocr = gr.Radio(
+                    ["Get Text With OCR", "Get Available Text Only"],
+                    value="Get Text With OCR",
+                    label="OCR/ No OCR",
+                    interactive=True,
+                )
             with gr.Row(equal_height=True, variant="panel") as row:
                 selected = gr.Dataframe(
                         interactive=True,
                         value=2,
                     )
+                with gr.Row(variant="panel", equal_height=True):
+                    temp = gr.Slider(
+                        label="Temperature",
+                        minimum=0.1,
+                        maximum=1,
+                        step=0.1,
+                        interactive=True,
+                        value=0.4,
+                    )
+                    max_tokens = gr.Slider(
+                        label="Max Tokens",
+                        minimum=10,
+                        maximum=2000,
+                        step=10,
+                        interactive=True,
+                        value=500,
+                    )
             with gr.Row():
                 with gr.Column():
                     ret_images = gr.Gallery("Similar Images", columns=1, rows=2)
     )
     embed.click(
         extract_data_from_pdfs,
+        inputs=[doc_collection, session_states, include_images, do_ocr],
         outputs=[
             vectordb,
             session_states,
     submit_btn.click(
         conversation,
+        [
+            vectordb,
+            msg,
+            num_context,
+            img_context,
+            chatbot,
+            temp,
+            max_tokens,
+            hf_token,
+            model_path,
+        ],
         [chatbot, references, ret_images],
     )

requirements.txt CHANGED Viewed

@@ -7,8 +7,10 @@ pandas==2.2.2
 Pillow==10.3.0
 pymupdf==1.24.5
 sentence_transformers==3.0.1
-unstructured[all-docs]
 accelerate
 bitsandbytes
 easyocr
-ocrmypdf

 Pillow==10.3.0
 pymupdf==1.24.5
 sentence_transformers==3.0.1
 accelerate
 bitsandbytes
 easyocr
+ocrmypdf
+tf2onnx
+clean-text[gpl]
+python-doctr[torch]

utils.py CHANGED Viewed

@@ -27,19 +27,17 @@ def extract_pdfs(docs, doc_collection):
 def extract_images(docs):
     images = []
     for doc_path in docs:
-        doc = pymupdf.open(doc_path)  # open a document
-        for page_index in range(len(doc)):  # iterate over pdf pages
-            page = doc[page_index]  # get the page
             image_list = page.get_images()
-            for image_index, img in enumerate(
-                image_list, start=1
-            ):  # enumerate the image list
-                xref = img[0]  # get the XREF of the image
-                pix = pymupdf.Pixmap(doc, xref)  # create a Pixmap
-                if pix.n - pix.alpha > 3:  # CMYK: convert to RGB first
                     pix = pymupdf.Pixmap(pymupdf.csRGB, pix)
                 images.append(Image.open(io.BytesIO(pix.pil_tobytes("JPEG"))))

 def extract_images(docs):
     images = []
     for doc_path in docs:
+        doc = pymupdf.open(doc_path)
+        for page_index in range(len(doc)):
+            page = doc[page_index]
             image_list = page.get_images()
+            for _, img in enumerate(image_list, start=1):
+                xref = img[0]
+                pix = pymupdf.Pixmap(doc, xref)
+                if pix.n - pix.alpha > 3:
                     pix = pymupdf.Pixmap(pymupdf.csRGB, pix)
                 images.append(Image.open(io.BytesIO(pix.pil_tobytes("JPEG"))))