Spaces:

calm-ai
/

DocQA

Sleeping

App Files Files Community

likhithv commited on May 1

Commit

c523dd3

•

1 Parent(s): 9210930

new rag approach

Browse files

Files changed (11) hide show

.streamlit/config.toml +0 -3
Dockerfile +12 -10
RAG.py +22 -10
app.py +4 -11
classification.py +2 -44
cropped_1099-Div.jpg +0 -0
cropped_1099-Int.jpg +0 -0
cropped_w2.jpg +0 -0
cropped_w3.jpg +0 -0
donut_inference.py +9 -16
non_form_llama_parse.py +2 -0

.streamlit/config.toml DELETED Viewed

@@ -1,3 +0,0 @@
-[server]
-enableXsrfProtection = false
-enableCORS = false

Dockerfile CHANGED Viewed

@@ -1,8 +1,7 @@
 FROM nvidia/cuda:12.4.1-devel-ubuntu22.04
 # Set the working directory in the container
-WORKDIR /DocQA
 # Install system dependencies, including Python and utilities
 RUN apt-get update && apt-get install -y \
@@ -12,9 +11,6 @@ RUN apt-get update && apt-get install -y \
     poppler-utils \
     && apt-get clean && rm -rf /var/lib/apt/lists/*
 # Add a new user to avoid running as root
 RUN useradd -m -u 1000 user
@@ -28,16 +24,22 @@ WORKDIR $HOME/app
 # Copy the requirements.txt first to leverage Docker cache
 COPY --chown=user requirements.txt $HOME/app/
 RUN pip install --no-cache-dir -r requirements.txt
-RUN pip uninstall --y faiss-cpu & pip install faiss-gpu
-RUN pip install torch torchvision torchaudio --extra-index-url https://download.pytorch.org/whl/cu124
 # Copy the rest of the application's code to the container
-COPY --chown=user . $HOME/app
 # Expose the port the app runs on
 EXPOSE 8501
 # Set the entry point to run the application
-ENTRYPOINT ["streamlit", "run", "app.py", "--server.enableXsrfProtection", "false"]

 FROM nvidia/cuda:12.4.1-devel-ubuntu22.04
 # Set the working directory in the container
+WORKDIR /app
 # Install system dependencies, including Python and utilities
 RUN apt-get update && apt-get install -y \
     poppler-utils \
     && apt-get clean && rm -rf /var/lib/apt/lists/*
 # Add a new user to avoid running as root
 RUN useradd -m -u 1000 user
 # Copy the requirements.txt first to leverage Docker cache
 COPY --chown=user requirements.txt $HOME/app/
+RUN pip install torch --extra-index-url https://download.pytorch.org/whl/cu124
 RUN pip install --no-cache-dir -r requirements.txt
 # Copy the rest of the application's code to the container
+COPY --chown=user src/app.py $HOME/app
+COPY --chown=user src/classification.py $HOME/app
+COPY --chown=user src/donut_inference.py $HOME/app
+COPY --chown=user src/non_form_llama_parse.py $HOME/app
+COPY --chown=user src/RAG.py $HOME/app
+COPY --chown=user src/.env $HOME/app
+COPY --chown=user images $HOME/app/images
+COPY --chown=user Model $HOME/app/Model
+COPY --chown=user best_resnet152_model.h5 $HOME/app
 # Expose the port the app runs on
 EXPOSE 8501
 # Set the entry point to run the application
+ENTRYPOINT ["streamlit", "run", "app.py", "--server.enableXsrfProtection", "false"]

RAG.py CHANGED Viewed

@@ -1,4 +1,8 @@
-from ragatouille import RAGPretrainedModel
 from langchain_groq import ChatGroq
 from langchain.chains import RetrievalQA
 from langchain.memory import ConversationBufferMemory
@@ -6,13 +10,17 @@ from langchain.prompts import PromptTemplate
 from dotenv import load_dotenv
 import os
 import streamlit as st
-import asyncio
 load_dotenv()
 GROQ_API_KEY = os.getenv('GROQ_API_KEY')
 llm = ChatGroq(temperature=0, groq_api_key=GROQ_API_KEY, model_name="llama3-70b-8192")
-RAG = RAGPretrainedModel.from_pretrained("colbert-ir/colbertv2.0")
 system_prompt = """You are a helpful assistant, you will use the provided context to answer user questions.
 Read the given context before answering questions and think step by step. If you can not answer a user question based on
 the provided context, inform the user. Do not use any other information for answering user. Provide a detailed answer to the question."""
@@ -30,14 +38,18 @@ memory = ConversationBufferMemory(input_key="question", memory_key="history")
 def rag(full_string):
-    RAG.index(
-        collection=[full_string],
-        index_name="vector_db",
-        max_document_length=512,
-        split_documents=True,
-    )
-    retriever = RAG.as_langchain_retriever(k=5)
     qa = RetrievalQA.from_chain_type(
             llm=llm,
             chain_type="stuff",  # try other chains types as well. refine, map_reduce, map_rerank

+# from ragatouille import RAGPretrainedModel
+from langchain_voyageai import VoyageAIEmbeddings
+# from langchain_text_splitters import RecursiveCharacterTextSplitter
+from langchain_experimental.text_splitter import SemanticChunker
+from langchain_community.vectorstores import FAISS
 from langchain_groq import ChatGroq
 from langchain.chains import RetrievalQA
 from langchain.memory import ConversationBufferMemory
 from dotenv import load_dotenv
 import os
 import streamlit as st
+# import asyncio
 load_dotenv()
 GROQ_API_KEY = os.getenv('GROQ_API_KEY')
+VOYAGE_EMBEDDINGS = os.getenv('VOYAGE_EMBEDDINGS')
 llm = ChatGroq(temperature=0, groq_api_key=GROQ_API_KEY, model_name="llama3-70b-8192")
+# RAG = RAGPretrainedModel.from_pretrained("colbert-ir/colbertv2.0")
+embeddings = VoyageAIEmbeddings(
+    voyage_api_key=VOYAGE_EMBEDDINGS, model="voyage-large-2-instruct"
+)
 system_prompt = """You are a helpful assistant, you will use the provided context to answer user questions.
 Read the given context before answering questions and think step by step. If you can not answer a user question based on
 the provided context, inform the user. Do not use any other information for answering user. Provide a detailed answer to the question."""
 def rag(full_string):
+    # RAG.index(
+    #     collection=[full_string],
+    #     index_name="vector_db",
+    #     max_document_length=512,
+    #     split_documents=True,
+    # )
+    text_splitter = SemanticChunker(embeddings, breakpoint_threshold_type="percentile")
+    texts = text_splitter.create_documents([full_string])
+    db = FAISS.from_documents(texts, embeddings)
+    retriever = db.as_retriever(search_kwargs={"k": 5})
+    # retriever = RAG.as_langchain_retriever(k=5)
     qa = RetrievalQA.from_chain_type(
             llm=llm,
             chain_type="stuff",  # try other chains types as well. refine, map_reduce, map_rerank

app.py CHANGED Viewed

@@ -135,7 +135,6 @@ Context:\n
             messages=context,  # Pass conversation context directly
             model="llama3-70b-8192",
             temperature=0,
-            max_tokens=1024,
             top_p=1,
             stop=None,
             stream=True,
@@ -181,10 +180,10 @@ def upload():
     # Define the paths to your images
     image_paths = [
-        "cropped_1099-Div.jpg",
-        "cropped_1099-Int.jpg",
-        "cropped_w2.jpg",
-        "cropped_w3.jpg"
     ]
     # Define the captions for your images
@@ -197,17 +196,11 @@ def upload():
     st.markdown('''
 # Instructions:
 1. **Ensure all uploads are in PDF format**. This ensures compatibility and uniform processing across documents.
 2. **Submit forms in portrait orientation only**. Landscape formats are not supported and may result in processing errors.
 3. **Forms must have a minimum resolution of 1864x1440**. This is crucial for the clarity and legibility necessary for accurate parsing.
 4. **Multiple documents can be uploaded simultaneously**; however, the combined size of these documents should not exceed 10MB.
 5. **Donut model parses specific forms**: 1099-Div, 1099-Int, W2, and W3. Non-form documents are also processable.
 6. **Upload only Forms at a time or Non forms at a time**: we dont accept both forms and Non forms simultaneoulsy.
             ''')
     st.subheader("Try it out")

             messages=context,  # Pass conversation context directly
             model="llama3-70b-8192",
             temperature=0,
             top_p=1,
             stop=None,
             stream=True,
     # Define the paths to your images
     image_paths = [
+        "images/cropped_1099-Div.jpg",
+        "images/cropped_1099-Int.jpg",
+        "images/cropped_w2.jpg",
+        "images/cropped_w3.jpg"
     ]
     # Define the captions for your images
     st.markdown('''
 # Instructions:
 1. **Ensure all uploads are in PDF format**. This ensures compatibility and uniform processing across documents.
 2. **Submit forms in portrait orientation only**. Landscape formats are not supported and may result in processing errors.
 3. **Forms must have a minimum resolution of 1864x1440**. This is crucial for the clarity and legibility necessary for accurate parsing.
 4. **Multiple documents can be uploaded simultaneously**; however, the combined size of these documents should not exceed 10MB.
 5. **Donut model parses specific forms**: 1099-Div, 1099-Int, W2, and W3. Non-form documents are also processable.
 6. **Upload only Forms at a time or Non forms at a time**: we dont accept both forms and Non forms simultaneoulsy.
             ''')
     st.subheader("Try it out")

classification.py CHANGED Viewed

@@ -1,6 +1,7 @@
 import numpy as np
 import time
 from tensorflow.keras.preprocessing import image
 # from tensorflow.keras.preprocessing.image import ImageDataGenerator
 import tensorflow as tf
 gpus = tf.config.experimental.list_physical_devices('GPU')
@@ -11,7 +12,6 @@ if gpus:
     except RuntimeError as e:
         # Memory growth must be set before GPUs have been initialized
         print(e)
-import streamlit as st
 # with tf.device('/cpu:0'):
 # Load the saved model
 model = tf.keras.models.load_model('best_resnet152_model.h5')
@@ -39,46 +39,4 @@ def predict(pil_img):
     predicted_class_name = class_names[predicted_class_index]
     print("Predicted class:", predicted_class_name)
     print("Execution time: ", end_time - start_time)
-    return predicted_class_name
-# import numpy as np
-# import time
-# from PIL import Image  # Import for PIL image handling
-# from torchvision import transforms  # Import for image preprocessing
-# import torch
-# import torch.nn as nn  # Import for PyTorch neural networks
-# import streamlit as st
-# # Load the PyTorch model (assuming it's saved in PyTorch format)
-# model = torch.load('./best_resnet152_model.pt')  # Replace with your model filename
-# # Define class names dictionary
-# class_names = {0: '1099_Div', 1: '1099_Int', 2: 'Non_Form', 3: 'w_2', 4: 'w_3'}
-# # Define a function for prediction using PyTorch
-# @st.cache_resource
-# def predict(pil_img):
-#     # Preprocess the image
-#     preprocess = transforms.Compose([
-#         transforms.ToTensor(),  # Convert to PyTorch tensor
-#         transforms.Normalize(mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225]),  # Normalize based on ImageNet statistics
-#     ])
-#     img_tensor = preprocess(pil_img)
-#     img_tensor.unsqueeze_(0)  # Add batch dimension
-#     # Predict with PyTorch
-#     start_time = time.time()
-#     with torch.no_grad():  # Disable gradient calculation for prediction
-#         predictions = model(img_tensor)
-#     end_time = time.time()
-#     # Get the predicted class
-#     predicted_class_index = torch.argmax(predictions, dim=1).item()
-#     predicted_class_name = class_names[predicted_class_index]
-#     # Print results (optional for debugging)
-#     print("Predicted class:", predicted_class_name)
-#     print("Execution time: ", end_time - start_time)
-#     return predicted_class_name

 import numpy as np
 import time
 from tensorflow.keras.preprocessing import image
+import streamlit as st
 # from tensorflow.keras.preprocessing.image import ImageDataGenerator
 import tensorflow as tf
 gpus = tf.config.experimental.list_physical_devices('GPU')
     except RuntimeError as e:
         # Memory growth must be set before GPUs have been initialized
         print(e)
 # with tf.device('/cpu:0'):
 # Load the saved model
 model = tf.keras.models.load_model('best_resnet152_model.h5')
     predicted_class_name = class_names[predicted_class_index]
     print("Predicted class:", predicted_class_name)
     print("Execution time: ", end_time - start_time)
+    return predicted_class_name

cropped_1099-Div.jpg DELETED Viewed

Binary file (721 kB)

cropped_1099-Int.jpg DELETED Viewed

Binary file (279 kB)

cropped_w2.jpg DELETED Viewed

Binary file (198 kB)

cropped_w3.jpg DELETED Viewed

Binary file (292 kB)

donut_inference.py CHANGED Viewed

@@ -1,33 +1,27 @@
 import torch, re
 from PIL import Image
 from transformers import DonutProcessor, VisionEncoderDecoderModel
-import streamlit as st
-from dotenv import load_dotenv
-import os
-import time
-load_dotenv()
 # image_path = '/app/Datasplit/test/1099_Div/filled_form_43.jpg'
 # image = Image.open(image_path)
 # imgae = image.resize((1864, 1440))
 device = "cuda" if torch.cuda.is_available() else "cpu"
-# device = "cpu"
 # Load the processor from the local directory
-processor = DonutProcessor.from_pretrained("Henge-navuuu/donut-base-finetuned-forms-v1")
-# processor.to(device)
 # Load the model from the local directory
-model = VisionEncoderDecoderModel.from_pretrained("Henge-navuuu/donut-base-finetuned-forms-v1")
 model.to(device)
-@st.cache_resource
 def inference(image):
     pixel_values = processor(image, return_tensors="pt").pixel_values
     task_prompt = "<s>"
     decoder_input_ids = processor.tokenizer(task_prompt, add_special_tokens=False, return_tensors="pt")["input_ids"]
-    # device = "cuda" if torch.cuda.is_available() else "cpu"
-    # model.to(device)
-    start_time = time.time()
     outputs = model.generate(pixel_values.to(device),
                                 decoder_input_ids=decoder_input_ids.to(device),
                                 max_length=model.decoder.config.max_position_embeddings,
@@ -39,12 +33,11 @@ def inference(image):
                                 bad_words_ids=[[processor.tokenizer.unk_token_id]],
                                 return_dict_in_generate=True,
                                 output_scores=True,)
-    end_time = time.time()
     sequence = processor.batch_decode(outputs.sequences)[0]
     sequence = sequence.replace(processor.tokenizer.eos_token, "").replace(processor.tokenizer.pad_token, "")
     sequence = re.sub(r"<.*?>", "", sequence, count=1).strip()  # remove first task start token
-    print(processor.token2json(sequence))
-    print(f"Donut Inference time {end_time-start_time}")
     return processor.token2json(sequence)
 # data = inference(image)

 import torch, re
 from PIL import Image
 from transformers import DonutProcessor, VisionEncoderDecoderModel
 # image_path = '/app/Datasplit/test/1099_Div/filled_form_43.jpg'
 # image = Image.open(image_path)
 # imgae = image.resize((1864, 1440))
 device = "cuda" if torch.cuda.is_available() else "cpu"
 # Load the processor from the local directory
+processor = DonutProcessor.from_pretrained("Model")
 # Load the model from the local directory
+model = VisionEncoderDecoderModel.from_pretrained("Model")
 model.to(device)
 def inference(image):
     pixel_values = processor(image, return_tensors="pt").pixel_values
     task_prompt = "<s>"
     decoder_input_ids = processor.tokenizer(task_prompt, add_special_tokens=False, return_tensors="pt")["input_ids"]
+    device = "cuda" if torch.cuda.is_available() else "cpu"
+    model.to(device)
     outputs = model.generate(pixel_values.to(device),
                                 decoder_input_ids=decoder_input_ids.to(device),
                                 max_length=model.decoder.config.max_position_embeddings,
                                 bad_words_ids=[[processor.tokenizer.unk_token_id]],
                                 return_dict_in_generate=True,
                                 output_scores=True,)
     sequence = processor.batch_decode(outputs.sequences)[0]
     sequence = sequence.replace(processor.tokenizer.eos_token, "").replace(processor.tokenizer.pad_token, "")
     sequence = re.sub(r"<.*?>", "", sequence, count=1).strip()  # remove first task start token
+    # print(processor.token2json(sequence))
     return processor.token2json(sequence)
 # data = inference(image)

non_form_llama_parse.py CHANGED Viewed

@@ -2,6 +2,7 @@ from llama_parse import LlamaParse
 from dotenv import load_dotenv
 import os
 import streamlit as st
 load_dotenv()
 LLAMA_PARSE = os.getenv('LLAMA_PARSE')
@@ -12,6 +13,7 @@ parser = LlamaParse(
     verbose=True,
     language="en" # Optionaly you can define a language, default=en
 )
 @st.cache_data
 def extract_text(pdf_path):
     documents = parser.load_data(pdf_path)

 from dotenv import load_dotenv
 import os
 import streamlit as st
 load_dotenv()
 LLAMA_PARSE = os.getenv('LLAMA_PARSE')
     verbose=True,
     language="en" # Optionaly you can define a language, default=en
 )
 @st.cache_data
 def extract_text(pdf_path):
     documents = parser.load_data(pdf_path)