likhithv commited on
Commit
c523dd3
1 Parent(s): 9210930

new rag approach

Browse files
.streamlit/config.toml DELETED
@@ -1,3 +0,0 @@
1
- [server]
2
- enableXsrfProtection = false
3
- enableCORS = false
 
 
 
 
Dockerfile CHANGED
@@ -1,8 +1,7 @@
1
-
2
  FROM nvidia/cuda:12.4.1-devel-ubuntu22.04
3
 
4
  # Set the working directory in the container
5
- WORKDIR /DocQA
6
 
7
  # Install system dependencies, including Python and utilities
8
  RUN apt-get update && apt-get install -y \
@@ -12,9 +11,6 @@ RUN apt-get update && apt-get install -y \
12
  poppler-utils \
13
  && apt-get clean && rm -rf /var/lib/apt/lists/*
14
 
15
-
16
-
17
-
18
  # Add a new user to avoid running as root
19
  RUN useradd -m -u 1000 user
20
 
@@ -28,16 +24,22 @@ WORKDIR $HOME/app
28
 
29
  # Copy the requirements.txt first to leverage Docker cache
30
  COPY --chown=user requirements.txt $HOME/app/
 
31
  RUN pip install --no-cache-dir -r requirements.txt
32
- RUN pip uninstall --y faiss-cpu & pip install faiss-gpu
33
- RUN pip install torch torchvision torchaudio --extra-index-url https://download.pytorch.org/whl/cu124
34
 
35
  # Copy the rest of the application's code to the container
36
- COPY --chown=user . $HOME/app
 
 
 
 
 
 
 
 
37
 
38
  # Expose the port the app runs on
39
  EXPOSE 8501
40
 
41
  # Set the entry point to run the application
42
- ENTRYPOINT ["streamlit", "run", "app.py", "--server.enableXsrfProtection", "false"]
43
-
 
 
1
  FROM nvidia/cuda:12.4.1-devel-ubuntu22.04
2
 
3
  # Set the working directory in the container
4
+ WORKDIR /app
5
 
6
  # Install system dependencies, including Python and utilities
7
  RUN apt-get update && apt-get install -y \
 
11
  poppler-utils \
12
  && apt-get clean && rm -rf /var/lib/apt/lists/*
13
 
 
 
 
14
  # Add a new user to avoid running as root
15
  RUN useradd -m -u 1000 user
16
 
 
24
 
25
  # Copy the requirements.txt first to leverage Docker cache
26
  COPY --chown=user requirements.txt $HOME/app/
27
+ RUN pip install torch --extra-index-url https://download.pytorch.org/whl/cu124
28
  RUN pip install --no-cache-dir -r requirements.txt
 
 
29
 
30
  # Copy the rest of the application's code to the container
31
+ COPY --chown=user src/app.py $HOME/app
32
+ COPY --chown=user src/classification.py $HOME/app
33
+ COPY --chown=user src/donut_inference.py $HOME/app
34
+ COPY --chown=user src/non_form_llama_parse.py $HOME/app
35
+ COPY --chown=user src/RAG.py $HOME/app
36
+ COPY --chown=user src/.env $HOME/app
37
+ COPY --chown=user images $HOME/app/images
38
+ COPY --chown=user Model $HOME/app/Model
39
+ COPY --chown=user best_resnet152_model.h5 $HOME/app
40
 
41
  # Expose the port the app runs on
42
  EXPOSE 8501
43
 
44
  # Set the entry point to run the application
45
+ ENTRYPOINT ["streamlit", "run", "app.py", "--server.enableXsrfProtection", "false"]
 
RAG.py CHANGED
@@ -1,4 +1,8 @@
1
- from ragatouille import RAGPretrainedModel
 
 
 
 
2
  from langchain_groq import ChatGroq
3
  from langchain.chains import RetrievalQA
4
  from langchain.memory import ConversationBufferMemory
@@ -6,13 +10,17 @@ from langchain.prompts import PromptTemplate
6
  from dotenv import load_dotenv
7
  import os
8
  import streamlit as st
9
- import asyncio
10
 
11
  load_dotenv()
12
  GROQ_API_KEY = os.getenv('GROQ_API_KEY')
 
13
 
14
  llm = ChatGroq(temperature=0, groq_api_key=GROQ_API_KEY, model_name="llama3-70b-8192")
15
- RAG = RAGPretrainedModel.from_pretrained("colbert-ir/colbertv2.0")
 
 
 
16
  system_prompt = """You are a helpful assistant, you will use the provided context to answer user questions.
17
  Read the given context before answering questions and think step by step. If you can not answer a user question based on
18
  the provided context, inform the user. Do not use any other information for answering user. Provide a detailed answer to the question."""
@@ -30,14 +38,18 @@ memory = ConversationBufferMemory(input_key="question", memory_key="history")
30
 
31
  def rag(full_string):
32
 
33
- RAG.index(
34
- collection=[full_string],
35
- index_name="vector_db",
36
- max_document_length=512,
37
- split_documents=True,
38
 
39
- )
40
- retriever = RAG.as_langchain_retriever(k=5)
 
 
 
 
41
  qa = RetrievalQA.from_chain_type(
42
  llm=llm,
43
  chain_type="stuff", # try other chains types as well. refine, map_reduce, map_rerank
 
1
+ # from ragatouille import RAGPretrainedModel
2
+ from langchain_voyageai import VoyageAIEmbeddings
3
+ # from langchain_text_splitters import RecursiveCharacterTextSplitter
4
+ from langchain_experimental.text_splitter import SemanticChunker
5
+ from langchain_community.vectorstores import FAISS
6
  from langchain_groq import ChatGroq
7
  from langchain.chains import RetrievalQA
8
  from langchain.memory import ConversationBufferMemory
 
10
  from dotenv import load_dotenv
11
  import os
12
  import streamlit as st
13
+ # import asyncio
14
 
15
  load_dotenv()
16
  GROQ_API_KEY = os.getenv('GROQ_API_KEY')
17
+ VOYAGE_EMBEDDINGS = os.getenv('VOYAGE_EMBEDDINGS')
18
 
19
  llm = ChatGroq(temperature=0, groq_api_key=GROQ_API_KEY, model_name="llama3-70b-8192")
20
+ # RAG = RAGPretrainedModel.from_pretrained("colbert-ir/colbertv2.0")
21
+ embeddings = VoyageAIEmbeddings(
22
+ voyage_api_key=VOYAGE_EMBEDDINGS, model="voyage-large-2-instruct"
23
+ )
24
  system_prompt = """You are a helpful assistant, you will use the provided context to answer user questions.
25
  Read the given context before answering questions and think step by step. If you can not answer a user question based on
26
  the provided context, inform the user. Do not use any other information for answering user. Provide a detailed answer to the question."""
 
38
 
39
  def rag(full_string):
40
 
41
+ # RAG.index(
42
+ # collection=[full_string],
43
+ # index_name="vector_db",
44
+ # max_document_length=512,
45
+ # split_documents=True,
46
 
47
+ # )
48
+ text_splitter = SemanticChunker(embeddings, breakpoint_threshold_type="percentile")
49
+ texts = text_splitter.create_documents([full_string])
50
+ db = FAISS.from_documents(texts, embeddings)
51
+ retriever = db.as_retriever(search_kwargs={"k": 5})
52
+ # retriever = RAG.as_langchain_retriever(k=5)
53
  qa = RetrievalQA.from_chain_type(
54
  llm=llm,
55
  chain_type="stuff", # try other chains types as well. refine, map_reduce, map_rerank
app.py CHANGED
@@ -135,7 +135,6 @@ Context:\n
135
  messages=context, # Pass conversation context directly
136
  model="llama3-70b-8192",
137
  temperature=0,
138
- max_tokens=1024,
139
  top_p=1,
140
  stop=None,
141
  stream=True,
@@ -181,10 +180,10 @@ def upload():
181
 
182
  # Define the paths to your images
183
  image_paths = [
184
- "cropped_1099-Div.jpg",
185
- "cropped_1099-Int.jpg",
186
- "cropped_w2.jpg",
187
- "cropped_w3.jpg"
188
  ]
189
 
190
  # Define the captions for your images
@@ -197,17 +196,11 @@ def upload():
197
 
198
  st.markdown('''
199
  # Instructions:
200
-
201
  1. **Ensure all uploads are in PDF format**. This ensures compatibility and uniform processing across documents.
202
-
203
  2. **Submit forms in portrait orientation only**. Landscape formats are not supported and may result in processing errors.
204
-
205
  3. **Forms must have a minimum resolution of 1864x1440**. This is crucial for the clarity and legibility necessary for accurate parsing.
206
-
207
  4. **Multiple documents can be uploaded simultaneously**; however, the combined size of these documents should not exceed 10MB.
208
-
209
  5. **Donut model parses specific forms**: 1099-Div, 1099-Int, W2, and W3. Non-form documents are also processable.
210
-
211
  6. **Upload only Forms at a time or Non forms at a time**: we dont accept both forms and Non forms simultaneoulsy.
212
  ''')
213
  st.subheader("Try it out")
 
135
  messages=context, # Pass conversation context directly
136
  model="llama3-70b-8192",
137
  temperature=0,
 
138
  top_p=1,
139
  stop=None,
140
  stream=True,
 
180
 
181
  # Define the paths to your images
182
  image_paths = [
183
+ "images/cropped_1099-Div.jpg",
184
+ "images/cropped_1099-Int.jpg",
185
+ "images/cropped_w2.jpg",
186
+ "images/cropped_w3.jpg"
187
  ]
188
 
189
  # Define the captions for your images
 
196
 
197
  st.markdown('''
198
  # Instructions:
 
199
  1. **Ensure all uploads are in PDF format**. This ensures compatibility and uniform processing across documents.
 
200
  2. **Submit forms in portrait orientation only**. Landscape formats are not supported and may result in processing errors.
 
201
  3. **Forms must have a minimum resolution of 1864x1440**. This is crucial for the clarity and legibility necessary for accurate parsing.
 
202
  4. **Multiple documents can be uploaded simultaneously**; however, the combined size of these documents should not exceed 10MB.
 
203
  5. **Donut model parses specific forms**: 1099-Div, 1099-Int, W2, and W3. Non-form documents are also processable.
 
204
  6. **Upload only Forms at a time or Non forms at a time**: we dont accept both forms and Non forms simultaneoulsy.
205
  ''')
206
  st.subheader("Try it out")
classification.py CHANGED
@@ -1,6 +1,7 @@
1
  import numpy as np
2
  import time
3
  from tensorflow.keras.preprocessing import image
 
4
  # from tensorflow.keras.preprocessing.image import ImageDataGenerator
5
  import tensorflow as tf
6
  gpus = tf.config.experimental.list_physical_devices('GPU')
@@ -11,7 +12,6 @@ if gpus:
11
  except RuntimeError as e:
12
  # Memory growth must be set before GPUs have been initialized
13
  print(e)
14
- import streamlit as st
15
  # with tf.device('/cpu:0'):
16
  # Load the saved model
17
  model = tf.keras.models.load_model('best_resnet152_model.h5')
@@ -39,46 +39,4 @@ def predict(pil_img):
39
  predicted_class_name = class_names[predicted_class_index]
40
  print("Predicted class:", predicted_class_name)
41
  print("Execution time: ", end_time - start_time)
42
- return predicted_class_name
43
- # import numpy as np
44
- # import time
45
- # from PIL import Image # Import for PIL image handling
46
- # from torchvision import transforms # Import for image preprocessing
47
-
48
- # import torch
49
- # import torch.nn as nn # Import for PyTorch neural networks
50
- # import streamlit as st
51
-
52
- # # Load the PyTorch model (assuming it's saved in PyTorch format)
53
- # model = torch.load('./best_resnet152_model.pt') # Replace with your model filename
54
-
55
- # # Define class names dictionary
56
- # class_names = {0: '1099_Div', 1: '1099_Int', 2: 'Non_Form', 3: 'w_2', 4: 'w_3'}
57
-
58
-
59
- # # Define a function for prediction using PyTorch
60
- # @st.cache_resource
61
- # def predict(pil_img):
62
- # # Preprocess the image
63
- # preprocess = transforms.Compose([
64
- # transforms.ToTensor(), # Convert to PyTorch tensor
65
- # transforms.Normalize(mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225]), # Normalize based on ImageNet statistics
66
- # ])
67
- # img_tensor = preprocess(pil_img)
68
- # img_tensor.unsqueeze_(0) # Add batch dimension
69
-
70
- # # Predict with PyTorch
71
- # start_time = time.time()
72
- # with torch.no_grad(): # Disable gradient calculation for prediction
73
- # predictions = model(img_tensor)
74
- # end_time = time.time()
75
-
76
- # # Get the predicted class
77
- # predicted_class_index = torch.argmax(predictions, dim=1).item()
78
- # predicted_class_name = class_names[predicted_class_index]
79
-
80
- # # Print results (optional for debugging)
81
- # print("Predicted class:", predicted_class_name)
82
- # print("Execution time: ", end_time - start_time)
83
-
84
- # return predicted_class_name
 
1
  import numpy as np
2
  import time
3
  from tensorflow.keras.preprocessing import image
4
+ import streamlit as st
5
  # from tensorflow.keras.preprocessing.image import ImageDataGenerator
6
  import tensorflow as tf
7
  gpus = tf.config.experimental.list_physical_devices('GPU')
 
12
  except RuntimeError as e:
13
  # Memory growth must be set before GPUs have been initialized
14
  print(e)
 
15
  # with tf.device('/cpu:0'):
16
  # Load the saved model
17
  model = tf.keras.models.load_model('best_resnet152_model.h5')
 
39
  predicted_class_name = class_names[predicted_class_index]
40
  print("Predicted class:", predicted_class_name)
41
  print("Execution time: ", end_time - start_time)
42
+ return predicted_class_name
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
cropped_1099-Div.jpg DELETED
Binary file (721 kB)
 
cropped_1099-Int.jpg DELETED
Binary file (279 kB)
 
cropped_w2.jpg DELETED
Binary file (198 kB)
 
cropped_w3.jpg DELETED
Binary file (292 kB)
 
donut_inference.py CHANGED
@@ -1,33 +1,27 @@
1
  import torch, re
2
  from PIL import Image
3
  from transformers import DonutProcessor, VisionEncoderDecoderModel
4
- import streamlit as st
5
- from dotenv import load_dotenv
6
- import os
7
- import time
8
- load_dotenv()
9
  # image_path = '/app/Datasplit/test/1099_Div/filled_form_43.jpg'
10
  # image = Image.open(image_path)
11
  # imgae = image.resize((1864, 1440))
12
 
13
  device = "cuda" if torch.cuda.is_available() else "cpu"
14
- # device = "cpu"
15
  # Load the processor from the local directory
16
- processor = DonutProcessor.from_pretrained("Henge-navuuu/donut-base-finetuned-forms-v1")
17
- # processor.to(device)
18
  # Load the model from the local directory
19
- model = VisionEncoderDecoderModel.from_pretrained("Henge-navuuu/donut-base-finetuned-forms-v1")
20
  model.to(device)
21
 
22
- @st.cache_resource
23
  def inference(image):
24
  pixel_values = processor(image, return_tensors="pt").pixel_values
25
  task_prompt = "<s>"
26
  decoder_input_ids = processor.tokenizer(task_prompt, add_special_tokens=False, return_tensors="pt")["input_ids"]
27
 
28
- # device = "cuda" if torch.cuda.is_available() else "cpu"
29
- # model.to(device)
30
- start_time = time.time()
31
  outputs = model.generate(pixel_values.to(device),
32
  decoder_input_ids=decoder_input_ids.to(device),
33
  max_length=model.decoder.config.max_position_embeddings,
@@ -39,12 +33,11 @@ def inference(image):
39
  bad_words_ids=[[processor.tokenizer.unk_token_id]],
40
  return_dict_in_generate=True,
41
  output_scores=True,)
42
- end_time = time.time()
43
  sequence = processor.batch_decode(outputs.sequences)[0]
44
  sequence = sequence.replace(processor.tokenizer.eos_token, "").replace(processor.tokenizer.pad_token, "")
45
  sequence = re.sub(r"<.*?>", "", sequence, count=1).strip() # remove first task start token
46
- print(processor.token2json(sequence))
47
- print(f"Donut Inference time {end_time-start_time}")
48
  return processor.token2json(sequence)
49
 
50
  # data = inference(image)
 
1
  import torch, re
2
  from PIL import Image
3
  from transformers import DonutProcessor, VisionEncoderDecoderModel
4
+
 
 
 
 
5
  # image_path = '/app/Datasplit/test/1099_Div/filled_form_43.jpg'
6
  # image = Image.open(image_path)
7
  # imgae = image.resize((1864, 1440))
8
 
9
  device = "cuda" if torch.cuda.is_available() else "cpu"
 
10
  # Load the processor from the local directory
11
+ processor = DonutProcessor.from_pretrained("Model")
12
+
13
  # Load the model from the local directory
14
+ model = VisionEncoderDecoderModel.from_pretrained("Model")
15
  model.to(device)
16
 
 
17
  def inference(image):
18
  pixel_values = processor(image, return_tensors="pt").pixel_values
19
  task_prompt = "<s>"
20
  decoder_input_ids = processor.tokenizer(task_prompt, add_special_tokens=False, return_tensors="pt")["input_ids"]
21
 
22
+ device = "cuda" if torch.cuda.is_available() else "cpu"
23
+ model.to(device)
24
+
25
  outputs = model.generate(pixel_values.to(device),
26
  decoder_input_ids=decoder_input_ids.to(device),
27
  max_length=model.decoder.config.max_position_embeddings,
 
33
  bad_words_ids=[[processor.tokenizer.unk_token_id]],
34
  return_dict_in_generate=True,
35
  output_scores=True,)
36
+
37
  sequence = processor.batch_decode(outputs.sequences)[0]
38
  sequence = sequence.replace(processor.tokenizer.eos_token, "").replace(processor.tokenizer.pad_token, "")
39
  sequence = re.sub(r"<.*?>", "", sequence, count=1).strip() # remove first task start token
40
+ # print(processor.token2json(sequence))
 
41
  return processor.token2json(sequence)
42
 
43
  # data = inference(image)
non_form_llama_parse.py CHANGED
@@ -2,6 +2,7 @@ from llama_parse import LlamaParse
2
  from dotenv import load_dotenv
3
  import os
4
  import streamlit as st
 
5
  load_dotenv()
6
  LLAMA_PARSE = os.getenv('LLAMA_PARSE')
7
 
@@ -12,6 +13,7 @@ parser = LlamaParse(
12
  verbose=True,
13
  language="en" # Optionaly you can define a language, default=en
14
  )
 
15
  @st.cache_data
16
  def extract_text(pdf_path):
17
  documents = parser.load_data(pdf_path)
 
2
  from dotenv import load_dotenv
3
  import os
4
  import streamlit as st
5
+
6
  load_dotenv()
7
  LLAMA_PARSE = os.getenv('LLAMA_PARSE')
8
 
 
13
  verbose=True,
14
  language="en" # Optionaly you can define a language, default=en
15
  )
16
+
17
  @st.cache_data
18
  def extract_text(pdf_path):
19
  documents = parser.load_data(pdf_path)