rolwinpinto commited on
Commit
fb64d37
·
verified ·
1 Parent(s): 0bfc924

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +33 -38
app.py CHANGED
@@ -3,10 +3,9 @@ import torch
3
  import PyPDF2
4
  from io import BytesIO
5
  from PIL import Image
6
- from transformers import BlipProcessor, BlipForConditionalGeneration, BlipForQuestionAnswering
7
  import streamlit as st
8
-
9
- from llama_index.core import Settings, VectorStoreIndex, SimpleDirectoryReader, Document
10
  from llama_index.embeddings.fastembed import FastEmbedEmbedding
11
  from llama_index.llms.gemini import Gemini
12
 
@@ -14,44 +13,40 @@ from llama_index.llms.gemini import Gemini
14
  Settings.embed_model = FastEmbedEmbedding(model_name="BAAI/bge-small-en-v1.5")
15
  Settings.llm = Gemini(api_key=os.getenv("GOOGLE_API_KEY"), temperature=0.5, model_name="models/gemini-pro")
16
 
17
- def write_to_file(content, filename="./files/uploaded_file"):
 
 
 
 
 
18
  os.makedirs(os.path.dirname(filename), exist_ok=True)
19
  with open(filename, "wb") as f:
20
  f.write(content)
21
 
22
- def process_image(image_bytes):
23
- device = "cuda" if torch.cuda.is_available() else "cpu"
24
- processor = BlipProcessor.from_pretrained("Salesforce/blip-image-captioning-base")
25
- model = BlipForConditionalGeneration.from_pretrained("Salesforce/blip-image-captioning-base").to(device)
26
-
27
- image = Image.open(BytesIO(image_bytes))
28
- inputs = processor(images=image, return_tensors="pt").to(device)
29
-
30
- with torch.no_grad():
31
- caption = model.generate(**inputs, max_length=50)
32
- caption_text = processor.decode(caption[0], skip_special_tokens=True)
33
-
34
- return caption_text, image
35
-
36
  def answer_question_about_image(image, question):
37
- device = "cuda" if torch.cuda.is_available() else "cpu"
38
- processor = BlipProcessor.from_pretrained("Salesforce/blip-vqa-base")
39
- model = BlipForQuestionAnswering.from_pretrained("Salesforce/blip-vqa-base").to(device)
40
-
41
- inputs = processor(image, question, return_tensors="pt").to(device)
42
 
43
  with torch.no_grad():
44
- out = model.generate(**inputs)
45
- answer = processor.decode(out[0], skip_special_tokens=True)
46
 
47
  return answer
48
 
49
- def extract_text_from_pdf(pdf_file):
50
  pdf_reader = PyPDF2.PdfReader(BytesIO(pdf_file.getvalue()))
51
  text = ""
 
 
52
  for page in pdf_reader.pages:
53
  text += page.extract_text()
54
- return text
 
 
 
 
 
 
 
55
 
56
  def ingest_documents():
57
  reader = SimpleDirectoryReader("./files/")
@@ -86,9 +81,9 @@ def generate_summary(index, document_text, query, target_language):
86
  # Streamlit app
87
  def main():
88
  st.title("Multimodal and Multilingual Document Analyzer")
89
- st.write("Upload a document (PDF, text, or image), ask questions in your preferred language, and get detailed analysis!")
90
 
91
- uploaded_file = st.file_uploader("Choose a file", type=["pdf", "txt", "jpg", "png"])
92
 
93
  languages = {
94
  'English': 'en',
@@ -107,15 +102,15 @@ def main():
107
 
108
  try:
109
  if file_type == "application/pdf":
110
- document_text = extract_text_from_pdf(uploaded_file)
111
  write_to_file(uploaded_file.getvalue(), "./files/uploaded.pdf")
112
- elif file_type == "text/plain":
113
- document_text = uploaded_file.getvalue().decode("utf-8")
114
- write_to_file(uploaded_file.getvalue(), "./files/uploaded.txt")
115
  elif file_type in ["image/jpeg", "image/png"]:
116
- image_caption, image = process_image(uploaded_file.getvalue())
117
- document_text = f"Image caption: {image_caption}"
118
- st.image(image, caption=image_caption, use_column_width=True)
119
  write_to_file(uploaded_file.getvalue(), "./files/uploaded_image")
120
  else:
121
  st.error("Unsupported file type")
@@ -130,7 +125,7 @@ def main():
130
  if file_type in ["image/jpeg", "image/png"]:
131
  answer = answer_question_about_image(image, query)
132
  st.write(f"**Direct Answer:** {answer}")
133
- summary = generate_summary(index, f"Image caption: {image_caption}\nQuestion: {query}\nAnswer: {answer}", query, target_language)
134
  else:
135
  summary = generate_summary(index, document_text, query, target_language)
136
 
@@ -144,4 +139,4 @@ def main():
144
  st.write("Please try uploading the file again or try a different file.")
145
 
146
  if __name__ == "__main__":
147
- main()
 
3
  import PyPDF2
4
  from io import BytesIO
5
  from PIL import Image
6
+ from transformers import BlipProcessor, BlipForQuestionAnswering
7
  import streamlit as st
8
+ from llama_index.core import Settings, VectorStoreIndex, SimpleDirectoryReader
 
9
  from llama_index.embeddings.fastembed import FastEmbedEmbedding
10
  from llama_index.llms.gemini import Gemini
11
 
 
13
  Settings.embed_model = FastEmbedEmbedding(model_name="BAAI/bge-small-en-v1.5")
14
  Settings.llm = Gemini(api_key=os.getenv("GOOGLE_API_KEY"), temperature=0.5, model_name="models/gemini-pro")
15
 
16
+ # Global variables to avoid reloading models
17
+ device = "cuda" if torch.cuda.is_available() else "cpu"
18
+ blip_vqa_processor = BlipProcessor.from_pretrained("Salesforce/blip-vqa-base")
19
+ blip_vqa_model = BlipForQuestionAnswering.from_pretrained("Salesforce/blip-vqa-base").to(device)
20
+
21
+ def write_to_file(content, filename):
22
  os.makedirs(os.path.dirname(filename), exist_ok=True)
23
  with open(filename, "wb") as f:
24
  f.write(content)
25
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
26
  def answer_question_about_image(image, question):
27
+ inputs = blip_vqa_processor(image, question, return_tensors="pt").to(device)
 
 
 
 
28
 
29
  with torch.no_grad():
30
+ out = blip_vqa_model.generate(**inputs)
31
+ answer = blip_vqa_processor.decode(out[0], skip_special_tokens=True)
32
 
33
  return answer
34
 
35
+ def extract_text_and_images_from_pdf(pdf_file):
36
  pdf_reader = PyPDF2.PdfReader(BytesIO(pdf_file.getvalue()))
37
  text = ""
38
+ images = []
39
+
40
  for page in pdf_reader.pages:
41
  text += page.extract_text()
42
+ x_objects = page.get('/Resources').get('/XObject')
43
+ if x_objects:
44
+ for obj in x_objects:
45
+ if x_objects[obj]['/Subtype'] == '/Image':
46
+ img_data = x_objects[obj]._data
47
+ images.append(img_data)
48
+
49
+ return text, images
50
 
51
  def ingest_documents():
52
  reader = SimpleDirectoryReader("./files/")
 
81
  # Streamlit app
82
  def main():
83
  st.title("Multimodal and Multilingual Document Analyzer")
84
+ st.write("Upload a document (PDF, or image), ask questions in your preferred language, and get detailed analysis!")
85
 
86
+ uploaded_file = st.file_uploader("Choose a file", type=["pdf", "jpg", "png"])
87
 
88
  languages = {
89
  'English': 'en',
 
102
 
103
  try:
104
  if file_type == "application/pdf":
105
+ document_text, images = extract_text_and_images_from_pdf(uploaded_file)
106
  write_to_file(uploaded_file.getvalue(), "./files/uploaded.pdf")
107
+ for img_data in images:
108
+ image = Image.open(BytesIO(img_data))
109
+ st.image(image, use_column_width=True)
110
  elif file_type in ["image/jpeg", "image/png"]:
111
+ image = Image.open(BytesIO(uploaded_file.getvalue()))
112
+ document_text = ""
113
+ st.image(image, use_column_width=True)
114
  write_to_file(uploaded_file.getvalue(), "./files/uploaded_image")
115
  else:
116
  st.error("Unsupported file type")
 
125
  if file_type in ["image/jpeg", "image/png"]:
126
  answer = answer_question_about_image(image, query)
127
  st.write(f"**Direct Answer:** {answer}")
128
+ summary = generate_summary(index, f"Image query: {query}\nAnswer: {answer}", query, target_language)
129
  else:
130
  summary = generate_summary(index, document_text, query, target_language)
131
 
 
139
  st.write("Please try uploading the file again or try a different file.")
140
 
141
  if __name__ == "__main__":
142
+ main()