Spaces:

rolwinpinto
/

Lingualizer

Sleeping

App Files Files Community

rolwinpinto commited on Aug 19, 2024

Commit

fb64d37

verified ·

1 Parent(s): 0bfc924

Update app.py

Browse files

Files changed (1) hide show

app.py +33 -38

app.py CHANGED Viewed

@@ -3,10 +3,9 @@ import torch
 import PyPDF2
 from io import BytesIO
 from PIL import Image
-from transformers import BlipProcessor, BlipForConditionalGeneration, BlipForQuestionAnswering
 import streamlit as st
-from llama_index.core import Settings, VectorStoreIndex, SimpleDirectoryReader, Document
 from llama_index.embeddings.fastembed import FastEmbedEmbedding
 from llama_index.llms.gemini import Gemini
@@ -14,44 +13,40 @@ from llama_index.llms.gemini import Gemini
 Settings.embed_model = FastEmbedEmbedding(model_name="BAAI/bge-small-en-v1.5")
 Settings.llm = Gemini(api_key=os.getenv("GOOGLE_API_KEY"), temperature=0.5, model_name="models/gemini-pro")
-def write_to_file(content, filename="./files/uploaded_file"):
     os.makedirs(os.path.dirname(filename), exist_ok=True)
     with open(filename, "wb") as f:
         f.write(content)
-def process_image(image_bytes):
-    device = "cuda" if torch.cuda.is_available() else "cpu"
-    processor = BlipProcessor.from_pretrained("Salesforce/blip-image-captioning-base")
-    model = BlipForConditionalGeneration.from_pretrained("Salesforce/blip-image-captioning-base").to(device)
-    image = Image.open(BytesIO(image_bytes))
-    inputs = processor(images=image, return_tensors="pt").to(device)
-    with torch.no_grad():
-        caption = model.generate(**inputs, max_length=50)
-        caption_text = processor.decode(caption[0], skip_special_tokens=True)
-    return caption_text, image
 def answer_question_about_image(image, question):
-    device = "cuda" if torch.cuda.is_available() else "cpu"
-    processor = BlipProcessor.from_pretrained("Salesforce/blip-vqa-base")
-    model = BlipForQuestionAnswering.from_pretrained("Salesforce/blip-vqa-base").to(device)
-    inputs = processor(image, question, return_tensors="pt").to(device)
     with torch.no_grad():
-        out = model.generate(**inputs)
-    answer = processor.decode(out[0], skip_special_tokens=True)
     return answer
-def extract_text_from_pdf(pdf_file):
     pdf_reader = PyPDF2.PdfReader(BytesIO(pdf_file.getvalue()))
     text = ""
     for page in pdf_reader.pages:
         text += page.extract_text()
-    return text
 def ingest_documents():
     reader = SimpleDirectoryReader("./files/")
@@ -86,9 +81,9 @@ def generate_summary(index, document_text, query, target_language):
 # Streamlit app
 def main():
     st.title("Multimodal and Multilingual Document Analyzer")
-    st.write("Upload a document (PDF, text, or image), ask questions in your preferred language, and get detailed analysis!")
-    uploaded_file = st.file_uploader("Choose a file", type=["pdf", "txt", "jpg", "png"])
     languages = {
         'English': 'en',
@@ -107,15 +102,15 @@ def main():
         try:
             if file_type == "application/pdf":
-                document_text = extract_text_from_pdf(uploaded_file)
                 write_to_file(uploaded_file.getvalue(), "./files/uploaded.pdf")
-            elif file_type == "text/plain":
-                document_text = uploaded_file.getvalue().decode("utf-8")
-                write_to_file(uploaded_file.getvalue(), "./files/uploaded.txt")
             elif file_type in ["image/jpeg", "image/png"]:
-                image_caption, image = process_image(uploaded_file.getvalue())
-                document_text = f"Image caption: {image_caption}"
-                st.image(image, caption=image_caption, use_column_width=True)
                 write_to_file(uploaded_file.getvalue(), "./files/uploaded_image")
             else:
                 st.error("Unsupported file type")
@@ -130,7 +125,7 @@ def main():
                     if file_type in ["image/jpeg", "image/png"]:
                         answer = answer_question_about_image(image, query)
                         st.write(f"**Direct Answer:** {answer}")
-                        summary = generate_summary(index, f"Image caption: {image_caption}\nQuestion: {query}\nAnswer: {answer}", query, target_language)
                     else:
                         summary = generate_summary(index, document_text, query, target_language)
@@ -144,4 +139,4 @@ def main():
             st.write("Please try uploading the file again or try a different file.")
 if __name__ == "__main__":
-    main()

 import PyPDF2
 from io import BytesIO
 from PIL import Image
+from transformers import BlipProcessor, BlipForQuestionAnswering
 import streamlit as st
+from llama_index.core import Settings, VectorStoreIndex, SimpleDirectoryReader
 from llama_index.embeddings.fastembed import FastEmbedEmbedding
 from llama_index.llms.gemini import Gemini
 Settings.embed_model = FastEmbedEmbedding(model_name="BAAI/bge-small-en-v1.5")
 Settings.llm = Gemini(api_key=os.getenv("GOOGLE_API_KEY"), temperature=0.5, model_name="models/gemini-pro")
+# Global variables to avoid reloading models
+device = "cuda" if torch.cuda.is_available() else "cpu"
+blip_vqa_processor = BlipProcessor.from_pretrained("Salesforce/blip-vqa-base")
+blip_vqa_model = BlipForQuestionAnswering.from_pretrained("Salesforce/blip-vqa-base").to(device)
+def write_to_file(content, filename):
     os.makedirs(os.path.dirname(filename), exist_ok=True)
     with open(filename, "wb") as f:
         f.write(content)
 def answer_question_about_image(image, question):
+    inputs = blip_vqa_processor(image, question, return_tensors="pt").to(device)
     with torch.no_grad():
+        out = blip_vqa_model.generate(**inputs)
+    answer = blip_vqa_processor.decode(out[0], skip_special_tokens=True)
     return answer
+def extract_text_and_images_from_pdf(pdf_file):
     pdf_reader = PyPDF2.PdfReader(BytesIO(pdf_file.getvalue()))
     text = ""
+    images = []
     for page in pdf_reader.pages:
         text += page.extract_text()
+        x_objects = page.get('/Resources').get('/XObject')
+        if x_objects:
+            for obj in x_objects:
+                if x_objects[obj]['/Subtype'] == '/Image':
+                    img_data = x_objects[obj]._data
+                    images.append(img_data)
+    return text, images
 def ingest_documents():
     reader = SimpleDirectoryReader("./files/")
 # Streamlit app
 def main():
     st.title("Multimodal and Multilingual Document Analyzer")
+    st.write("Upload a document (PDF, or image), ask questions in your preferred language, and get detailed analysis!")
+    uploaded_file = st.file_uploader("Choose a file", type=["pdf", "jpg", "png"])
     languages = {
         'English': 'en',
         try:
             if file_type == "application/pdf":
+                document_text, images = extract_text_and_images_from_pdf(uploaded_file)
                 write_to_file(uploaded_file.getvalue(), "./files/uploaded.pdf")
+                for img_data in images:
+                    image = Image.open(BytesIO(img_data))
+                    st.image(image, use_column_width=True)
             elif file_type in ["image/jpeg", "image/png"]:
+                image = Image.open(BytesIO(uploaded_file.getvalue()))
+                document_text = ""
+                st.image(image, use_column_width=True)
                 write_to_file(uploaded_file.getvalue(), "./files/uploaded_image")
             else:
                 st.error("Unsupported file type")
                     if file_type in ["image/jpeg", "image/png"]:
                         answer = answer_question_about_image(image, query)
                         st.write(f"**Direct Answer:** {answer}")
+                        summary = generate_summary(index, f"Image query: {query}\nAnswer: {answer}", query, target_language)
                     else:
                         summary = generate_summary(index, document_text, query, target_language)
             st.write("Please try uploading the file again or try a different file.")
 if __name__ == "__main__":
+    main()