Spaces:

eagle0504
/

IDP-Demo

Running

App Files Files Community

eagle0504 commited on Mar 14, 2024

Commit

59d3355

1 Parent(s): c8d4109

1st push

Browse files

Files changed (11) hide show

LICENSE +21 -0
app.py +259 -0
figs/false-insurance-policy.jpeg +0 -0
figs/labcorp_accessioning.jpg +0 -0
figs/system-architect.drawio +82 -0
figs/system-architect.png +0 -0
lambda/my_textract.py +95 -0
models/cnn_transformer/tf_keras_image_captioning_cnn+transformer_flicker8k.index +0 -0
requirements.txt +13 -0
utils/cnn_transformer.py +379 -0
utils/helpers.py +192 -0

LICENSE ADDED Viewed

	@@ -0,0 +1,21 @@

+MIT License
+Copyright (c) 2023 Yiqiao Yin
+Permission is hereby granted, free of charge, to any person obtaining a copy
+of this software and associated documentation files (the "Software"), to deal
+in the Software without restriction, including without limitation the rights
+to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+copies of the Software, and to permit persons to whom the Software is
+furnished to do so, subject to the following conditions:
+The above copyright notice and this permission notice shall be included in all
+copies or substantial portions of the Software.
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+SOFTWARE.

app.py ADDED Viewed

	@@ -0,0 +1,259 @@

+import base64
+import io
+import json
+import os
+from typing import Any, Dict, List
+import chromadb
+import google.generativeai as palm
+import pandas as pd
+import requests
+import streamlit as st
+from chromadb.utils.embedding_functions import SentenceTransformerEmbeddingFunction
+from langchain.text_splitter import (
+    RecursiveCharacterTextSplitter,
+    SentenceTransformersTokenTextSplitter,
+)
+from PIL import Image, ImageDraw, ImageFont
+from pypdf import PdfReader
+from transformers import pipeline
+from utils.cnn_transformer import *
+from utils.helpers import *
+# API Key (You should set this in your environment variables)
+api_key = st.secrets["PALM_API_KEY"]
+palm.configure(api_key=api_key)
+# Load YOLO pipeline
+yolo_pipe = pipeline("object-detection", model="hustvl/yolos-small")
+# Function to draw bounding boxes and labels on image
+def draw_boxes(image, predictions):
+    draw = ImageDraw.Draw(image)
+    font = ImageFont.load_default()
+    for pred in predictions:
+        label = pred["label"]
+        score = pred["score"]
+        box = pred["box"]
+        xmin, ymin, xmax, ymax = box.values()
+        draw.rectangle([xmin, ymin, xmax, ymax], outline="red", width=2)
+        draw.text((xmin, ymin), f"{label} ({score:.2f})", fill="red", font=font)
+    return image
+# Main function of the Streamlit app
+def main():
+    st.title("Generative AI Demo on Camera Input/Image/PDF 💻")
+    # Dropdown for user to choose the input method
+    input_method = st.sidebar.selectbox(
+        "Choose input method:", ["Camera", "Upload Image", "Upload PDF"]
+    )
+    image, uploaded_file = None, None
+    if input_method == "Camera":
+        # Streamlit widget to capture an image from the user's webcam
+        image = st.sidebar.camera_input("Take a picture 📸")
+    elif input_method == "Upload Image":
+        # Create a file uploader in the sidebar
+        image = st.sidebar.file_uploader("Upload a JPG image", type=["jpg"])
+    elif input_method == "Upload PDF":
+        # File uploader widget
+        uploaded_file = st.sidebar.file_uploader("Choose a PDF file", type="pdf")
+    # Add instruction
+    st.sidebar.markdown(
+        """
+            # 🌟 How to Use the App 🌟
+            1) **🌈 User Input Magic**:
+            - 📸 **Camera Snap**: Tap to capture a moment with your device's camera. Say cheese!
+            - 🖼️ **Image Upload Extravaganza**: Got a cool pic? Upload it from your computer and let the magic begin!
+            - 📄 **PDF Adventure**: Use gen AI as ctrl+F to search information on any PDF, like opening a treasure chest of information!
+            - 📄 **YOLO Algorithm**: Wanna detect the object in the image? Use our object detection algorithm to see if the objects can be detected.
+            2) **🤖 AI Interaction Wonderland**:
+            - 🌟 **Gemini's AI**: Google's Gemini AI is your companion, ready to dive deep into your uploads.
+            - 🌐 **Chroma Database**: As you upload, we're crafting a colorful Chroma database in our secret lab, making your interaction even more awesome!
+            3) **💬 Chit-Chat with AI Post-Upload**:
+            - 🌍 Once your content is up in the app, ask away! Any question, any time.
+            - 💡 Light up the conversation with Gemini AI. It is like having a chat with a wise wizard from the digital realm!
+            Enjoy exploring and have fun! 😄🎉
+        """
+    )
+    if image is not None:
+        # Display the captured image
+        st.image(image, caption="Captured Image", use_column_width=True)
+        # Convert the image to PIL format and resize
+        pil_image = Image.open(image)
+        resized_image = resize_image(pil_image)
+        # Convert the resized image to base64
+        image_base64 = convert_image_to_base64(resized_image)
+        # OCR by API Call of AWS Textract via Post Method
+        if input_method == "Upload Image":
+            st.success("Running textract!")
+            url = "https://2tsig211e0.execute-api.us-east-1.amazonaws.com/my_textract"
+            payload = {"image": image_base64}
+            result_dict = post_request_and_parse_response(url, payload)
+            output_data = extract_line_items(result_dict)
+            df = pd.DataFrame(output_data)
+            # Using an expander to hide the json
+            with st.expander("Show/Hide Raw Json"):
+                st.write(result_dict)
+            # Using an expander to hide the table
+            with st.expander("Show/Hide Table"):
+                st.table(df)
+        if api_key:
+            # Make API call
+            st.success("Running Gemini!")
+            with st.spinner('Wait for it...'):
+                response = call_gemini_api(image_base64, api_key)
+            with st.expander("Raw output from Gemini"):
+                st.write(response)
+            # Display the response
+            if response["candidates"][0]["content"]["parts"][0]["text"]:
+                text_from_response = response["candidates"][0]["content"]["parts"][0][
+                    "text"
+                ]
+                with st.spinner("Wait for it..."):
+                    st.write(text_from_response)
+                # Text input for the question
+                input_prompt = st.text_input(
+                    "Type your question here:",
+                )
+                # Display the entered question
+                if input_prompt:
+                    updated_text_from_response = call_gemini_api(
+                        image_base64, api_key, prompt=input_prompt
+                    )
+                    if updated_text_from_response is not None:
+                        # Do something with the text
+                        updated_ans = updated_text_from_response["candidates"][0][
+                            "content"
+                        ]["parts"][0]["text"]
+                        with st.spinner("Wait for it..."):
+                            st.write(f"Gemini: {updated_ans}")
+                    else:
+                        st.warning("Check gemini's API.")
+            else:
+                st.write("No response from API.")
+        else:
+            st.write("API Key is not set. Please set the API Key.")
+    # YOLO
+    if image is not None:
+        st.sidebar.success("Check the following box to run YOLO algorithm if desired!")
+        use_yolo = st.sidebar.checkbox("Use YOLO!", value=False)
+        if use_yolo:
+            # Process image with YOLO
+            image = Image.open(image)
+            with st.spinner("Wait for it..."):
+                st.success("Running YOLO algorithm!")
+                predictions = yolo_pipe(image)
+                st.success("YOLO running successfully.")
+            # Draw bounding boxes and labels
+            image_with_boxes = draw_boxes(image.copy(), predictions)
+            st.success("Bounding boxes drawn.")
+            # Display annotated image
+            st.image(image_with_boxes, caption="Annotated Image", use_column_width=True)
+    # File uploader widget
+    if uploaded_file is not None:
+        # To read file as bytes:
+        bytes_data = uploaded_file.getvalue()
+        st.success("Your PDF is uploaded successfully.")
+        # Get the file name
+        file_name = uploaded_file.name
+        # Save the file temporarily
+        with open(file_name, "wb") as f:
+            f.write(uploaded_file.getbuffer())
+        # Display PDF
+        # displayPDF(file_name)
+        # Read file
+        reader = PdfReader(file_name)
+        pdf_texts = [p.extract_text().strip() for p in reader.pages]
+        # Filter the empty strings
+        pdf_texts = [text for text in pdf_texts if text]
+        st.success("PDF extracted successfully.")
+        # Split the texts
+        character_splitter = RecursiveCharacterTextSplitter(
+            separators=["\n\n", "\n", ". ", " ", ""], chunk_size=1000, chunk_overlap=0
+        )
+        character_split_texts = character_splitter.split_text("\n\n".join(pdf_texts))
+        st.success("Texts splitted successfully.")
+        # Tokenize it
+        st.warning("Start tokenzing ...")
+        token_splitter = SentenceTransformersTokenTextSplitter(
+            chunk_overlap=0, tokens_per_chunk=256
+        )
+        token_split_texts = []
+        for text in character_split_texts:
+            token_split_texts += token_splitter.split_text(text)
+        st.success("Tokenized successfully.")
+        # Add to vector database
+        embedding_function = SentenceTransformerEmbeddingFunction()
+        chroma_client = chromadb.Client()
+        chroma_collection = chroma_client.create_collection(
+            "tmp", embedding_function=embedding_function
+        )
+        ids = [str(i) for i in range(len(token_split_texts))]
+        chroma_collection.add(ids=ids, documents=token_split_texts)
+        st.success("Vector database loaded successfully.")
+        # User input
+        query = st.text_input("Ask me anything!", "What is the document about?")
+        results = chroma_collection.query(query_texts=[query], n_results=5)
+        retrieved_documents = results["documents"][0]
+        results_as_table = pd.DataFrame(
+            {
+                "ids": results["ids"][0],
+                "documents": results["documents"][0],
+                "distances": results["distances"][0],
+            }
+        )
+        # API of a foundation model
+        output = rag(query=query, retrieved_documents=retrieved_documents)
+        st.write(output)
+        st.success(
+            "Please see where the chatbot got the information from the document below.👇"
+        )
+        with st.expander("Raw query outputs:"):
+            st.write(results)
+        with st.expander("Processed tabular form query outputs:"):
+            st.table(results_as_table)
+if __name__ == "__main__":
+    main()

figs/false-insurance-policy.jpeg ADDED Viewed

figs/labcorp_accessioning.jpg ADDED Viewed

figs/system-architect.drawio ADDED Viewed

	@@ -0,0 +1,82 @@

+<mxfile host="65bd71144e">
+    <diagram id="6I0VWqCgP7JPpdnrNpuH" name="Page-1">
+        <mxGraphModel dx="721" dy="917" grid="1" gridSize="10" guides="1" tooltips="1" connect="1" arrows="1" fold="1" page="1" pageScale="1" pageWidth="850" pageHeight="1100" math="0" shadow="0">
+            <root>
+                <mxCell id="0"/>
+                <mxCell id="1" parent="0"/>
+                <mxCell id="37" value="" style="rounded=0;whiteSpace=wrap;html=1;" vertex="1" parent="1">
+                    <mxGeometry x="80" y="110" width="720" height="480" as="geometry"/>
+                </mxCell>
+                <mxCell id="32" style="edgeStyle=none;html=1;" parent="1" source="2" target="29" edge="1">
+                    <mxGeometry relative="1" as="geometry">
+                        <mxPoint x="237.5" y="380" as="targetPoint"/>
+                    </mxGeometry>
+                </mxCell>
+                <mxCell id="2" value="&lt;b&gt;PDF&lt;/b&gt;" style="html=1;verticalLabelPosition=bottom;align=center;labelBackgroundColor=#ffffff;verticalAlign=top;strokeWidth=2;strokeColor=#0080F0;shadow=0;dashed=0;shape=mxgraph.ios7.icons.documents;" parent="1" vertex="1">
+                    <mxGeometry x="137.5" y="195" width="55" height="60" as="geometry"/>
+                </mxCell>
+                <mxCell id="12" style="html=1;entryX=1;entryY=0.5;entryDx=0;entryDy=0;entryPerimeter=0;" parent="1" source="3" target="11" edge="1">
+                    <mxGeometry relative="1" as="geometry"/>
+                </mxCell>
+                <mxCell id="3" value="&lt;b&gt;Textract&lt;/b&gt;" style="sketch=0;points=[[0,0,0],[0.25,0,0],[0.5,0,0],[0.75,0,0],[1,0,0],[0,1,0],[0.25,1,0],[0.5,1,0],[0.75,1,0],[1,1,0],[0,0.25,0],[0,0.5,0],[0,0.75,0],[1,0.25,0],[1,0.5,0],[1,0.75,0]];outlineConnect=0;fontColor=#232F3E;gradientColor=#4AB29A;gradientDirection=north;fillColor=#116D5B;strokeColor=#ffffff;dashed=0;verticalLabelPosition=bottom;verticalAlign=top;align=center;html=1;fontSize=12;fontStyle=0;aspect=fixed;shape=mxgraph.aws4.resourceIcon;resIcon=mxgraph.aws4.textract;" parent="1" vertex="1">
+                    <mxGeometry x="700" y="337.5" width="78" height="78" as="geometry"/>
+                </mxCell>
+                <mxCell id="15" style="edgeStyle=none;html=1;" parent="1" source="10" edge="1">
+                    <mxGeometry relative="1" as="geometry">
+                        <mxPoint x="590" y="420" as="targetPoint"/>
+                        <Array as="points">
+                            <mxPoint x="460" y="520"/>
+                            <mxPoint x="520" y="520"/>
+                            <mxPoint x="570" y="520"/>
+                        </Array>
+                    </mxGeometry>
+                </mxCell>
+                <mxCell id="31" style="edgeStyle=none;html=1;entryX=1.04;entryY=0.492;entryDx=0;entryDy=0;entryPerimeter=0;" parent="1" source="10" target="29" edge="1">
+                    <mxGeometry relative="1" as="geometry"/>
+                </mxCell>
+                <mxCell id="10" value="&lt;b&gt;API Gateway&lt;/b&gt;" style="outlineConnect=0;dashed=0;verticalLabelPosition=bottom;verticalAlign=top;align=center;html=1;shape=mxgraph.aws3.api_gateway;fillColor=#D9A741;gradientColor=none;" parent="1" vertex="1">
+                    <mxGeometry x="400" y="330" width="76.5" height="93" as="geometry"/>
+                </mxCell>
+                <mxCell id="13" style="edgeStyle=none;html=1;entryX=0;entryY=0.5;entryDx=0;entryDy=0;entryPerimeter=0;" parent="1" source="11" target="3" edge="1">
+                    <mxGeometry relative="1" as="geometry"/>
+                </mxCell>
+                <mxCell id="16" style="edgeStyle=none;html=1;exitX=0.5;exitY=0;exitDx=0;exitDy=0;exitPerimeter=0;entryX=0.5;entryY=0;entryDx=0;entryDy=0;entryPerimeter=0;" parent="1" source="11" target="10" edge="1">
+                    <mxGeometry relative="1" as="geometry">
+                        <Array as="points">
+                            <mxPoint x="570" y="240"/>
+                            <mxPoint x="510" y="240"/>
+                            <mxPoint x="450" y="240"/>
+                        </Array>
+                    </mxGeometry>
+                </mxCell>
+                <mxCell id="11" value="&lt;b&gt;AWS Lambda&lt;/b&gt;" style="sketch=0;points=[[0,0,0],[0.25,0,0],[0.5,0,0],[0.75,0,0],[1,0,0],[0,1,0],[0.25,1,0],[0.5,1,0],[0.75,1,0],[1,1,0],[0,0.25,0],[0,0.5,0],[0,0.75,0],[1,0.25,0],[1,0.5,0],[1,0.75,0]];outlineConnect=0;fontColor=#232F3E;gradientColor=#F78E04;gradientDirection=north;fillColor=#D05C17;strokeColor=#ffffff;dashed=0;verticalLabelPosition=bottom;verticalAlign=top;align=center;html=1;fontSize=12;fontStyle=0;aspect=fixed;shape=mxgraph.aws4.resourceIcon;resIcon=mxgraph.aws4.lambda;" parent="1" vertex="1">
+                    <mxGeometry x="546" y="337.5" width="78" height="78" as="geometry"/>
+                </mxCell>
+                <mxCell id="22" value="&lt;b&gt;OCR Output&lt;/b&gt;" style="text;html=1;strokeColor=none;fillColor=none;align=center;verticalAlign=middle;whiteSpace=wrap;rounded=0;" parent="1" vertex="1">
+                    <mxGeometry x="491" y="208" width="60" height="30" as="geometry"/>
+                </mxCell>
+                <mxCell id="25" value="&lt;b&gt;base64&amp;nbsp;&lt;br&gt;Encoded&lt;br&gt;Image&lt;br&gt;&lt;/b&gt;" style="text;html=1;align=center;verticalAlign=middle;resizable=0;points=[];autosize=1;strokeColor=none;fillColor=none;" parent="1" vertex="1">
+                    <mxGeometry x="486" y="513" width="70" height="60" as="geometry"/>
+                </mxCell>
+                <mxCell id="26" value="&lt;b&gt;base64&amp;nbsp;&lt;br&gt;Encoded&lt;br&gt;Image&lt;br&gt;&lt;/b&gt;" style="text;html=1;align=center;verticalAlign=middle;resizable=0;points=[];autosize=1;strokeColor=none;fillColor=none;" parent="1" vertex="1">
+                    <mxGeometry x="265" y="374.25" width="70" height="60" as="geometry"/>
+                </mxCell>
+                <mxCell id="28" value="&lt;b&gt;Extracted&lt;br&gt;Text&lt;br&gt;&lt;/b&gt;" style="text;html=1;align=center;verticalAlign=middle;resizable=0;points=[];autosize=1;strokeColor=none;fillColor=none;" parent="1" vertex="1">
+                    <mxGeometry x="260" y="334.25" width="80" height="40" as="geometry"/>
+                </mxCell>
+                <mxCell id="30" style="edgeStyle=none;html=1;" parent="1" source="29" target="10" edge="1">
+                    <mxGeometry relative="1" as="geometry"/>
+                </mxCell>
+                <mxCell id="29" value="&lt;b&gt;User&lt;/b&gt;" style="html=1;verticalLabelPosition=bottom;align=center;labelBackgroundColor=#ffffff;verticalAlign=top;strokeWidth=2;strokeColor=#0080F0;shadow=0;dashed=0;shape=mxgraph.ios7.icons.user;" parent="1" vertex="1">
+                    <mxGeometry x="125" y="334.25" width="80" height="84.5" as="geometry"/>
+                </mxCell>
+                <mxCell id="33" value="Streamlit App" style="swimlane;whiteSpace=wrap;html=1;align=left;" vertex="1" parent="1">
+                    <mxGeometry x="100" y="150" width="690" height="430" as="geometry"/>
+                </mxCell>
+                <mxCell id="34" value="&lt;b&gt;EC2&lt;/b&gt;" style="outlineConnect=0;dashed=0;verticalLabelPosition=bottom;verticalAlign=top;align=center;html=1;shape=mxgraph.aws3.ec2;fillColor=#F58534;gradientColor=none;" vertex="1" parent="1">
+                    <mxGeometry x="95" y="80" width="35" height="43" as="geometry"/>
+                </mxCell>
+            </root>
+        </mxGraphModel>
+    </diagram>
+</mxfile>

figs/system-architect.png ADDED Viewed

lambda/my_textract.py ADDED Viewed

	@@ -0,0 +1,95 @@

+"""
+Purpose
+An AWS lambda function that analyzes documents with Amazon Textract.
+"""
+import json
+import base64
+import logging
+import boto3
+from botocore.exceptions import ClientError
+# Set up logging.
+logger = logging.getLogger(__name__)
+# Get the boto3 client.
+textract_client = boto3.client("textract")
+def lambda_handler(event, context):
+    """
+    Lambda handler function
+    param: event: The event object for the Lambda function.
+    param: context: The context object for the lambda function.
+    return: The list of Block objects recognized in the document
+    passed in the event object.
+    """
+    # raw_image = json.loads(event['body'])['image']
+    # message = f"i love {country}"
+    # return message
+    try:
+        # Determine document source.
+        # event['image'] = event["queryStringParameters"]['image']
+        # event['image'] = json.loads(event['body'])["queryStringParameters"]['image']
+        event["image"] = json.loads(event["body"])["image"]
+        if "image" in event:
+            # Decode the image
+            image_bytes = event["image"].encode("utf-8")
+            img_b64decoded = base64.b64decode(image_bytes)
+            image = {"Bytes": img_b64decoded}
+        elif "S3Object" in event:
+            image = {
+                "S3Object": {
+                    "Bucket": event["S3Object"]["Bucket"],
+                    "Name": event["S3Object"]["Name"],
+                }
+            }
+        else:
+            raise ValueError(
+                "Invalid source. Only image base 64 encoded image bytes or S3Object are supported."
+            )
+        # Analyze the document.
+        response = textract_client.detect_document_text(Document=image)
+        # Get the Blocks
+        blocks = response["Blocks"]
+        lambda_response = {"statusCode": 200, "body": json.dumps(blocks)}
+    except ClientError as err:
+        error_message = "Couldn't analyze image. " + err.response["Error"]["Message"]
+        lambda_response = {
+            "statusCode": 400,
+            "body": {
+                "Error": err.response["Error"]["Code"],
+                "ErrorMessage": error_message,
+            },
+        }
+        logger.error(
+            "Error function %s: %s", context.invoked_function_arn, error_message
+        )
+    except ValueError as val_error:
+        lambda_response = {
+            "statusCode": 400,
+            "body": {"Error": "ValueError", "ErrorMessage": format(val_error)},
+        }
+        logger.error(
+            "Error function %s: %s", context.invoked_function_arn, format(val_error)
+        )
+    # Create return body
+    http_resp = {}
+    http_resp["statusCode"] = 200
+    http_resp["headers"] = {}
+    http_resp["headers"]["Content-Type"] = "application/json"
+    http_resp["body"] = json.dumps(lambda_response)
+    return http_resp

models/cnn_transformer/tf_keras_image_captioning_cnn+transformer_flicker8k.index ADDED Viewed

Binary file (28.9 kB). View file

requirements.txt ADDED Viewed

	@@ -0,0 +1,13 @@

+chromadb==0.3.29
+langchain==0.0.343
+matplotlib
+numpy
+google-generativeai>=0.1.0
+pandas
+pypdf==3.17.1
+Pillow
+sentence-transformers==2.2.2
+streamlit
+transformers
+torch
+tensorflow

utils/cnn_transformer.py ADDED Viewed

	@@ -0,0 +1,379 @@

+import os
+os.environ["KERAS_BACKEND"] = "tensorflow"
+import re
+import numpy as np
+import matplotlib.pyplot as plt
+import tensorflow as tf
+import keras
+from keras import layers
+from keras.applications import efficientnet
+from keras.layers import TextVectorization
+keras.utils.set_random_seed(111)
+# Desired image dimensions
+IMAGE_SIZE = (299, 299)
+# Dimension for the image embeddings and token embeddings
+EMBED_DIM = 512
+# Per-layer units in the feed-forward network
+FF_DIM = 512
+# Fixed length allowed for any sequence
+SEQ_LENGTH = 25
+# Vocabulary size
+VOCAB_SIZE = 10000
+# Data augmentation for image data
+image_augmentation = keras.Sequential(
+    [
+        layers.RandomFlip("horizontal"),
+        layers.RandomRotation(0.2),
+        layers.RandomContrast(0.3),
+    ]
+)
+def get_cnn_model():
+    base_model = efficientnet.EfficientNetB0(
+        input_shape=(*IMAGE_SIZE, 3),
+        include_top=False,
+        weights="imagenet",
+    )
+    # We freeze our feature extractor
+    base_model.trainable = False
+    base_model_out = base_model.output
+    base_model_out = layers.Reshape((-1, base_model_out.shape[-1]))(base_model_out)
+    cnn_model = keras.models.Model(base_model.input, base_model_out)
+    return cnn_model
+class TransformerEncoderBlock(layers.Layer):
+    def __init__(self, embed_dim, dense_dim, num_heads, **kwargs):
+        super().__init__(**kwargs)
+        self.embed_dim = embed_dim
+        self.dense_dim = dense_dim
+        self.num_heads = num_heads
+        self.attention_1 = layers.MultiHeadAttention(
+            num_heads=num_heads, key_dim=embed_dim, dropout=0.0
+        )
+        self.layernorm_1 = layers.LayerNormalization()
+        self.layernorm_2 = layers.LayerNormalization()
+        self.dense_1 = layers.Dense(embed_dim, activation="relu")
+    def call(self, inputs, training, mask=None):
+        inputs = self.layernorm_1(inputs)
+        inputs = self.dense_1(inputs)
+        attention_output_1 = self.attention_1(
+            query=inputs,
+            value=inputs,
+            key=inputs,
+            attention_mask=None,
+            training=training,
+        )
+        out_1 = self.layernorm_2(inputs + attention_output_1)
+        return out_1
+class PositionalEmbedding(layers.Layer):
+    def __init__(self, sequence_length, vocab_size, embed_dim, **kwargs):
+        super().__init__(**kwargs)
+        self.token_embeddings = layers.Embedding(
+            input_dim=vocab_size, output_dim=embed_dim
+        )
+        self.position_embeddings = layers.Embedding(
+            input_dim=sequence_length, output_dim=embed_dim
+        )
+        self.sequence_length = sequence_length
+        self.vocab_size = vocab_size
+        self.embed_dim = embed_dim
+        self.embed_scale = tf.math.sqrt(tf.cast(embed_dim, tf.float32))
+    def call(self, inputs):
+        length = tf.shape(inputs)[-1]
+        positions = tf.range(start=0, limit=length, delta=1)
+        embedded_tokens = self.token_embeddings(inputs)
+        embedded_tokens = embedded_tokens * self.embed_scale
+        embedded_positions = self.position_embeddings(positions)
+        return embedded_tokens + embedded_positions
+    def compute_mask(self, inputs, mask=None):
+        return tf.math.not_equal(inputs, 0)
+class TransformerDecoderBlock(layers.Layer):
+    def __init__(self, embed_dim, ff_dim, num_heads, **kwargs):
+        super().__init__(**kwargs)
+        self.embed_dim = embed_dim
+        self.ff_dim = ff_dim
+        self.num_heads = num_heads
+        self.attention_1 = layers.MultiHeadAttention(
+            num_heads=num_heads, key_dim=embed_dim, dropout=0.1
+        )
+        self.attention_2 = layers.MultiHeadAttention(
+            num_heads=num_heads, key_dim=embed_dim, dropout=0.1
+        )
+        self.ffn_layer_1 = layers.Dense(ff_dim, activation="relu")
+        self.ffn_layer_2 = layers.Dense(embed_dim)
+        self.layernorm_1 = layers.LayerNormalization()
+        self.layernorm_2 = layers.LayerNormalization()
+        self.layernorm_3 = layers.LayerNormalization()
+        self.embedding = PositionalEmbedding(
+            embed_dim=EMBED_DIM,
+            sequence_length=SEQ_LENGTH,
+            vocab_size=VOCAB_SIZE,
+        )
+        self.out = layers.Dense(VOCAB_SIZE, activation="softmax")
+        self.dropout_1 = layers.Dropout(0.3)
+        self.dropout_2 = layers.Dropout(0.5)
+        self.supports_masking = True
+    def call(self, inputs, encoder_outputs, training, mask=None):
+        inputs = self.embedding(inputs)
+        causal_mask = self.get_causal_attention_mask(inputs)
+        if mask is not None:
+            padding_mask = tf.cast(mask[:, :, tf.newaxis], dtype=tf.int32)
+            combined_mask = tf.cast(mask[:, tf.newaxis, :], dtype=tf.int32)
+            combined_mask = tf.minimum(combined_mask, causal_mask)
+        attention_output_1 = self.attention_1(
+            query=inputs,
+            value=inputs,
+            key=inputs,
+            attention_mask=combined_mask,
+            training=training,
+        )
+        out_1 = self.layernorm_1(inputs + attention_output_1)
+        attention_output_2 = self.attention_2(
+            query=out_1,
+            value=encoder_outputs,
+            key=encoder_outputs,
+            attention_mask=padding_mask,
+            training=training,
+        )
+        out_2 = self.layernorm_2(out_1 + attention_output_2)
+        ffn_out = self.ffn_layer_1(out_2)
+        ffn_out = self.dropout_1(ffn_out, training=training)
+        ffn_out = self.ffn_layer_2(ffn_out)
+        ffn_out = self.layernorm_3(ffn_out + out_2, training=training)
+        ffn_out = self.dropout_2(ffn_out, training=training)
+        preds = self.out(ffn_out)
+        return preds
+    def get_causal_attention_mask(self, inputs):
+        input_shape = tf.shape(inputs)
+        batch_size, sequence_length = input_shape[0], input_shape[1]
+        i = tf.range(sequence_length)[:, tf.newaxis]
+        j = tf.range(sequence_length)
+        mask = tf.cast(i >= j, dtype="int32")
+        mask = tf.reshape(mask, (1, input_shape[1], input_shape[1]))
+        mult = tf.concat(
+            [
+                tf.expand_dims(batch_size, -1),
+                tf.constant([1, 1], dtype=tf.int32),
+            ],
+            axis=0,
+        )
+        return tf.tile(mask, mult)
+class ImageCaptioningModel(keras.Model):
+    def __init__(
+        self,
+        cnn_model,
+        encoder,
+        decoder,
+        num_captions_per_image=5,
+        image_aug=None,
+    ):
+        super().__init__()
+        self.cnn_model = cnn_model
+        self.encoder = encoder
+        self.decoder = decoder
+        self.loss_tracker = keras.metrics.Mean(name="loss")
+        self.acc_tracker = keras.metrics.Mean(name="accuracy")
+        self.num_captions_per_image = num_captions_per_image
+        self.image_aug = image_aug
+    def calculate_loss(self, y_true, y_pred, mask):
+        loss = self.loss(y_true, y_pred)
+        mask = tf.cast(mask, dtype=loss.dtype)
+        loss *= mask
+        return tf.reduce_sum(loss) / tf.reduce_sum(mask)
+    def calculate_accuracy(self, y_true, y_pred, mask):
+        accuracy = tf.equal(y_true, tf.argmax(y_pred, axis=2))
+        accuracy = tf.math.logical_and(mask, accuracy)
+        accuracy = tf.cast(accuracy, dtype=tf.float32)
+        mask = tf.cast(mask, dtype=tf.float32)
+        return tf.reduce_sum(accuracy) / tf.reduce_sum(mask)
+    def _compute_caption_loss_and_acc(self, img_embed, batch_seq, training=True):
+        encoder_out = self.encoder(img_embed, training=training)
+        batch_seq_inp = batch_seq[:, :-1]
+        batch_seq_true = batch_seq[:, 1:]
+        mask = tf.math.not_equal(batch_seq_true, 0)
+        batch_seq_pred = self.decoder(
+            batch_seq_inp, encoder_out, training=training, mask=mask
+        )
+        loss = self.calculate_loss(batch_seq_true, batch_seq_pred, mask)
+        acc = self.calculate_accuracy(batch_seq_true, batch_seq_pred, mask)
+        return loss, acc
+    def train_step(self, batch_data):
+        batch_img, batch_seq = batch_data
+        batch_loss = 0
+        batch_acc = 0
+        if self.image_aug:
+            batch_img = self.image_aug(batch_img)
+        # 1. Get image embeddings
+        img_embed = self.cnn_model(batch_img)
+        # 2. Pass each of the five captions one by one to the decoder
+        # along with the encoder outputs and compute the loss as well as accuracy
+        # for each caption.
+        for i in range(self.num_captions_per_image):
+            with tf.GradientTape() as tape:
+                loss, acc = self._compute_caption_loss_and_acc(
+                    img_embed, batch_seq[:, i, :], training=True
+                )
+                # 3. Update loss and accuracy
+                batch_loss += loss
+                batch_acc += acc
+            # 4. Get the list of all the trainable weights
+            train_vars = (
+                self.encoder.trainable_variables + self.decoder.trainable_variables
+            )
+            # 5. Get the gradients
+            grads = tape.gradient(loss, train_vars)
+            # 6. Update the trainable weights
+            self.optimizer.apply_gradients(zip(grads, train_vars))
+        # 7. Update the trackers
+        batch_acc /= float(self.num_captions_per_image)
+        self.loss_tracker.update_state(batch_loss)
+        self.acc_tracker.update_state(batch_acc)
+        # 8. Return the loss and accuracy values
+        return {
+            "loss": self.loss_tracker.result(),
+            "acc": self.acc_tracker.result(),
+        }
+    def test_step(self, batch_data):
+        batch_img, batch_seq = batch_data
+        batch_loss = 0
+        batch_acc = 0
+        # 1. Get image embeddings
+        img_embed = self.cnn_model(batch_img)
+        # 2. Pass each of the five captions one by one to the decoder
+        # along with the encoder outputs and compute the loss as well as accuracy
+        # for each caption.
+        for i in range(self.num_captions_per_image):
+            loss, acc = self._compute_caption_loss_and_acc(
+                img_embed, batch_seq[:, i, :], training=False
+            )
+            # 3. Update batch loss and batch accuracy
+            batch_loss += loss
+            batch_acc += acc
+        batch_acc /= float(self.num_captions_per_image)
+        # 4. Update the trackers
+        self.loss_tracker.update_state(batch_loss)
+        self.acc_tracker.update_state(batch_acc)
+        # 5. Return the loss and accuracy values
+        return {
+            "loss": self.loss_tracker.result(),
+            "acc": self.acc_tracker.result(),
+        }
+    @property
+    def metrics(self):
+        # We need to list our metrics here so the `reset_states()` can be
+        # called automatically.
+        return [self.loss_tracker, self.acc_tracker]
+strip_chars = "!\"#$%&'()*+,-./:;<=>?@[\]^_`{|}~"
+strip_chars = strip_chars.replace("<", "")
+strip_chars = strip_chars.replace(">", "")
+def custom_standardization(input_string):
+    lowercase = tf.strings.lower(input_string)
+    return tf.strings.regex_replace(lowercase, "[%s]" % re.escape(strip_chars), "")
+vectorization = TextVectorization(
+    max_tokens=VOCAB_SIZE,
+    output_mode="int",
+    output_sequence_length=SEQ_LENGTH,
+    standardize=custom_standardization,
+)
+def generate_caption(caption_model: None):
+    # Select a random image from the validation dataset
+    # sample_img = np.random.choice(valid_images)
+    # # Read the image from the disk
+    # sample_img = decode_and_resize(sample_img)
+    # img = sample_img.numpy().clip(0, 255).astype(np.uint8)
+    # plt.imshow(img)
+    # plt.show()
+    # Pass the image to the CNN
+    # img = tf.expand_dims(sample_img, 0)
+    #TOOD
+    img = None
+    img = caption_model.cnn_model(img)
+    # Pass the image features to the Transformer encoder
+    encoded_img = caption_model.encoder(img, training=False)
+    # Generate the caption using the Transformer decoder
+    decoded_caption = "<start> "
+    vocab = vectorization.get_vocabulary()
+    index_lookup = dict(zip(range(len(vocab)), vocab))
+    max_decoded_sentence_length = SEQ_LENGTH - 1
+    for i in range(max_decoded_sentence_length):
+        tokenized_caption = vectorization([decoded_caption])[:, :-1]
+        mask = tf.math.not_equal(tokenized_caption, 0)
+        predictions = caption_model.decoder(
+            tokenized_caption, encoded_img, training=False, mask=mask
+        )
+        sampled_token_index = np.argmax(predictions[0, i, :])
+        sampled_token = index_lookup[sampled_token_index]
+        if sampled_token == "<end>":
+            break
+        decoded_caption += " " + sampled_token
+    decoded_caption = decoded_caption.replace("<start> ", "")
+    decoded_caption = decoded_caption.replace(" <end>", "").strip()
+    print("Predicted Caption: ", decoded_caption)

utils/helpers.py ADDED Viewed

	@@ -0,0 +1,192 @@

+import base64
+import io
+import json
+import os
+from typing import Any, Dict, List
+import pandas as pd
+import requests
+import streamlit as st
+from PIL import Image
+import google.generativeai as palm
+from pypdf import PdfReader
+from langchain.text_splitter import (
+    RecursiveCharacterTextSplitter,
+    SentenceTransformersTokenTextSplitter,
+)
+import chromadb
+from chromadb.utils.embedding_functions import SentenceTransformerEmbeddingFunction
+# API Key (You should set this in your environment variables)
+api_key = st.secrets["PALM_API_KEY"]
+palm.configure(api_key=api_key)
+# Function to convert the image to bytes for download
+def convert_image_to_bytes(image):
+    buffered = io.BytesIO()
+    image.save(buffered, format="JPEG")
+    return buffered.getvalue()
+# Function to resize the image
+def resize_image(image):
+    return image.resize((512, int(image.height * 512 / image.width)))
+# Function to convert the image to base64
+def convert_image_to_base64(image):
+    buffered = io.BytesIO()
+    image.save(buffered, format="JPEG")
+    return base64.b64encode(buffered.getvalue()).decode()
+# Function to make an API call to Palm
+def call_palm(prompt: str) -> str:
+    completion = palm.generate_text(
+        model="models/text-bison-001",
+        prompt=prompt,
+        temperature=0,
+        max_output_tokens=800,
+    )
+    return completion.result
+# Function to make an API call to Google's Gemini API
+def call_gemini_api(image_base64, api_key=api_key, prompt="What is this picture?"):
+    headers = {
+        "Content-Type": "application/json",
+    }
+    data = {
+        "contents": [
+            {
+                "parts": [
+                    {"text": prompt},
+                    {"inline_data": {"mime_type": "image/jpeg", "data": image_base64}},
+                ]
+            }
+        ]
+    }
+    response = requests.post(
+        f"https://generativelanguage.googleapis.com/v1beta/models/gemini-pro-vision:generateContent?key={api_key}",
+        headers=headers,
+        json=data,
+    )
+    return response.json()
+def safely_get_text(response):
+    try:
+        response
+    except Exception as e:
+        print(f"An error occurred: {e}")
+    # Return None or a default value if the path does not exist
+    return None
+def post_request_and_parse_response(
+    url: str, payload: Dict[str, Any]
+) -> Dict[str, Any]:
+    """
+    Sends a POST request to the specified URL with the given payload,
+    then parses the byte response to a dictionary.
+    Args:
+    url (str): The URL to which the POST request is sent.
+    payload (Dict[str, Any]): The payload to send in the POST request.
+    Returns:
+    Dict[str, Any]: The parsed dictionary from the response.
+    """
+    # Set headers for the POST request
+    headers = {"Content-Type": "application/json"}
+    # Send the POST request and get the response
+    response = requests.post(url, json=payload, headers=headers)
+    # Extract the byte data from the response
+    byte_data = response.content
+    # Decode the byte data to a string
+    decoded_string = byte_data.decode("utf-8")
+    # Convert the JSON string to a dictionary
+    dict_data = json.loads(decoded_string)
+    return dict_data
+def extract_line_items(input_data: Dict[str, Any]) -> List[Dict[str, Any]]:
+    """
+    Extracts items with "BlockType": "LINE" from the provided JSON data.
+    Args:
+    input_data (Dict[str, Any]): The input JSON data as a dictionary.
+    Returns:
+    List[Dict[str, Any]]: A list of dictionaries with the extracted data.
+    """
+    # Initialize an empty list to hold the extracted line items
+    line_items: List[Dict[str, Any]] = []
+    # Get the list of items from the 'body' key in the input data
+    body_items = json.loads(input_data.get("body", "[]"))
+    # Iterate through each item in the body
+    for item in body_items:
+        # Check if the BlockType of the item is 'LINE'
+        if item.get("BlockType") == "LINE":
+            # Add the item to the line_items list
+            line_items.append(item)
+    return line_items
+def rag(query: str, retrieved_documents: list, api_key: str = api_key) -> str:
+    """
+    Function to process a query and a list of retrieved documents using the Gemini API.
+    Args:
+    query (str): The user's query or question.
+    retrieved_documents (list): A list of documents retrieved as relevant information to the query.
+    api_key (str): API key for accessing the Gemini API. Default is a predefined 'api_key'.
+    Returns:
+    str: The cleaned output from the Gemini API response.
+    """
+    # Combine the retrieved documents into a single string, separated by two newlines.
+    information = "\n\n".join(retrieved_documents)
+    # Format the query and combined information into a single message.
+    messages = f"Question: {query}. \n Information: {information}"
+    # Call the Gemini API with the formatted message and the API key.
+    gemini_output = call_palm(prompt=messages)
+    # Placeholder for processing the Gemini output. Currently, it simply assigns the raw output to 'cleaned_output'.
+    cleaned_output = gemini_output  # ["candidates"][0]["content"]["parts"][0]["text"]
+    return cleaned_output
+def displayPDF(file: str) -> None:
+    """
+    Displays a PDF file in a Streamlit application.
+    Parameters:
+    - file (str): The path to the PDF file to be displayed.
+    """
+    # Opening the PDF file in binary read mode
+    with open(file, "rb") as f:
+        # Encoding the PDF file content to base64
+        base64_pdf: str = base64.b64encode(f.read()).decode('utf-8')
+    # Creating an HTML embed string for displaying the PDF
+    pdf_display: str = F'<embed src="data:application/pdf;base64,{base64_pdf}" width="700" height="1000" type="application/pdf">'
+    # Using Streamlit to display the HTML embed string as unsafe HTML
+    st.markdown(pdf_display, unsafe_allow_html=True)