Build

Paused

File size: 8,143 Bytes

7ec133b
 
 
 
9db455c
 
85e7ead
69344b8
7ec133b
 
c52e238
fb293c4
c52e238
7ec133b
f8dcf83
a4115fd
 
 
f8dcf83
7ec133b
9db455c
5c72980
 
b499d7f
7294f1e
 
b499d7f
7294f1e
 
 
 
 
b499d7f
 
 
7294f1e
 
 
 
 
 
b499d7f
 
 
 
 
 
7294f1e
 
 
9db455c
b499d7f
 
 
 
 
 
 
 
 
 
 
 
 
3f30162
feb8185
 
 
 
fa7747b
3f30162
0a6288f
 
 
9db455c
0a6288f
 
 
5c72980
0a6288f
 
9db455c
0a6288f
fa7747b
3f30162
0a6288f
bda5bd0
9309155
bda5bd0
d0246f4
0a6288f
5c72980
0a6288f
 
7bf6cc7
 
f6b64ee
bda5bd0
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
69344b8
bda5bd0
 
69344b8
bda5bd0
 
 
 
 
 
69344b8
bda5bd0
69344b8
 
fa7747b
69344b8
 
fa7747b
b1c675e
 
 
aa10fdf
 
b1c675e
 
aa10fdf
b1c675e
 
 
aa10fdf
b1c675e
 
82a5278
b1c675e
aa10fdf
b1c675e
 
aa10fdf
b1c675e
 
 
aa10fdf
c5ecbbd
b1c675e
82a5278
b1c675e
aa10fdf
f6b64ee
 
 
 
 
 
 
 
 
 
 
 
 
aa10fdf
52aa47d
b1c675e
52aa47d
0e737a8
b1c675e
 
52aa47d
 
b1c675e
52aa47d
b1c675e
f6b64ee
 
aa10fdf
 
52aa47d
aa10fdf
fa7747b
 
b50f60b
 
7ec133b
 
5c72980
 
 
 
 
e7d06c3
5c72980
 
 
 
 
 
 
 
8975f30
8501e74
e7d06c3
5c72980
 
 
 
 
 
 
a333293
ee0b20a
 
a333293
 
5c72980
 
a333293
 
 
5c72980
066538a
a333293
aa10fdf
 
1f932c1
066538a
aa10fdf
 
 
 
0327ec6
5c72980

import gradio as gr
from PIL import Image
import torch
from transformers import AutoModelForCausalLM, AutoTokenizer
import cv2
import numpy as np
import ast
from collections import Counter


# # Ensure GPU usage if available
device = "cuda" if torch.cuda.is_available() else "cpu"

# Initialize the model and tokenizer
model = AutoModelForCausalLM.from_pretrained("ManishThota/SparrowVQE",
                                             torch_dtype=torch.float16, 
                                             device_map="auto",
                                             trust_remote_code=True)
tokenizer = AutoTokenizer.from_pretrained("ManishThota/SparrowVQE", trust_remote_code=True)




def video_to_frames(video, fps=1):
    """Converts a video file into frames and stores them as PNG images in a list."""
    frames_png = []
    cap = cv2.VideoCapture(video)
    
    if not cap.isOpened():
        print("Error opening video file")
        return frames_png
    
    frame_count = 0
    frame_interval = int(cap.get(cv2.CAP_PROP_FPS)) // fps  # Calculate frame interval
    
    while cap.isOpened():
        ret, frame = cap.read()
        if not ret:
            print("Can't receive frame (stream end?). Exiting ...")
            break
        
        if frame_count % frame_interval == 0:
            is_success, buffer = cv2.imencode(".png", frame)
            if is_success:
                frames_png.append(np.array(buffer).tobytes())
        
        frame_count += 1
    
    cap.release()
    return frames_png

def extract_frames(frame):

    # Convert binary data to a numpy array
    frame_np = np.frombuffer(frame, dtype=np.uint8)

    # Decode the PNG image
    image_rgb = cv2.imdecode(frame_np, flags=cv2.IMREAD_COLOR)  # Assuming it's in RGB format

    # Convert RGB to BGR
    image_bgr = cv2.cvtColor(image_rgb, cv2.COLOR_RGB2BGR)

    return image_bgr

def predict_answer(video, image, question):

    text = f"A chat between a curious user and an artificial intelligence assistant. The assistant gives helpful, detailed, and polite answers to the user's questions. USER: <image>\n{question}? ASSISTANT:"
    input_ids = tokenizer(text, return_tensors='pt').input_ids.to(device)

    
    if image:
        # Process as an image
        image = image.convert("RGB")
        image_tensor = model.image_preprocess(image)
        
        #Generate the answer
        output_ids = model.generate(
            input_ids,
            max_new_tokens=25,
            images=image_tensor,
            use_cache=True)[0]
        
        return tokenizer.decode(output_ids[input_ids.shape[1]:], skip_special_tokens=True).strip()
        
    elif video:
        frames = video_to_frames(video)
        image = extract_frames(frames[2])
        image_tensor = model.image_preprocess([image])
        # Generate the answer
        output_ids = model.generate(
                input_ids,
                max_new_tokens=25,
                images=image_tensor,
                use_cache=True)[0]
        answer = tokenizer.decode(output_ids[input_ids.shape[1]:], skip_special_tokens=True).strip()

        return answer
        # # Process as a video
        # frames = video_to_frames(video)
        # answers = []
        # for frame in frames:
        #     image = extract_frames(frame)
        #     image_tensor = model.image_preprocess([image])
            
        #     # Generate the answer
        #     output_ids = model.generate(
        #         input_ids,
        #         max_new_tokens=25,
        #         images=image_tensor,
        #         use_cache=True)[0]
            
        #     answer = tokenizer.decode(output_ids[input_ids.shape[1]:], skip_special_tokens=True).strip()
        #     answers.append(answer)

        # # Modify this logic based on your specific needs
        # most_common_answer = Counter(answers).most_common(1)[0][0]
        
        # # Safely evaluate the most common answer assuming it's a string representation of a Python literal
        # try:
        #     evaluated_answer = ast.literal_eval(most_common_answer)
        # except (ValueError, SyntaxError):
        #     # Handle malformed answer string
        #     evaluated_answer = f"Error evaluating answer: {most_common_answer}"
        
        # return evaluated_answer
    
    #     return ast.literal_eval(answers[0])
        
    # else:
    #     return "Unsupported file type. Please upload an image or video."
        
promt_cat_dog = """
    Annotate this image with this schema:
    {
        “description”: “Is there a cat in the image?”,
        “value”: “Cat”
    },
    {
        “description”: “Is there a dog in the image?”,
        “value”: “Dog”,
    },
    {
        “description”: “Is there a horse in the image?”,
        “value”: “Horse”,
    },
provide me the answers as a dictionary with key as the string value of the variable value on top and its value should be boolean value        
"""
promt_bus_people = """
    Annotate this image with this schema:
    {
        “description”: “Is there a bus in the image?”,
        “value”: “Bus”,
    },
    {
        “description”: “Is there a bike in the image?”,
        “value”: “Bike”,
    },
provide me the answers as a dictionary with key as the string value of the variable value on top and its value should be boolean value        
"""

# promt_video = """
#     Annotate this image with this schema:
#     {
#         “description”: “Is the person standing?”,
#         “value”: “standing”,
#     },
#     {
#         “description”: “Is the person's hands free?”,
#         “value”: “Hands-Free”,
#     },
# provide me the answers as a dictionary with key as the string value of the variable value on top and its value should be boolean value
# """

promt_video = """
Annotate this image with this schema:
    {
        “description”: “Is there a person standing in the image?”,
        “value”: “standing”,
    },
    {
        “description”: “Is the person's hands free in the image?”,
        “value”: “hands-free”,
    },
provide me the answers as a dictionary with key as the string value of the variable value on top and its value should be boolean value.
"""


test_examples = [[None, "Images/cat_dog.jpeg", promt_cat_dog], 
            [None,"Images/bus_people.jpeg", promt_bus_people], 
            ["videos/v1.mp4",None,promt_video],
            ["videos/v3.mp4",None,promt_video]]


def gradio_predict(video,image, question):
    answer = predict_answer(video,image, question)
    return answer

css = """
#container{
    display: block;
    margin-left: auto;
    margin-right: auto;
    width: 60%;
}
#intro{
    max-width: 100%;
    margin: 0 auto;
    text-align: center;
}

"""
with gr.Blocks(css = css) as app:
    with gr.Row(elem_id="container"):
        gr.Image("gsoc_redhen.png",min_width=60, label="GSOC 2024")
    gr.Markdown("""  
    ## This Gradio app serves as four folds:
    ### 1. My ability and experience to design a customizable Gradio application with Interface/Blocks structure.
    ### 2. One of my Multimodel Vision-Language model's capabilities with the LLaVA framework.
    ### 3. Demo for annotating random images and 4 second videos provided at Notion (https://shorturl.at/givyC) 
    ### 4. Ability to integrate a Large Language Model and Vision Encoder        
                 """)
    with gr.Row():
        video = gr.Video(label="Video")
        image = gr.Image(type="pil", label="Image")
    with gr.Row():
        with gr.Column():
            question = gr.Textbox(label="Question", placeholder="Annotate prompt", lines=4.3)
            btn = gr.Button("Annotate")
        with gr.Column():
            answer = gr.TextArea(label="Answer")
    
    
    btn.click(gradio_predict, inputs=[video,image, question], outputs=answer)

    gr.Examples(
        examples=test_examples,
        inputs=[video,image, question],
        outputs= answer,
        fn=gradio_predict,
        cache_examples=True,
    )

app.launch(debug=True)