Build

Paused

App Files Files Community

ManishThota commited on Mar 9

Commit

b499d7f

•

1 Parent(s): feb8185

Update app.py

Browse files

Files changed (1) hide show

app.py +36 -48

app.py CHANGED Viewed

@@ -4,14 +4,11 @@ import torch
 from transformers import AutoModelForCausalLM, AutoTokenizer
 import cv2
 import numpy as np
-import io
 # # Ensure GPU usage if available
 device = "cuda" if torch.cuda.is_available() else "cpu"
 # Initialize the model and tokenizer
 model = AutoModelForCausalLM.from_pretrained("ManishThota/SparrowVQE",
                                              torch_dtype=torch.float16,
@@ -20,59 +17,54 @@ model = AutoModelForCausalLM.from_pretrained("ManishThota/SparrowVQE",
 tokenizer = AutoTokenizer.from_pretrained("ManishThota/SparrowVQE", trust_remote_code=True)
-# def process_video(video_bytes):
-#     """Extracts frames from the video, 1 per second."""
-#     video = cv2.VideoCapture(io.BytesIO(video_bytes))
-#     fps = video.get(cv2.CAP_PROP_FPS)
-#     frames = []
-#     success, frame = video.read()
-#     while success:
-#         frames.append(frame)
-#         for _ in range(int(fps)):  # Skip fps frames
-#             success, frame = video.read()
-#     video.release()
-#     return frames[:4]  # Return the first 4 frames
-def video_to_frames(video_path):
     """Converts a video file into frames and stores them as PNG images in a list."""
-    # List to hold frames encoded as PNG
     frames_png = []
-    # Open the video file
-    cap = cv2.VideoCapture(video_path)
-    # Check if video opened successfully
     if not cap.isOpened():
         print("Error opening video file")
         return frames_png
-    # Read until video is completed
     while cap.isOpened():
-        # Capture frame-by-frame
         ret, frame = cap.read()
-        # If frame is read correctly ret is True
         if not ret:
             print("Can't receive frame (stream end?). Exiting ...")
             break
-        # Convert the frame to PNG and store it
-        is_success, buffer = cv2.imencode(".png", frame)
-        if is_success:
-            frames_png.append(np.array(buffer).tobytes())
-    # When everything done, release the video capture object
     cap.release()
     return frames_png
 def predict_answer(image, video, question, max_tokens=100):
     text = f"A chat between a curious user and an artificial intelligence assistant. The assistant gives helpful, detailed, and polite answers to the user's questions. USER: <image>\n{question}? ASSISTANT:"
     input_ids = tokenizer(text, return_tensors='pt').input_ids.to(device)
-    if image:
         # Process as an image
         image = image.convert("RGB")
         image_tensor = model.image_preprocess(image)
@@ -86,13 +78,13 @@ def predict_answer(image, video, question, max_tokens=100):
         return tokenizer.decode(output_ids[input_ids.shape[1]:], skip_special_tokens=True).strip()
-    elif video:
         # Process as a video
         frames = video_to_frames(video)
         answers = []
         for frame in frames:
-            frame = Image.open(frame).convert("RGB")
-            image_tensor = model.image_preprocess(frame)
             # Generate the answer
             output_ids = model.generate(
@@ -114,21 +106,17 @@ def predict_answer(image, video, question, max_tokens=100):
 def gradio_predict(image, video, question, max_tokens):
     answer = predict_answer(image, video, question, max_tokens)
     return answer
-# Define the Gradio interface
 iface = gr.Interface(
     fn=gradio_predict,
-    inputs=[gr.Image(type="pil", label="Upload or Drag an Image"),
-            gr.Video(label="upload your video here"),
-            gr.Textbox(label="Question", placeholder="e.g. Can you explain the slide?", scale=4),
-            gr.Slider(2, 500, value=25, label="Token Count", info="Choose between 2 and 500")],
     outputs=gr.TextArea(label="Answer"),
-    # examples=examples,
-    title="Super Rapid Annotator - Multimodal vision tool to annotate videos with LLaVA framework",
-    # description="An interactive chat model that can answer questions about images in an Academic context. \n We can input images, and the system will analyze them to provide information about their contents. I've utilized this capability by feeding slides from PowerPoint presentations used in classes and the lecture content passed as text. Consequently, the model now mimics the behavior and responses of my professors. So, if I present any PowerPoint slide, it explains it just like my professor would, further it can be personalized.",
 )
-# Launch the app
-iface.queue().launch(debug=True)

 from transformers import AutoModelForCausalLM, AutoTokenizer
 import cv2
 import numpy as np
 # # Ensure GPU usage if available
 device = "cuda" if torch.cuda.is_available() else "cpu"
 # Initialize the model and tokenizer
 model = AutoModelForCausalLM.from_pretrained("ManishThota/SparrowVQE",
                                              torch_dtype=torch.float16,
 tokenizer = AutoTokenizer.from_pretrained("ManishThota/SparrowVQE", trust_remote_code=True)
+def video_to_frames(video, fps=1):
     """Converts a video file into frames and stores them as PNG images in a list."""
     frames_png = []
+    cap = cv2.VideoCapture(video)
     if not cap.isOpened():
         print("Error opening video file")
         return frames_png
+    frame_count = 0
+    frame_interval = int(cap.get(cv2.CAP_PROP_FPS)) // fps  # Calculate frame interval
     while cap.isOpened():
         ret, frame = cap.read()
         if not ret:
             print("Can't receive frame (stream end?). Exiting ...")
             break
+        if frame_count % frame_interval == 0:
+            is_success, buffer = cv2.imencode(".png", frame)
+            if is_success:
+                frames_png.append(np.array(buffer).tobytes())
+        frame_count += 1
     cap.release()
     return frames_png
+def extract_frames(frame):
+    # Convert binary data to a numpy array
+    frame_np = np.frombuffer(frame, dtype=np.uint8)
+    # Decode the PNG image
+    image_rgb = cv2.imdecode(frame_np, flags=cv2.IMREAD_COLOR)  # Assuming it's in RGB format
+    # Convert RGB to BGR
+    image_bgr = cv2.cvtColor(image_rgb, cv2.COLOR_RGB2BGR)
+    return image_bgr
 def predict_answer(image, video, question, max_tokens=100):
     text = f"A chat between a curious user and an artificial intelligence assistant. The assistant gives helpful, detailed, and polite answers to the user's questions. USER: <image>\n{question}? ASSISTANT:"
     input_ids = tokenizer(text, return_tensors='pt').input_ids.to(device)
+    if image is not None:
         # Process as an image
         image = image.convert("RGB")
         image_tensor = model.image_preprocess(image)
         return tokenizer.decode(output_ids[input_ids.shape[1]:], skip_special_tokens=True).strip()
+    elif video is not None:
         # Process as a video
         frames = video_to_frames(video)
         answers = []
         for frame in frames:
+            image = extract_frames(frame)
+            image_tensor = model.image_preprocess(image)
             # Generate the answer
             output_ids = model.generate(
 def gradio_predict(image, video, question, max_tokens):
     answer = predict_answer(image, video, question, max_tokens)
     return answer
 iface = gr.Interface(
     fn=gradio_predict,
+    inputs=[
+        gr.Image(type="pil", label="Upload or Drag an Image"),
+        gr.Video(label="Upload your video here"),
+    ],
     outputs=gr.TextArea(label="Answer"),
+    # outputs=gr.Image(label="Output"),
+    title="Video/Image Viewer",
+    description="Upload an image or video to view it or extract frames from the video.",
 )
+iface.launch(debug=True)