ManishThota commited on
Commit
b499d7f
1 Parent(s): feb8185

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +36 -48
app.py CHANGED
@@ -4,14 +4,11 @@ import torch
4
  from transformers import AutoModelForCausalLM, AutoTokenizer
5
  import cv2
6
  import numpy as np
7
- import io
8
 
9
 
10
  # # Ensure GPU usage if available
11
  device = "cuda" if torch.cuda.is_available() else "cpu"
12
 
13
-
14
-
15
  # Initialize the model and tokenizer
16
  model = AutoModelForCausalLM.from_pretrained("ManishThota/SparrowVQE",
17
  torch_dtype=torch.float16,
@@ -20,59 +17,54 @@ model = AutoModelForCausalLM.from_pretrained("ManishThota/SparrowVQE",
20
  tokenizer = AutoTokenizer.from_pretrained("ManishThota/SparrowVQE", trust_remote_code=True)
21
 
22
 
23
- # def process_video(video_bytes):
24
- # """Extracts frames from the video, 1 per second."""
25
- # video = cv2.VideoCapture(io.BytesIO(video_bytes))
26
- # fps = video.get(cv2.CAP_PROP_FPS)
27
- # frames = []
28
- # success, frame = video.read()
29
- # while success:
30
- # frames.append(frame)
31
- # for _ in range(int(fps)): # Skip fps frames
32
- # success, frame = video.read()
33
- # video.release()
34
- # return frames[:4] # Return the first 4 frames
35
-
36
- def video_to_frames(video_path):
37
  """Converts a video file into frames and stores them as PNG images in a list."""
38
- # List to hold frames encoded as PNG
39
  frames_png = []
 
40
 
41
- # Open the video file
42
- cap = cv2.VideoCapture(video_path)
43
-
44
- # Check if video opened successfully
45
  if not cap.isOpened():
46
  print("Error opening video file")
47
  return frames_png
48
 
49
- # Read until video is completed
 
 
50
  while cap.isOpened():
51
- # Capture frame-by-frame
52
  ret, frame = cap.read()
53
-
54
- # If frame is read correctly ret is True
55
  if not ret:
56
  print("Can't receive frame (stream end?). Exiting ...")
57
  break
58
 
59
- # Convert the frame to PNG and store it
60
- is_success, buffer = cv2.imencode(".png", frame)
61
- if is_success:
62
- frames_png.append(np.array(buffer).tobytes())
 
 
63
 
64
- # When everything done, release the video capture object
65
  cap.release()
66
-
67
  return frames_png
68
 
 
 
 
 
 
 
 
 
 
 
 
 
 
69
  def predict_answer(image, video, question, max_tokens=100):
70
 
71
  text = f"A chat between a curious user and an artificial intelligence assistant. The assistant gives helpful, detailed, and polite answers to the user's questions. USER: <image>\n{question}? ASSISTANT:"
72
  input_ids = tokenizer(text, return_tensors='pt').input_ids.to(device)
73
 
74
 
75
- if image:
76
  # Process as an image
77
  image = image.convert("RGB")
78
  image_tensor = model.image_preprocess(image)
@@ -86,13 +78,13 @@ def predict_answer(image, video, question, max_tokens=100):
86
 
87
  return tokenizer.decode(output_ids[input_ids.shape[1]:], skip_special_tokens=True).strip()
88
 
89
- elif video:
90
  # Process as a video
91
  frames = video_to_frames(video)
92
  answers = []
93
  for frame in frames:
94
- frame = Image.open(frame).convert("RGB")
95
- image_tensor = model.image_preprocess(frame)
96
 
97
  # Generate the answer
98
  output_ids = model.generate(
@@ -114,21 +106,17 @@ def predict_answer(image, video, question, max_tokens=100):
114
  def gradio_predict(image, video, question, max_tokens):
115
  answer = predict_answer(image, video, question, max_tokens)
116
  return answer
117
-
118
-
119
 
120
- # Define the Gradio interface
121
  iface = gr.Interface(
122
  fn=gradio_predict,
123
- inputs=[gr.Image(type="pil", label="Upload or Drag an Image"),
124
- gr.Video(label="upload your video here"),
125
- gr.Textbox(label="Question", placeholder="e.g. Can you explain the slide?", scale=4),
126
- gr.Slider(2, 500, value=25, label="Token Count", info="Choose between 2 and 500")],
127
  outputs=gr.TextArea(label="Answer"),
128
- # examples=examples,
129
- title="Super Rapid Annotator - Multimodal vision tool to annotate videos with LLaVA framework",
130
- # description="An interactive chat model that can answer questions about images in an Academic context. \n We can input images, and the system will analyze them to provide information about their contents. I've utilized this capability by feeding slides from PowerPoint presentations used in classes and the lecture content passed as text. Consequently, the model now mimics the behavior and responses of my professors. So, if I present any PowerPoint slide, it explains it just like my professor would, further it can be personalized.",
131
  )
132
 
133
- # Launch the app
134
- iface.queue().launch(debug=True)
 
4
  from transformers import AutoModelForCausalLM, AutoTokenizer
5
  import cv2
6
  import numpy as np
 
7
 
8
 
9
  # # Ensure GPU usage if available
10
  device = "cuda" if torch.cuda.is_available() else "cpu"
11
 
 
 
12
  # Initialize the model and tokenizer
13
  model = AutoModelForCausalLM.from_pretrained("ManishThota/SparrowVQE",
14
  torch_dtype=torch.float16,
 
17
  tokenizer = AutoTokenizer.from_pretrained("ManishThota/SparrowVQE", trust_remote_code=True)
18
 
19
 
20
+ def video_to_frames(video, fps=1):
 
 
 
 
 
 
 
 
 
 
 
 
 
21
  """Converts a video file into frames and stores them as PNG images in a list."""
 
22
  frames_png = []
23
+ cap = cv2.VideoCapture(video)
24
 
 
 
 
 
25
  if not cap.isOpened():
26
  print("Error opening video file")
27
  return frames_png
28
 
29
+ frame_count = 0
30
+ frame_interval = int(cap.get(cv2.CAP_PROP_FPS)) // fps # Calculate frame interval
31
+
32
  while cap.isOpened():
 
33
  ret, frame = cap.read()
 
 
34
  if not ret:
35
  print("Can't receive frame (stream end?). Exiting ...")
36
  break
37
 
38
+ if frame_count % frame_interval == 0:
39
+ is_success, buffer = cv2.imencode(".png", frame)
40
+ if is_success:
41
+ frames_png.append(np.array(buffer).tobytes())
42
+
43
+ frame_count += 1
44
 
 
45
  cap.release()
 
46
  return frames_png
47
 
48
+ def extract_frames(frame):
49
+
50
+ # Convert binary data to a numpy array
51
+ frame_np = np.frombuffer(frame, dtype=np.uint8)
52
+
53
+ # Decode the PNG image
54
+ image_rgb = cv2.imdecode(frame_np, flags=cv2.IMREAD_COLOR) # Assuming it's in RGB format
55
+
56
+ # Convert RGB to BGR
57
+ image_bgr = cv2.cvtColor(image_rgb, cv2.COLOR_RGB2BGR)
58
+
59
+ return image_bgr
60
+
61
  def predict_answer(image, video, question, max_tokens=100):
62
 
63
  text = f"A chat between a curious user and an artificial intelligence assistant. The assistant gives helpful, detailed, and polite answers to the user's questions. USER: <image>\n{question}? ASSISTANT:"
64
  input_ids = tokenizer(text, return_tensors='pt').input_ids.to(device)
65
 
66
 
67
+ if image is not None:
68
  # Process as an image
69
  image = image.convert("RGB")
70
  image_tensor = model.image_preprocess(image)
 
78
 
79
  return tokenizer.decode(output_ids[input_ids.shape[1]:], skip_special_tokens=True).strip()
80
 
81
+ elif video is not None:
82
  # Process as a video
83
  frames = video_to_frames(video)
84
  answers = []
85
  for frame in frames:
86
+ image = extract_frames(frame)
87
+ image_tensor = model.image_preprocess(image)
88
 
89
  # Generate the answer
90
  output_ids = model.generate(
 
106
  def gradio_predict(image, video, question, max_tokens):
107
  answer = predict_answer(image, video, question, max_tokens)
108
  return answer
 
 
109
 
 
110
  iface = gr.Interface(
111
  fn=gradio_predict,
112
+ inputs=[
113
+ gr.Image(type="pil", label="Upload or Drag an Image"),
114
+ gr.Video(label="Upload your video here"),
115
+ ],
116
  outputs=gr.TextArea(label="Answer"),
117
+ # outputs=gr.Image(label="Output"),
118
+ title="Video/Image Viewer",
119
+ description="Upload an image or video to view it or extract frames from the video.",
120
  )
121
 
122
+ iface.launch(debug=True)