ManishThota commited on
Commit
5c72980
1 Parent(s): a333293

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +43 -51
app.py CHANGED
@@ -18,6 +18,8 @@ model = AutoModelForCausalLM.from_pretrained("ManishThota/SparrowVQE",
18
  tokenizer = AutoTokenizer.from_pretrained("ManishThota/SparrowVQE", trust_remote_code=True)
19
 
20
 
 
 
21
  def video_to_frames(video, fps=1):
22
  """Converts a video file into frames and stores them as PNG images in a list."""
23
  frames_png = []
@@ -59,31 +61,13 @@ def extract_frames(frame):
59
 
60
  return image_bgr
61
 
62
- def predict_answer(image, video, question, max_tokens=100):
63
 
64
  text = f"A chat between a curious user and an artificial intelligence assistant. The assistant gives helpful, detailed, and polite answers to the user's questions. USER: <image>\n{question}? ASSISTANT:"
65
  input_ids = tokenizer(text, return_tensors='pt').input_ids.to(device)
66
 
67
- # frames = video_to_frames(video)
68
- # answers = []
69
- # for i in range(len(frames)):
70
- # image = extract_frames(frames[i])
71
- # image_tensor = model.image_preprocess([image])
72
-
73
- # # Generate the answer
74
- # output_ids = model.generate(
75
- # input_ids,
76
- # max_new_tokens=max_tokens,
77
- # images=image_tensor,
78
- # use_cache=True)[0]
79
-
80
- # answer = tokenizer.decode(output_ids[input_ids.shape[1]:], skip_special_tokens=True).strip()
81
- # answers.append(answer)
82
- # return answers
83
-
84
-
85
 
86
- if image:
87
  # Process as an image
88
  image = image.convert("RGB")
89
  image_tensor = model.image_preprocess(image)
@@ -91,30 +75,30 @@ def predict_answer(image, video, question, max_tokens=100):
91
  #Generate the answer
92
  output_ids = model.generate(
93
  input_ids,
94
- max_new_tokens=max_tokens,
95
  images=image_tensor,
96
  use_cache=True)[0]
97
 
98
  return tokenizer.decode(output_ids[input_ids.shape[1]:], skip_special_tokens=True).strip()
99
 
100
- elif video:
101
  # Process as a video
102
  frames = video_to_frames(video)
103
  answers = []
104
- for i in range(len(frames)):
105
- image = extract_frames(frames[i])
106
  image_tensor = model.image_preprocess([image])
107
 
108
  # Generate the answer
109
  output_ids = model.generate(
110
  input_ids,
111
- max_new_tokens=max_tokens,
112
  images=image_tensor,
113
  use_cache=True)[0]
114
 
115
  answer = tokenizer.decode(output_ids[input_ids.shape[1]:], skip_special_tokens=True).strip()
116
  answers.append(answer)
117
- return ast.literal_eval(answers[0])
118
 
119
  else:
120
  return "Unsupported file type. Please upload an image or video."
@@ -122,39 +106,47 @@ def predict_answer(image, video, question, max_tokens=100):
122
 
123
 
124
 
125
- def gradio_predict(image, video, question, max_tokens):
126
- answer = predict_answer(image, video, question, max_tokens)
127
  return answer
128
 
129
- # iface = gr.Interface(
130
- # fn=gradio_predict,
131
- # inputs=[
132
- # gr.Image(type="pil", label="Upload or Drag an Image"),
133
- # gr.Video(label="Upload your video here"),
134
- # gr.Textbox(label="Question", placeholder="e.g. Can you explain the slide?", scale=4),
135
- # gr.Slider(2, 500, value=25, label="Token Count", info="Choose between 2 and 500")],
136
- # outputs=gr.TextArea(label="Answer"),
137
- # # outputs=gr.Image(label="Output"),
138
- # title="Video/Image Viewer",
139
- # description="Upload an image or video to view it or extract frames from the video.",
140
- # )
141
-
142
- # iface.launch(debug=True)
143
-
144
-
145
- with gr.Blocks() as app:
146
- gr.Markdown("### Upload an Image or Video")
 
 
 
 
 
 
 
147
  with gr.Row():
148
- image = gr.Image(type="pil", label="Upload or Drag an Image")
149
  video = gr.Video(label="Upload your video here")
 
150
  with gr.Row():
151
  with gr.Column():
152
- question = gr.Textbox(label="Question", placeholder="e.g. Can you explain the slide?", lines=4)
153
- tokens = gr.Slider(2, 500, value=25, label="Token Count", info="Choose between 2 and 500")
154
  with gr.Column():
155
  answer = gr.TextArea(label="Answer")
156
 
157
- btn = gr.Button("Predict")
158
- btn.click(gradio_predict, inputs=[image, video, question, tokens], outputs=answer)
159
 
160
  app.launch(debug=True)
 
 
18
  tokenizer = AutoTokenizer.from_pretrained("ManishThota/SparrowVQE", trust_remote_code=True)
19
 
20
 
21
+
22
+
23
  def video_to_frames(video, fps=1):
24
  """Converts a video file into frames and stores them as PNG images in a list."""
25
  frames_png = []
 
61
 
62
  return image_bgr
63
 
64
+ def predict_answer(image, video, question):
65
 
66
  text = f"A chat between a curious user and an artificial intelligence assistant. The assistant gives helpful, detailed, and polite answers to the user's questions. USER: <image>\n{question}? ASSISTANT:"
67
  input_ids = tokenizer(text, return_tensors='pt').input_ids.to(device)
68
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
69
 
70
+ if image is not None:
71
  # Process as an image
72
  image = image.convert("RGB")
73
  image_tensor = model.image_preprocess(image)
 
75
  #Generate the answer
76
  output_ids = model.generate(
77
  input_ids,
78
+ max_new_tokens=25,
79
  images=image_tensor,
80
  use_cache=True)[0]
81
 
82
  return tokenizer.decode(output_ids[input_ids.shape[1]:], skip_special_tokens=True).strip()
83
 
84
+ elif video is not None:
85
  # Process as a video
86
  frames = video_to_frames(video)
87
  answers = []
88
+ for frame in frames:
89
+ image = extract_frames(frame)
90
  image_tensor = model.image_preprocess([image])
91
 
92
  # Generate the answer
93
  output_ids = model.generate(
94
  input_ids,
95
+ max_new_tokens=25,
96
  images=image_tensor,
97
  use_cache=True)[0]
98
 
99
  answer = tokenizer.decode(output_ids[input_ids.shape[1]:], skip_special_tokens=True).strip()
100
  answers.append(answer)
101
+ return "\n".join(answers)
102
 
103
  else:
104
  return "Unsupported file type. Please upload an image or video."
 
106
 
107
 
108
 
109
+ def gradio_predict(image, video, question):
110
+ answer = predict_answer(image, video, question)
111
  return answer
112
 
113
+ css = """
114
+ #container{
115
+ display: block;
116
+ margin-left: auto;
117
+ margin-right: auto;
118
+ width: 50%;
119
+ }
120
+ #intro{
121
+ max-width: 100%;
122
+ margin: 0 auto;
123
+ text-align: center;
124
+ }
125
+
126
+ """
127
+ with gr.Blocks(css = css) as app:
128
+ with gr.Row(elem_id="container"):
129
+ gr.Markdown("""<div style='text-align: center;'><img src="https://github-production-user-asset-6210df.s3.amazonaws.com/37763863/311454340-af72f848-9735-4d49-830b-885ffbb81091.jpeg?X-Amz-Algorithm=AWS4-HMAC-SHA256&X-Amz-Credential=AKIAVCODYLSA53PQK4ZA%2F20240309%2Fus-east-1%2Fs3%2Faws4_request&X-Amz-Date=20240309T165700Z&X-Amz-Expires=300&X-Amz-Signature=51aeb4811afff72e70c083594aaffcca1f4a2b95ddd4adf23ee5e736e4fbfefe&X-Amz-SignedHeaders=host&actor_id=37763863&key_id=0&repo_id=769602947" width="1000" height="500" /></div>""")
130
+
131
+ gr.Markdown("""
132
+ ## This Gradio app serves as four folds:
133
+ ### 1. My ability and experience to design a customizable Gradio application with Interface/Blocks structure.
134
+ ### 2. One of my Multimodel Vision-Language model's capabilities with the LLaVA framework.
135
+ ### 3. Demo for annotating random images and 4 second videos provided at Notion (https://shorturl.at/givyC)
136
+ ### 4. Ability to integrate a Large Language Model and Vision Encoder
137
+ """)
138
  with gr.Row():
 
139
  video = gr.Video(label="Upload your video here")
140
+ image = gr.Image(type="pil", label="Upload or Drag an Image")
141
  with gr.Row():
142
  with gr.Column():
143
+ question = gr.Textbox(label="Question", placeholder="Annotate prompt", lines=4.3)
144
+ btn = gr.Button("Annotate")
145
  with gr.Column():
146
  answer = gr.TextArea(label="Answer")
147
 
148
+
149
+ btn.click(gradio_predict, inputs=[image, video, question], outputs=answer)
150
 
151
  app.launch(debug=True)
152
+