Ridealist commited on
Commit
6598fac
1 Parent(s): e2d11b6

feat: apply audio trancript evaluation

Browse files
Files changed (1) hide show
  1. src/obs_eval_gradio.py +95 -28
src/obs_eval_gradio.py CHANGED
@@ -10,6 +10,7 @@ from langchain.chat_models import ChatOpenAI
10
  from langchain.schema import StrOutputParser
11
  from PIL import Image
12
 
 
13
  global_dict = {}
14
 
15
  ######
@@ -55,9 +56,9 @@ def validate_api_key(api_key):
55
  raise gr.Error(f"OpenAI returned an API Error: {error}")
56
 
57
 
58
- def _process_video(image_file):
59
  # Read and process the video file
60
- video = cv2.VideoCapture(image_file.name)
61
 
62
  base64Frames = []
63
  while video.isOpened():
@@ -75,10 +76,21 @@ def _process_video(image_file):
75
  raise gr.Error(f"Cannot open the video.")
76
  return base64Frames
77
 
 
 
 
 
 
 
 
 
 
 
 
78
 
79
- def _make_video_batch(image_file, batch_size, total_batch_percent):
80
 
81
- frames = _process_video(image_file)
82
 
83
  TOTAL_FRAME_COUNT = len(frames)
84
  BATCH_SIZE = int(batch_size)
@@ -110,9 +122,9 @@ def _make_video_batch(image_file, batch_size, total_batch_percent):
110
  return base64FramesBatch
111
 
112
 
113
- def show_batches(image_file, batch_size, total_batch_percent):
114
 
115
- batched_frames = _make_video_batch(image_file, batch_size, total_batch_percent)
116
 
117
  images = []
118
  for i, l in enumerate(batched_frames):
@@ -130,6 +142,31 @@ def show_batches(image_file, batch_size, total_batch_percent):
130
  return images
131
 
132
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
133
  def call_gpt_vision(api_key, instruction, progress=gr.Progress()):
134
  frames = global_dict.get('batched_frames')
135
  openai.api_key = api_key
@@ -147,7 +184,7 @@ def call_gpt_vision(api_key, instruction, progress=gr.Progress()):
147
  {
148
  "role": "user",
149
  "content": [
150
- "Evaluate the behavior's actions based on the <CRITERIA> provided.\n\n" + instruction,
151
  *map(lambda x: {"image": x, "resize": 300}, batch),
152
  ],
153
  },
@@ -234,7 +271,7 @@ def get_final_anser(api_key, result_text):
234
  def main():
235
  with gr.Blocks() as demo:
236
  gr.Markdown("# GPT-4 Vision for Evaluation")
237
- gr.Markdown("## 1st STEP. Make Batched Snapshots")
238
  with gr.Row():
239
  with gr.Column(scale=1):
240
  api_key_input = gr.Textbox(
@@ -247,47 +284,77 @@ def main():
247
  label="Upload your video (under 1 minute video is the best..!)",
248
  file_types=["video"],
249
  )
250
- batch_size = gr.Number(
251
  label="Number of images in one batch",
252
- info="(2<=N<=5)",
253
  value=5,
254
  minimum=2,
255
- maximum=5
 
256
  )
257
- total_batch_percent = gr.Number(
258
  label="Percentage(%) of batched image frames to total frames",
259
- info="(5<=P<=20)",
260
  value=5,
261
  minimum=5,
262
  maximum=20,
263
  step=5
264
  )
265
- process_button = gr.Button("Process")
266
-
267
  with gr.Column(scale=1):
268
  gallery = gr.Gallery(
269
  label="Batched Snapshots of Video",
270
  columns=[5],
271
- rows=[1],
272
  object_fit="contain",
273
  height="auto"
274
  )
275
- gr.Markdown("## 2nd STEP. Set Evaluation Criteria")
 
 
 
 
 
 
276
  with gr.Row():
277
  with gr.Column(scale=1):
278
- instruction_input = gr.Textbox(
279
- label="Evaluation Criteria",
280
- info="Enter your evaluation criteria here...",
281
- placeholder="<CRITERIA>\nThe correct way to do a forward roll is as follows:\n1. From standing, bend your knees and straighten your arms in front of you.\n2. Place your hands on the floor, shoulder width apart with fingers pointing forward and your chin on your chest.\n3. Rock forward, straighten legs and transfer body weight onto shoulders.\n4. Rock forward on a rounded back placing both feet on the floor.\n5. Stand using arms for balance, without hands touching the floor.",
282
- lines=7)
283
- submit_button = gr.Button("Evaluate")
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
284
 
 
285
  with gr.Column(scale=1):
286
- output_box = gr.Textbox(
287
- label="Batched Generated Response...(Streaming)",
288
  lines=10,
289
  interactive=False
290
  )
 
 
 
 
 
 
 
 
 
291
  gr.Markdown("## 3rd STEP. Summarize and Get Result")
292
  with gr.Row():
293
  with gr.Column(scale=1):
@@ -299,11 +366,11 @@ def main():
299
  submit_button_2 = gr.Button("Summarize")
300
 
301
  with gr.Column(scale=1):
302
- output_box_fin_fin = gr.Textbox(label="FINAL EVALUATION", lines=10, interactive=True)
303
 
304
 
305
- process_button.click(fn=validate_api_key, inputs=api_key_input, outputs=None).success(fn=show_batches, inputs=[video_upload, batch_size, total_batch_percent], outputs=gallery)
306
- submit_button.click(fn=call_gpt_vision, inputs=[api_key_input, instruction_input], outputs=output_box).then(get_full_result, None, output_box_fin)
307
  submit_button_2.click(fn=get_final_anser, inputs=[api_key_input, output_box_fin], outputs=output_box_fin_fin)
308
 
309
  demo.launch()
 
10
  from langchain.schema import StrOutputParser
11
  from PIL import Image
12
 
13
+
14
  global_dict = {}
15
 
16
  ######
 
56
  raise gr.Error(f"OpenAI returned an API Error: {error}")
57
 
58
 
59
+ def _process_video(video_file):
60
  # Read and process the video file
61
+ video = cv2.VideoCapture(video_file.name)
62
 
63
  base64Frames = []
64
  while video.isOpened():
 
76
  raise gr.Error(f"Cannot open the video.")
77
  return base64Frames
78
 
79
+ def _process_audio(video_file, api_key):
80
+ audio_file = open(video_file.name, "rb")
81
+
82
+ client = openai.OpenAI(api_key=api_key)
83
+ transcript = client.audio.transcriptions.create(
84
+ model="whisper-1",
85
+ file=audio_file,
86
+ response_format="text"
87
+ )
88
+ return transcript
89
+
90
 
91
+ def _make_video_batch(video_file, batch_size, total_batch_percent):
92
 
93
+ frames = _process_video(video_file)
94
 
95
  TOTAL_FRAME_COUNT = len(frames)
96
  BATCH_SIZE = int(batch_size)
 
122
  return base64FramesBatch
123
 
124
 
125
+ def show_batches(video_file, batch_size, total_batch_percent):
126
 
127
+ batched_frames = _make_video_batch(video_file, batch_size, total_batch_percent)
128
 
129
  images = []
130
  for i, l in enumerate(batched_frames):
 
142
  return images
143
 
144
 
145
+ def change_audio_rubric(choice):
146
+ if choice == "Video only":
147
+ return gr.Textbox(visible=False)
148
+ else:
149
+ return gr.Textbox(
150
+ label="3. Audio Evaluation Rubric (if needed)",
151
+ info="Enter your evaluation rubric here...",
152
+ placeholder="<RUBRIC>\nHere's what the performer should *SAY* as follows:\n1. From standing, you need to shout 'Start' signal.\n2. Rock forward, you shouldn't make any noise while rolling.\n3. Standing still again, you need to shout 'Finish' signal.",
153
+ lines=7,
154
+ interactive=True,
155
+ visible=True)
156
+
157
+
158
+ def change_audio_eval(choice):
159
+ if choice == "Video only":
160
+ return gr.Textbox(visible=False)
161
+ else:
162
+ return gr.Textbox(
163
+ label="Audio Script Eval...",
164
+ lines=10,
165
+ interactive=False,
166
+ visible=True
167
+ )
168
+
169
+
170
  def call_gpt_vision(api_key, instruction, progress=gr.Progress()):
171
  frames = global_dict.get('batched_frames')
172
  openai.api_key = api_key
 
184
  {
185
  "role": "user",
186
  "content": [
187
+ "Evaluate the behavior's actions based on the <RUBRIC> provided.\n\n" + instruction,
188
  *map(lambda x: {"image": x, "resize": 300}, batch),
189
  ],
190
  },
 
271
  def main():
272
  with gr.Blocks() as demo:
273
  gr.Markdown("# GPT-4 Vision for Evaluation")
274
+ gr.Markdown("## 1st STEP. Make Batched Snapshots & Audio Script")
275
  with gr.Row():
276
  with gr.Column(scale=1):
277
  api_key_input = gr.Textbox(
 
284
  label="Upload your video (under 1 minute video is the best..!)",
285
  file_types=["video"],
286
  )
287
+ batch_size = gr.Slider(
288
  label="Number of images in one batch",
289
+ info="Choose between 2 and 5",
290
  value=5,
291
  minimum=2,
292
+ maximum=5,
293
+ step=1
294
  )
295
+ total_batch_percent = gr.Slider(
296
  label="Percentage(%) of batched image frames to total frames",
297
+ info="Choose between 5(%) and 20(%)",
298
  value=5,
299
  minimum=5,
300
  maximum=20,
301
  step=5
302
  )
303
+ process_button = gr.Button("Process")
 
304
  with gr.Column(scale=1):
305
  gallery = gr.Gallery(
306
  label="Batched Snapshots of Video",
307
  columns=[5],
 
308
  object_fit="contain",
309
  height="auto"
310
  )
311
+ transcript_box = gr.Textbox(
312
+ label="Audio Transcript",
313
+ lines=8,
314
+ interactive=False
315
+ )
316
+
317
+ gr.Markdown("## 2nd STEP. Set Evaluation Rubric")
318
  with gr.Row():
319
  with gr.Column(scale=1):
320
+ multimodal_radio = gr.Radio(
321
+ label="1. Multimodal Selection",
322
+ info="Choose evaluation channel",
323
+ value="Video + Audio",
324
+ choices=["Video + Audio", "Video only"]
325
+ )
326
+ rubric_video_input = gr.Textbox(
327
+ label="2. Video Evaluation Rubric",
328
+ info="Enter your evaluation rubric here...",
329
+ placeholder="<RUBRIC>\nHere's what the performer should *SHOW* as follows:\n1. From standing, bend your knees and straighten your arms in front of you.\n2. Place your hands on the floor, shoulder width apart with fingers pointing forward and your chin on your chest.\n3. Rock forward, straighten legs and transfer body weight onto shoulders.\n4. Rock forward on a rounded back placing both feet on the floor.\n5. Stand using arms for balance, without hands touching the floor.",
330
+ lines=7
331
+ )
332
+ rubric_audio_input = gr.Textbox(
333
+ label="3. Audio Evaluation Rubric (if needed)",
334
+ info="Enter your evaluation rubric here...",
335
+ placeholder="<RUBRIC>\nHere's what the performer should *SAY* as follows:\n1. From standing, you need to shout 'Start' signal.\n2. Rock forward, you shouldn't make any noise while rolling.\n3. Standing still again, you need to shout 'Finish' signal.",
336
+ interactive=True,
337
+ visible=True,
338
+ lines=7
339
+ )
340
+ multimodal_radio.change(fn=change_audio_rubric, inputs=multimodal_radio, outputs=rubric_audio_input)
341
 
342
+ submit_button = gr.Button("Evaluate")
343
  with gr.Column(scale=1):
344
+ video_output_box = gr.Textbox(
345
+ label="Video Batched Snapshots Eval...",
346
  lines=10,
347
  interactive=False
348
  )
349
+ audio_output_box = gr.Textbox(
350
+ label="Audio Script Eval...",
351
+ lines=10,
352
+ interactive=False,
353
+ visible=True
354
+ )
355
+ multimodal_radio.change(fn=change_audio_eval, inputs=multimodal_radio, outputs=audio_output_box)
356
+
357
+
358
  gr.Markdown("## 3rd STEP. Summarize and Get Result")
359
  with gr.Row():
360
  with gr.Column(scale=1):
 
366
  submit_button_2 = gr.Button("Summarize")
367
 
368
  with gr.Column(scale=1):
369
+ output_box_fin_fin = gr.Textbox(label="Final Evaluation", lines=10, interactive=True)
370
 
371
 
372
+ process_button.click(fn=validate_api_key, inputs=api_key_input, outputs=None).success(fn=_process_audio, inputs=[video_upload, api_key_input], outputs=transcript_box).success(fn=show_batches, inputs=[video_upload, batch_size, total_batch_percent], outputs=gallery)
373
+ submit_button.click(fn=call_gpt_vision, inputs=[api_key_input, rubric_video_input], outputs=video_output_box).then().then(get_full_result, None, output_box_fin)
374
  submit_button_2.click(fn=get_final_anser, inputs=[api_key_input, output_box_fin], outputs=output_box_fin_fin)
375
 
376
  demo.launch()