Ridealist commited on
Commit
acb6133
1 Parent(s): fe6c5fc

feat: Apply verval evaluation with vision

Browse files
Files changed (1) hide show
  1. src/obs_eval_gradio.py +131 -70
src/obs_eval_gradio.py CHANGED
@@ -10,6 +10,8 @@ from langchain.chat_models import ChatOpenAI
10
  from langchain.schema import StrOutputParser
11
  from PIL import Image
12
 
 
 
13
 
14
  global_dict = {}
15
 
@@ -60,6 +62,11 @@ def _process_video(video_file):
60
  # Read and process the video file
61
  video = cv2.VideoCapture(video_file.name)
62
 
 
 
 
 
 
63
  base64Frames = []
64
  while video.isOpened():
65
  success, frame = video.read()
@@ -76,17 +83,6 @@ def _process_video(video_file):
76
  raise gr.Error(f"Cannot open the video.")
77
  return base64Frames
78
 
79
- def _process_audio(video_file, api_key):
80
- audio_file = open(video_file.name, "rb")
81
-
82
- client = openai.OpenAI(api_key=api_key)
83
- transcript = client.audio.transcriptions.create(
84
- model="whisper-1",
85
- file=audio_file,
86
- response_format="text"
87
- )
88
- return transcript
89
-
90
 
91
  def _make_video_batch(video_file, batch_size, total_batch_percent):
92
 
@@ -142,6 +138,28 @@ def show_batches(video_file, batch_size, total_batch_percent):
142
  return images
143
 
144
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
145
  def change_audio_rubric(choice):
146
  if choice == "Video only":
147
  return gr.Textbox(visible=False)
@@ -167,24 +185,24 @@ def change_audio_eval(choice):
167
  )
168
 
169
 
170
- def call_gpt_vision(api_key, instruction, progress=gr.Progress()):
171
  frames = global_dict.get('batched_frames')
172
  openai.api_key = api_key
173
 
174
- full_result = []
175
- full_text = ""
176
  idx = 0
177
 
178
  for batch in progress.tqdm(frames):
179
- PROMPT_MESSAGES = [
180
  {
181
  "role": "system",
182
- "content": "You will evaluate the behavior of the person in the sequences of images. They show discrete parts of the whole continuous behavior. You should only evaluate the parts you can rate based on the given images. Remember, you're evaluating the given parts to evaluate the whole continuous behavior, and you'll connect them later to evaluate the whole. Never add your own judgment. Evlaute only in the contents of images themselves. If you can't evaluate it, just answer '(Unevaluable)'"
183
  },
184
  {
185
  "role": "user",
186
  "content": [
187
- "Evaluate the behavior's actions based on the <RUBRIC> provided.\n\n" + instruction,
188
  *map(lambda x: {"image": x, "resize": 300}, batch),
189
  ],
190
  },
@@ -192,45 +210,99 @@ def call_gpt_vision(api_key, instruction, progress=gr.Progress()):
192
 
193
  params = {
194
  "model": "gpt-4-vision-preview",
195
- "messages": PROMPT_MESSAGES,
196
  "max_tokens": 1024,
197
  }
198
 
199
  try:
200
  result = openai.chat.completions.create(**params)
201
  print(result.choices[0].message.content)
202
- full_result.append(result)
203
  except Exception as e:
204
  print(f"Error: {e}")
205
- full_text += f'### BATCH_{idx+1}\n' + "-"*50 + "\n" + f"Error: {e}" + "\n" + "-"*50 + "\n"
206
  idx += 1
207
  pass
208
 
209
- if 'full_result' not in global_dict:
210
- global_dict.setdefault('full_result', full_result)
211
  else:
212
- global_dict['full_result'] = full_result
213
 
214
  print(f'### BATCH_{idx+1}')
215
  print('-'*100)
216
- full_text += f'### BATCH_{idx+1}\n' + "-"*50 + "\n" + result.choices[0].message.content + "\n" + "-"*50 + "\n"
217
  idx += 1
218
  time.sleep(2)
219
 
220
- return full_text
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
221
 
222
 
223
  def get_full_result():
224
- full_result = global_dict.get('full_result')
 
225
 
226
- result_text = ""
227
-
228
- for idx, res in enumerate(full_result):
229
- result_text += f'<Evaluation_{idx+1}>\n'
230
- result_text += res.choices[0].message.content
231
- result_text += "\n"
232
- result_text += "-"*5
233
- result_text += "\n"
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
234
 
235
  global_dict.setdefault('result_text', result_text)
236
 
@@ -238,29 +310,13 @@ def get_full_result():
238
 
239
 
240
  def get_final_anser(api_key, result_text):
241
- chain = ChatOpenAI(model="gpt-4", max_tokens=1024, temperature=0, api_key=api_key)
242
- prompt = PromptTemplate.from_template(
243
- """
244
- You see the following list of texts that evaluate forward roll:
245
- {evals}
246
- Write an full text that synthesizes and summarizes the contents of all the text above.
247
- Each evaluates a specific part, and you should combine them based on what was evaluated in each part.
248
- The way to combine them is 'or', not 'and', which means you only need to evaluate the parts of a post that are rated based on that.
249
- Concatenate based on what was evaluated, if anything.
250
-
251
- Example:
252
- an overview of evaluations
253
- 1. Specific assessments for each item
254
- 2.
255
- 3.
256
- ....
257
- Overall opinion
258
-
259
- Total score : 1~10 / 10
260
-
261
- Output:
262
- """
263
  )
 
264
  runnable = prompt | chain | StrOutputParser()
265
 
266
  final_eval = runnable.invoke({"evals": result_text})
@@ -294,11 +350,11 @@ def main():
294
  )
295
  total_batch_percent = gr.Slider(
296
  label="Percentage(%) of batched image frames to total frames",
297
- info="Choose between 5(%) and 20(%)",
298
- value=5,
299
- minimum=5,
300
- maximum=20,
301
- step=5
302
  )
303
  process_button = gr.Button("Process")
304
  with gr.Column(scale=1):
@@ -326,20 +382,20 @@ def main():
326
  rubric_video_input = gr.Textbox(
327
  label="2. Video Evaluation Rubric",
328
  info="Enter your evaluation rubric here...",
329
- placeholder="<RUBRIC>\nHere's what the performer should *SHOW* as follows:\n1. From standing, bend your knees and straighten your arms in front of you.\n2. Place your hands on the floor, shoulder width apart with fingers pointing forward and your chin on your chest.\n3. Rock forward, straighten legs and transfer body weight onto shoulders.\n4. Rock forward on a rounded back placing both feet on the floor.\n5. Stand using arms for balance, without hands touching the floor.",
330
  lines=7
331
  )
332
  rubric_audio_input = gr.Textbox(
333
  label="3. Audio Evaluation Rubric (if needed)",
334
  info="Enter your evaluation rubric here...",
335
- placeholder="<RUBRIC>\nHere's what the performer should *SAY* as follows:\n1. From standing, you need to shout 'Start' signal.\n2. Rock forward, you shouldn't make any noise while rolling.\n3. Standing still again, you need to shout 'Finish' signal.",
336
  interactive=True,
337
  visible=True,
338
  lines=7
339
  )
340
  multimodal_radio.change(fn=change_audio_rubric, inputs=multimodal_radio, outputs=rubric_audio_input)
341
 
342
- submit_button = gr.Button("Evaluate")
343
  with gr.Column(scale=1):
344
  video_output_box = gr.Textbox(
345
  label="Video Batched Snapshots Eval...",
@@ -362,18 +418,23 @@ def main():
362
  label="FULL Response",
363
  info="You can edit partial evaluation in here...",
364
  lines=10,
365
- interactive=True)
366
- submit_button_2 = gr.Button("Summarize")
 
 
367
 
368
  with gr.Column(scale=1):
369
  output_box_fin_fin = gr.Textbox(label="Final Evaluation", lines=10, interactive=True)
370
 
371
 
372
- process_button.click(fn=validate_api_key, inputs=api_key_input, outputs=None).success(fn=_process_audio, inputs=[video_upload, api_key_input], outputs=transcript_box).success(fn=show_batches, inputs=[video_upload, batch_size, total_batch_percent], outputs=gallery)
373
- submit_button.click(fn=call_gpt_vision, inputs=[api_key_input, rubric_video_input], outputs=video_output_box).then().then(get_full_result, None, output_box_fin)
374
- submit_button_2.click(fn=get_final_anser, inputs=[api_key_input, output_box_fin], outputs=output_box_fin_fin)
 
 
 
375
 
376
  demo.launch()
377
 
378
  if __name__ == "__main__":
379
- main()
 
10
  from langchain.schema import StrOutputParser
11
  from PIL import Image
12
 
13
+ from prompts import VISION_SYSTEM_PROMPT, AUDIO_SYSTEM_PROMPT, USER_PROMPT_TEMPLATE, FINAL_EVALUATION_PROMPT
14
+
15
 
16
  global_dict = {}
17
 
 
62
  # Read and process the video file
63
  video = cv2.VideoCapture(video_file.name)
64
 
65
+ if 'video_file' not in global_dict:
66
+ global_dict.setdefault('video_file', video_file.name)
67
+ else:
68
+ global_dict['video_file'] = video_file.name
69
+
70
  base64Frames = []
71
  while video.isOpened():
72
  success, frame = video.read()
 
83
  raise gr.Error(f"Cannot open the video.")
84
  return base64Frames
85
 
 
 
 
 
 
 
 
 
 
 
 
86
 
87
  def _make_video_batch(video_file, batch_size, total_batch_percent):
88
 
 
138
  return images
139
 
140
 
141
+ def show_audio_transcript(video_file, api_key):
142
+ previous_video_file = global_dict.get('video_file')
143
+
144
+ if global_dict.get('transcript') and previous_video_file == video_file.name:
145
+ return global_dict['transcript']
146
+ else:
147
+ audio_file = open(video_file.name, "rb")
148
+
149
+ client = openai.OpenAI(api_key=api_key)
150
+ transcript = client.audio.transcriptions.create(
151
+ model="whisper-1",
152
+ file=audio_file,
153
+ response_format="text"
154
+ )
155
+ if 'transcript' not in global_dict:
156
+ global_dict.setdefault('transcript', transcript)
157
+ else:
158
+ global_dict['transcript'] = transcript
159
+
160
+ return transcript
161
+
162
+
163
  def change_audio_rubric(choice):
164
  if choice == "Video only":
165
  return gr.Textbox(visible=False)
 
185
  )
186
 
187
 
188
+ def call_gpt_vision(api_key, rubrics, progress=gr.Progress()) -> list:
189
  frames = global_dict.get('batched_frames')
190
  openai.api_key = api_key
191
 
192
+ full_result_vision = []
193
+ full_text_vision = ""
194
  idx = 0
195
 
196
  for batch in progress.tqdm(frames):
197
+ VISION_PROMPT_MESSAGES = [
198
  {
199
  "role": "system",
200
+ "content": VISION_SYSTEM_PROMPT,
201
  },
202
  {
203
  "role": "user",
204
  "content": [
205
+ PromptTemplate.from_template(USER_PROMPT_TEMPLATE).format(rubrics=rubrics),
206
  *map(lambda x: {"image": x, "resize": 300}, batch),
207
  ],
208
  },
 
210
 
211
  params = {
212
  "model": "gpt-4-vision-preview",
213
+ "messages": VISION_PROMPT_MESSAGES,
214
  "max_tokens": 1024,
215
  }
216
 
217
  try:
218
  result = openai.chat.completions.create(**params)
219
  print(result.choices[0].message.content)
220
+ full_result_vision.append(result)
221
  except Exception as e:
222
  print(f"Error: {e}")
223
+ full_text_vision += f'### BATCH_{idx+1}\n' + "-"*50 + "\n" + f"Error: {e}" + "\n" + "-"*50 + "\n"
224
  idx += 1
225
  pass
226
 
227
+ if 'full_result_vision' not in global_dict:
228
+ global_dict.setdefault('full_result_vision', full_result_vision)
229
  else:
230
+ global_dict['full_result_vision'] = full_result_vision
231
 
232
  print(f'### BATCH_{idx+1}')
233
  print('-'*100)
234
+ full_text_vision += f'### BATCH_{idx+1}\n' + "-"*50 + "\n" + result.choices[0].message.content + "\n" + "-"*50 + "\n"
235
  idx += 1
236
  time.sleep(2)
237
 
238
+ return full_text_vision
239
+
240
+
241
+ def call_gpt_audio(api_key, rubrics) -> str:
242
+ transcript = global_dict.get('transcript')
243
+ openai.api_key = api_key
244
+
245
+ full_text_audio = ""
246
+
247
+ PROMPT_MESSAGES = [
248
+ {
249
+ "role": "system",
250
+ "content": AUDIO_SYSTEM_PROMPT,
251
+ },
252
+ {
253
+ "role": "user",
254
+ "content": PromptTemplate.from_template(USER_PROMPT_TEMPLATE).format(rubrics=rubrics) + "\n\n<TEXT>\n" + transcript
255
+ },
256
+ ]
257
+ params = {
258
+ "model": "gpt-4",
259
+ "messages": PROMPT_MESSAGES,
260
+ "max_tokens": 1024,
261
+ }
262
+
263
+ try:
264
+ result = openai.chat.completions.create(**params)
265
+ full_text_audio = result.choices[0].message.content
266
+ print(full_text_audio)
267
+ except openai.OpenAIError as e:
268
+ print(f"Failed to connect to OpenAI: {e}")
269
+ pass
270
+
271
+ if 'full_text_audio' not in global_dict:
272
+ global_dict.setdefault('full_text_audio', full_text_audio)
273
+ else:
274
+ global_dict['full_text_audio'] = full_text_audio
275
+
276
+ return full_text_audio
277
 
278
 
279
  def get_full_result():
280
+ full_result_vision = global_dict.get('full_result_vision')
281
+ full_result_audio = global_dict.get('full_text_audio')
282
 
283
+ result_text_video = ""
284
+ result_text_audio = ""
285
+
286
+
287
+ for idx, res in enumerate(full_result_vision):
288
+ result_text_video += f'<Video Evaluation_{idx+1}>\n'
289
+ result_text_video += res.choices[0].message.content
290
+ result_text_video += "\n"
291
+ result_text_video += "-"*5
292
+ result_text_video += "\n"
293
+ result_text_video += "*"*5 + "END of Video" + "*"*5
294
+
295
+ if full_result_audio:
296
+ result_text_audio += '<Audio Evaluation>\n'
297
+ result_text_audio += full_result_audio
298
+ result_text_audio += "\n"
299
+ result_text_audio += "-"*5
300
+ result_text_audio += "\n"
301
+ result_text_audio += "*"*5 + "END of Audio" + "*"*5
302
+
303
+ result_text = result_text_video + "\n\n" + result_text_audio
304
+ else:
305
+ result_text = result_text_video
306
 
307
  global_dict.setdefault('result_text', result_text)
308
 
 
310
 
311
 
312
  def get_final_anser(api_key, result_text):
313
+ chain = ChatOpenAI(
314
+ api_key=api_key,
315
+ model="gpt-4",
316
+ max_tokens=1024,
317
+ temperature=0,
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
318
  )
319
+ prompt = PromptTemplate.from_template(FINAL_EVALUATION_PROMPT)
320
  runnable = prompt | chain | StrOutputParser()
321
 
322
  final_eval = runnable.invoke({"evals": result_text})
 
350
  )
351
  total_batch_percent = gr.Slider(
352
  label="Percentage(%) of batched image frames to total frames",
353
+ info="Choose between 1(%) and 5(%)",
354
+ value=3,
355
+ minimum=1,
356
+ maximum=5,
357
+ step=1
358
  )
359
  process_button = gr.Button("Process")
360
  with gr.Column(scale=1):
 
382
  rubric_video_input = gr.Textbox(
383
  label="2. Video Evaluation Rubric",
384
  info="Enter your evaluation rubric here...",
385
+ placeholder="Here's what the performer should *SHOW* as follows:\n1. From standing, bend your knees and straighten your arms in front of you.\n2. Place your hands on the floor, shoulder width apart with fingers pointing forward and your chin on your chest.\n3. Rock forward, straighten legs and transfer body weight onto shoulders.\n4. Rock forward on a rounded back placing both feet on the floor.\n5. Stand using arms for balance, without hands touching the floor.",
386
  lines=7
387
  )
388
  rubric_audio_input = gr.Textbox(
389
  label="3. Audio Evaluation Rubric (if needed)",
390
  info="Enter your evaluation rubric here...",
391
+ placeholder="Here's what the performer should *SAY* as follows:\n1. From standing, you need to shout 'Start' signal.\n2. Rock forward, you shouldn't make any noise while rolling.\n3. Standing still again, you need to shout 'Finish' signal.",
392
  interactive=True,
393
  visible=True,
394
  lines=7
395
  )
396
  multimodal_radio.change(fn=change_audio_rubric, inputs=multimodal_radio, outputs=rubric_audio_input)
397
 
398
+ evaluate_button = gr.Button("Evaluate")
399
  with gr.Column(scale=1):
400
  video_output_box = gr.Textbox(
401
  label="Video Batched Snapshots Eval...",
 
418
  label="FULL Response",
419
  info="You can edit partial evaluation in here...",
420
  lines=10,
421
+ interactive=True,
422
+ show_copy_button=True,
423
+ )
424
+ summarize_button = gr.Button("Summarize")
425
 
426
  with gr.Column(scale=1):
427
  output_box_fin_fin = gr.Textbox(label="Final Evaluation", lines=10, interactive=True)
428
 
429
 
430
+ process_button.click(fn=validate_api_key, inputs=api_key_input, outputs=None).success(fn=show_batches, inputs=[video_upload, batch_size, total_batch_percent], outputs=gallery).success(fn=show_audio_transcript, inputs=[video_upload, api_key_input], outputs=transcript_box)
431
+ if rubric_audio_input.visible:
432
+ evaluate_button.click(fn=call_gpt_vision, inputs=[api_key_input, rubric_video_input], outputs=video_output_box).then(fn=call_gpt_audio, inputs=[api_key_input, rubric_audio_input], outputs=audio_output_box).then(get_full_result, None, output_box_fin)
433
+ else:
434
+ evaluate_button.click(fn=call_gpt_vision, inputs=[api_key_input, rubric_video_input], outputs=video_output_box).then(get_full_result, None, output_box_fin)
435
+ summarize_button.click(fn=get_final_anser, inputs=[api_key_input, output_box_fin], outputs=output_box_fin_fin)
436
 
437
  demo.launch()
438
 
439
  if __name__ == "__main__":
440
+ main()