fffiloni commited on
Commit
fa29376
1 Parent(s): cd5d77e

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +11 -2
app.py CHANGED
@@ -40,7 +40,7 @@ def extract_frames(video_in, output_format='.jpg'):
40
  # Adjust interval to video length
41
  video_clip = VideoFileClip(video_in)
42
  if video_clip.duration <= 5:
43
- interval = 12
44
  else :
45
  interval = 24
46
 
@@ -165,9 +165,11 @@ def llm_process(user_prompt):
165
 
166
  def infer(video_in):
167
  # Extract frames from a video
 
168
  frame_files = extract_frames(video_in)
169
 
170
  # Process each extracted frame and collect results in a list
 
171
  processed_texts = []
172
  for frame_file in frame_files:
173
  text = process_image(frame_file)
@@ -184,6 +186,7 @@ def infer(video_in):
184
  print(extracted_audio)
185
 
186
  # Get description of audio content
 
187
  audio_content_described = get_salmonn(extracted_audio)
188
  else :
189
  audio_content_described = "Video has no sound."
@@ -195,6 +198,7 @@ def infer(video_in):
195
  print(formatted_captions)
196
 
197
  # Send formatted captions to LLM
 
198
  video_description_from_llm = llm_process(formatted_captions)
199
 
200
  return video_description_from_llm
@@ -213,7 +217,12 @@ div#video-text textarea {
213
  with gr.Blocks(css=css) as demo :
214
  with gr.Column(elem_id="col-container"):
215
  gr.HTML("""
216
- <h2 style="text-align: center;">Soft video understanding</h2>
 
 
 
 
 
217
  """)
218
  with gr.Row():
219
  with gr.Column():
 
40
  # Adjust interval to video length
41
  video_clip = VideoFileClip(video_in)
42
  if video_clip.duration <= 5:
43
+ interval = 6
44
  else :
45
  interval = 24
46
 
 
165
 
166
  def infer(video_in):
167
  # Extract frames from a video
168
+ gr.info("Extracting frames...")
169
  frame_files = extract_frames(video_in)
170
 
171
  # Process each extracted frame and collect results in a list
172
+ gr.Info("Captioning frames ...")
173
  processed_texts = []
174
  for frame_file in frame_files:
175
  text = process_image(frame_file)
 
186
  print(extracted_audio)
187
 
188
  # Get description of audio content
189
+ gr.Info("Getting audio description from extracted sound ...")
190
  audio_content_described = get_salmonn(extracted_audio)
191
  else :
192
  audio_content_described = "Video has no sound."
 
198
  print(formatted_captions)
199
 
200
  # Send formatted captions to LLM
201
+ gr.Info("Try to provide a video understanding with provided elements ...")
202
  video_description_from_llm = llm_process(formatted_captions)
203
 
204
  return video_description_from_llm
 
217
  with gr.Blocks(css=css) as demo :
218
  with gr.Column(elem_id="col-container"):
219
  gr.HTML("""
220
+ <h2 style="text-align: center;">Soft Video Understanding</h2>
221
+ <p style="text-align: center;">
222
+ An experiment to try to achieve what i call "soft video understanding" with open-source available models. <br />
223
+ We use moondream1 to caption extracted frames, salmonn to analyze extracted audio, then send visual and audio details to Zephyr which is instructed to resume what it understood.
224
+ Instructions prompt is available for further discussion with the Community.
225
+ </p>
226
  """)
227
  with gr.Row():
228
  with gr.Column():