Spaces:

awacke1
/

GPT-4o-omni-text-audio-image-video

Running

App Files Files Community

awacke1 commited on May 14, 2024

Commit

e4a7d86

verified ·

1 Parent(s): 93853eb

Update app.py

Browse files

Files changed (1) hide show

app.py +21 -2

app.py CHANGED Viewed

@@ -67,6 +67,22 @@ def process_audio(audio_input):
         )
         st.markdown(response.choices[0].message.content)
 def save_video(video_file):
     # Save the uploaded video file
@@ -114,7 +130,10 @@ def process_audio_and_video(video_input):
         # Process the saved video
         base64Frames, audio_path = process_video(video_path, seconds_per_frame=1)
         # Generate a summary with visual and audio
         response = client.chat.completions.create(
             model=MODEL,
@@ -124,7 +143,7 @@ def process_audio_and_video(video_input):
                     "These are the frames from the video.",
                     *map(lambda x: {"type": "image_url",
                                     "image_url": {"url": f'data:image/jpg;base64,{x}', "detail": "low"}}, base64Frames),
-                    {"type": "text", "text": f"The audio transcription is: {transcription.text}"}
                 ]},
             ],
             temperature=0,

         )
         st.markdown(response.choices[0].message.content)
+def process_audio_for_video(video_input):
+    if audio_input:
+        transcription = client.audio.transcriptions.create(
+            model="whisper-1",
+            file=video_input,
+        )
+        response = client.chat.completions.create(
+            model=MODEL,
+            messages=[
+            {"role": "system", "content":"""You are generating a transcript summary. Create a summary of the provided transcription. Respond in Markdown."""},
+            {"role": "user", "content": [{"type": "text", "text": f"The audio transcription is: {transcription.text}"}],}
+            ],
+            temperature=0,
+        )
+        st.markdown(response.choices[0].message.content)
+        return response.choices[0].message.content
 def save_video(video_file):
     # Save the uploaded video file
         # Process the saved video
         base64Frames, audio_path = process_video(video_path, seconds_per_frame=1)
+        # Get the transcript for the video model call
+        transcript = process_audio_for_video(video_input)
         # Generate a summary with visual and audio
         response = client.chat.completions.create(
             model=MODEL,
                     "These are the frames from the video.",
                     *map(lambda x: {"type": "image_url",
                                     "image_url": {"url": f'data:image/jpg;base64,{x}', "detail": "low"}}, base64Frames),
+                    {"type": "text", "text": f"The audio transcription is: {transcript}"}
                 ]},
             ],
             temperature=0,