GPT-4o-omni-text-audio-image-video

Running

awacke1 commited on May 24, 2024

Commit

053774d

verified ·

1 Parent(s): eb21303

Update app.py

Files changed (1) hide show

app.py CHANGED Viewed

@@ -1614,7 +1614,7 @@ def save_imageold(image_input, filename_txt):
     return image_input.name
-def process_audio(audio_input):
     if audio_input:
         transcription = client.audio.transcriptions.create(
             model="whisper-1",
@@ -1623,7 +1623,7 @@ def process_audio(audio_input):
         response = client.chat.completions.create(
             model=MODEL,
             messages=[
-            {"role": "system", "content":"""You are generating a transcript summary. Create a summary of the provided transcription. Respond in Markdown."""},
             {"role": "user", "content": [{"type": "text", "text": f"The audio transcription is: {transcription.text}"}],}
             ],
             temperature=0,
@@ -1736,8 +1736,13 @@ def main():
     elif option == "Audio":
         audio_input = st.file_uploader("Upload an audio file", type=["mp3", "wav"])
-        process_audio(audio_input)
     elif option == "Video":
         video_input = st.file_uploader("Upload a video file", type=["mp4"])
         process_audio_and_video(video_input)

     return image_input.name
+def process_audio(audio_input, text_input):
     if audio_input:
         transcription = client.audio.transcriptions.create(
             model="whisper-1",
         response = client.chat.completions.create(
             model=MODEL,
             messages=[
+            {"role": "system", "content":{text_input}},
             {"role": "user", "content": [{"type": "text", "text": f"The audio transcription is: {transcription.text}"}],}
             ],
             temperature=0,
     elif option == "Audio":
+        text = "Transcribe and answer questions as a helpful audio music and speech assistant.  "
+        #text = "You are generating a transcript summary. Create a summary of the provided transcription. Respond in Markdown."
+        text_input = st.text_input(label="Enter text prompt to use with Audio context.", value=text)
         audio_input = st.file_uploader("Upload an audio file", type=["mp3", "wav"])
+        audio_response = process_audio(audio_input, text_input)
     elif option == "Video":
         video_input = st.file_uploader("Upload a video file", type=["mp4"])
         process_audio_and_video(video_input)