Spaces:
Sleeping
Sleeping
import requests | |
from smolagents import tool | |
import openai | |
import base64 | |
def analyse_audio(audio_url: str) -> str: | |
""" | |
analyse the provided audio file, and return a description or transcription of the contents. | |
Args: | |
audio_url (str): The URL of the audio file to be analysed. Usually with an audio extension like mp3, aac, etc. | |
Returns: | |
str: description or transcription of the contents of the provided audio | |
""" | |
# some security: | |
if "https://agents-course-unit4-scoring.hf.space" not in audio_url: | |
return "the requested URL is not whitelisted, refusing to fetch data" | |
resp = requests.get(audio_url) | |
if resp.status_code != 200: | |
return f"failed to fetch the requested audio file: (status={resp.status_code})\n{resp.text}" | |
mime = resp.headers.get("content-type") | |
# todo filer mimetypes for security and correctness | |
audio_bytes = base64.b64encode(resp.content).decode("utf-8") | |
# Create the message to GPT-4o (vision) | |
response = openai.chat.completions.create( | |
model="gpt-4o-audio-preview", | |
messages=[ | |
{ | |
"role": "user", | |
"content": [ | |
{"type": "text", "text": "Please analyze the contents of this audio file. Provide a short (two sentence) description of the contents, and then output your analysis. The analysis should be in the most appropriate format. e.g. if the audio is a conversation, a transcription (indicating who says what) is best, for a monologue, maybe a simple transcription is best. if it's nature noises, describe what they are, the likely locations, etc."}, | |
{ | |
"type": "input_audio", | |
"input_audio": { | |
"data": audio_bytes, # Use the base64 string here | |
"format": "mp3" # mime.split("/")[-1], # TODO this is pretty poor parsing of a content-type response header | |
}, | |
} | |
] | |
} | |
], | |
max_tokens=500, | |
) | |
return response.choices[0].message.content | |