| import json |
| import threading |
| from typing import Iterator |
| import time |
| import gradio as gr |
| from groq import Groq |
| from elevenlabs import play, stream |
| from elevenlabs.client import ElevenLabs, AsyncElevenLabs |
| import subprocess |
|
|
| from utils.mapper import TOOLS, AVAILABLE_FUNCTIONS, COMMON_VARS |
|
|
| |
| client = Groq(api_key="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") |
| elevenlabs_client = ElevenLabs( |
| api_key="sk_16d08614d675e9de0a89bdbff094c6332fceaafbb280f4b3" |
| ) |
| |
| |
| |
|
|
|
|
| def text_to_speech_file(text: str, play_audio: bool) -> Iterator[bytes]: |
| audio = elevenlabs_client.generate( |
| text=text, |
| voice="Adam", |
| model="eleven_turbo_v2_5", |
| stream=True, |
| optimize_streaming_latency=3 |
| ) |
| if play_audio: |
| print("streaming") |
| stream(audio) |
| return audio |
|
|
|
|
| def create_content(result): |
| additional_text = """Based on the user's request, follow these steps: |
| 1. **Understand the Request:** Read the user's request carefully to determine the specific needs or actions |
| required. |
| 2. **Match with Functions:** Compare the user's request with the available functions in the tools. Identify |
| which function aligns with the user's needs. |
| 3. **Select the Best Function:** Choose the function that best matches the user's request. |
| 4. **Call the Function:** Use the selected function from the tools to fulfill the user's request. |
| """ |
| content = ( |
| f"""You are an AI assistant that will suggest and call the functions provided in the tools based on the |
| user's request. You need to analyze the user's request and select the function from the provided tools that |
| best matches the request and provide the results by calling the appropriate function. |
| |
| Expect all parameters from user request, Consider the dates according to the user query for example if user |
| asking some operations for today, it should be understood to get today's date in YYYY-MM-DD format for date |
| parameters. |
| |
| If required parameters are not in the user request these default can be used {COMMON_VARS} |
| |
| """ |
|
|
| ) |
| return content |
|
|
|
|
| def background_task(audio_file_path, language, additional_text): |
| with open(audio_file_path, "rb") as audio_file: |
| |
| response = client.audio.transcriptions.create( |
| file=audio_file, |
| model="whisper-large-v3", |
| language=language |
| ) |
|
|
| transcribed_text = response.text |
|
|
| result = f"Transcription: {transcribed_text}\n\nAdditional Context: {additional_text}\n\n" |
|
|
| content = create_content(result) |
| model = "llama3-groq-70b-8192-tool-use-preview" |
| messages = [ |
| { |
| "role": "system", |
| "content": content |
| }, |
| { |
| "role": "user", |
| "content": result |
| } |
| ] |
| chat_completion = client.chat.completions.create( |
| messages=messages, |
| tools=TOOLS, |
| tool_choice="auto", |
| model=model, |
| temperature=0.5, |
| max_tokens=500, |
| ) |
|
|
| |
|
|
| response_message = chat_completion.choices[0].message |
| tool_calls = response_message.tool_calls |
| if not tool_calls: |
| raise Exception(f"No Tool Found associated with query: {transcribed_text}") |
| messages.append(response_message) |
| for tool_call in tool_calls: |
| function_name = tool_call.function.name |
| function_to_call = AVAILABLE_FUNCTIONS[function_name] |
| function_args = json.loads(tool_call.function.arguments) |
| function_response = function_to_call(**function_args) |
| messages.append( |
| { |
| "tool_call_id": tool_call.id, |
| "role": "tool", |
| "name": function_name, |
| "content": function_response, |
| } |
| ) |
| second_response = client.chat.completions.create( |
| model=model, |
| messages=messages |
| ) |
| final_response = second_response.choices[0].message.content |
| print("final: ", final_response) |
|
|
| |
| audio = text_to_speech_file(final_response, True) |
| |
| |
| |
| |
| return final_response |
|
|
|
|
| def play_audio(): |
| bytes_data = open('greetings.mp3', 'rb').read() |
| play(bytes_data) |
|
|
|
|
| def transcribe_audio(audio_file_path, language, additional_text): |
| try: |
| |
| |
| |
|
|
| |
| |
| bytes_data = open('greetings.mp3', 'rb').read() |
| play(bytes_data) |
| |
| with open(audio_file_path, "rb") as audio_file: |
| |
| response = client.audio.transcriptions.create( |
| file=audio_file, |
| model="whisper-large-v3", |
| language=language |
| ) |
|
|
| transcribed_text = response.text |
|
|
| result = f"Transcription: {transcribed_text}\n\nAdditional Context: {additional_text}\n\n" |
|
|
| content = create_content(result) |
| model = "llama3-groq-70b-8192-tool-use-preview" |
| messages = [ |
| { |
| "role": "system", |
| "content": content |
| }, |
| { |
| "role": "user", |
| "content": result |
| } |
| ] |
| chat_completion = client.chat.completions.create( |
| messages=messages, |
| tools=TOOLS, |
| tool_choice="auto", |
| model=model, |
| temperature=0.5, |
| max_tokens=500, |
| ) |
|
|
| |
|
|
| response_message = chat_completion.choices[0].message |
| tool_calls = response_message.tool_calls |
| if not tool_calls: |
| raise Exception(f"No Tool Found associated with query: {transcribed_text}") |
| messages.append(response_message) |
| for tool_call in tool_calls: |
| function_name = tool_call.function.name |
| function_to_call = AVAILABLE_FUNCTIONS[function_name] |
| function_args = json.loads(tool_call.function.arguments) |
| function_response = function_to_call(**function_args) |
| messages.append( |
| { |
| "tool_call_id": tool_call.id, |
| "role": "tool", |
| "name": function_name, |
| "content": function_response, |
| } |
| ) |
| second_response = client.chat.completions.create( |
| model=model, |
| messages=messages |
| ) |
| final_response = second_response.choices[0].message.content |
| print("final: ", final_response) |
|
|
| |
| |
| |
| thread = threading.Thread(target=text_to_speech_file, args=(final_response, True,)) |
| thread.start() |
| |
| return final_response |
|
|
| except Exception as e: |
| return f"An error occurred: {str(e)}" |
|
|
|
|
| def speach_to_text(): |
| |
| languages = ["en", "ba", "ms", "is", "no", "id"] |
|
|
| |
| iface = gr.Interface( |
| fn=transcribe_audio, |
| inputs=[ |
| gr.Audio(type="filepath", label="Upload Audio File"), |
| gr.Dropdown(choices=languages, label="Select Language", value="en"), |
| |
| gr.Textbox(label="Additional Text", placeholder="Enter any additional context or instructions here...") |
| ], |
| outputs="text", |
| title="Groq Speech-to-Text Transcription", |
| description="Upload an audio file, set parameters, and provide additional text for context in the " |
| "transcription process." |
| ) |
|
|
| |
| iface.launch() |
|
|
|
|
| |
| if __name__ == '__main__': |
| import platform |
| print(f"platform: {platform.platform()}") |
| print(f"platform: {platform.freedesktop_os_release()}") |
| if "Linux" in platform.platform(): |
| |
| |
| subprocess.run(['apt', 'install', '-y', 'mpv']) |
| speach_to_text() |
|
|