Abc / app.py
grayphite's picture
minor chaneg
7033d59
import json
import threading
from typing import Iterator
import time
import gradio as gr
from groq import Groq
from elevenlabs import play, stream
from elevenlabs.client import ElevenLabs, AsyncElevenLabs
import subprocess
from utils.mapper import TOOLS, AVAILABLE_FUNCTIONS, COMMON_VARS
# Initialize Groq client
client = Groq(api_key="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")
elevenlabs_client = ElevenLabs(
api_key="sk_16d08614d675e9de0a89bdbff094c6332fceaafbb280f4b3"
)
# elevenlabs_client = AsyncElevenLabs(
# api_key="73d9a4f6d777e9224641e79aeb39dc50"
# )
def text_to_speech_file(text: str, play_audio: bool) -> Iterator[bytes]:
audio = elevenlabs_client.generate(
text=text,
voice="Adam",
model="eleven_turbo_v2_5",
stream=True,
optimize_streaming_latency=3
)
if play_audio:
print("streaming")
stream(audio)
return audio
def create_content(result):
additional_text = """Based on the user's request, follow these steps:
1. **Understand the Request:** Read the user's request carefully to determine the specific needs or actions
required.
2. **Match with Functions:** Compare the user's request with the available functions in the tools. Identify
which function aligns with the user's needs.
3. **Select the Best Function:** Choose the function that best matches the user's request.
4. **Call the Function:** Use the selected function from the tools to fulfill the user's request.
"""
content = (
f"""You are an AI assistant that will suggest and call the functions provided in the tools based on the
user's request. You need to analyze the user's request and select the function from the provided tools that
best matches the request and provide the results by calling the appropriate function.
Expect all parameters from user request, Consider the dates according to the user query for example if user
asking some operations for today, it should be understood to get today's date in YYYY-MM-DD format for date
parameters.
If required parameters are not in the user request these default can be used {COMMON_VARS}
"""
)
return content
def background_task(audio_file_path, language, additional_text):
with open(audio_file_path, "rb") as audio_file:
# Make the API call
response = client.audio.transcriptions.create(
file=audio_file,
model="whisper-large-v3",
language=language
)
transcribed_text = response.text
result = f"Transcription: {transcribed_text}\n\nAdditional Context: {additional_text}\n\n"
content = create_content(result)
model = "llama3-groq-70b-8192-tool-use-preview"
messages = [
{
"role": "system",
"content": content
},
{
"role": "user",
"content": result
}
]
chat_completion = client.chat.completions.create(
messages=messages,
tools=TOOLS,
tool_choice="auto",
model=model,
temperature=0.5,
max_tokens=500,
)
# thread.join()
response_message = chat_completion.choices[0].message
tool_calls = response_message.tool_calls
if not tool_calls:
raise Exception(f"No Tool Found associated with query: {transcribed_text}")
messages.append(response_message)
for tool_call in tool_calls:
function_name = tool_call.function.name
function_to_call = AVAILABLE_FUNCTIONS[function_name]
function_args = json.loads(tool_call.function.arguments)
function_response = function_to_call(**function_args)
messages.append(
{
"tool_call_id": tool_call.id,
"role": "tool",
"name": function_name,
"content": function_response,
}
)
second_response = client.chat.completions.create(
model=model,
messages=messages
)
final_response = second_response.choices[0].message.content
print("final: ", final_response)
# Response Speech
audio = text_to_speech_file(final_response, True)
# stream(audio)
# thread = threading.Thread(target=stream, args=(audio,))
# thread.start()
# print(time.time() - start_time, "seconds")
return final_response
def play_audio():
bytes_data = open('greetings.mp3', 'rb').read()
play(bytes_data)
def transcribe_audio(audio_file_path, language, additional_text):
try:
# start_time = time.time()
# thread = threading.Thread(target=background_task, args=(audio_file_path, language, additional_text,))
# thread.start()
# text_to_speech_file("Sure.", True)
# text_to_speech_file("Sure, let me get that for you from the CRM.", True)
bytes_data = open('greetings.mp3', 'rb').read()
play(bytes_data)
# Open the audio file
with open(audio_file_path, "rb") as audio_file:
# Make the API call
response = client.audio.transcriptions.create(
file=audio_file,
model="whisper-large-v3",
language=language
)
transcribed_text = response.text
result = f"Transcription: {transcribed_text}\n\nAdditional Context: {additional_text}\n\n"
content = create_content(result)
model = "llama3-groq-70b-8192-tool-use-preview"
messages = [
{
"role": "system",
"content": content
},
{
"role": "user",
"content": result
}
]
chat_completion = client.chat.completions.create(
messages=messages,
tools=TOOLS,
tool_choice="auto",
model=model,
temperature=0.5,
max_tokens=500,
)
# thread.join()
response_message = chat_completion.choices[0].message
tool_calls = response_message.tool_calls
if not tool_calls:
raise Exception(f"No Tool Found associated with query: {transcribed_text}")
messages.append(response_message)
for tool_call in tool_calls:
function_name = tool_call.function.name
function_to_call = AVAILABLE_FUNCTIONS[function_name]
function_args = json.loads(tool_call.function.arguments)
function_response = function_to_call(**function_args)
messages.append(
{
"tool_call_id": tool_call.id,
"role": "tool",
"name": function_name,
"content": function_response,
}
)
second_response = client.chat.completions.create(
model=model,
messages=messages
)
final_response = second_response.choices[0].message.content
print("final: ", final_response)
# Response Speech
# audio = text_to_speech_file(final_response, True)
# stream(audio)
thread = threading.Thread(target=text_to_speech_file, args=(final_response, True,))
thread.start()
# print(time.time() - start_time, "seconds")
return final_response
except Exception as e:
return f"An error occurred: {str(e)}"
def speach_to_text():
# List of supported languages (this is an example, adjust based on Groq's actual supported languages)
languages = ["en", "ba", "ms", "is", "no", "id"]
# Create Gradio interface
iface = gr.Interface(
fn=transcribe_audio,
inputs=[
gr.Audio(type="filepath", label="Upload Audio File"),
gr.Dropdown(choices=languages, label="Select Language", value="en"),
# gr.Radio(["standard", "high"], label="Transcription Quality", value="standard"),
gr.Textbox(label="Additional Text", placeholder="Enter any additional context or instructions here...")
],
outputs="text",
title="Groq Speech-to-Text Transcription",
description="Upload an audio file, set parameters, and provide additional text for context in the "
"transcription process."
)
# Launch the interface
iface.launch()
# Press the green button in the gutter to run the script.
if __name__ == '__main__':
import platform
print(f"platform: {platform.platform()}")
print(f"platform: {platform.freedesktop_os_release()}")
if "Linux" in platform.platform():
# subprocess.run(['apt-get', 'install', '-y', 'snapd'])
# subprocess.run(['snap', 'install', '-y', 'mpv'])
subprocess.run(['apt', 'install', '-y', 'mpv'])
speach_to_text()