KITT / space.py
sasan's picture
chore: Update requirements.txt and add MeloTTS dependency
dca05b7
import subprocess
subprocess.run(
"pip install flash-attn --no-build-isolation",
env={"FLASH_ATTENTION_SKIP_CUDA_BUILD": "TRUE"},
shell=True,
)
subprocess.run(
"python -m unidic download",
shell=True,
)
import gradio as gr
import spaces
from langchain.tools import tool
from langchain_community.chat_message_histories import ChatMessageHistory
from langchain_core.utils.function_calling import convert_to_openai_tool
from loguru import logger
from kitt.core import tts_gradio
from kitt.core import utils as kitt_utils
from kitt.core import voice_options
from kitt.core.model import generate_function_call as process_query
from kitt.core.stt import transcribe_audio
from kitt.core.tts import prep_for_tts, run_melo_tts, run_tts_replicate
from kitt.skills import (
code_interpreter,
date_time_info,
do_anything_else,
extract_func_args,
find_route,
get_forecast,
get_weather,
get_weather_current_location,
search_along_route_w_coordinates,
search_points_of_interest,
set_vehicle_destination,
set_vehicle_speed,
)
from kitt.skills.common import config, vehicle
from kitt.skills.routing import calculate_route, find_address
ORIGIN = "Luxembourg, Luxembourg"
DESTINATION = "Paris, France"
DEFAULT_LLM_BACKEND = "local"
ENABLE_HISTORY = True
ENABLE_TTS = True
TTS_BACKEND = "local"
USER_PREFERENCES = "User prefers italian food."
global_context = {
"vehicle": vehicle,
"query": "How is the weather?",
"route_points": [],
"origin": ORIGIN,
"destination": DESTINATION,
"enable_history": ENABLE_HISTORY,
"tts_enabled": ENABLE_TTS,
"tts_backend": TTS_BACKEND,
"llm_backend": DEFAULT_LLM_BACKEND,
"map_origin": ORIGIN,
"map_destination": DESTINATION,
"update_proxy": 0,
"map": None,
}
speaker_embedding_cache = {}
history = ChatMessageHistory()
# Generate options for hours (00-23)
hour_options = [f"{i:02d}:00:00" for i in range(24)]
@tool
def search_along_route(query=""):
"""Search for points of interest along the route/way to the destination.
Args:
query (str, optional): The type of point of interest to search for. Defaults to "restaurant".
"""
points = global_context["route_points"]
# maybe reshape
return search_along_route_w_coordinates(points, query)
def set_time(time_picker):
vehicle.time = time_picker
return vehicle
functions = [
# set_vehicle_speed,
set_vehicle_destination,
get_weather,
find_route,
search_points_of_interest,
search_along_route,
]
openai_tools = [convert_to_openai_tool(tool) for tool in functions]
def clear_history():
logger.info("Clearing the conversation history...")
history.clear()
@spaces.GPU
def run_llama3_model(query, voice_character, state):
assert len(functions) > 0, "No functions to call"
assert len(openai_tools) > 0, "No openai tools to call"
output_text = process_query(
query,
history=history,
user_preferences=state["user_preferences"],
tools=openai_tools,
functions=functions,
backend=state["llm_backend"],
)
gr.Info(f"Output text: {output_text}\nGenerating voice output...")
output_text_tts = prep_for_tts(output_text)
voice_out = None
if global_context["tts_enabled"]:
if "Fast" in voice_character:
voice_out = run_melo_tts(output_text_tts, voice_character)
elif global_context["tts_backend"] == "replicate":
voice_out = run_tts_replicate(output_text_tts, voice_character)
else:
voice_out = tts_gradio(
output_text_tts, voice_character, speaker_embedding_cache
)[0]
return (
output_text,
voice_out,
)
def run_model(query, voice_character, state):
model = state.get("model", "llama3")
query = query.strip().replace("'", "")
logger.info(
f"Running model: {model} with query: {query}, voice_character: {voice_character} and llm_backend: {state['llm_backend']}, tts_enabled: {state['tts_enabled']}"
)
global_context["query"] = query
text, voice = run_llama3_model(query, voice_character, state)
if not state["enable_history"]:
history.clear()
global_context["update_proxy"] += 1
return (
text,
voice,
vehicle.model_dump(),
state,
dict(update_proxy=global_context["update_proxy"]),
)
def calculate_route_gradio(origin, destination):
_, points = calculate_route(origin, destination)
plot = kitt_utils.plot_route(points, vehicle=vehicle.location_coordinates)
global_context["map"] = plot
global_context["route_points"] = points
# state.value["route_points"] = points
vehicle.location_coordinates = points[0]["latitude"], points[0]["longitude"]
return plot, vehicle.model_dump(), 0
def update_vehicle_status(trip_progress, origin, destination, state):
if not global_context["route_points"]:
_, points = calculate_route(origin, destination)
global_context["route_points"] = points
global_context["destination"] = destination
global_context["route_points"] = global_context["route_points"]
n_points = len(global_context["route_points"])
index = min(int(trip_progress / 100 * n_points), n_points - 1)
logger.info(f"Trip progress: {trip_progress} len: {n_points}, index: {index}")
new_coords = global_context["route_points"][index]
new_coords = new_coords["latitude"], new_coords["longitude"]
logger.info(
f"Trip progress: {trip_progress}, len: {n_points}, new_coords: {new_coords}"
)
vehicle.location_coordinates = new_coords
new_vehicle_location = find_address(new_coords[0], new_coords[1])
vehicle.location = new_vehicle_location
plot = kitt_utils.plot_route(
global_context["route_points"], vehicle=vehicle.location_coordinates
)
return vehicle, plot, state
@spaces.GPU
def save_and_transcribe_run_model(audio, voice_character, state):
text = transcribe_audio(audio)
out_text, out_voice, vehicle_status, state, update_proxy = run_model(
text, voice_character, state
)
return None, text, out_text, out_voice, vehicle_status, state, update_proxy
def set_tts_enabled(tts_enabled, state):
new_tts_enabled = tts_enabled == "Yes"
logger.info(
f"TTS enabled was {state['tts_enabled']} and changed to {new_tts_enabled}"
)
state["tts_enabled"] = new_tts_enabled
global_context["tts_enabled"] = new_tts_enabled
return state
def set_llm_backend(llm_backend, state):
assert llm_backend in ["Ollama", "Replicate", "Local"], "Invalid LLM backend"
new_llm_backend = llm_backend.lower()
logger.info(
f"LLM backend was {state['llm_backend']} and changed to {new_llm_backend}"
)
state["llm_backend"] = new_llm_backend
global_context["llm_backend"] = new_llm_backend
return state
def set_user_preferences(preferences, state):
new_preferences = preferences
logger.info(f"User preferences changed to: {new_preferences}")
state["user_preferences"] = new_preferences
global_context["user_preferences"] = new_preferences
return state
def set_enable_history(enable_history, state):
new_enable_history = enable_history == "Yes"
logger.info(
f"Enable history was {state['enable_history']} and changed to {new_enable_history}"
)
state["enable_history"] = new_enable_history
global_context["enable_history"] = new_enable_history
return state
def set_tts_backend(tts_backend, state):
new_tts_backend = tts_backend.lower()
logger.info(
f"TTS backend was {state['tts_backend']} and changed to {new_tts_backend}"
)
state["tts_backend"] = new_tts_backend
global_context["tts_backend"] = new_tts_backend
return state
def conditional_update():
if global_context["destination"] != vehicle.destination:
global_context["destination"] = vehicle.destination
if global_context["origin"] != vehicle.location:
global_context["origin"] = vehicle.location
if (
global_context["map_origin"] != vehicle.location
or global_context["map_destination"] != vehicle.destination
or global_context["update_proxy"] == 0
):
logger.info(f"Updating the map plot... in conditional_update")
map_plot, _, _ = calculate_route_gradio(vehicle.location, vehicle.destination)
global_context["map"] = map_plot
return global_context["map"]
# to be able to use the microphone on chrome, you will have to go to chrome://flags/#unsafely-treat-insecure-origin-as-secure and enter http://10.186.115.21:7860/
# in "Insecure origins treated as secure", enable it and relaunch chrome
# example question:
# what's the weather like outside?
# What's the closest restaurant from here?
def create_demo(tts_server: bool = False, model="llama3"):
print(f"Running the demo with model: {model} and TTSServer: {tts_server}")
with gr.Blocks(theme=gr.themes.Default(), title="KITT") as demo:
state = gr.State(
value={
# "context": initial_context,
"query": "",
"route_points": [],
"model": model,
"tts_enabled": ENABLE_TTS,
"llm_backend": DEFAULT_LLM_BACKEND,
"user_preferences": USER_PREFERENCES,
"enable_history": ENABLE_HISTORY,
"tts_backend": TTS_BACKEND,
"destination": DESTINATION,
}
)
plot, _, _ = calculate_route_gradio(ORIGIN, DESTINATION)
global_context["map"] = plot
with gr.Row():
# with gr.Row():
# gr.Text("KITT", interactive=False)
with gr.Column(scale=1, min_width=300):
vehicle_status = gr.JSON(
value=vehicle.model_dump(), label="Vehicle status"
)
time_picker = gr.Dropdown(
choices=hour_options,
label="What time is it? (HH:MM)",
value="08:00:00",
interactive=True,
)
voice_character = gr.Radio(
choices=voice_options,
label="Choose a voice",
value=voice_options[0],
show_label=True,
)
# voice_character = gr.Textbox(
# label="Choose a voice",
# value="freeman",
# show_label=True,
# )
origin = gr.Textbox(
value=ORIGIN,
label="Origin",
interactive=True,
)
destination = gr.Textbox(
value=DESTINATION,
label="Destination",
interactive=True,
)
preferences = gr.Textbox(
value=USER_PREFERENCES,
label="User preferences",
lines=3,
interactive=True,
)
with gr.Column(scale=2, min_width=600):
map_plot = gr.Plot(value=plot, label="Map")
trip_progress = gr.Slider(
0, 100, step=5, label="Trip progress", interactive=True
)
# with gr.Column(scale=1, min_width=300):
# gr.Image("linkedin-1.png", label="Linkedin - Sasan Jafarnejad")
# gr.Image(
# "team-ubix.png",
# label="Research Team - UBIX - University of Luxembourg",
# )
with gr.Row():
with gr.Column():
input_audio = gr.Audio(
type="numpy",
sources=["microphone"],
label="Input audio",
elem_id="input_audio",
)
input_text = gr.Textbox(
value="How is the weather?", label="Input text", interactive=True
)
with gr.Accordion("Debug"):
input_audio_debug = gr.Audio(
type="numpy",
sources=["microphone"],
label="Input audio",
elem_id="input_audio",
)
input_text_debug = gr.Textbox(
value="How is the weather?",
label="Input text",
interactive=True,
)
update_proxy = gr.JSON(
value=dict(update_proxy=0),
label="Global context",
)
with gr.Accordion("Config"):
tts_enabled = gr.Radio(
["Yes", "No"],
label="Enable TTS",
value="Yes" if ENABLE_TTS else "No",
interactive=True,
)
tts_backend = gr.Radio(
["Local"],
label="TTS Backend",
value=TTS_BACKEND.title(),
interactive=True,
)
llm_backend = gr.Radio(
choices=["Ollama", "Local"],
label="LLM Backend",
value=DEFAULT_LLM_BACKEND.title(),
interactive=True,
)
enable_history = gr.Radio(
["Yes", "No"],
label="Maintain the conversation history?",
value="Yes" if ENABLE_HISTORY else "No",
interactive=True,
)
# Push button
clear_history_btn = gr.Button(value="Clear History")
with gr.Column():
output_audio = gr.Audio(label="output audio", autoplay=True)
output_text = gr.TextArea(
value="", label="Output text", interactive=False
)
# Update plot based on the origin and destination
# Sets the current location and destination
origin.submit(
fn=calculate_route_gradio,
inputs=[origin, destination],
outputs=[map_plot, vehicle_status, trip_progress],
)
destination.submit(
fn=calculate_route_gradio,
inputs=[origin, destination],
outputs=[map_plot, vehicle_status, trip_progress],
)
preferences.submit(
fn=set_user_preferences, inputs=[preferences, state], outputs=[state]
)
# Update time based on the time picker
time_picker.select(fn=set_time, inputs=[time_picker], outputs=[vehicle_status])
# Run the model if the input text is changed
input_text.submit(
fn=run_model,
inputs=[input_text, voice_character, state],
outputs=[output_text, output_audio, vehicle_status, state, update_proxy],
)
input_text_debug.submit(
fn=run_model,
inputs=[input_text_debug, voice_character, state],
outputs=[output_text, output_audio, vehicle_status, state, update_proxy],
)
# Set the vehicle status based on the trip progress
trip_progress.release(
fn=update_vehicle_status,
inputs=[trip_progress, origin, destination, state],
outputs=[vehicle_status, map_plot, state],
)
# Save and transcribe the audio
input_audio.stop_recording(
fn=save_and_transcribe_run_model,
inputs=[input_audio, voice_character, state],
outputs=[
input_audio,
input_text,
output_text,
output_audio,
vehicle_status,
state,
update_proxy,
],
)
input_audio_debug.stop_recording(
fn=transcribe_audio,
inputs=[input_audio_debug],
outputs=[input_text_debug],
)
# Clear the history
clear_history_btn.click(fn=clear_history, inputs=[], outputs=[])
# Config
tts_enabled.change(
fn=set_tts_enabled, inputs=[tts_enabled, state], outputs=[state]
)
tts_backend.change(
fn=set_tts_backend, inputs=[tts_backend, state], outputs=[state]
)
llm_backend.change(
fn=set_llm_backend, inputs=[llm_backend, state], outputs=[state]
)
enable_history.change(
fn=set_enable_history, inputs=[enable_history, state], outputs=[state]
)
update_proxy.change(fn=conditional_update, inputs=[], outputs=[map_plot])
return demo
# close all interfaces open to make the port available
gr.close_all()
demo = create_demo(False, "llama3")
demo.launch(
debug=True,
server_name="0.0.0.0",
server_port=7860,
ssl_verify=False,
share=False,
)