Spaces:

sasan
/

KITT

Build error

App Files Files Community

sasan commited on May 22, 2024

Commit

78e760c

1 Parent(s): 0f04201

chore: Update vehicle speed and destination handling functions

Browse files

Files changed (3) hide show

kitt/core/model.py +34 -1
kitt/core/tts.py +103 -0
main.py +30 -15

kitt/core/model.py CHANGED Viewed

@@ -84,6 +84,7 @@ Don't make assumptions about tool results if <tool_response> XML tags are not pr
 Analyze the data once you get the results and call another function.
 At each iteration please continue adding the your analysis to previous summary.
 Your final response should directly answer the user query. Don't tell what you are doing, just do it.
 Tools:
@@ -92,13 +93,45 @@ Here are the available tools:
 Make sure that the json object above with code markdown block is parseable with json.loads() and the XML block with XML ElementTree.
 When using tools, ensure to only use the tools provided and not make up any data and do not provide any explanation as to which tool you are using and why.
-When asked for the weather or points of interest, use the appropriate tool with the current location of the car. Unless the user provides a location, then use that location.
 Always assume user wants to travel by car.
 Schema:
 Use the following pydantic model json schema for each tool call you will make:
 {schema}
 Instructions:
 At the very first turn you don't have <tool_results> so you shouldn't not make up the results.
 Please keep a running summary with analysis of previous function results and summaries from previous iterations.

 Analyze the data once you get the results and call another function.
 At each iteration please continue adding the your analysis to previous summary.
 Your final response should directly answer the user query. Don't tell what you are doing, just do it.
+Keep your responses very concise and to the point. Don't provide any unnecessary information. Don't refer to user preferences as <user_preferences>.
 Tools:
 Make sure that the json object above with code markdown block is parseable with json.loads() and the XML block with XML ElementTree.
 When using tools, ensure to only use the tools provided and not make up any data and do not provide any explanation as to which tool you are using and why.
+When asked for the weather or points of interest, use the appropriate tool with the current location from <car_status>. If user provides a location, use that location.
 Always assume user wants to travel by car.
 Schema:
 Use the following pydantic model json schema for each tool call you will make:
 {schema}
+Examples:
+Example 1:
+User: How is the weather?
+Assistant:
+<tool_call>
+{{"arguments": {{"location": ""}}, "name": "get_weather"}}
+</tool_call>
+Example 2:
+User: Is there a Spa nearby?
+Assistant:
+<tool_call>
+{{"arguments": {{"search_query": "Spa"}}, "name": "search_points_of_interest"}}
+</tool_call>
+Example 3:
+User: How long will it take to get to the destination?
+Assistant:
+<tool_call>
+{{"arguments": {{"destination": ""}}, "name": "calculate_route"}}
+</tool_call>
+Example 4:
+User: Set the destination to Paris.
+Assistant:
+<tool_call>
+{{"arguments": {{"destination": "Paris"}}, "name": "set_vehicle_destination"}}
+</tool_call>
 Instructions:
 At the very first turn you don't have <tool_results> so you shouldn't not make up the results.
 Please keep a running summary with analysis of previous function results and summaries from previous iterations.

kitt/core/tts.py ADDED Viewed

	@@ -0,0 +1,103 @@

+from collections import namedtuple
+from replicate import Client
+from loguru import logger
+from kitt.skills.common import config
+import torch
+from parler_tts import ParlerTTSForConditionalGeneration
+from transformers import AutoTokenizer, set_seed
+import soundfile as sf
+replicate = Client(api_token=config.REPLICATE_API_KEY)
+Voice = namedtuple("voice", ["name", "neutral", "angry", "speed"])
+voices_replicate = [
+    Voice(
+        "Attenborough",
+        neutral="https://zebel.ams3.digitaloceanspaces.com/xtts/short/attenborough-neutral.wav",
+        angry=None,
+        speed=1.2,
+    ),
+    Voice(
+        "Rick",
+        neutral="https://zebel.ams3.digitaloceanspaces.com/xtts/short/rick-neutral.wav",
+        angry=None,
+        speed=1.2,
+    ),
+    Voice(
+        "Freeman",
+        neutral="https://zebel.ams3.digitaloceanspaces.com/xtts/short/freeman-neutral.wav",
+        angry="https://zebel.ams3.digitaloceanspaces.com/xtts/short/freeman-angry.wav",
+        speed=1.1,
+    ),
+    Voice(
+        "Walken",
+        neutral="https://zebel.ams3.digitaloceanspaces.com/xtts/short/walken-neutral.wav",
+        angry=None,
+        speed=1.1,
+    ),
+    Voice(
+        "Darth Wader",
+        neutral="https://zebel.ams3.digitaloceanspaces.com/xtts/short/darth-neutral.wav",
+        angry=None,
+        speed=1.15,
+    ),
+]
+def voice_from_text(voice, voices):
+    for v in voices:
+        if voice == f"{v.name} - Neutral":
+            return v.neutral
+        if voice == f"{v.name} - Angry":
+            return v.angry
+    raise ValueError(f"Voice {voice} not found.")
+def speed_from_text(voice, voices):
+    for v in voices:
+        if voice == f"{v.name} - Neutral":
+            return v.speed
+        if voice == f"{v.name} - Angry":
+            return v.speed
+def run_tts_replicate(text: str, voice_character: str):
+    voice = voice_from_text(voice_character, voices_replicate)
+    input = {
+        "text": text,
+        "speaker": voice,
+        "cleanup_voice": True
+    }
+    output = replicate.run(
+        # "afiaka87/tortoise-tts:e9658de4b325863c4fcdc12d94bb7c9b54cbfe351b7ca1b36860008172b91c71",
+        "lucataco/xtts-v2:684bc3855b37866c0c65add2ff39c78f3dea3f4ff103a436465326e0f438d55e",
+        input=input,
+    )
+    logger.info(f"sound output: {output}")
+    return output
+def get_fast_tts():
+    device = "cuda:0" if torch.cuda.is_available() else "cpu"
+    model = ParlerTTSForConditionalGeneration.from_pretrained("parler-tts/parler-tts-mini-expresso").to(device)
+    tokenizer = AutoTokenizer.from_pretrained("parler-tts/parler-tts-mini-expresso")
+    return model, tokenizer, device
+fast_tts = get_fast_tts()
+def run_tts_fast(text: str):
+    model, tokenizer, device = fast_tts
+    description = "Thomas speaks moderately slowly in a sad tone with emphasis and high quality audio."
+    input_ids = tokenizer(description, return_tensors="pt").input_ids.to(device)
+    prompt_input_ids = tokenizer(text, return_tensors="pt").input_ids.to(device)
+    generation = model.generate(input_ids=input_ids, prompt_input_ids=prompt_input_ids)
+    audio_arr = generation.cpu().numpy().squeeze()
+    return model.config.sampling_rate, audio_arr, dict(text=text, voice="Thomas")

main.py CHANGED Viewed

@@ -8,6 +8,7 @@ import typer
 from kitt.skills.common import config, vehicle
 from kitt.skills.routing import calculate_route
 import ollama
 from langchain.tools.base import StructuredTool
@@ -33,6 +34,7 @@ from kitt.skills import (
 )
 from kitt.skills import extract_func_args
 from kitt.core import voice_options, tts_gradio
 # from kitt.core.model import process_query
 from kitt.core.model import generate_function_call as process_query
 from kitt.core import utils as kitt_utils
@@ -144,7 +146,7 @@ functions = [
     get_weather,
     find_route,
     search_points_of_interest,
-    search_along_route
 ]
 openai_tools = [convert_to_openai_tool(tool) for tool in functions]
@@ -203,8 +205,8 @@ def run_nexusraven_model(query, voice_character, state):
 def run_llama3_model(query, voice_character, state):
-    assert len (functions) > 0, "No functions to call"
-    assert len (openai_tools) > 0, "No openai tools to call"
     output_text = process_query(
         query,
@@ -217,7 +219,9 @@ def run_llama3_model(query, voice_character, state):
     gr.Info(f"Output text: {output_text}, generating voice output...")
     voice_out = None
     if state["tts_enabled"]:
-        voice_out = tts_gradio(output_text, voice_character, speaker_embedding_cache)[0]
     return (
         output_text,
         voice_out,
@@ -340,10 +344,13 @@ def set_user_preferences(preferences, state):
 def set_enable_history(enable_history, state):
     new_enable_history = enable_history == "Yes"
-    logger.info(f"Enable history was {state['enable_history']} and changed to {new_enable_history}")
     state["enable_history"] = new_enable_history
     return state
 # to be able to use the microphone on chrome, you will have to go to chrome://flags/#unsafely-treat-insecure-origin-as-secure and enter http://10.186.115.21:7860/
 # in "Insecure origins treated as secure", enable it and relaunch chrome
@@ -354,9 +361,12 @@ def set_enable_history(enable_history, state):
 ORIGIN = "Mondorf-les-Bains, Luxembourg"
 DESTINATION = "Rue Alphonse Weicker, Luxembourg"
-def create_demo(tts_server: bool = False, model="llama3", tts_enabled: bool = True):
     print(f"Running the demo with model: {model} and TTSServer: {tts_server}")
     with gr.Blocks(theme=gr.themes.Default()) as demo:
         state = gr.State(
@@ -365,10 +375,10 @@ def create_demo(tts_server: bool = False, model="llama3", tts_enabled: bool = Tr
                 "query": "",
                 "route_points": [],
                 "model": model,
-                "tts_enabled": tts_enabled,
-                "llm_backend": "ollama",
                 "user_preferences": USER_PREFERENCES,
-                "enable_history": False,
             }
         )
         trip_points = gr.State(value=[])
@@ -388,6 +398,11 @@ def create_demo(tts_server: bool = False, model="llama3", tts_enabled: bool = Tr
                     value=voice_options[0],
                     show_label=True,
                 )
                 origin = gr.Textbox(
                     value=ORIGIN,
                     label="Origin",
@@ -441,21 +456,21 @@ def create_demo(tts_server: bool = False, model="llama3", tts_enabled: bool = Tr
                 )
                 with gr.Accordion("Config"):
                     tts_enabled = gr.Radio(
-                        choices=["Yes", "No"],
                         label="Enable TTS",
-                        value="No",
                         interactive=True,
                     )
                     llm_backend = gr.Radio(
                         choices=["Ollama", "Replicate"],
                         label="LLM Backend",
-                        value="Ollama",
                         interactive=True,
                     )
                     enable_history = gr.Radio(
                         ["Yes", "No"],
                         label="Maintain the conversation history?",
-                        value="No",
                         interactive=True,
                     )
                 # Push button
@@ -529,7 +544,7 @@ def create_demo(tts_server: bool = False, model="llama3", tts_enabled: bool = Tr
         enable_history.change(
             fn=set_enable_history, inputs=[enable_history, state], outputs=[state]
         )
     return demo
@@ -537,7 +552,7 @@ def create_demo(tts_server: bool = False, model="llama3", tts_enabled: bool = Tr
 gr.close_all()
-demo = create_demo(False, "llama3", tts_enabled=False)
 demo.launch(
     debug=True,
     server_name="0.0.0.0",

 from kitt.skills.common import config, vehicle
 from kitt.skills.routing import calculate_route
+from kitt.core.tts import run_tts_replicate, run_tts_fast
 import ollama
 from langchain.tools.base import StructuredTool
 )
 from kitt.skills import extract_func_args
 from kitt.core import voice_options, tts_gradio
 # from kitt.core.model import process_query
 from kitt.core.model import generate_function_call as process_query
 from kitt.core import utils as kitt_utils
     get_weather,
     find_route,
     search_points_of_interest,
+    search_along_route,
 ]
 openai_tools = [convert_to_openai_tool(tool) for tool in functions]
 def run_llama3_model(query, voice_character, state):
+    assert len(functions) > 0, "No functions to call"
+    assert len(openai_tools) > 0, "No openai tools to call"
     output_text = process_query(
         query,
     gr.Info(f"Output text: {output_text}, generating voice output...")
     voice_out = None
     if state["tts_enabled"]:
+        # voice_out = run_tts_replicate(output_text, voice_character)
+        voice_out = run_tts_fast(output_text)[0]
+        # voice_out = tts_gradio(output_text, voice_character, speaker_embedding_cache)[0]
     return (
         output_text,
         voice_out,
 def set_enable_history(enable_history, state):
     new_enable_history = enable_history == "Yes"
+    logger.info(
+        f"Enable history was {state['enable_history']} and changed to {new_enable_history}"
+    )
     state["enable_history"] = new_enable_history
     return state
 # to be able to use the microphone on chrome, you will have to go to chrome://flags/#unsafely-treat-insecure-origin-as-secure and enter http://10.186.115.21:7860/
 # in "Insecure origins treated as secure", enable it and relaunch chrome
 ORIGIN = "Mondorf-les-Bains, Luxembourg"
 DESTINATION = "Rue Alphonse Weicker, Luxembourg"
+DEFAULT_LLM_BACKEND = "ollama"
+ENABLE_HISTORY = True
+ENABLE_TTS = True
+def create_demo(tts_server: bool = False, model="llama3"):
     print(f"Running the demo with model: {model} and TTSServer: {tts_server}")
     with gr.Blocks(theme=gr.themes.Default()) as demo:
         state = gr.State(
                 "query": "",
                 "route_points": [],
                 "model": model,
+                "tts_enabled": ENABLE_TTS,
+                "llm_backend": DEFAULT_LLM_BACKEND,
                 "user_preferences": USER_PREFERENCES,
+                "enable_history": ENABLE_HISTORY,
             }
         )
         trip_points = gr.State(value=[])
                     value=voice_options[0],
                     show_label=True,
                 )
+                # voice_character = gr.Textbox(
+                #     label="Choose a voice",
+                #     value="freeman",
+                #     show_label=True,
+                # )
                 origin = gr.Textbox(
                     value=ORIGIN,
                     label="Origin",
                 )
                 with gr.Accordion("Config"):
                     tts_enabled = gr.Radio(
+                        ["Yes", "No"],
                         label="Enable TTS",
+                        value="Yes" if ENABLE_TTS else "No",
                         interactive=True,
                     )
                     llm_backend = gr.Radio(
                         choices=["Ollama", "Replicate"],
                         label="LLM Backend",
+                        value=DEFAULT_LLM_BACKEND.title(),
                         interactive=True,
                     )
                     enable_history = gr.Radio(
                         ["Yes", "No"],
                         label="Maintain the conversation history?",
+                        value="Yes" if ENABLE_HISTORY else "No",
                         interactive=True,
                     )
                 # Push button
         enable_history.change(
             fn=set_enable_history, inputs=[enable_history, state], outputs=[state]
         )
     return demo
 gr.close_all()
+demo = create_demo(False, "llama3")
 demo.launch(
     debug=True,
     server_name="0.0.0.0",