Spaces:

sasan
/

KITT

Build error

App Files Files Community

sasan commited on Jun 18, 2024

Commit

42c9935

1 Parent(s): 65dafa6

remove nexus and refactor stt

Browse files

Files changed (2) hide show

kitt/core/stt.py +39 -0
main.py +9 -195

kitt/core/stt.py ADDED Viewed

	@@ -0,0 +1,39 @@

+import copy
+import time
+import gradio as gr
+import numpy as np
+import torch
+import torchaudio
+from loguru import logger
+from transformers import pipeline
+device = "cuda" if torch.cuda.is_available() else "cpu"
+transcriber = pipeline(
+    "automatic-speech-recognition", model="openai/whisper-base.en", device=device
+)
+def save_audio_as_wav(data, sample_rate, file_path):
+    # make a tensor from the numpy array
+    data = torch.tensor(data).reshape(1, -1)
+    torchaudio.save(
+        file_path, data, sample_rate=sample_rate, bits_per_sample=16, encoding="PCM_S"
+    )
+def save_and_transcribe_audio(audio):
+    sample_rate, data = audio
+    try:
+        # add timestamp to file name
+        filename = f"recordings/audio{time.time()}.wav"
+        save_audio_as_wav(data, sample_rate, filename)
+        data = data.astype(np.float32)
+        data /= np.max(np.abs(data))
+        text = transcriber({"sampling_rate": sample_rate, "raw": data})["text"]
+        gr.Info(f"Transcribed text is: {text}\nProcessing the input...")
+    except Exception as e:
+        logger.error(f"Error: {e}")
+        raise Exception("Error transcribing audio.")
+    return text

main.py CHANGED Viewed

@@ -1,24 +1,14 @@
-import time
 import gradio as gr
-import numpy as np
-import ollama
-import torch
-import torchaudio
-import typer
 from langchain.memory import ChatMessageHistory
 from langchain.tools import tool
-from langchain.tools.base import StructuredTool
 from langchain_core.utils.function_calling import convert_to_openai_tool
 from loguru import logger
-from transformers import pipeline
 from kitt.core import tts_gradio
 from kitt.core import utils as kitt_utils
 from kitt.core import voice_options
-# from kitt.core.model import process_query
 from kitt.core.model import generate_function_call as process_query
 from kitt.core.tts import prep_for_tts, run_melo_tts, run_tts_fast, run_tts_replicate
 from kitt.skills import (
     code_interpreter,
@@ -34,7 +24,6 @@ from kitt.skills import (
     set_vehicle_destination,
     set_vehicle_speed,
 )
-from kitt.skills import vehicle_status as vehicle_status_fn
 from kitt.skills.common import config, vehicle
 from kitt.skills.routing import calculate_route, find_address
@@ -65,54 +54,6 @@ global_context = {
 speaker_embedding_cache = {}
 history = ChatMessageHistory()
-MODEL_FUNC = "nexusraven"
-MODEL_GENERAL = "llama3:instruct"
-RAVEN_PROMPT_FUNC = """You are a helpful AI assistant in a car (vehicle), that follows instructions extremely well. \
-Answer questions concisely and do not mention what you base your reply on."
-{raven_tools}
-{history}
-User Query: Question: {input}<human_end>
-"""
-HERMES_PROMPT_FUNC = """
-<|im_start|>system
-You are a helpful AI assistant in a car (vehicle), that follows instructions extremely well. \
-Answer questions concisely and do not mention what you base your reply on.<|im_end|>
-<|im_start|>user
-{{ .Prompt }}<|im_end|>
-<|im_start|>assistant
-"""
-def get_prompt(template, input, history, tools):
-    # "vehicle_status": vehicle_status_fn()[0]
-    kwargs = {"history": history, "input": input}
-    prompt = "<human>:\n"
-    for tool in tools:
-        func_signature, func_docstring = tool.description.split(" - ", 1)
-        prompt += f'Function:\n<func_start>def {func_signature}<func_end>\n<docstring_start>\n"""\n{func_docstring}\n"""\n<docstring_end>\n'
-    kwargs["raven_tools"] = prompt
-    if history:
-        kwargs["history"] = f"Previous conversation history:{history}\n"
-    return template.format(**kwargs).replace("{{", "{").replace("}}", "}")
-def use_tool(func_name, kwargs, tools):
-    for tool in tools:
-        if tool.name == func_name:
-            return tool.invoke(input=kwargs)
-    return None
-# llm = Ollama(model="nexusraven", stop=["\nReflection:", "\nThought:"], keep_alive=60*10)
 # Generate options for hours (00-23)
 hour_options = [f"{i:02d}:00:00" for i in range(24)]
@@ -136,20 +77,6 @@ def set_time(time_picker):
     return vehicle
-tools = [
-    # StructuredTool.from_function(get_weather),
-    # StructuredTool.from_function(find_route),
-    # StructuredTool.from_function(vehicle_status_fn),
-    # StructuredTool.from_function(set_vehicle_speed),
-    # StructuredTool.from_function(set_vehicle_destination),
-    # StructuredTool.from_function(search_points_of_interest),
-    # StructuredTool.from_function(search_along_route),
-    # StructuredTool.from_function(date_time_info),
-    # StructuredTool.from_function(get_weather_current_location),
-    # StructuredTool.from_function(code_interpreter),
-    # StructuredTool.from_function(do_anything_else),
-]
 functions = [
     # set_vehicle_speed,
     set_vehicle_destination,
@@ -161,59 +88,11 @@ functions = [
 openai_tools = [convert_to_openai_tool(tool) for tool in functions]
-def run_generic_model(query):
-    print(f"Running the generic model with query: {query}")
-    data = {
-        "prompt": f"Answer the question below in a short and concise manner.\n{query}",
-        "model": MODEL_GENERAL,
-        "options": {
-            # "temperature": 0.1,
-            # "stop":["\nReflection:", "\nThought:"]
-        },
-    }
-    out = ollama.generate(**data)
-    return out["response"]
 def clear_history():
     logger.info("Clearing the conversation history...")
     history.clear()
-def run_nexusraven_model(query, voice_character, state):
-    global_context["prompt"] = get_prompt(RAVEN_PROMPT_FUNC, query, "", tools)
-    print("Prompt: ", global_context["prompt"])
-    data = {
-        "prompt": global_context["prompt"],
-        # "streaming": False,
-        "model": "nexusraven",
-        # "model": "smangrul/llama-3-8b-instruct-function-calling",
-        "raw": True,
-        "options": {"temperature": 0.5, "stop": ["\nReflection:", "\nThought:"]},
-    }
-    out = ollama.generate(**data)
-    llm_response = out["response"]
-    if "Call: " in llm_response:
-        print(f"llm_response: {llm_response}")
-        llm_response = llm_response.replace("<bot_end>", " ")
-        func_name, kwargs = extract_func_args(llm_response)
-        print(f"Function: {func_name}, Args: {kwargs}")
-        if func_name == "do_anything_else":
-            output_text = run_generic_model(query)
-        else:
-            output_text = use_tool(func_name, kwargs, tools)
-    else:
-        output_text = out["response"]
-    if type(output_text) == tuple:
-        output_text = output_text[0]
-    gr.Info(f"Output text: {output_text}\nGenerating voice output...")
-    return (
-        output_text,
-        tts_gradio(output_text, voice_character, speaker_embedding_cache)[0],
-    )
 def run_llama3_model(query, voice_character, state):
     assert len(functions) > 0, "No functions to call"
@@ -249,18 +128,13 @@ def run_llama3_model(query, voice_character, state):
 def run_model(query, voice_character, state):
-    model = state.get("model", "nexusraven")
     query = query.strip().replace("'", "")
     logger.info(
         f"Running model: {model} with query: {query}, voice_character: {voice_character} and llm_backend: {state['llm_backend']}, tts_enabled: {state['tts_enabled']}"
     )
     global_context["query"] = query
-    if model == "nexusraven":
-        text, voice = run_nexusraven_model(query, voice_character, state)
-    elif model == "llama3":
-        text, voice = run_llama3_model(query, voice_character, state)
-    else:
-        text, voice = "Error running model", None
     if not state["enable_history"]:
         history.clear()
@@ -308,44 +182,6 @@ def update_vehicle_status(trip_progress, origin, destination, state):
     return vehicle, plot, state
-device = "cuda" if torch.cuda.is_available() else "cpu"
-transcriber = pipeline(
-    "automatic-speech-recognition", model="openai/whisper-base.en", device=device
-)
-def save_audio_as_wav(data, sample_rate, file_path):
-    # make a tensor from the numpy array
-    data = torch.tensor(data).reshape(1, -1)
-    torchaudio.save(
-        file_path, data, sample_rate=sample_rate, bits_per_sample=16, encoding="PCM_S"
-    )
-def save_and_transcribe_audio(audio):
-    try:
-        # capture the audio and save it to a file as wav or mp3
-        # file_name = save("audioinput.wav")
-        sr, y = audio
-        # y = y.astype(np.float32)
-        # y /= np.max(np.abs(y))
-        # add timestamp to file name
-        filename = f"recordings/audio{time.time()}.wav"
-        save_audio_as_wav(y, sr, filename)
-        sr, y = audio
-        y = y.astype(np.float32)
-        y /= np.max(np.abs(y))
-        text = transcriber({"sampling_rate": sr, "raw": y})["text"]
-        gr.Info(f"Transcribed text is: {text}\nProcessing the input...")
-    except Exception as e:
-        logger.error(f"Error: {e}")
-        raise Exception("Error transcribing audio.")
-    return text
 def save_and_transcribe_run_model(audio, voice_character, state):
     text = save_and_transcribe_audio(audio)
     out_text, out_voice, vehicle_status, state, update_proxy = run_model(
@@ -494,7 +330,12 @@ def create_demo(tts_server: bool = False, model="llama3"):
                     0, 100, step=5, label="Trip progress", interactive=True
                 )
-                # map_if = gr.Interface(fn=plot_map, inputs=year_input, outputs=map_plot)
         with gr.Row():
             with gr.Column():
@@ -647,30 +488,3 @@ demo.launch(
     ssl_verify=False,
     share=False,
 )
-app = typer.Typer()
-@app.command()
-def run(tts_server: bool = False):
-    global demo
-    demo = create_demo(tts_server)
-    demo.launch(
-        debug=True, server_name="0.0.0.0", server_port=7860, ssl_verify=True, share=True
-    )
-@app.command()
-def dev(tts_server: bool = False, model: str = "llama3"):
-    demo = create_demo(tts_server, model)
-    demo.launch(
-        debug=True,
-        server_name="0.0.0.0",
-        server_port=7860,
-        ssl_verify=False,
-        share=False,
-    )
-if __name__ == "__main__":
-    app()

 import gradio as gr
 from langchain.memory import ChatMessageHistory
 from langchain.tools import tool
 from langchain_core.utils.function_calling import convert_to_openai_tool
 from loguru import logger
 from kitt.core import tts_gradio
 from kitt.core import utils as kitt_utils
 from kitt.core import voice_options
 from kitt.core.model import generate_function_call as process_query
+from kitt.core.stt import save_and_transcribe_audio
 from kitt.core.tts import prep_for_tts, run_melo_tts, run_tts_fast, run_tts_replicate
 from kitt.skills import (
     code_interpreter,
     set_vehicle_destination,
     set_vehicle_speed,
 )
 from kitt.skills.common import config, vehicle
 from kitt.skills.routing import calculate_route, find_address
 speaker_embedding_cache = {}
 history = ChatMessageHistory()
 # Generate options for hours (00-23)
 hour_options = [f"{i:02d}:00:00" for i in range(24)]
     return vehicle
 functions = [
     # set_vehicle_speed,
     set_vehicle_destination,
 openai_tools = [convert_to_openai_tool(tool) for tool in functions]
 def clear_history():
     logger.info("Clearing the conversation history...")
     history.clear()
 def run_llama3_model(query, voice_character, state):
     assert len(functions) > 0, "No functions to call"
 def run_model(query, voice_character, state):
+    model = state.get("model", "llama3")
     query = query.strip().replace("'", "")
     logger.info(
         f"Running model: {model} with query: {query}, voice_character: {voice_character} and llm_backend: {state['llm_backend']}, tts_enabled: {state['tts_enabled']}"
     )
     global_context["query"] = query
+    text, voice = run_llama3_model(query, voice_character, state)
     if not state["enable_history"]:
         history.clear()
     return vehicle, plot, state
 def save_and_transcribe_run_model(audio, voice_character, state):
     text = save_and_transcribe_audio(audio)
     out_text, out_voice, vehicle_status, state, update_proxy = run_model(
                     0, 100, step=5, label="Trip progress", interactive=True
                 )
+            # with gr.Column(scale=1, min_width=300):
+            #     gr.Image("linkedin-1.png", label="Linkedin - Sasan Jafarnejad")
+            #     gr.Image(
+            #         "team-ubix.png",
+            #         label="Research Team - UBIX - University of Luxembourg",
+            #     )
         with gr.Row():
             with gr.Column():
     ssl_verify=False,
     share=False,
 )