sasan commited on
Commit
3ae3815
1 Parent(s): 7b96044

Update vehicle destination in calculate_route function

Browse files
Files changed (5) hide show
  1. core/__init__.py +176 -0
  2. kitt.py +85 -29
  3. skills/poi.py +1 -1
  4. skills/routing.py +1 -1
  5. skills/weather.py +4 -4
core/__init__.py ADDED
@@ -0,0 +1,176 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import os
2
+ from collections import namedtuple
3
+ import time
4
+ import pathlib
5
+ from typing import List
6
+
7
+ import numpy as np
8
+ import torch
9
+ from TTS.api import TTS
10
+
11
+ os.environ["COQUI_TOS_AGREED"] = "1"
12
+
13
+
14
+ Voice = namedtuple("voice", ["name", "neutral", "angry"])
15
+
16
+ file_full_path = pathlib.Path(os.path.realpath(__file__)).parent
17
+
18
+ voices = [
19
+ Voice(
20
+ "Attenborough",
21
+ neutral=f"{file_full_path}/audio/attenborough/neutral.wav",
22
+ angry=None,
23
+ ),
24
+ Voice("Rick", neutral=f"{file_full_path}/audio/rick/neutral.wav", angry=None),
25
+ Voice(
26
+ "Freeman",
27
+ neutral=f"{file_full_path}/audio/freeman/neutral.wav",
28
+ angry="audio/freeman/angry.wav",
29
+ ),
30
+ Voice("Walken", neutral=f"{file_full_path}/audio/walken/neutral.wav", angry=None),
31
+ Voice(
32
+ "Darth Wader", neutral=f"{file_full_path}/audio/darth/neutral.wav", angry=None
33
+ ),
34
+ ]
35
+
36
+
37
+ def load_tts_pipeline():
38
+ # load model for text to speech
39
+ device = "cuda" if torch.cuda.is_available() else "cpu"
40
+ # device = "mps"
41
+ tts_pipeline = TTS("tts_models/multilingual/multi-dataset/xtts_v2").to(device)
42
+ return tts_pipeline
43
+
44
+
45
+ def compute_speaker_embedding(voice_path: str, config, pipeline, cache):
46
+ if voice_path not in cache:
47
+ cache[voice_path] = pipeline.synthesizer.tts_model.get_conditioning_latents(
48
+ audio_path=voice_path,
49
+ gpt_cond_len=config.gpt_cond_len,
50
+ gpt_cond_chunk_len=config.gpt_cond_chunk_len,
51
+ max_ref_length=config.max_ref_len,
52
+ sound_norm_refs=config.sound_norm_refs,
53
+ )
54
+ return cache[voice_path]
55
+
56
+
57
+ voice_options = []
58
+ for voice in voices:
59
+ if voice.neutral:
60
+ voice_options.append(f"{voice.name} - Neutral")
61
+ if voice.angry:
62
+ voice_options.append(f"{voice.name} - Angry")
63
+
64
+
65
+ def voice_from_text(voice):
66
+ for v in voices:
67
+ if voice == f"{v.name} - Neutral":
68
+ return v.neutral
69
+ if voice == f"{v.name} - Angry":
70
+ return v.angry
71
+ raise ValueError(f"Voice {voice} not found.")
72
+
73
+
74
+ def tts(
75
+ self,
76
+ text: str = "",
77
+ language_name: str = "",
78
+ reference_wav=None,
79
+ gpt_cond_latent=None,
80
+ speaker_embedding=None,
81
+ split_sentences: bool = True,
82
+ **kwargs,
83
+ ) -> List[int]:
84
+ """🐸 TTS magic. Run all the models and generate speech.
85
+
86
+ Args:
87
+ text (str): input text.
88
+ speaker_name (str, optional): speaker id for multi-speaker models. Defaults to "".
89
+ language_name (str, optional): language id for multi-language models. Defaults to "".
90
+ speaker_wav (Union[str, List[str]], optional): path to the speaker wav for voice cloning. Defaults to None.
91
+ style_wav ([type], optional): style waveform for GST. Defaults to None.
92
+ style_text ([type], optional): transcription of style_wav for Capacitron. Defaults to None.
93
+ reference_wav ([type], optional): reference waveform for voice conversion. Defaults to None.
94
+ reference_speaker_name ([type], optional): speaker id of reference waveform. Defaults to None.
95
+ split_sentences (bool, optional): split the input text into sentences. Defaults to True.
96
+ **kwargs: additional arguments to pass to the TTS model.
97
+ Returns:
98
+ List[int]: [description]
99
+ """
100
+ start_time = time.time()
101
+ use_gl = self.vocoder_model is None
102
+ wavs = []
103
+
104
+ if not text and not reference_wav:
105
+ raise ValueError(
106
+ "You need to define either `text` (for sythesis) or a `reference_wav` (for voice conversion) to use the Coqui TTS API."
107
+ )
108
+
109
+ if text:
110
+ sens = [text]
111
+ if split_sentences:
112
+ print(" > Text splitted to sentences.")
113
+ sens = self.split_into_sentences(text)
114
+ print(sens)
115
+
116
+ if not reference_wav: # not voice conversion
117
+ for sen in sens:
118
+ outputs = self.tts_model.inference(
119
+ sen,
120
+ language_name,
121
+ gpt_cond_latent,
122
+ speaker_embedding,
123
+ # GPT inference
124
+ temperature=0.75,
125
+ length_penalty=1.0,
126
+ repetition_penalty=10.0,
127
+ top_k=50,
128
+ top_p=0.85,
129
+ do_sample=True,
130
+ **kwargs,
131
+ )
132
+ waveform = outputs["wav"]
133
+ if (
134
+ torch.is_tensor(waveform)
135
+ and waveform.device != torch.device("cpu")
136
+ and not use_gl
137
+ ):
138
+ waveform = waveform.cpu()
139
+ if not use_gl:
140
+ waveform = waveform.numpy()
141
+ waveform = waveform.squeeze()
142
+
143
+ # # trim silence
144
+ # if (
145
+ # "do_trim_silence" in self.tts_config.audio
146
+ # and self.tts_config.audio["do_trim_silence"]
147
+ # ):
148
+ # waveform = trim_silence(waveform, self.tts_model.ap)
149
+
150
+ wavs += list(waveform)
151
+ wavs += [0] * 10000
152
+
153
+ # compute stats
154
+ process_time = time.time() - start_time
155
+ audio_time = len(wavs) / self.tts_config.audio["sample_rate"]
156
+ print(f" > Processing time: {process_time}")
157
+ print(f" > Real-time factor: {process_time / audio_time}")
158
+ return wavs
159
+
160
+
161
+ def tts_gradio(tts_pipeline, text, voice, cache):
162
+ voice_path = voice_from_text(voice)
163
+ (gpt_cond_latent, speaker_embedding) = compute_speaker_embedding(
164
+ voice_path, tts_pipeline.synthesizer.tts_config, tts_pipeline, cache
165
+ )
166
+ out = tts(
167
+ tts_pipeline.synthesizer,
168
+ text,
169
+ language_name="en",
170
+ speaker=None,
171
+ gpt_cond_latent=gpt_cond_latent,
172
+ speaker_embedding=speaker_embedding,
173
+ speed=1.1,
174
+ # file_path="out.wav",
175
+ )
176
+ return (22050, np.array(out)), dict(text=text, voice=voice)
kitt.py CHANGED
@@ -1,5 +1,12 @@
 
1
  import gradio as gr
 
2
  import requests
 
 
 
 
 
3
 
4
  import skills
5
  from skills.common import config, vehicle
@@ -21,6 +28,7 @@ from skills import (
21
  date_time_info
22
  )
23
  from skills import extract_func_args
 
24
 
25
 
26
  global_context = {
@@ -29,6 +37,7 @@ global_context = {
29
  "route_points": [],
30
  }
31
 
 
32
 
33
  MODEL_FUNC = "nexusraven"
34
  MODEL_GENERAL = "llama3:instruct"
@@ -102,7 +111,7 @@ def run_generic_model(query):
102
  return out["response"]
103
 
104
 
105
- def run_model(query):
106
  print("Query: ", query)
107
  global_context["query"] = query
108
  global_context["prompt"] = get_prompt(RAVEN_PROMPT_FUNC, query, "", tools)
@@ -124,11 +133,60 @@ def run_model(query):
124
  func_name, kwargs = extract_func_args(llm_response)
125
  print(f"Function: {func_name}, Args: {kwargs}")
126
  if func_name == "do_anything_else":
127
- return run_generic_model(query)
128
-
129
- return use_tool(func_name, kwargs, tools)
130
- return out["response"]
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
131
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
132
 
133
  # to be able to use the microphone on chrome, you will have to go to chrome://flags/#unsafely-treat-insecure-origin-as-secure and enter http://10.186.115.21:7860/
134
  # in "Insecure origins treated as secure", enable it and relaunch chrome
@@ -138,6 +196,9 @@ def run_model(query):
138
  # What's the closest restaurant from here?
139
 
140
 
 
 
 
141
  with gr.Blocks(theme=gr.themes.Default()) as demo:
142
  state = gr.State(
143
  value={
@@ -146,6 +207,7 @@ with gr.Blocks(theme=gr.themes.Default()) as demo:
146
  "route_points": [],
147
  }
148
  )
 
149
 
150
  with gr.Row():
151
  with gr.Column(scale=1, min_width=300):
@@ -161,24 +223,7 @@ with gr.Blocks(theme=gr.themes.Default()) as demo:
161
  value="No",
162
  interactive=True,
163
  )
164
- voice_character = gr.Radio(
165
- choices=[
166
- "Morgan Freeman",
167
- "Eddie Murphy",
168
- "David Attenborough",
169
- "Rick Sanches",
170
- ],
171
- label="Choose a voice",
172
- value="Morgan Freeman",
173
- show_label=True,
174
- interactive=True,
175
- )
176
- emotion = gr.Radio(
177
- choices=["Cheerful", "Grumpy"],
178
- label="Choose an emotion",
179
- value="Cheerful",
180
- show_label=True,
181
- )
182
  origin = gr.Textbox(
183
  value="Luxembourg Gare, Luxembourg", label="Origin", interactive=True
184
  )
@@ -190,13 +235,14 @@ with gr.Blocks(theme=gr.themes.Default()) as demo:
190
 
191
  with gr.Column(scale=2, min_width=600):
192
  map_plot = gr.Plot()
 
193
 
194
  # map_if = gr.Interface(fn=plot_map, inputs=year_input, outputs=map_plot)
195
 
196
  with gr.Row():
197
  with gr.Column():
198
- recorder = gr.Audio(
199
- type="filepath", label="Input audio", elem_id="recorder"
200
  )
201
  input_text = gr.Textbox(
202
  value="How is the weather?", label="Input text", interactive=True
@@ -205,7 +251,7 @@ with gr.Blocks(theme=gr.themes.Default()) as demo:
205
  value=vehicle.model_dump_json(), label="Vehicle status"
206
  )
207
  with gr.Column():
208
- output_audio = gr.Audio(label="output audio")
209
  output_text = gr.TextArea(value="", label="Output text", interactive=False)
210
  # iface = gr.Interface(
211
  # fn=transcript,
@@ -226,12 +272,12 @@ with gr.Blocks(theme=gr.themes.Default()) as demo:
226
  # Update plot based on the origin and destination
227
  # Sets the current location and destination
228
  origin.submit(
229
- fn=calculate_route,
230
  inputs=[origin, destination],
231
  outputs=[map_plot, vehicle_status],
232
  )
233
  destination.submit(
234
- fn=calculate_route,
235
  inputs=[origin, destination],
236
  outputs=[map_plot, vehicle_status],
237
  )
@@ -240,7 +286,17 @@ with gr.Blocks(theme=gr.themes.Default()) as demo:
240
  time_picker.select(fn=set_time, inputs=[time_picker], outputs=[vehicle_status])
241
 
242
  # Run the model if the input text is changed
243
- input_text.submit(fn=run_model, inputs=[input_text], outputs=[output_text])
 
 
 
 
 
 
 
 
 
 
244
 
245
  # close all interfaces open to make the port available
246
  gr.close_all()
 
1
+ import time
2
  import gradio as gr
3
+ import numpy as np
4
  import requests
5
+ import torch
6
+ import torchaudio
7
+ from transformers import pipeline
8
+
9
+
10
 
11
  import skills
12
  from skills.common import config, vehicle
 
28
  date_time_info
29
  )
30
  from skills import extract_func_args
31
+ from core import voice_options, load_tts_pipeline, tts_gradio
32
 
33
 
34
  global_context = {
 
37
  "route_points": [],
38
  }
39
 
40
+ speaker_embedding_cache = {}
41
 
42
  MODEL_FUNC = "nexusraven"
43
  MODEL_GENERAL = "llama3:instruct"
 
111
  return out["response"]
112
 
113
 
114
+ def run_model(query, voice_character):
115
  print("Query: ", query)
116
  global_context["query"] = query
117
  global_context["prompt"] = get_prompt(RAVEN_PROMPT_FUNC, query, "", tools)
 
133
  func_name, kwargs = extract_func_args(llm_response)
134
  print(f"Function: {func_name}, Args: {kwargs}")
135
  if func_name == "do_anything_else":
136
+ output_text = run_generic_model(query)
137
+ else:
138
+ output_text = use_tool(func_name, kwargs, tools)
139
+ else:
140
+ output_text = out["response"]
141
+
142
+ if type(output_text) == tuple:
143
+ output_text = output_text[0]
144
+ gr.Info(f"Output text: {output_text}, generating voice output...")
145
+ return output_text, tts_gradio(tts_pipeline, output_text, voice_character, speaker_embedding_cache)[0]
146
+
147
+
148
+ def calculate_route_gradio(origin, destination):
149
+ plot, vehicle_status, points = calculate_route(origin, destination)
150
+ global_context["route_points"] = points
151
+ vehicle.location_coordinates = points[0]["latitude"], points[0]["longitude"]
152
+ return plot, vehicle_status
153
+
154
+
155
+ def update_vehicle_status(trip_progress):
156
+ n_points = len(global_context["route_points"])
157
+ new_coords = global_context["route_points"][max(int(trip_progress / 100 * n_points), n_points - 1)]
158
+ new_coords = new_coords["latitude"], new_coords["longitude"]
159
+ print(f"Trip progress: {trip_progress}, len: {n_points}, new_coords: {new_coords}")
160
+ vehicle.location_coordinates = new_coords
161
+ return vehicle.model_dump_json()
162
+
163
 
164
+ device = "cuda" if torch.cuda.is_available() else "cpu"
165
+ transcriber = pipeline("automatic-speech-recognition", model="openai/whisper-base.en", device=device)
166
+
167
+
168
+ def save_audio_as_wav(data, sample_rate, file_path):
169
+ # make a tensor from the numpy array
170
+ data = torch.tensor(data).reshape(1, -1)
171
+ torchaudio.save(file_path, data, sample_rate=sample_rate, bits_per_sample=16, encoding="PCM_S")
172
+
173
+
174
+ def save_and_transcribe_audio(audio):
175
+ # capture the audio and save it to a file as wav or mp3
176
+ # file_name = save("audioinput.wav")
177
+ sr, y = audio
178
+ # y = y.astype(np.float32)
179
+ # y /= np.max(np.abs(y))
180
+
181
+ # add timestamp to file name
182
+ filename = f"recordings/audio{time.time()}.wav"
183
+ save_audio_as_wav(y, sr, filename)
184
+
185
+ sr, y = audio
186
+ y = y.astype(np.float32)
187
+ y /= np.max(np.abs(y))
188
+ text = transcriber({"sampling_rate": sr, "raw":y})["text"]
189
+ return text
190
 
191
  # to be able to use the microphone on chrome, you will have to go to chrome://flags/#unsafely-treat-insecure-origin-as-secure and enter http://10.186.115.21:7860/
192
  # in "Insecure origins treated as secure", enable it and relaunch chrome
 
196
  # What's the closest restaurant from here?
197
 
198
 
199
+ tts_pipeline = load_tts_pipeline()
200
+
201
+
202
  with gr.Blocks(theme=gr.themes.Default()) as demo:
203
  state = gr.State(
204
  value={
 
207
  "route_points": [],
208
  }
209
  )
210
+ trip_points = gr.State(value=[])
211
 
212
  with gr.Row():
213
  with gr.Column(scale=1, min_width=300):
 
223
  value="No",
224
  interactive=True,
225
  )
226
+ voice_character = gr.Radio(choices=voice_options, label='Choose a voice', value=voice_options[0], show_label=True)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
227
  origin = gr.Textbox(
228
  value="Luxembourg Gare, Luxembourg", label="Origin", interactive=True
229
  )
 
235
 
236
  with gr.Column(scale=2, min_width=600):
237
  map_plot = gr.Plot()
238
+ trip_progress = gr.Slider(0, 100, step=5, label="Trip progress", interactive=True)
239
 
240
  # map_if = gr.Interface(fn=plot_map, inputs=year_input, outputs=map_plot)
241
 
242
  with gr.Row():
243
  with gr.Column():
244
+ input_audio = gr.Audio(
245
+ type="numpy",sources=["microphone"], label="Input audio", elem_id="input_audio"
246
  )
247
  input_text = gr.Textbox(
248
  value="How is the weather?", label="Input text", interactive=True
 
251
  value=vehicle.model_dump_json(), label="Vehicle status"
252
  )
253
  with gr.Column():
254
+ output_audio = gr.Audio(label="output audio", autoplay=True)
255
  output_text = gr.TextArea(value="", label="Output text", interactive=False)
256
  # iface = gr.Interface(
257
  # fn=transcript,
 
272
  # Update plot based on the origin and destination
273
  # Sets the current location and destination
274
  origin.submit(
275
+ fn=calculate_route_gradio,
276
  inputs=[origin, destination],
277
  outputs=[map_plot, vehicle_status],
278
  )
279
  destination.submit(
280
+ fn=calculate_route_gradio,
281
  inputs=[origin, destination],
282
  outputs=[map_plot, vehicle_status],
283
  )
 
286
  time_picker.select(fn=set_time, inputs=[time_picker], outputs=[vehicle_status])
287
 
288
  # Run the model if the input text is changed
289
+ input_text.submit(fn=run_model, inputs=[input_text, voice_character], outputs=[output_text, output_audio])
290
+
291
+ # Set the vehicle status based on the trip progress
292
+ trip_progress.release(
293
+ fn=update_vehicle_status, inputs=[trip_progress], outputs=[vehicle_status]
294
+ )
295
+
296
+ # Save and transcribe the audio
297
+ input_audio.stop_recording(
298
+ fn=save_and_transcribe_audio, inputs=[input_audio], outputs=[input_text]
299
+ )
300
 
301
  # close all interfaces open to make the port available
302
  gr.close_all()
skills/poi.py CHANGED
@@ -86,7 +86,7 @@ def find_points_of_interest(lat="0", lon="0", type_of_poi="restaurant"):
86
 
87
  # Format and limit to top 5 results
88
  formatted_results = [
89
- f"The {type_of_poi} {result['poi']['name']} is {int(result['dist'])} meters away"
90
  for result in sorted_results[:5]
91
  ]
92
 
 
86
 
87
  # Format and limit to top 5 results
88
  formatted_results = [
89
+ f"The {type_of_poi} {result['poi']['name']}, {int(result['dist'])} meters away"
90
  for result in sorted_results[:5]
91
  ]
92
 
skills/routing.py CHANGED
@@ -64,7 +64,7 @@ def calculate_route(origin, destination):
64
  data = response.json()
65
  points = data["routes"][0]["legs"][0]["points"]
66
 
67
- return plot_route(points), vehicle.model_dump_json()
68
 
69
 
70
  def find_route_tomtom(
 
64
  data = response.json()
65
  points = data["routes"][0]["legs"][0]["points"]
66
 
67
+ return plot_route(points), vehicle.model_dump_json(), points
68
 
69
 
70
  def find_route_tomtom(
skills/weather.py CHANGED
@@ -39,12 +39,12 @@ def get_weather(location:str= ""):
39
  humidity = weather_data['current']['humidity']
40
  feelslike_c = weather_data['current']['feelslike_c']
41
 
42
- # Formulate the sentences
43
  weather_sentences = (
44
- f"The current weather in {location}, {region}, {country} is {condition_text} "
45
  f"with a temperature of {temperature_c}°C that feels like {feelslike_c}°C. "
46
- f"Humidity is at {humidity}%. "
47
- f"Wind speed is {wind_kph} kph." if 'wind_kph' in weather_data['current'] else ""
48
  )
49
  return weather_sentences, weather_data
50
 
 
39
  humidity = weather_data['current']['humidity']
40
  feelslike_c = weather_data['current']['feelslike_c']
41
 
42
+ # Formulate the sentences - {region}, {country}
43
  weather_sentences = (
44
+ f"The current weather in {location} is {condition_text} "
45
  f"with a temperature of {temperature_c}°C that feels like {feelslike_c}°C. "
46
+ # f"Humidity is at {humidity}%. "
47
+ # f"Wind speed is {wind_kph} kph." if 'wind_kph' in weather_data['current'] else ""
48
  )
49
  return weather_sentences, weather_data
50