Spaces:
Running
Running
Update app.py
Browse files
app.py
CHANGED
@@ -8,7 +8,9 @@ from llama_index.llms.text_generation_inference import TextGenerationInference
|
|
8 |
import whisper
|
9 |
import gradio as gr
|
10 |
from gtts import gTTS
|
11 |
-
|
|
|
|
|
12 |
model = whisper.load_model("base")
|
13 |
HF_API_TOKEN = os.getenv("HF_TOKEN")
|
14 |
|
@@ -26,11 +28,27 @@ def translate_audio(audio):
|
|
26 |
result = whisper.decode(model, mel, options)
|
27 |
return result.text
|
28 |
|
29 |
-
def audio_response(
|
30 |
-
|
31 |
-
|
32 |
-
|
33 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
34 |
|
35 |
def messages_to_prompt(messages):
|
36 |
# Default system message for a chatbot
|
|
|
8 |
import whisper
|
9 |
import gradio as gr
|
10 |
from gtts import gTTS
|
11 |
+
from transformers import SpeechT5Processor, SpeechT5ForTextToSpeech, SpeechT5HifiGan
|
12 |
+
import soundfile as sf
|
13 |
+
from datasets import load_dataset
|
14 |
model = whisper.load_model("base")
|
15 |
HF_API_TOKEN = os.getenv("HF_TOKEN")
|
16 |
|
|
|
28 |
result = whisper.decode(model, mel, options)
|
29 |
return result.text
|
30 |
|
31 |
+
def audio_response(text, output_path="speech.wav"):
|
32 |
+
# Load the processor, model, and vocoder
|
33 |
+
processor = SpeechT5Processor.from_pretrained("microsoft/speecht5_tts")
|
34 |
+
model = SpeechT5ForTextToSpeech.from_pretrained("microsoft/speecht5_tts")
|
35 |
+
vocoder = SpeechT5HifiGan.from_pretrained("microsoft/speecht5_hifigan")
|
36 |
+
|
37 |
+
# Process the input text
|
38 |
+
inputs = processor(text=text, return_tensors="pt")
|
39 |
+
|
40 |
+
# Load xvector containing speaker's voice characteristics
|
41 |
+
embeddings_dataset = load_dataset("Matthijs/cmu-arctic-xvectors", split="validation")
|
42 |
+
speaker_embeddings = torch.tensor(embeddings_dataset[7306]["xvector"]).unsqueeze(0)
|
43 |
+
|
44 |
+
# Generate speech
|
45 |
+
with torch.no_grad():
|
46 |
+
speech = model.generate_speech(inputs["input_ids"], speaker_embeddings, vocoder=vocoder)
|
47 |
+
|
48 |
+
# Save the audio to a file
|
49 |
+
sf.write(output_path, speech.numpy(), samplerate=16000) # Ensure the sample rate matches your needs
|
50 |
+
|
51 |
+
return output_path
|
52 |
|
53 |
def messages_to_prompt(messages):
|
54 |
# Default system message for a chatbot
|