Spaces:
Sleeping
Sleeping
jmd-pintor
commited on
Commit
•
e12e2bc
1
Parent(s):
670d9ee
Update app.py
Browse files
app.py
CHANGED
@@ -1,5 +1,5 @@
|
|
1 |
from transformers import AutoTokenizer, VitsModel
|
2 |
-
from transformers import pipeline
|
3 |
import gradio as gr
|
4 |
import torch
|
5 |
import numpy as np
|
@@ -10,10 +10,10 @@ def talk_to_alexa(audio):
|
|
10 |
transcribed_text = convert_speech_to_text(audio)
|
11 |
|
12 |
# Get Alexa's response
|
13 |
-
|
14 |
|
15 |
# Convert text to speech
|
16 |
-
audio_output = convert_text_to_speech(
|
17 |
|
18 |
return audio_output
|
19 |
|
@@ -26,9 +26,22 @@ def convert_speech_to_text(speech_inputs):
|
|
26 |
|
27 |
return transcribed_text
|
28 |
|
29 |
-
def
|
30 |
-
|
31 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
32 |
|
33 |
def convert_text_to_speech(text):
|
34 |
inputs = tts_tokenizer(text, return_tensors="pt")
|
@@ -40,15 +53,30 @@ def convert_text_to_speech(text):
|
|
40 |
|
41 |
|
42 |
# Speech To Text
|
43 |
-
# TODO: Replace with whisper-large-v3 once deployed
|
44 |
pipe = pipeline("automatic-speech-recognition",
|
45 |
"openai/whisper-large-v3",
|
46 |
-
torch_dtype=torch.
|
47 |
-
|
48 |
-
#
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
49 |
tts_tokenizer = AutoTokenizer.from_pretrained("facebook/mms-tts-eng")
|
50 |
tts_model = VitsModel.from_pretrained("facebook/mms-tts-eng")
|
51 |
|
|
|
52 |
iface = gr.Interface(fn=talk_to_alexa,
|
53 |
inputs=[gr.Audio(sources=["microphone", "upload"], type="filepath"),],
|
54 |
outputs="audio",
|
|
|
1 |
from transformers import AutoTokenizer, VitsModel
|
2 |
+
from transformers import AutoModelForCausalLM, AutoTokenizer, pipeline
|
3 |
import gradio as gr
|
4 |
import torch
|
5 |
import numpy as np
|
|
|
10 |
transcribed_text = convert_speech_to_text(audio)
|
11 |
|
12 |
# Get Alexa's response
|
13 |
+
instruction_response = get_instruction_response(transcribed_text)
|
14 |
|
15 |
# Convert text to speech
|
16 |
+
audio_output = convert_text_to_speech(instruction_response)
|
17 |
|
18 |
return audio_output
|
19 |
|
|
|
26 |
|
27 |
return transcribed_text
|
28 |
|
29 |
+
def get_instruction_response(text) -> str:
|
30 |
+
generation_args = {
|
31 |
+
"max_new_tokens": 500,
|
32 |
+
"return_full_text": False,
|
33 |
+
"temperature": 0.0,
|
34 |
+
"do_sample": False,
|
35 |
+
}
|
36 |
+
|
37 |
+
messages = [
|
38 |
+
{"role": "user", "content": text},
|
39 |
+
]
|
40 |
+
|
41 |
+
output = instruction_pipe(messages, **generation_args)
|
42 |
+
print(output[0]['generated_text'])
|
43 |
+
|
44 |
+
return output[0]['generated_text']
|
45 |
|
46 |
def convert_text_to_speech(text):
|
47 |
inputs = tts_tokenizer(text, return_tensors="pt")
|
|
|
53 |
|
54 |
|
55 |
# Speech To Text
|
|
|
56 |
pipe = pipeline("automatic-speech-recognition",
|
57 |
"openai/whisper-large-v3",
|
58 |
+
torch_dtype=torch.float32)
|
59 |
+
|
60 |
+
# Instruction Resposne
|
61 |
+
instr_model = AutoModelForCausalLM.from_pretrained(
|
62 |
+
"microsoft/Phi-3-mini-128k-instruct",
|
63 |
+
device_map="cuda",
|
64 |
+
torch_dtype="auto",
|
65 |
+
trust_remote_code=True,
|
66 |
+
)
|
67 |
+
instr_tokenizer = AutoTokenizer.from_pretrained("microsoft/Phi-3-mini-128k-instruct")
|
68 |
+
|
69 |
+
instruction_pipe = pipeline(
|
70 |
+
"text-generation",
|
71 |
+
model=instr_model,
|
72 |
+
tokenizer=instr_tokenizer,
|
73 |
+
)
|
74 |
+
|
75 |
+
# Text to Speech
|
76 |
tts_tokenizer = AutoTokenizer.from_pretrained("facebook/mms-tts-eng")
|
77 |
tts_model = VitsModel.from_pretrained("facebook/mms-tts-eng")
|
78 |
|
79 |
+
# Interface
|
80 |
iface = gr.Interface(fn=talk_to_alexa,
|
81 |
inputs=[gr.Audio(sources=["microphone", "upload"], type="filepath"),],
|
82 |
outputs="audio",
|