jmd-pintor commited on
Commit
e12e2bc
1 Parent(s): 670d9ee

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +38 -10
app.py CHANGED
@@ -1,5 +1,5 @@
1
  from transformers import AutoTokenizer, VitsModel
2
- from transformers import pipeline
3
  import gradio as gr
4
  import torch
5
  import numpy as np
@@ -10,10 +10,10 @@ def talk_to_alexa(audio):
10
  transcribed_text = convert_speech_to_text(audio)
11
 
12
  # Get Alexa's response
13
- alexa_response = get_alexa_response(transcribed_text)
14
 
15
  # Convert text to speech
16
- audio_output = convert_text_to_speech(alexa_response)
17
 
18
  return audio_output
19
 
@@ -26,9 +26,22 @@ def convert_speech_to_text(speech_inputs):
26
 
27
  return transcribed_text
28
 
29
- def get_alexa_response(text) -> str:
30
- # TODO: Complete this function
31
- return text
 
 
 
 
 
 
 
 
 
 
 
 
 
32
 
33
  def convert_text_to_speech(text):
34
  inputs = tts_tokenizer(text, return_tensors="pt")
@@ -40,15 +53,30 @@ def convert_text_to_speech(text):
40
 
41
 
42
  # Speech To Text
43
- # TODO: Replace with whisper-large-v3 once deployed
44
  pipe = pipeline("automatic-speech-recognition",
45
  "openai/whisper-large-v3",
46
- torch_dtype=torch.float16)
47
-
48
- # # Text to Speech
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
49
  tts_tokenizer = AutoTokenizer.from_pretrained("facebook/mms-tts-eng")
50
  tts_model = VitsModel.from_pretrained("facebook/mms-tts-eng")
51
 
 
52
  iface = gr.Interface(fn=talk_to_alexa,
53
  inputs=[gr.Audio(sources=["microphone", "upload"], type="filepath"),],
54
  outputs="audio",
 
1
  from transformers import AutoTokenizer, VitsModel
2
+ from transformers import AutoModelForCausalLM, AutoTokenizer, pipeline
3
  import gradio as gr
4
  import torch
5
  import numpy as np
 
10
  transcribed_text = convert_speech_to_text(audio)
11
 
12
  # Get Alexa's response
13
+ instruction_response = get_instruction_response(transcribed_text)
14
 
15
  # Convert text to speech
16
+ audio_output = convert_text_to_speech(instruction_response)
17
 
18
  return audio_output
19
 
 
26
 
27
  return transcribed_text
28
 
29
+ def get_instruction_response(text) -> str:
30
+ generation_args = {
31
+ "max_new_tokens": 500,
32
+ "return_full_text": False,
33
+ "temperature": 0.0,
34
+ "do_sample": False,
35
+ }
36
+
37
+ messages = [
38
+ {"role": "user", "content": text},
39
+ ]
40
+
41
+ output = instruction_pipe(messages, **generation_args)
42
+ print(output[0]['generated_text'])
43
+
44
+ return output[0]['generated_text']
45
 
46
  def convert_text_to_speech(text):
47
  inputs = tts_tokenizer(text, return_tensors="pt")
 
53
 
54
 
55
  # Speech To Text
 
56
  pipe = pipeline("automatic-speech-recognition",
57
  "openai/whisper-large-v3",
58
+ torch_dtype=torch.float32)
59
+
60
+ # Instruction Resposne
61
+ instr_model = AutoModelForCausalLM.from_pretrained(
62
+ "microsoft/Phi-3-mini-128k-instruct",
63
+ device_map="cuda",
64
+ torch_dtype="auto",
65
+ trust_remote_code=True,
66
+ )
67
+ instr_tokenizer = AutoTokenizer.from_pretrained("microsoft/Phi-3-mini-128k-instruct")
68
+
69
+ instruction_pipe = pipeline(
70
+ "text-generation",
71
+ model=instr_model,
72
+ tokenizer=instr_tokenizer,
73
+ )
74
+
75
+ # Text to Speech
76
  tts_tokenizer = AutoTokenizer.from_pretrained("facebook/mms-tts-eng")
77
  tts_model = VitsModel.from_pretrained("facebook/mms-tts-eng")
78
 
79
+ # Interface
80
  iface = gr.Interface(fn=talk_to_alexa,
81
  inputs=[gr.Audio(sources=["microphone", "upload"], type="filepath"),],
82
  outputs="audio",