gdnartea commited on
Commit
c27a4b2
·
verified ·
1 Parent(s): 4c8ddf5

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +111 -17
app.py CHANGED
@@ -1,34 +1,128 @@
1
- from transformers import AutoTokenizer, VitsModel, set_seed
2
  import gradio as gr
3
- import torch
 
 
4
  import soundfile as sf
 
 
 
 
5
 
 
 
 
6
 
7
 
8
- # Initialize the VITS model, tokenizer, and seed
9
 
10
- vits_model = VitsModel.from_pretrained("facebook/mms-tts-eng")
11
- vits_tokenizer = AutoTokenizer.from_pretrained("facebook/mms-tts-eng")
12
- set_seed(2020)
 
 
13
 
14
 
15
- def text_to_speech(text_response):
16
- inputs = vits_tokenizer(text=text_response, return_tensors="pt")
17
- with torch.no_grad():
18
- outputs = vits_model(**inputs)
19
- waveform = outputs.waveform[0]
20
- sf.write('output.wav', waveform.numpy(), vits_model.config.sampling_rate)
21
 
22
- return 'output.wav'
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
23
 
 
 
 
 
24
 
25
 
26
  # Create a Gradio interface
27
  iface = gr.Interface(
28
- fn=text_to_speech,
29
- inputs=gr.Textbox(lines=5, placeholder="Enter your text here..."),
30
- outputs=gr.Audio("response.wav")
31
  )
32
 
33
  # Launch the interface
34
- iface.launch()
 
1
+ # imports
2
  import gradio as gr
3
+ import json
4
+ import librosa
5
+ import os
6
  import soundfile as sf
7
+ import tempfile
8
+ import uuid
9
+ import torch
10
+ from transformers import AutoTokenizer, VitsModel, set_seed, AutoModelForCausalLM, AutoTokenizer, pipeline
11
 
12
+ from nemo.collections.asr.models import ASRModel
13
+ from nemo.collections.asr.parts.utils.streaming_utils import FrameBatchMultiTaskAED
14
+ from nemo.collections.asr.parts.utils.transcribe_utils import get_buffered_pred_feat_multitaskAED
15
 
16
 
 
17
 
18
+ torch.random.manual_seed(0)
19
+ proc_model_name = "SanctumAI/Phi-3-mini-4k-instruct-GGUF"
20
+ proc_model = AutoModelForCausalLM.from_pretrained(proc_model_name)
21
+ proc_model.to("cpu")
22
+ proc_tokenizer = AutoTokenizer.from_pretrained(proc_model_name)
23
 
24
 
 
 
 
 
 
 
25
 
26
+
27
+
28
+ SAMPLE_RATE = 16000 # Hz
29
+ MAX_AUDIO_MINUTES = 10 # wont try to transcribe if longer than this
30
+
31
+ model = ASRModel.from_pretrained("nvidia/canary-1b")
32
+ model.eval()
33
+
34
+ # make sure beam size always 1 for consistency
35
+ model.change_decoding_strategy(None)
36
+ decoding_cfg = model.cfg.decoding
37
+ decoding_cfg.beam.beam_size = 1
38
+ model.change_decoding_strategy(decoding_cfg)
39
+
40
+
41
+
42
+ amp_dtype = torch.float16
43
+
44
+
45
+ def convert_audio(audio_filepath, tmpdir, utt_id):
46
+
47
+ data, sr = librosa.load(audio_filepath, sr=None, mono=True)
48
+
49
+ duration = librosa.get_duration(y=data, sr=sr)
50
+
51
+ if sr != SAMPLE_RATE:
52
+ data = librosa.resample(data, orig_sr=sr, target_sr=SAMPLE_RATE)
53
+
54
+ out_filename = os.path.join(tmpdir, utt_id + '.wav')
55
+
56
+ # save output audio
57
+ sf.write(out_filename, data, SAMPLE_RATE)
58
+
59
+ return out_filename, duration
60
+
61
+ def transcribe(audio_filepath):
62
+
63
+ if audio_filepath is None:
64
+ raise gr.Error("Please provide some input audio: either upload an audio file or use the microphone")
65
+
66
+ utt_id = uuid.uuid4()
67
+ with tempfile.TemporaryDirectory() as tmpdir:
68
+ converted_audio_filepath, duration = convert_audio(audio_filepath, tmpdir, str(utt_id))
69
+
70
+ # make manifest file and save
71
+ manifest_data = {
72
+ "audio_filepath": converted_audio_filepath,
73
+ "source_lang": "en",
74
+ "target_lang": "en",
75
+ "taskname": "asr",
76
+ "pnc": "no",
77
+ "answer": "predict",
78
+ "duration": str(duration),
79
+ }
80
+
81
+ manifest_filepath = os.path.join(tmpdir, f'{utt_id}.json')
82
+
83
+ with open(manifest_filepath, 'w') as fout:
84
+ line = json.dumps(manifest_data)
85
+ fout.write(line + '\n')
86
+
87
+ output_text = model.transcribe(manifest_filepath)[0]
88
+
89
+ return output_text
90
+
91
+
92
+
93
+ start = {"role": "system", "content": "You are a helpful digital assistant. Please provide safe, ethical and accurate information to the user."}
94
+
95
+
96
+
97
+ def generate_response(user_input):
98
+ messages = [start, {"role": "user", "content": user_input}]
99
+ inputs = proc_tokenizer.apply_chat_template(messages, add_generation_prompt=True, return_tensors="pt")
100
+ with torch.no_grad():
101
+ outputs = proc_model.generate(
102
+ inputs,
103
+ max_new_tokens=48,
104
+ )
105
+
106
+ response = proc_tokenizer.batch_decode(
107
+ outputs,
108
+ skip_special_tokens=True,
109
+ clean_up_tokenization_spaces=False,
110
+ )[0]
111
+
112
+ return response
113
 
114
+ def CanaryPhi(audio_filepath):
115
+ user_input = transcribe(audio_filepath)
116
+ response = generate_response(user_input)
117
+ return response
118
 
119
 
120
  # Create a Gradio interface
121
  iface = gr.Interface(
122
+ fn=CanaryPhi,
123
+ inputs=gr.Audio(sources="microphone", type="filepath"),
124
+ outputs=gr.Textbox(),
125
  )
126
 
127
  # Launch the interface
128
+ iface.launch()