gdnartea commited on
Commit
4c8ddf5
·
verified ·
1 Parent(s): b3025ce

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +17 -116
app.py CHANGED
@@ -1,133 +1,34 @@
1
- # imports
2
  import gradio as gr
3
- import json
4
- import librosa
5
- import os
6
- import soundfile as sf
7
- import tempfile
8
- import uuid
9
  import torch
10
- from transformers import AutoTokenizer, VitsModel, set_seed, AutoModelForCausalLM, AutoTokenizer, pipeline
11
-
12
- from nemo.collections.asr.models import ASRModel
13
- from nemo.collections.asr.parts.utils.streaming_utils import FrameBatchMultiTaskAED
14
- from nemo.collections.asr.parts.utils.transcribe_utils import get_buffered_pred_feat_multitaskAED
15
-
16
-
17
-
18
- torch.random.manual_seed(0)
19
- proc_model_name = "SanctumAI/Phi-3-mini-4k-instruct-GGUF"
20
- proc_model = AutoModelForCausalLM.from_pretrained(
21
- proc_model_name,
22
- trust_remote_code=True,
23
- low_cpu_mem_usage=True,
24
- )
25
-
26
- proc_model.to("cpu")
27
- proc_tokenizer = AutoTokenizer.from_pretrained(proc_model_name)
28
-
29
-
30
-
31
-
32
-
33
- SAMPLE_RATE = 16000 # Hz
34
- MAX_AUDIO_MINUTES = 10 # wont try to transcribe if longer than this
35
-
36
- model = ASRModel.from_pretrained("nvidia/canary-1b")
37
- model.eval()
38
-
39
- # make sure beam size always 1 for consistency
40
- model.change_decoding_strategy(None)
41
- decoding_cfg = model.cfg.decoding
42
- decoding_cfg.beam.beam_size = 1
43
- model.change_decoding_strategy(decoding_cfg)
44
-
45
-
46
-
47
- amp_dtype = torch.float16
48
-
49
-
50
- def convert_audio(audio_filepath, tmpdir, utt_id):
51
-
52
- data, sr = librosa.load(audio_filepath, sr=None, mono=True)
53
-
54
- duration = librosa.get_duration(y=data, sr=sr)
55
-
56
- if sr != SAMPLE_RATE:
57
- data = librosa.resample(data, orig_sr=sr, target_sr=SAMPLE_RATE)
58
-
59
- out_filename = os.path.join(tmpdir, utt_id + '.wav')
60
-
61
- # save output audio
62
- sf.write(out_filename, data, SAMPLE_RATE)
63
-
64
- return out_filename, duration
65
-
66
- def transcribe(audio_filepath):
67
-
68
- if audio_filepath is None:
69
- raise gr.Error("Please provide some input audio: either upload an audio file or use the microphone")
70
-
71
- utt_id = uuid.uuid4()
72
- with tempfile.TemporaryDirectory() as tmpdir:
73
- converted_audio_filepath, duration = convert_audio(audio_filepath, tmpdir, str(utt_id))
74
-
75
- # make manifest file and save
76
- manifest_data = {
77
- "audio_filepath": converted_audio_filepath,
78
- "source_lang": "en",
79
- "target_lang": "en",
80
- "taskname": "asr",
81
- "pnc": "no",
82
- "answer": "predict",
83
- "duration": str(duration),
84
- }
85
-
86
- manifest_filepath = os.path.join(tmpdir, f'{utt_id}.json')
87
-
88
- with open(manifest_filepath, 'w') as fout:
89
- line = json.dumps(manifest_data)
90
- fout.write(line + '\n')
91
-
92
- output_text = model.transcribe(manifest_filepath)[0]
93
-
94
- return output_text
95
 
96
 
97
 
98
- start = {"role": "system", "content": "You are a helpful digital assistant. Please provide safe, ethical and accurate information to the user."}
99
 
 
 
 
100
 
101
 
102
- def generate_response(user_input):
103
- messages = [start, {"role": "user", "content": user_input}]
104
- inputs = proc_tokenizer.apply_chat_template(messages, add_generation_prompt=True, return_tensors="pt")
105
  with torch.no_grad():
106
- outputs = proc_model.generate(
107
- inputs,
108
- max_new_tokens=48,
109
- )
110
-
111
- response = proc_tokenizer.batch_decode(
112
- outputs,
113
- skip_special_tokens=True,
114
- clean_up_tokenization_spaces=False,
115
- )[0]
116
-
117
- return response
118
 
119
- def CanaryPhi(audio_filepath):
120
- user_input = transcribe(audio_filepath)
121
- response = generate_response(user_input)
122
- return response
123
 
124
 
125
  # Create a Gradio interface
126
  iface = gr.Interface(
127
- fn=CanaryPhi,
128
- inputs=gr.Audio(sources="microphone", type="filepath"),
129
- outputs=gr.Textbox(),
130
  )
131
 
132
  # Launch the interface
133
- iface.launch()
 
1
+ from transformers import AutoTokenizer, VitsModel, set_seed
2
  import gradio as gr
 
 
 
 
 
 
3
  import torch
4
+ import soundfile as sf
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
5
 
6
 
7
 
8
+ # Initialize the VITS model, tokenizer, and seed
9
 
10
+ vits_model = VitsModel.from_pretrained("facebook/mms-tts-eng")
11
+ vits_tokenizer = AutoTokenizer.from_pretrained("facebook/mms-tts-eng")
12
+ set_seed(2020)
13
 
14
 
15
+ def text_to_speech(text_response):
16
+ inputs = vits_tokenizer(text=text_response, return_tensors="pt")
 
17
  with torch.no_grad():
18
+ outputs = vits_model(**inputs)
19
+ waveform = outputs.waveform[0]
20
+ sf.write('output.wav', waveform.numpy(), vits_model.config.sampling_rate)
21
+
22
+ return 'output.wav'
 
 
 
 
 
 
 
23
 
 
 
 
 
24
 
25
 
26
  # Create a Gradio interface
27
  iface = gr.Interface(
28
+ fn=text_to_speech,
29
+ inputs=gr.Textbox(lines=5, placeholder="Enter your text here..."),
30
+ outputs=gr.Audio("response.wav")
31
  )
32
 
33
  # Launch the interface
34
+ iface.launch()