camanalo1 commited on
Commit
4ba3d10
1 Parent(s): 581c54f

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +11 -6
app.py CHANGED
@@ -1,15 +1,20 @@
1
  import gradio as gr
2
- from transformers import pipeline, VitsTokenizer, VitsModel, set_seed
3
  import numpy as np
4
  import torch
5
  import io
6
  import soundfile as sf
 
7
 
8
- # Initialize ASR pipeline
9
- transcriber = pipeline("automatic-speech-recognition", model="facebook/s2t-small-librispeech-asr")
 
 
 
 
 
10
 
11
  # Initialize LLM pipeline
12
- generator = pipeline("text-generation", model="microsoft/Phi-3-mini-128k-instruct", trust_remote_code=True)
13
 
14
  # Initialize TTS tokenizer and model
15
  tokenizer = VitsTokenizer.from_pretrained("facebook/mms-tts-eng")
@@ -21,10 +26,10 @@ def transcribe_generate_and_speak(audio):
21
  y /= np.max(np.abs(y))
22
 
23
  # Transcribe audio
24
- asr_output = transcriber({"sampling_rate": sr, "raw": y})["text"]
25
 
26
  # Generate text based on ASR output
27
- generated_text = generator(asr_output, max_length=100, num_return_sequences=1)[0]['generated_text']
28
 
29
  # Generate audio from text
30
  inputs = tokenizer(text=generated_text, return_tensors="pt")
 
1
  import gradio as gr
 
2
  import numpy as np
3
  import torch
4
  import io
5
  import soundfile as sf
6
+ from nemo.collections.asr.models import EncDecMultiTaskModel
7
 
8
+ # Load the ASR model
9
+ canary_model = EncDecMultiTaskModel.from_pretrained('nvidia/canary-1b')
10
+
11
+ # Update decoding parameters
12
+ decode_cfg = canary_model.cfg.decoding
13
+ decode_cfg.beam.beam_size = 1
14
+ canary_model.change_decoding_strategy(decode_cfg)
15
 
16
  # Initialize LLM pipeline
17
+ generator = pipeline("text-generation", model="microsoft/Phi-3-mini-4k-instruct", trust_remote_code=True)
18
 
19
  # Initialize TTS tokenizer and model
20
  tokenizer = VitsTokenizer.from_pretrained("facebook/mms-tts-eng")
 
26
  y /= np.max(np.abs(y))
27
 
28
  # Transcribe audio
29
+ asr_output = canary_model.transcribe([y], [sr])
30
 
31
  # Generate text based on ASR output
32
+ generated_text = generator(asr_output[0])[0]['generated_text']
33
 
34
  # Generate audio from text
35
  inputs = tokenizer(text=generated_text, return_tensors="pt")