camanalo1 commited on
Commit
e0fe085
1 Parent(s): 6ab814a

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +5 -11
app.py CHANGED
@@ -1,18 +1,12 @@
1
  import gradio as gr
2
- import numpy as np
3
  from transformers import pipeline, VitsTokenizer, VitsModel, set_seed
 
4
  import torch
5
  import io
6
  import soundfile as sf
7
- from nemo.collections.asr.models import EncDecMultiTaskModel
8
-
9
- # Load the ASR model
10
- canary_model = EncDecMultiTaskModel.from_pretrained('nvidia/canary-1b')
11
 
12
- # Update decoding parameters
13
- decode_cfg = canary_model.cfg.decoding
14
- decode_cfg.beam.beam_size = 1
15
- canary_model.change_decoding_strategy(decode_cfg)
16
 
17
  # Initialize LLM pipeline
18
  generator = pipeline("text-generation", model="microsoft/Phi-3-mini-128k-instruct", trust_remote_code=True)
@@ -27,10 +21,10 @@ def transcribe_generate_and_speak(audio):
27
  y /= np.max(np.abs(y))
28
 
29
  # Transcribe audio
30
- asr_output = canary_model.transcribe([y], [sr])
31
 
32
  # Generate text based on ASR output
33
- generated_text = generator(asr_output[0])[0]['generated_text']
34
 
35
  # Generate audio from text
36
  inputs = tokenizer(text=generated_text, return_tensors="pt")
 
1
  import gradio as gr
 
2
  from transformers import pipeline, VitsTokenizer, VitsModel, set_seed
3
+ import numpy as np
4
  import torch
5
  import io
6
  import soundfile as sf
 
 
 
 
7
 
8
+ # Initialize ASR pipeline
9
+ transcriber = pipeline("automatic-speech-recognition", model="facebook/s2t-small-librispeech-asr")
 
 
10
 
11
  # Initialize LLM pipeline
12
  generator = pipeline("text-generation", model="microsoft/Phi-3-mini-128k-instruct", trust_remote_code=True)
 
21
  y /= np.max(np.abs(y))
22
 
23
  # Transcribe audio
24
+ asr_output = transcriber({"sampling_rate": sr, "raw": y})["text"]
25
 
26
  # Generate text based on ASR output
27
+ generated_text = generator(asr_output, max_length=100, num_return_sequences=1)[0]['generated_text']
28
 
29
  # Generate audio from text
30
  inputs = tokenizer(text=generated_text, return_tensors="pt")