Shanuka01 commited on
Commit
468bbaf
·
1 Parent(s): 82874d1

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +6 -5
app.py CHANGED
@@ -3,6 +3,7 @@ import gradio as gr
3
  import torchaudio
4
  import time
5
  from datetime import datetime
 
6
  from transformers import pipeline
7
  from tortoise.api import TextToSpeech
8
  from tortoise.utils.text import split_and_recombine_text
@@ -14,7 +15,7 @@ pipe = pipeline("automatic-speech-recognition", model=model_id)
14
 
15
  # TTS Initialization
16
  VOICE_OPTIONS = [
17
- "indian_f_1", "indian_F_2", "indian_F_3",
18
  "indian_M_1", "indian_M_2", "indian_M_3"
19
  ]
20
  tts = TextToSpeech(kv_cache=True, use_deepspeed=True, half=True)
@@ -47,19 +48,19 @@ def convert_audio(filepath, voice="indian_F_1"):
47
  ):
48
  audio_frames.append(audio_frame.cpu().detach().numpy())
49
 
50
- # Joining the audio frames for output
51
- final_audio = torch.cat(audio_frames, axis=0)
52
  return (24000, final_audio)
53
 
54
  interface = gr.Interface(
55
  fn=convert_audio,
56
  inputs=[
57
  gr.Audio(source="upload", type="filepath"),
58
- gr.Dropdown(VOICE_OPTIONS, value="indian_f_1", label="Select voice:", type="value")
59
  ],
60
  outputs=gr.Audio(label="streaming audio:", streaming=True, autoplay=True),
61
  title="STT to TTS",
62
  description="Convert spoken words into a different voice"
63
  )
64
 
65
- interface.launch()
 
3
  import torchaudio
4
  import time
5
  from datetime import datetime
6
+ import numpy as np # Add this import for handling numpy arrays
7
  from transformers import pipeline
8
  from tortoise.api import TextToSpeech
9
  from tortoise.utils.text import split_and_recombine_text
 
15
 
16
  # TTS Initialization
17
  VOICE_OPTIONS = [
18
+ "indian_F_1", "indian_F_2", "indian_F_3",
19
  "indian_M_1", "indian_M_2", "indian_M_3"
20
  ]
21
  tts = TextToSpeech(kv_cache=True, use_deepspeed=True, half=True)
 
48
  ):
49
  audio_frames.append(audio_frame.cpu().detach().numpy())
50
 
51
+ # Joining the audio frames for output using numpy's concatenate
52
+ final_audio = np.concatenate(audio_frames, axis=0)
53
  return (24000, final_audio)
54
 
55
  interface = gr.Interface(
56
  fn=convert_audio,
57
  inputs=[
58
  gr.Audio(source="upload", type="filepath"),
59
+ gr.Dropdown(VOICE_OPTIONS, value="indian_F_1", label="Select voice:", type="value")
60
  ],
61
  outputs=gr.Audio(label="streaming audio:", streaming=True, autoplay=True),
62
  title="STT to TTS",
63
  description="Convert spoken words into a different voice"
64
  )
65
 
66
+ interface.launch()