Spaces:
Runtime error
Runtime error
Update app.py
Browse files
app.py
CHANGED
@@ -8,32 +8,36 @@ from tortoise.api import TextToSpeech
|
|
8 |
from tortoise.utils.text import split_and_recombine_text
|
9 |
from tortoise.utils.audio import load_audio, load_voice, load_voices
|
10 |
|
11 |
-
# STT
|
12 |
-
device = 'cuda' if torch.cuda.is_available() else 'cpu'
|
13 |
-
|
14 |
model_id = "openai/whisper-tiny"
|
15 |
-
pipe = pipeline("automatic-speech-recognition", model=model_id
|
16 |
|
17 |
-
# TTS
|
18 |
-
VOICE_OPTIONS = [
|
|
|
|
|
|
|
19 |
tts = TextToSpeech(kv_cache=True, use_deepspeed=True, half=True)
|
20 |
|
21 |
-
def
|
22 |
-
#
|
23 |
-
|
24 |
filepath,
|
25 |
max_new_tokens=256,
|
26 |
-
generate_kwargs={
|
27 |
-
|
28 |
-
|
|
|
|
|
|
|
29 |
)
|
30 |
-
|
31 |
-
|
32 |
-
#
|
33 |
-
texts = split_and_recombine_text(
|
34 |
voice_samples, conditioning_latents = load_voice(voice)
|
35 |
-
|
36 |
-
for
|
37 |
for audio_frame in tts.tts_with_preset(
|
38 |
text,
|
39 |
voice_samples=voice_samples,
|
@@ -41,32 +45,21 @@ def combined_inference(filepath, voice):
|
|
41 |
preset="ultra_fast",
|
42 |
k=1
|
43 |
):
|
44 |
-
|
45 |
-
|
46 |
-
def main():
|
47 |
-
title = "Combined STT and TTS"
|
48 |
-
description = ""
|
49 |
|
50 |
-
|
51 |
-
|
52 |
-
)
|
53 |
-
|
54 |
-
interface = gr.Interface(
|
55 |
-
fn=combined_inference,
|
56 |
-
inputs=[
|
57 |
-
gr.Audio(source="upload", type="filepath"),
|
58 |
-
voice
|
59 |
-
],
|
60 |
-
title=title,
|
61 |
-
description=description,
|
62 |
-
outputs=[gr.Audio(label="streaming audio:", streaming=True, autoplay=True)],
|
63 |
-
)
|
64 |
-
interface.queue().launch()
|
65 |
|
66 |
-
|
67 |
-
|
68 |
-
|
69 |
-
|
70 |
-
)
|
|
|
|
|
|
|
|
|
|
|
71 |
|
72 |
-
|
|
|
8 |
from tortoise.utils.text import split_and_recombine_text
|
9 |
from tortoise.utils.audio import load_audio, load_voice, load_voices
|
10 |
|
11 |
+
# STT Initialization
|
|
|
|
|
12 |
model_id = "openai/whisper-tiny"
|
13 |
+
pipe = pipeline("automatic-speech-recognition", model=model_id)
|
14 |
|
15 |
+
# TTS Initialization
|
16 |
+
VOICE_OPTIONS = [
|
17 |
+
"indian_F_1", "indian_F_2", "indian_F_3",
|
18 |
+
"indian_M_1", "indian_M_2", "indian_M_3"
|
19 |
+
]
|
20 |
tts = TextToSpeech(kv_cache=True, use_deepspeed=True, half=True)
|
21 |
|
22 |
+
def convert_audio(filepath, voice="indian_F_1"):
|
23 |
+
# Transcribe audio to text using STT
|
24 |
+
transcription_output = pipe(
|
25 |
filepath,
|
26 |
max_new_tokens=256,
|
27 |
+
generate_kwargs={
|
28 |
+
"task": "transcribe",
|
29 |
+
"language": "english",
|
30 |
+
},
|
31 |
+
chunk_length_s=30,
|
32 |
+
batch_size=8
|
33 |
)
|
34 |
+
transcribed_text = transcription_output["text"]
|
35 |
+
|
36 |
+
# Use the transcribed text for TTS
|
37 |
+
texts = split_and_recombine_text(transcribed_text)
|
38 |
voice_samples, conditioning_latents = load_voice(voice)
|
39 |
+
audio_frames = []
|
40 |
+
for text in texts:
|
41 |
for audio_frame in tts.tts_with_preset(
|
42 |
text,
|
43 |
voice_samples=voice_samples,
|
|
|
45 |
preset="ultra_fast",
|
46 |
k=1
|
47 |
):
|
48 |
+
audio_frames.append(audio_frame.cpu().detach().numpy())
|
|
|
|
|
|
|
|
|
49 |
|
50 |
+
# Joining the audio frames for output
|
51 |
+
final_audio = torch.cat(audio_frames, axis=0)
|
52 |
+
return (24000, final_audio)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
53 |
|
54 |
+
interface = gr.Interface(
|
55 |
+
fn=convert_audio,
|
56 |
+
inputs=[
|
57 |
+
gr.Audio(source="upload", type="filepath"),
|
58 |
+
gr.Dropdown(VOICE_OPTIONS, value="indian_F_1", label="Select voice:", type="value")
|
59 |
+
],
|
60 |
+
outputs=gr.Audio(label="streaming audio:", streaming=True, autoplay=True),
|
61 |
+
title="STT to TTS",
|
62 |
+
description="Convert spoken words into a different voice"
|
63 |
+
)
|
64 |
|
65 |
+
interface.launch()
|