Shanuka01 commited on
Commit
f7135c5
·
1 Parent(s): f3b2f43

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +30 -36
app.py CHANGED
@@ -1,33 +1,37 @@
1
- import os
2
  import torch
3
  import gradio as gr
4
  import torchaudio
5
  import time
6
  from datetime import datetime
 
7
  from tortoise.api import TextToSpeech
8
  from tortoise.utils.text import split_and_recombine_text
9
  from tortoise.utils.audio import load_audio, load_voice, load_voices
10
 
11
- VOICE_OPTIONS = [
12
- "kasuri",
13
- "shanuka",
14
- "indian_f_1",
15
- "kushan",
16
- "afshak",
17
- ]
18
 
19
- def inference(
20
- text,
21
- voice
22
- ):
23
- if text is None or text.strip() == "":
24
- raise gr.Error("Please provide text.")
25
 
26
- texts = split_and_recombine_text(text)
 
 
27
 
28
- voice_samples, conditioning_latents = load_voice(voice)
 
 
 
 
 
 
 
 
 
29
 
30
- start_time = time.time()
 
 
31
 
32
  for j, text in enumerate(texts):
33
  for audio_frame in tts.tts_with_preset(
@@ -40,39 +44,29 @@ def inference(
40
  yield (24000, audio_frame.cpu().detach().numpy())
41
 
42
  def main():
43
- title = "Tortoise TTS"
44
- description = """
45
-
46
- """
47
- text = gr.Textbox(
48
- lines=4,
49
- label="Text:",
50
- )
51
 
52
  voice = gr.Dropdown(
53
- VOICE_OPTIONS, value="kasuri", label="Select voice:", type="value"
54
  )
55
 
56
- output_audio = gr.Audio(label="streaming audio:", streaming=True, autoplay=True)
57
-
58
  interface = gr.Interface(
59
- fn=inference,
60
  inputs=[
61
- text,
62
  voice
63
  ],
64
  title=title,
65
  description=description,
66
- outputs=[output_audio],
67
  )
68
  interface.queue().launch()
69
 
70
  if __name__ == "__main__":
71
- tts = TextToSpeech(kv_cache=True, use_deepspeed=True, half=True)
72
-
73
- with open("Tortoise_TTS_Runs_Scripts.log", "a") as f:
74
  f.write(
75
- f"\n\n-------------------------Tortoise TTS Scripts Logs, {datetime.now()}-------------------------\n"
76
  )
77
 
78
- main()
 
 
1
  import torch
2
  import gradio as gr
3
  import torchaudio
4
  import time
5
  from datetime import datetime
6
+ from transformers import pipeline
7
  from tortoise.api import TextToSpeech
8
  from tortoise.utils.text import split_and_recombine_text
9
  from tortoise.utils.audio import load_audio, load_voice, load_voices
10
 
11
+ # STT Setup
12
+ device = 'cuda' if torch.cuda.is_available() else 'cpu'
 
 
 
 
 
13
 
14
+ model_id = "openai/whisper-tiny"
15
+ pipe = pipeline("automatic-speech-recognition", model=model_id, device=0)
 
 
 
 
16
 
17
+ # TTS Setup
18
+ VOICE_OPTIONS = ["indian_F_1", "indian_F_2", "indian_F_3", "indian_M_1", "indian_M_2", "indian_M_3"]
19
+ tts = TextToSpeech(kv_cache=True, use_deepspeed=True, half=True)
20
 
21
+ def combined_inference(filepath, voice):
22
+ # STT: Convert audio to text
23
+ output = pipe(
24
+ filepath,
25
+ max_new_tokens=256,
26
+ generate_kwargs={"task": "transcribe", "language": "english"},
27
+ chunk_length_s=15,
28
+ batch_size=16,
29
+ )
30
+ text = output["text"]
31
 
32
+ # TTS: Convert text back to audio
33
+ texts = split_and_recombine_text(text)
34
+ voice_samples, conditioning_latents = load_voice(voice)
35
 
36
  for j, text in enumerate(texts):
37
  for audio_frame in tts.tts_with_preset(
 
44
  yield (24000, audio_frame.cpu().detach().numpy())
45
 
46
  def main():
47
+ title = "Combined STT and TTS"
48
+ description = ""
 
 
 
 
 
 
49
 
50
  voice = gr.Dropdown(
51
+ VOICE_OPTIONS, value="indian_F_1", label="Select voice:", type="value"
52
  )
53
 
 
 
54
  interface = gr.Interface(
55
+ fn=combined_inference,
56
  inputs=[
57
+ gr.Audio(source="upload", type="filepath"),
58
  voice
59
  ],
60
  title=title,
61
  description=description,
62
+ outputs=[gr.Audio(label="streaming audio:", streaming=True, autoplay=True)],
63
  )
64
  interface.queue().launch()
65
 
66
  if __name__ == "__main__":
67
+ with open("Combined_STT_TTS_Runs_Scripts.log", "a") as f:
 
 
68
  f.write(
69
+ f"\n\n-------------------------Combined STT TTS Scripts Logs, {datetime.now()}-------------------------\n"
70
  )
71
 
72
+ main()