Shanuka01 commited on
Commit
96b28b8
·
1 Parent(s): f7135c5

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +37 -44
app.py CHANGED
@@ -8,32 +8,36 @@ from tortoise.api import TextToSpeech
8
  from tortoise.utils.text import split_and_recombine_text
9
  from tortoise.utils.audio import load_audio, load_voice, load_voices
10
 
11
- # STT Setup
12
- device = 'cuda' if torch.cuda.is_available() else 'cpu'
13
-
14
  model_id = "openai/whisper-tiny"
15
- pipe = pipeline("automatic-speech-recognition", model=model_id, device=0)
16
 
17
- # TTS Setup
18
- VOICE_OPTIONS = ["indian_F_1", "indian_F_2", "indian_F_3", "indian_M_1", "indian_M_2", "indian_M_3"]
 
 
 
19
  tts = TextToSpeech(kv_cache=True, use_deepspeed=True, half=True)
20
 
21
- def combined_inference(filepath, voice):
22
- # STT: Convert audio to text
23
- output = pipe(
24
  filepath,
25
  max_new_tokens=256,
26
- generate_kwargs={"task": "transcribe", "language": "english"},
27
- chunk_length_s=15,
28
- batch_size=16,
 
 
 
29
  )
30
- text = output["text"]
31
-
32
- # TTS: Convert text back to audio
33
- texts = split_and_recombine_text(text)
34
  voice_samples, conditioning_latents = load_voice(voice)
35
-
36
- for j, text in enumerate(texts):
37
  for audio_frame in tts.tts_with_preset(
38
  text,
39
  voice_samples=voice_samples,
@@ -41,32 +45,21 @@ def combined_inference(filepath, voice):
41
  preset="ultra_fast",
42
  k=1
43
  ):
44
- yield (24000, audio_frame.cpu().detach().numpy())
45
-
46
- def main():
47
- title = "Combined STT and TTS"
48
- description = ""
49
 
50
- voice = gr.Dropdown(
51
- VOICE_OPTIONS, value="indian_F_1", label="Select voice:", type="value"
52
- )
53
-
54
- interface = gr.Interface(
55
- fn=combined_inference,
56
- inputs=[
57
- gr.Audio(source="upload", type="filepath"),
58
- voice
59
- ],
60
- title=title,
61
- description=description,
62
- outputs=[gr.Audio(label="streaming audio:", streaming=True, autoplay=True)],
63
- )
64
- interface.queue().launch()
65
 
66
- if __name__ == "__main__":
67
- with open("Combined_STT_TTS_Runs_Scripts.log", "a") as f:
68
- f.write(
69
- f"\n\n-------------------------Combined STT TTS Scripts Logs, {datetime.now()}-------------------------\n"
70
- )
 
 
 
 
 
71
 
72
- main()
 
8
  from tortoise.utils.text import split_and_recombine_text
9
  from tortoise.utils.audio import load_audio, load_voice, load_voices
10
 
11
+ # STT Initialization
 
 
12
  model_id = "openai/whisper-tiny"
13
+ pipe = pipeline("automatic-speech-recognition", model=model_id)
14
 
15
+ # TTS Initialization
16
+ VOICE_OPTIONS = [
17
+ "indian_F_1", "indian_F_2", "indian_F_3",
18
+ "indian_M_1", "indian_M_2", "indian_M_3"
19
+ ]
20
  tts = TextToSpeech(kv_cache=True, use_deepspeed=True, half=True)
21
 
22
+ def convert_audio(filepath, voice="indian_F_1"):
23
+ # Transcribe audio to text using STT
24
+ transcription_output = pipe(
25
  filepath,
26
  max_new_tokens=256,
27
+ generate_kwargs={
28
+ "task": "transcribe",
29
+ "language": "english",
30
+ },
31
+ chunk_length_s=30,
32
+ batch_size=8
33
  )
34
+ transcribed_text = transcription_output["text"]
35
+
36
+ # Use the transcribed text for TTS
37
+ texts = split_and_recombine_text(transcribed_text)
38
  voice_samples, conditioning_latents = load_voice(voice)
39
+ audio_frames = []
40
+ for text in texts:
41
  for audio_frame in tts.tts_with_preset(
42
  text,
43
  voice_samples=voice_samples,
 
45
  preset="ultra_fast",
46
  k=1
47
  ):
48
+ audio_frames.append(audio_frame.cpu().detach().numpy())
 
 
 
 
49
 
50
+ # Joining the audio frames for output
51
+ final_audio = torch.cat(audio_frames, axis=0)
52
+ return (24000, final_audio)
 
 
 
 
 
 
 
 
 
 
 
 
53
 
54
+ interface = gr.Interface(
55
+ fn=convert_audio,
56
+ inputs=[
57
+ gr.Audio(source="upload", type="filepath"),
58
+ gr.Dropdown(VOICE_OPTIONS, value="indian_F_1", label="Select voice:", type="value")
59
+ ],
60
+ outputs=gr.Audio(label="streaming audio:", streaming=True, autoplay=True),
61
+ title="STT to TTS",
62
+ description="Convert spoken words into a different voice"
63
+ )
64
 
65
+ interface.launch()