osanseviero HF staff commited on
Commit
1a8cc73
1 Parent(s): ec81f5a

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +69 -14
app.py CHANGED
@@ -12,6 +12,7 @@ from tortoise_tts.api import TextToSpeech
12
  from tortoise_tts.utils.audio import load_audio, get_voices
13
  import torch
14
  import torchaudio
 
15
  import gradio as gr
16
 
17
  device = "cuda" if torch.cuda.is_available() else "cpu"
@@ -50,8 +51,47 @@ def inference(text, voice):
50
  print("gen")
51
  torchaudio.save('generated.wav', gen.squeeze(0).cpu(), 24000)
52
  return "generated.wav"
 
 
 
 
 
 
 
 
 
53
 
 
 
 
 
 
 
 
 
 
 
54
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
55
 
56
  text = "Joining two modalities results in a surprising increase in generalization! What would happen if we combined them all?"
57
  examples = [
@@ -60,18 +100,33 @@ examples = [
60
  ["how are you doing this day", "freeman"]
61
  ]
62
 
63
- iface = gr.Interface(
64
- inference,
65
- inputs=[
66
- gr.inputs.Textbox(type="str", default=text, label="Text", lines=3),
67
- gr.inputs.Dropdown(voices),
68
- ],
69
- outputs="audio",
70
- title="TorToiSe",
71
- description="A multi-voice TTS system trained with an emphasis on quality",
72
- article="This demo shows the ultra fast option in the TorToiSe system. For more info check the <a href='https://github.com/neonbjb/tortoise-tts' target='_blank'>Repository</a>.",
73
- enable_queue=True,
74
- examples=examples,
75
- )
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
76
 
77
- iface.launch(cache_examples=True)
 
12
  from tortoise_tts.utils.audio import load_audio, get_voices
13
  import torch
14
  import torchaudio
15
+ import numpy as np
16
  import gradio as gr
17
 
18
  device = "cuda" if torch.cuda.is_available() else "cpu"
 
51
  print("gen")
52
  torchaudio.save('generated.wav', gen.squeeze(0).cpu(), 24000)
53
  return "generated.wav"
54
+
55
+ def load_audio_special(sr, data):
56
+ if data.dtype == np.int32:
57
+ norm_fix = 2 ** 31
58
+ elif data.dtype == np.int16:
59
+ norm_fix = 2 ** 15
60
+ elif data.dtype == np.float16 or data.dtype == np.float32:
61
+ norm_fix = 1.
62
+ audio = torch.FloatTensor(data.astype(np.float32)) / norm_fix, sampling_rate)
63
 
64
+ # Remove any channel data.
65
+ if len(audio.shape) > 1:
66
+ if audio.shape[0] < 5:
67
+ audio = audio[0]
68
+ else:
69
+ assert audio.shape[1] < 5
70
+ audio = audio[:, 0]
71
+
72
+ if sr != sampling_rate:
73
+ audio = torchaudio.functional.resample(audio, sr, sampling_rate)
74
 
75
+ # Check some assumptions about audio range. This should be automatically fixed in load_wav_to_torch, but might not be in some edge cases, where we should squawk.
76
+ # '2' is arbitrarily chosen since it seems like audio will often "overdrive" the [-1,1] bounds.
77
+ if torch.any(audio > 2) or not torch.any(audio < 0):
78
+ print(f"Error with {audiopath}. Max={audio.max()} min={audio.min()}")
79
+ audio.clip_(-1, 1)
80
+ return audio.unsqueeze(0)
81
+
82
+ def inference_own_voice(text, voice_1, voice_2, voice_3):
83
+ text = text[:256]
84
+ print(voice_1)
85
+ conds = [
86
+ load_audio_special(voice_1),
87
+ load_audio_special(voice_2),
88
+ load_audio_special(voice_1_3),
89
+ ]
90
+ print(text, conds, preset)
91
+ gen = tts.tts_with_preset(text, conds, preset)
92
+ print("gen")
93
+ torchaudio.save('generated.wav', gen.squeeze(0).cpu(), 24000)
94
+ return "generated.wav"
95
 
96
  text = "Joining two modalities results in a surprising increase in generalization! What would happen if we combined them all?"
97
  examples = [
 
100
  ["how are you doing this day", "freeman"]
101
  ]
102
 
103
+ block = gr.Blocks()
104
+ with block:
105
+ gr.Markdown("# TorToiSe")
106
+ gr.Markdown("A multi-voice TTS system trained with an emphasis on quality")
107
+ with gr.Tabs():
108
+ with gr.TabItem("Pre-recorded voices"):
109
+ iface = gr.Interface(
110
+ inference,
111
+ inputs=[
112
+ gr.inputs.Textbox(type="str", default=text, label="Text", lines=3),
113
+ gr.inputs.Dropdown(voices),
114
+ ],
115
+ outputs="audio",
116
+ examples=examples,
117
+ )
118
+ with gr.TabItem("Record your voice"):
119
+ iface = gr.Interface(
120
+ inference_own_voice,
121
+ inputs=[
122
+ gr.inputs.Textbox(type="str", default=text, label="Text", lines=3),
123
+ gr.inputs.Audio(source="microphone", label="Record yourself reading something out loud (audio 1)", type="numpy"),
124
+ gr.inputs.Audio(source="microphone", label="Record yourself reading something out loud (audio 2)", type="numpy"),
125
+ gr.inputs.Audio(source="microphone", label="Record yourself reading something out loud (audio 3)", type="numpy"),
126
+ ],
127
+ outputs="audio"
128
+ )
129
+
130
+ gr.Markdown("This demo shows the ultra fast option in the TorToiSe system. For more info check the <a href='https://github.com/neonbjb/tortoise-tts' target='_blank'>Repository</a>.",)
131
 
132
+ block.launch()