sanchit-gandhi HF staff commited on
Commit
dc780c5
1 Parent(s): aa4ea6e
Files changed (2) hide show
  1. README.md +1 -1
  2. app.py +42 -5
README.md CHANGED
@@ -1,5 +1,5 @@
1
  ---
2
- title: Parler Tts Mini
3
  emoji: 🐠
4
  colorFrom: green
5
  colorTo: pink
 
1
  ---
2
+ title: Parler-TTS Mini
3
  emoji: 🐠
4
  colorFrom: green
5
  colorTo: pink
app.py CHANGED
@@ -1,3 +1,4 @@
 
1
  import gradio as gr
2
  import torch
3
 
@@ -18,9 +19,6 @@ SAMPLE_RATE = feature_extractor.sampling_rate
18
  SEED = 41
19
 
20
  default_text = "Please surprise me and speak in whatever voice you enjoy."
21
-
22
- title = "# Parler-TTS </div>"
23
-
24
  examples = [
25
  [
26
  "'This is the best time of my life, Bartley,' she said happily.",
@@ -37,6 +35,7 @@ examples = [
37
  ]
38
 
39
 
 
40
  def gen_tts(text, description):
41
  inputs = tokenizer(description, return_tensors="pt").to(device)
42
  prompt = tokenizer(text, return_tensors="pt").to(device)
@@ -47,7 +46,7 @@ def gen_tts(text, description):
47
  )
48
  audio_arr = generation.cpu().numpy().squeeze()
49
 
50
- return (SAMPLE_RATE, audio_arr)
51
 
52
 
53
  css = """
@@ -87,7 +86,44 @@ css = """
87
  }
88
  """
89
  with gr.Blocks(css=css) as block:
90
- gr.Markdown(title)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
91
  with gr.Row():
92
  with gr.Column():
93
  input_text = gr.Textbox(label="Input Text", lines=2, value=default_text, elem_id="input_text")
@@ -100,6 +136,7 @@ with gr.Blocks(css=css) as block:
100
  outputs = [audio_out]
101
  gr.Examples(examples=examples, fn=gen_tts, inputs=inputs, outputs=outputs, cache_examples=True)
102
  run_button.click(fn=gen_tts, inputs=inputs, outputs=outputs, queue=True)
 
103
 
104
  block.queue()
105
  block.launch(share=True)
 
1
+ import spaces
2
  import gradio as gr
3
  import torch
4
 
 
19
  SEED = 41
20
 
21
  default_text = "Please surprise me and speak in whatever voice you enjoy."
 
 
 
22
  examples = [
23
  [
24
  "'This is the best time of my life, Bartley,' she said happily.",
 
35
  ]
36
 
37
 
38
+ @spaces.GPU
39
  def gen_tts(text, description):
40
  inputs = tokenizer(description, return_tensors="pt").to(device)
41
  prompt = tokenizer(text, return_tensors="pt").to(device)
 
46
  )
47
  audio_arr = generation.cpu().numpy().squeeze()
48
 
49
+ return SAMPLE_RATE, audio_arr
50
 
51
 
52
  css = """
 
86
  }
87
  """
88
  with gr.Blocks(css=css) as block:
89
+ gr.HTML(
90
+ """
91
+ <div style="text-align: center; max-width: 700px; margin: 0 auto;">
92
+ <div
93
+ style="
94
+ display: inline-flex; align-items: center; gap: 0.8rem; font-size: 1.75rem;
95
+ "
96
+ >
97
+ <h1 style="font-weight: 900; margin-bottom: 7px; line-height: normal;">
98
+ Parler-TTS 🗣️
99
+ </h1>
100
+ </div>
101
+ </div>
102
+ """
103
+ )
104
+ gr.HTML(
105
+ f"""
106
+ <p><a href="https://github.com/huggingface/parler-tts"> Parler-TTS</a> is a training and inference library for
107
+ high-fidelity text-to-speech (TTS) models. The model demonstrated here, <a href="parler-tts/parler_tts_300M_v0.1"> Parler-TTS Mini v0.1</a>,
108
+ is the first iteration model trained using 10k hours of narrated audiobooks. It generates high-quality speech
109
+ with features that can be controlled using a simple text prompt (e.g. gender, background noise, speaking rate, pitch and reverberation).</p>
110
+
111
+ <p>Tips for ensuring good generation:
112
+ <ul>
113
+ <li>Include the term "very clear audio" to generate the highest quality audio, and "very noisy audio" for high levels of background noise</li>
114
+ <li>Punctuation can be used to control the prosody of the generations, e.g. use commas to add small breaks in speech</li>
115
+ <li>The remaining speech features (gender, speaking rate, pitch and reverberation) can be controlled directly through the prompt</li>
116
+ </ul>
117
+ </p>
118
+
119
+ <p>To improve the prosody and naturalness of the speech further, we're scaling up the amount of training data to 50k hours of speech.
120
+ The v1 release of the model will be trained on this data, as well as inference optimisations, such as flash attention
121
+ and torch compile, that will improve the latency by 2-4x.</p>
122
+
123
+ <p>If you want to find out more about how this model was trained and even fine-tune it yourself, check-out the
124
+ <a href="https://github.com/huggingface/parler-tts"> Parler-TTS</a> repository on GitHub.</p>
125
+ """
126
+ )
127
  with gr.Row():
128
  with gr.Column():
129
  input_text = gr.Textbox(label="Input Text", lines=2, value=default_text, elem_id="input_text")
 
136
  outputs = [audio_out]
137
  gr.Examples(examples=examples, fn=gen_tts, inputs=inputs, outputs=outputs, cache_examples=True)
138
  run_button.click(fn=gen_tts, inputs=inputs, outputs=outputs, queue=True)
139
+ gr.HTML("The Parler-TTS codebase and its associated checkpoints are licensed under <a href='https://github.com/huggingface/parler-tts?tab=Apache-2.0-1-ov-file#readme'> Apache 2.0</a>.")
140
 
141
  block.queue()
142
  block.launch(share=True)