sanchit-gandhi commited on
Commit
efcdb1c
·
1 Parent(s): ab3a30c

for parler

Browse files
Files changed (2) hide show
  1. app.py +16 -13
  2. requirements.txt +2 -1
app.py CHANGED
@@ -1,3 +1,4 @@
 
1
  from queue import Queue
2
  from threading import Thread
3
  from typing import Optional
@@ -11,12 +12,14 @@ from parler_tts import ParlerTTSForConditionalGeneration
11
  from transformers import AutoTokenizer, AutoFeatureExtractor, set_seed
12
  from transformers.generation.streamers import BaseStreamer
13
 
14
- device = "cuda:0" if torch.cuda.is_available() else "mps" if torch.backends.mps.is_available() else "cpu"
15
  torch_dtype = torch.float16 if device != "cpu" else torch.float32
16
 
17
  repo_id = "parler-tts/parler_tts_mini_v0.1"
18
 
19
- model = ParlerTTSForConditionalGeneration.from_pretrained(repo_id, torch_dtype=torch_dtype).to(device)
 
 
20
  tokenizer = AutoTokenizer.from_pretrained(repo_id)
21
  feature_extractor = AutoFeatureExtractor.from_pretrained(repo_id)
22
 
@@ -83,7 +86,7 @@ class ParlerTTSStreamer(BaseStreamer):
83
  if stride is not None:
84
  self.stride = stride
85
  else:
86
- hop_length = np.prod(self.audio_encoder.config.upsampling_ratios)
87
  self.stride = hop_length * (play_steps - self.decoder.num_codebooks) // 6
88
  self.token_cache = None
89
  self.to_yield = 0
@@ -95,19 +98,18 @@ class ParlerTTSStreamer(BaseStreamer):
95
 
96
  def apply_delay_pattern_mask(self, input_ids):
97
  # build the delay pattern mask for offsetting each codebook prediction by 1 (this behaviour is specific to MusicGen)
98
- _, decoder_delay_pattern_mask = self.decoder.build_delay_pattern_mask(
99
  input_ids[:, :1],
 
100
  pad_token_id=self.generation_config.decoder_start_token_id,
101
  max_length=input_ids.shape[-1],
102
  )
103
  # apply the pattern mask to the input ids
104
- input_ids = self.decoder.apply_delay_pattern_mask(input_ids, decoder_delay_pattern_mask)
105
 
106
  # revert the pattern delay mask by filtering the pad token id
107
- input_ids = input_ids[input_ids != self.generation_config.pad_token_id].reshape(
108
- 1, self.decoder.num_codebooks, -1
109
- )
110
-
111
  # append the frame dimension back to the audio codes
112
  input_ids = input_ids[None, ...]
113
 
@@ -169,7 +171,7 @@ target_dtype = np.int16
169
  max_range = np.iinfo(target_dtype).max
170
 
171
  @spaces.GPU
172
- def gen_tts(text, description, play_steps_in_s=2.0):
173
  play_steps = int(frame_rate * play_steps_in_s)
174
  streamer = ParlerTTSStreamer(model, device=device, play_steps=play_steps)
175
 
@@ -182,6 +184,7 @@ def gen_tts(text, description, play_steps_in_s=2.0):
182
  streamer=streamer,
183
  do_sample=True,
184
  temperature=1.0,
 
185
  )
186
 
187
  set_seed(SEED)
@@ -267,12 +270,12 @@ with gr.Blocks(css=css) as block:
267
  description = gr.Textbox(label="Description", lines=2, value="", elem_id="input_description")
268
  run_button = gr.Button("Generate Audio", variant="primary")
269
  with gr.Column():
270
- audio_out = gr.Audio(label="Parler-TTS generation", type="numpy", elem_id="audio_out")
271
 
272
  inputs = [input_text, description]
273
  outputs = [audio_out]
274
- gr.Examples(examples=examples, fn=gen_tts, inputs=inputs, outputs=outputs, cache_examples=True)
275
- run_button.click(fn=gen_tts, inputs=inputs, outputs=outputs, queue=True)
276
  gr.HTML(
277
  """
278
  <p>To improve the prosody and naturalness of the speech further, we're scaling up the amount of training data to 50k hours of speech.
 
1
+ import math
2
  from queue import Queue
3
  from threading import Thread
4
  from typing import Optional
 
12
  from transformers import AutoTokenizer, AutoFeatureExtractor, set_seed
13
  from transformers.generation.streamers import BaseStreamer
14
 
15
+ device = "cuda:0" if torch.cuda.is_available() else "mps" if torch.backends.mps.is_available() else "cpu"
16
  torch_dtype = torch.float16 if device != "cpu" else torch.float32
17
 
18
  repo_id = "parler-tts/parler_tts_mini_v0.1"
19
 
20
+ model = ParlerTTSForConditionalGeneration.from_pretrained(
21
+ repo_id, torch_dtype=torch_dtype, low_cpu_mem_usage=True
22
+ ).to(device)
23
  tokenizer = AutoTokenizer.from_pretrained(repo_id)
24
  feature_extractor = AutoFeatureExtractor.from_pretrained(repo_id)
25
 
 
86
  if stride is not None:
87
  self.stride = stride
88
  else:
89
+ hop_length = math.floor(self.audio_encoder.config.sampling_rate / self.audio_encoder.config.frame_rate)
90
  self.stride = hop_length * (play_steps - self.decoder.num_codebooks) // 6
91
  self.token_cache = None
92
  self.to_yield = 0
 
98
 
99
  def apply_delay_pattern_mask(self, input_ids):
100
  # build the delay pattern mask for offsetting each codebook prediction by 1 (this behaviour is specific to MusicGen)
101
+ _, delay_pattern_mask = self.decoder.build_delay_pattern_mask(
102
  input_ids[:, :1],
103
+ bos_token_id=self.generation_config.bos_token_id,
104
  pad_token_id=self.generation_config.decoder_start_token_id,
105
  max_length=input_ids.shape[-1],
106
  )
107
  # apply the pattern mask to the input ids
108
+ input_ids = self.decoder.apply_delay_pattern_mask(input_ids, delay_pattern_mask)
109
 
110
  # revert the pattern delay mask by filtering the pad token id
111
+ mask = (delay_pattern_mask != self.generation_config.bos_token_id) & (delay_pattern_mask != self.generation_config.pad_token_id)
112
+ input_ids = input_ids[mask].reshape(1, self.decoder.num_codebooks, -1)
 
 
113
  # append the frame dimension back to the audio codes
114
  input_ids = input_ids[None, ...]
115
 
 
171
  max_range = np.iinfo(target_dtype).max
172
 
173
  @spaces.GPU
174
+ def generate_tts(text, description, play_steps_in_s=2.0):
175
  play_steps = int(frame_rate * play_steps_in_s)
176
  streamer = ParlerTTSStreamer(model, device=device, play_steps=play_steps)
177
 
 
184
  streamer=streamer,
185
  do_sample=True,
186
  temperature=1.0,
187
+ min_new_tokens=10,
188
  )
189
 
190
  set_seed(SEED)
 
270
  description = gr.Textbox(label="Description", lines=2, value="", elem_id="input_description")
271
  run_button = gr.Button("Generate Audio", variant="primary")
272
  with gr.Column():
273
+ audio_out = gr.Audio(label="Parler-TTS generation", type="numpy", elem_id="audio_out", streaming=True, autoplay=True)
274
 
275
  inputs = [input_text, description]
276
  outputs = [audio_out]
277
+ gr.Examples(examples=examples, fn=generate_tts, inputs=inputs, outputs=outputs, cache_examples=False)
278
+ run_button.click(fn=generate_tts, inputs=inputs, outputs=outputs, queue=True)
279
  gr.HTML(
280
  """
281
  <p>To improve the prosody and naturalness of the speech further, we're scaling up the amount of training data to 50k hours of speech.
requirements.txt CHANGED
@@ -1 +1,2 @@
1
- git+https://github.com/huggingface/parler-tts.git
 
 
1
+ git+https://github.com/huggingface/parler-tts.git
2
+ accelerate