Dionyssos commited on
Commit
1544bea
·
1 Parent(s): 77cae30
Files changed (3) hide show
  1. app.py +5 -8
  2. audiocraft.py +11 -5
  3. requirements.txt +1 -1
app.py CHANGED
@@ -38,7 +38,7 @@ language_names = ['Ancient greek',
38
 
39
  def audionar_tts(text=None,
40
  lang='Romanian',
41
- soundscape='',
42
  cache_lim=24):
43
 
44
  # https://huggingface.co/dkounadis/artificial-styletts2/blob/main/msinference.py
@@ -62,9 +62,7 @@ def audionar_tts(text=None,
62
 
63
  x = np.zeros(4 * 16000, dtype=np.float32) # If no txt 4s of audiogen
64
 
65
- else:
66
-
67
- if lang not in language_names: # StyleTTS2
68
 
69
  text = only_greek_or_only_latin(text, lang='eng')
70
 
@@ -77,7 +75,7 @@ def audionar_tts(text=None,
77
  original_rate=24000,
78
  target_rate=16000)[0, :] # 16 KHz
79
 
80
- else: # VITS
81
 
82
  lang_code = lang_map.get(lang.lower(), lang.lower().split()[0].strip())
83
 
@@ -254,7 +252,6 @@ VOICES = [
254
 
255
  _tts = StyleTTS2().to('cpu')
256
 
257
-
258
  with gr.Blocks(theme='huggingface') as demo:
259
  with gr.Row():
260
  text_input = gr.Textbox(
@@ -264,9 +261,9 @@ with gr.Blocks(theme='huggingface') as demo:
264
  value='Η γρηγορη καφετι αλεπου πειδαει πανω απο τον τεμπελη σκυλο.',
265
  )
266
  choice_dropdown = gr.Dropdown(
267
- choices=language_names + VOICES,
268
  label="Vox",
269
- value=language_names[0]
270
  )
271
  soundscape_input = gr.Textbox(
272
  lines=1,
 
38
 
39
  def audionar_tts(text=None,
40
  lang='Romanian',
41
+ soundscape='frogs',
42
  cache_lim=24):
43
 
44
  # https://huggingface.co/dkounadis/artificial-styletts2/blob/main/msinference.py
 
62
 
63
  x = np.zeros(4 * 16000, dtype=np.float32) # If no txt 4s of audiogen
64
 
65
+ elif lang not in language_names: # text exists / StyleTTS2
 
 
66
 
67
  text = only_greek_or_only_latin(text, lang='eng')
68
 
 
75
  original_rate=24000,
76
  target_rate=16000)[0, :] # 16 KHz
77
 
78
+ else: # VITS
79
 
80
  lang_code = lang_map.get(lang.lower(), lang.lower().split()[0].strip())
81
 
 
252
 
253
  _tts = StyleTTS2().to('cpu')
254
 
 
255
  with gr.Blocks(theme='huggingface') as demo:
256
  with gr.Row():
257
  text_input = gr.Textbox(
 
261
  value='Η γρηγορη καφετι αλεπου πειδαει πανω απο τον τεμπελη σκυλο.',
262
  )
263
  choice_dropdown = gr.Dropdown(
264
+ choices=VOICES + language_names,
265
  label="Vox",
266
+ value=VOICES[0]
267
  )
268
  soundscape_input = gr.Textbox(
269
  lines=1,
audiocraft.py CHANGED
@@ -459,10 +459,16 @@ class LMModel(nn.Module):
459
  logits = torch.stack([self.linears[k](out) for k in range(n_q)], dim=1) # [2*bs, 4, 1, 2048]
460
  logits = 3 * logits[:bs, :, :, :] - self._scale * logits[bs:, :, :, :] # [ bs, 4, n_draw, 2048]
461
 
462
- #bs, n_q, n_draw, vocab = logits.shape
463
- tokens = torch.multinomial(torch.softmax(logits.view(bs * self.n_draw * n_q, 2048), dim=1),
464
- num_samples=1)
465
- return tokens.view(bs, n_q, self.n_draw).transpose(1, 2)
 
 
 
 
 
 
466
 
467
  @torch.no_grad()
468
  def generate(self,
@@ -718,7 +724,7 @@ class StreamingTransformer(nn.Module):
718
 
719
  if __name__ == '__main__':
720
 
721
- import audiofile
722
  model = AudioGen().to('cpu')
723
  x = model.generate(prompt='swims in lake frogs', duration=6.4).cpu().numpy()
724
  audiofile.write('_sound_.wav', x, 16000)
 
459
  logits = torch.stack([self.linears[k](out) for k in range(n_q)], dim=1) # [2*bs, 4, 1, 2048]
460
  logits = 3 * logits[:bs, :, :, :] - self._scale * logits[bs:, :, :, :] # [ bs, 4, n_draw, 2048]
461
 
462
+ k = 24
463
+ logits = torch.softmax(logits / 1.0, dim=3) # [bs, 4, 1, 2048]
464
+ p, ix = torch.topk(logits, k, dim=3) # p = [bs, 4, 1, 24], ix = [bs, 4, 1, 2048]
465
+ # Exponential Distribution
466
+ deflation = torch.empty_like(p).exponential_(lambd=1)
467
+ p = p / deflation
468
+ # divide large probs with exp(prob) If prob=.001 then 1/exp(1*.001) -> almost by 0 --> exp doesnt really produce (0, Inf)
469
+ p = p.argmax(dim=3, keepdim=True) # [bs, 4, n_draw, 24]
470
+ tok = ix.gather(dim=3, index=p).to(torch.int64) # [bs, 4, n_draw, 1]
471
+ return tok[:, :, :, 0].transpose(1, 2) # [bs, n_draw, 4]
472
 
473
  @torch.no_grad()
474
  def generate(self,
 
724
 
725
  if __name__ == '__main__':
726
 
727
+ import audiofile # pip uninstall flash-attn
728
  model = AudioGen().to('cpu')
729
  x = model.generate(prompt='swims in lake frogs', duration=6.4).cpu().numpy()
730
  audiofile.write('_sound_.wav', x, 16000)
requirements.txt CHANGED
@@ -9,7 +9,7 @@ numpy<2.0.0
9
  gradio==5.27.0
10
  Numbers2Words-Greek
11
  einops
12
- torch
13
  pydantic==2.10.6
14
  transformers==4.49.0
15
  sentencepiece
 
9
  gradio==5.27.0
10
  Numbers2Words-Greek
11
  einops
12
+ torch==2.1.0
13
  pydantic==2.10.6
14
  transformers==4.49.0
15
  sentencepiece