Spaces:

Dionyssos
/

speech-analysis2

Running

Dionyssos commited on 18 days ago

Commit

1544bea

1 Parent(s): 77cae30

expo

Files changed (3) hide show

app.py CHANGED Viewed

@@ -38,7 +38,7 @@ language_names = ['Ancient greek',
 def audionar_tts(text=None,
                  lang='Romanian',
-                 soundscape='',
                  cache_lim=24):
     # https://huggingface.co/dkounadis/artificial-styletts2/blob/main/msinference.py
@@ -62,9 +62,7 @@ def audionar_tts(text=None,
         x = np.zeros(4 * 16000, dtype=np.float32)  # If no txt 4s of audiogen
-    else:
-        if lang not in language_names:  # StyleTTS2
             text = only_greek_or_only_latin(text, lang='eng')
@@ -77,7 +75,7 @@ def audionar_tts(text=None,
                                         original_rate=24000,
                                         target_rate=16000)[0, :]   # 16 KHz
-        else:  # VITS
             lang_code = lang_map.get(lang.lower(), lang.lower().split()[0].strip())
@@ -254,7 +252,6 @@ VOICES = [
 _tts = StyleTTS2().to('cpu')
 with gr.Blocks(theme='huggingface') as demo:
     with gr.Row():
         text_input = gr.Textbox(
@@ -264,9 +261,9 @@ with gr.Blocks(theme='huggingface') as demo:
             value='Η γρηγορη καφετι αλεπου πειδαει πανω απο τον τεμπελη σκυλο.',
         )
         choice_dropdown = gr.Dropdown(
-            choices=language_names + VOICES,
             label="Vox",
-            value=language_names[0]
         )
         soundscape_input = gr.Textbox(
             lines=1,

 def audionar_tts(text=None,
                  lang='Romanian',
+                 soundscape='frogs',
                  cache_lim=24):
     # https://huggingface.co/dkounadis/artificial-styletts2/blob/main/msinference.py
         x = np.zeros(4 * 16000, dtype=np.float32)  # If no txt 4s of audiogen
+    elif lang not in language_names:  # text exists / StyleTTS2
             text = only_greek_or_only_latin(text, lang='eng')
                                         original_rate=24000,
                                         target_rate=16000)[0, :]   # 16 KHz
+    else:  # VITS
             lang_code = lang_map.get(lang.lower(), lang.lower().split()[0].strip())
 _tts = StyleTTS2().to('cpu')
 with gr.Blocks(theme='huggingface') as demo:
     with gr.Row():
         text_input = gr.Textbox(
             value='Η γρηγορη καφετι αλεπου πειδαει πανω απο τον τεμπελη σκυλο.',
         )
         choice_dropdown = gr.Dropdown(
+            choices=VOICES + language_names,
             label="Vox",
+            value=VOICES[0]
         )
         soundscape_input = gr.Textbox(
             lines=1,

audiocraft.py CHANGED Viewed

@@ -459,10 +459,16 @@ class LMModel(nn.Module):
         logits = torch.stack([self.linears[k](out) for k in range(n_q)], dim=1) # [2*bs, 4, 1,      2048]
         logits = 3 * logits[:bs, :, :, :] - self._scale * logits[bs:, :, :, :]                 # [  bs, 4, n_draw, 2048]
-        #bs, n_q, n_draw, vocab = logits.shape
-        tokens = torch.multinomial(torch.softmax(logits.view(bs * self.n_draw * n_q, 2048), dim=1),
-                                   num_samples=1)
-        return tokens.view(bs, n_q, self.n_draw).transpose(1, 2)
     @torch.no_grad()
     def generate(self,
@@ -718,7 +724,7 @@ class StreamingTransformer(nn.Module):
 if __name__ == '__main__':
-    import audiofile
     model = AudioGen().to('cpu')
     x = model.generate(prompt='swims in lake frogs', duration=6.4).cpu().numpy()
     audiofile.write('_sound_.wav', x, 16000)

         logits = torch.stack([self.linears[k](out) for k in range(n_q)], dim=1) # [2*bs, 4, 1,      2048]
         logits = 3 * logits[:bs, :, :, :] - self._scale * logits[bs:, :, :, :]                 # [  bs, 4, n_draw, 2048]
+        k = 24
+        logits = torch.softmax(logits / 1.0, dim=3)  # [bs, 4, 1, 2048]
+        p, ix = torch.topk(logits, k, dim=3)  # p = [bs, 4, 1, 24], ix = [bs, 4, 1, 2048]
+        # Exponential Distribution
+        deflation = torch.empty_like(p).exponential_(lambd=1)
+        p = p / deflation
+        # divide large probs with exp(prob) If prob=.001 then 1/exp(1*.001) -> almost by 0  --> exp doesnt really produce (0, Inf)
+        p = p.argmax(dim=3, keepdim=True)  # [bs, 4, n_draw, 24]
+        tok = ix.gather(dim=3, index=p).to(torch.int64)  # [bs, 4, n_draw, 1]
+        return tok[:, :, :, 0].transpose(1, 2)  # [bs, n_draw, 4]
     @torch.no_grad()
     def generate(self,
 if __name__ == '__main__':
+    import audiofile  # pip uninstall flash-attn
     model = AudioGen().to('cpu')
     x = model.generate(prompt='swims in lake frogs', duration=6.4).cpu().numpy()
     audiofile.write('_sound_.wav', x, 16000)

requirements.txt CHANGED Viewed

@@ -9,7 +9,7 @@ numpy<2.0.0
 gradio==5.27.0
 Numbers2Words-Greek
 einops
-torch
 pydantic==2.10.6
 transformers==4.49.0
 sentencepiece

 gradio==5.27.0
 Numbers2Words-Greek
 einops
+torch==2.1.0
 pydantic==2.10.6
 transformers==4.49.0
 sentencepiece