Plachta commited on
Commit
f330917
1 Parent(s): b97852e
Files changed (5) hide show
  1. app.py +2 -2
  2. descriptions.py +7 -1
  3. models/vallex.py +12 -0
  4. requirements.txt +1 -0
  5. utils/generation.py +2 -1
app.py CHANGED
@@ -44,8 +44,8 @@ text_tokenizer = PhonemeBpeTokenizer(tokenizer_path="./utils/g2p/bpe_69.json")
44
  text_collater = get_text_token_collater()
45
 
46
  device = torch.device("cpu")
47
- if torch.cuda.is_available():
48
- device = torch.device("cuda", 0)
49
 
50
  # VALL-E-X model
51
  model = VALLE(
 
44
  text_collater = get_text_token_collater()
45
 
46
  device = torch.device("cpu")
47
+ # if torch.cuda.is_available():
48
+ # device = torch.device("cuda", 0)
49
 
50
  # VALL-E-X model
51
  model = VALLE(
descriptions.py CHANGED
@@ -1,6 +1,12 @@
1
  top_md = """
2
  # VALL-E X
3
- [![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/drive/1yyD_sz531QntLKowMHo-XxorsFBCfKul?usp=sharing)
 
 
 
 
 
 
4
  VALL-E X can synthesize high-quality personalized speech with only a 3-second enrolled recording of
5
  an unseen speaker as an acoustic prompt, even in another language for a monolingual speaker.<br>
6
  This implementation supports zero-shot, mono-lingual/cross-lingual text-to-speech functionality of three languages (English, Chinese, Japanese)<br>
 
1
  top_md = """
2
  # VALL-E X
3
+ <a href="https://huggingface.co/spaces/facebook/MusicGen?duplicate=true"
4
+ style="display: inline-block;margin-top: .5em;margin-right: .25em;" target="_blank">
5
+ <img style="margin-bottom: 0em;display: inline;margin-top: -.25em;"
6
+ src="https://bit.ly/3gLdBN6" alt="Duplicate Space"></a> or <a href="https://colab.research.google.com/drive/1yyD_sz531QntLKowMHo-XxorsFBCfKul?usp=sharing"
7
+ style="display: inline-block;margin-top: .5em;margin-right: .25em;" target="_blank">
8
+ <img style="margin-bottom: 0em;display: inline;margin-top: -.25em;"
9
+ src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"></a>to skip the queue.</p>
10
  VALL-E X can synthesize high-quality personalized speech with only a 3-second enrolled recording of
11
  an unseen speaker as an acoustic prompt, even in another language for a monolingual speaker.<br>
12
  This implementation supports zero-shot, mono-lingual/cross-lingual text-to-speech functionality of three languages (English, Chinese, Japanese)<br>
models/vallex.py CHANGED
@@ -33,6 +33,15 @@ from modules.transformer import (
33
 
34
  from .macros import NUM_AUDIO_TOKENS, NUM_TEXT_TOKENS
35
 
 
 
 
 
 
 
 
 
 
36
 
37
  class Transpose(nn.Identity):
38
  """(N, T, D) -> (N, D, T)"""
@@ -572,6 +581,9 @@ class VALLE(VALLF):
572
  )
573
 
574
  print(f"VALL-E EOS [{prompts.shape[1]} -> {y.shape[1]}]")
 
 
 
575
  break
576
 
577
  y = torch.concat([y, samples], dim=1)
 
33
 
34
  from .macros import NUM_AUDIO_TOKENS, NUM_TEXT_TOKENS
35
 
36
+ import psutil
37
+ def get_memory_usage():
38
+ process = psutil.Process()
39
+ memory_info = process.memory_info()
40
+
41
+ memory_used = memory_info.rss
42
+ memory_used_mb = memory_used / (1024 * 1024)
43
+
44
+ return memory_used_mb
45
 
46
  class Transpose(nn.Identity):
47
  """(N, T, D) -> (N, D, T)"""
 
581
  )
582
 
583
  print(f"VALL-E EOS [{prompts.shape[1]} -> {y.shape[1]}]")
584
+
585
+ memory_used = get_memory_usage()
586
+ print(f"Current memory used: {memory_used:.2f} MB")
587
  break
588
 
589
  y = torch.concat([y, samples], dim=1)
requirements.txt CHANGED
@@ -18,4 +18,5 @@ nltk
18
  openai-whisper
19
  phonemizer
20
  matplotlib
 
21
  gradio
 
18
  openai-whisper
19
  phonemizer
20
  matplotlib
21
+ psutil
22
  gradio
utils/generation.py CHANGED
@@ -2,6 +2,7 @@ import os
2
  import torch
3
  import gdown
4
  import logging
 
5
  import langid
6
  langid.set_languages(['en', 'zh', 'ja'])
7
 
@@ -253,4 +254,4 @@ def generate_audio_from_long_text(text, prompt=None, language='auto', accent='no
253
  )
254
  return samples[0][0].cpu().numpy()
255
  else:
256
- raise ValueError(f"No such mode {mode}")
 
2
  import torch
3
  import gdown
4
  import logging
5
+ import psutil
6
  import langid
7
  langid.set_languages(['en', 'zh', 'ja'])
8
 
 
254
  )
255
  return samples[0][0].cpu().numpy()
256
  else:
257
+ raise ValueError(f"No such mode {mode}")