mrq commited on
Commit
54036fd
·
1 Parent(s): 92a05d3
models/{.template.yaml → .template.dlas.yaml} RENAMED
File without changes
models/.template.valle.yaml ADDED
@@ -0,0 +1,9 @@
 
 
 
 
 
 
 
 
 
 
1
+ data_dirs: [./training/${voice}/valle/]
2
+ spkr_name_getter: "lambda p: p.parts[-3]"
3
+
4
+ model: ${model_name}
5
+ batch_size: ${batch_size}
6
+ eval_batch_size: ${validation_batch_size}
7
+ eval_every: ${validation_rate}
8
+
9
+ sampling_temperature: 1.0
src/utils.py CHANGED
@@ -20,6 +20,8 @@ import subprocess
20
  import psutil
21
  import yaml
22
  import hashlib
 
 
23
 
24
  import tqdm
25
  import torch
@@ -45,6 +47,7 @@ WHISPER_MODELS = ["tiny", "base", "small", "medium", "large"]
45
  WHISPER_SPECIALIZED_MODELS = ["tiny.en", "base.en", "small.en", "medium.en"]
46
  WHISPER_BACKENDS = ["openai/whisper", "lightmare/whispercpp"]
47
  VOCODERS = ['univnet', 'bigvgan_base_24khz_100band', 'bigvgan_24khz_100band']
 
48
 
49
  GENERATE_SETTINGS_ARGS = None
50
 
@@ -56,6 +59,16 @@ RESAMPLERS = {}
56
  MIN_TRAINING_DURATION = 0.6
57
  MAX_TRAINING_DURATION = 11.6097505669
58
 
 
 
 
 
 
 
 
 
 
 
59
  args = None
60
  tts = None
61
  tts_loading = False
@@ -1195,7 +1208,7 @@ def slice_dataset( voice, trim_silence=True, start_offset=0, end_offset=0, resul
1195
  messages.append(f"Sliced segments: {files} => {segments}.")
1196
  return "\n".join(messages)
1197
 
1198
- def prepare_dataset( voice, use_segments, text_length, audio_length, normalize=False ):
1199
  indir = f'./training/{voice}/'
1200
  infile = f'{indir}/whisper.json'
1201
  messages = []
@@ -1273,6 +1286,8 @@ def prepare_dataset( voice, use_segments, text_length, audio_length, normalize=F
1273
  continue
1274
 
1275
  waveform, sample_rate = torchaudio.load(path)
 
 
1276
 
1277
  error = validate_waveform( waveform, sample_rate )
1278
  if error:
@@ -1281,21 +1296,43 @@ def prepare_dataset( voice, use_segments, text_length, audio_length, normalize=F
1281
  messages.append(message)
1282
  errored += 1
1283
  continue
 
1284
 
1285
  culled = len(text) < text_length
1286
  if not culled and audio_length > 0:
1287
- num_channels, num_frames = waveform.shape
1288
- duration = num_frames / sample_rate
1289
  culled = duration < audio_length
1290
 
1291
  # for when i add in a little treat ;), as it requires normalized text
1292
- if normalize and length(normalized_text) < 200:
1293
  line = f'audio/{file}|{text}|{normalized_text}'
1294
  else:
1295
  line = f'audio/{file}|{text}'
1296
 
1297
  lines['training' if not culled else 'validation'].append(line)
1298
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1299
  training_joined = "\n".join(lines['training'])
1300
  validation_joined = "\n".join(lines['validation'])
1301
 
@@ -1538,21 +1575,27 @@ def save_training_settings( **kwargs ):
1538
  settings['source_model'] = f"pretrain_model_gpt: '{settings['source_model']}'"
1539
  settings['resume_state'] = f"# resume_state: '{settings['resume_state']}'"
1540
 
1541
- with open(f'./models/.template.yaml', 'r', encoding="utf-8") as f:
1542
- yaml = f.read()
 
1543
 
1544
- # i could just load and edit the YAML directly, but this is easier, as I don't need to bother with path traversals
1545
- for k in settings:
1546
- if settings[k] is None:
1547
- continue
1548
- yaml = yaml.replace(f"${{{k}}}", str(settings[k]))
1549
 
1550
- outyaml = f'./training/{settings["voice"]}/train.yaml'
1551
- with open(outyaml, 'w', encoding="utf-8") as f:
1552
- f.write(yaml)
1553
 
 
1554
 
1555
- messages.append(f"Saved training output to: {outyaml}")
 
 
 
 
 
1556
  return settings, messages
1557
 
1558
  def import_voices(files, saveAs=None, progress=None):
@@ -1743,17 +1786,22 @@ def setup_args():
1743
  'latents-lean-and-mean': True,
1744
  'voice-fixer': False, # getting tired of long initialization times in a Colab for downloading a large dataset for it
1745
  'voice-fixer-use-cuda': True,
 
1746
  'force-cpu-for-conditioning-latents': False,
1747
  'defer-tts-load': False,
1748
  'device-override': None,
1749
  'prune-nonfinal-outputs': True,
1750
- 'vocoder-model': VOCODERS[-1],
1751
  'concurrency-count': 2,
1752
- 'autocalculate-voice-chunk-duration-size': 0,
 
1753
  'output-sample-rate': 44100,
1754
  'output-volume': 1,
1755
 
 
 
1756
  'autoregressive-model': None,
 
 
1757
  'whisper-backend': 'openai/whisper',
1758
  'whisper-model': "base",
1759
 
@@ -1792,6 +1840,7 @@ def setup_args():
1792
  parser.add_argument("--output-sample-rate", type=int, default=default_arguments['output-sample-rate'], help="Sample rate to resample the output to (from 24KHz)")
1793
  parser.add_argument("--output-volume", type=float, default=default_arguments['output-volume'], help="Adjusts volume of output")
1794
 
 
1795
  parser.add_argument("--autoregressive-model", default=default_arguments['autoregressive-model'], help="Specifies which autoregressive model to use for sampling.")
1796
  parser.add_argument("--whisper-backend", default=default_arguments['whisper-backend'], action='store_true', help="Picks which whisper backend to use (openai/whisper, lightmare/whispercpp)")
1797
  parser.add_argument("--whisper-model", default=default_arguments['whisper-model'], help="Specifies which whisper model to use for transcription.")
@@ -1828,10 +1877,48 @@ def setup_args():
1828
 
1829
  return args
1830
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1831
  def update_args( **kwargs ):
1832
  global args
1833
 
1834
- settings = {}
1835
  settings.update(kwargs)
1836
 
1837
  args.listen = settings['listen']
@@ -1853,8 +1940,10 @@ def update_args( **kwargs ):
1853
  args.autocalculate_voice_chunk_duration_size = settings['autocalculate_voice_chunk_duration_size']
1854
  args.output_volume = settings['output_volume']
1855
 
 
1856
  args.autoregressive_model = settings['autoregressive_model']
1857
  args.vocoder_model = settings['vocoder_model']
 
1858
  args.whisper_backend = settings['whisper_backend']
1859
  args.whisper_model = settings['whisper_model']
1860
 
@@ -1865,34 +1954,7 @@ def update_args( **kwargs ):
1865
 
1866
  def save_args_settings():
1867
  global args
1868
- settings = {
1869
- 'listen': None if not args.listen else args.listen,
1870
- 'share': args.share,
1871
- 'low-vram':args.low_vram,
1872
- 'check-for-updates':args.check_for_updates,
1873
- 'models-from-local-only':args.models_from_local_only,
1874
- 'force-cpu-for-conditioning-latents': args.force_cpu_for_conditioning_latents,
1875
- 'defer-tts-load': args.defer_tts_load,
1876
- 'prune-nonfinal-outputs': args.prune_nonfinal_outputs,
1877
- 'device-override': args.device_override,
1878
- 'sample-batch-size': args.sample_batch_size,
1879
- 'embed-output-metadata': args.embed_output_metadata,
1880
- 'latents-lean-and-mean': args.latents_lean_and_mean,
1881
- 'voice-fixer': args.voice_fixer,
1882
- 'voice-fixer-use-cuda': args.voice_fixer_use_cuda,
1883
- 'concurrency-count': args.concurrency_count,
1884
- 'output-sample-rate': args.output_sample_rate,
1885
- 'autocalculate-voice-chunk-duration-size': args.autocalculate_voice_chunk_duration_size,
1886
- 'output-volume': args.output_volume,
1887
-
1888
- 'autoregressive-model': args.autoregressive_model,
1889
- 'vocoder-model': args.vocoder_model,
1890
- 'whisper-backend': args.whisper_backend,
1891
- 'whisper-model': args.whisper_model,
1892
-
1893
- 'training-default-halfp': args.training_default_halfp,
1894
- 'training-default-bnb': args.training_default_bnb,
1895
- }
1896
 
1897
  os.makedirs('./config/', exist_ok=True)
1898
  with open(f'./config/exec.json', 'w', encoding="utf-8") as f:
@@ -2009,18 +2071,13 @@ def load_tts( restart=False, autoregressive_model=None ):
2009
  if autoregressive_model == "auto":
2010
  autoregressive_model = deduce_autoregressive_model()
2011
 
2012
- print(f"Loading TorToiSe... (AR: {autoregressive_model}, vocoder: {args.vocoder_model})")
2013
 
2014
  if get_device_name() == "cpu":
2015
  print("!!!! WARNING !!!! No GPU available in PyTorch. You may need to reinstall PyTorch.")
2016
 
2017
  tts_loading = True
2018
- try:
2019
- tts = TextToSpeech(minor_optimizations=not args.low_vram, autoregressive_model_path=autoregressive_model, vocoder_model=args.vocoder_model)
2020
- except Exception as e:
2021
- tts = TextToSpeech(minor_optimizations=not args.low_vram)
2022
- load_autoregressive_model(autoregressive_model)
2023
-
2024
  tts_loading = False
2025
 
2026
  get_model_path('dvae.pth')
 
20
  import psutil
21
  import yaml
22
  import hashlib
23
+ import io
24
+ import gzip
25
 
26
  import tqdm
27
  import torch
 
47
  WHISPER_SPECIALIZED_MODELS = ["tiny.en", "base.en", "small.en", "medium.en"]
48
  WHISPER_BACKENDS = ["openai/whisper", "lightmare/whispercpp"]
49
  VOCODERS = ['univnet', 'bigvgan_base_24khz_100band', 'bigvgan_24khz_100band']
50
+ TTSES = ['tortoise'] # + ['vall-e']
51
 
52
  GENERATE_SETTINGS_ARGS = None
53
 
 
59
  MIN_TRAINING_DURATION = 0.6
60
  MAX_TRAINING_DURATION = 11.6097505669
61
 
62
+ VALLE_ENABLED = False
63
+
64
+ try:
65
+ from vall_e.emb.qnt import encode as quantize
66
+ from vall_e.emb.g2p import encode as phonemize
67
+
68
+ VALLE_ENABLED = True
69
+ except Exception as e:
70
+ pass
71
+
72
  args = None
73
  tts = None
74
  tts_loading = False
 
1208
  messages.append(f"Sliced segments: {files} => {segments}.")
1209
  return "\n".join(messages)
1210
 
1211
+ def prepare_dataset( voice, use_segments, text_length, audio_length, normalize=True ):
1212
  indir = f'./training/{voice}/'
1213
  infile = f'{indir}/whisper.json'
1214
  messages = []
 
1286
  continue
1287
 
1288
  waveform, sample_rate = torchaudio.load(path)
1289
+ num_channels, num_frames = waveform.shape
1290
+ duration = num_frames / sample_rate
1291
 
1292
  error = validate_waveform( waveform, sample_rate )
1293
  if error:
 
1296
  messages.append(message)
1297
  errored += 1
1298
  continue
1299
+
1300
 
1301
  culled = len(text) < text_length
1302
  if not culled and audio_length > 0:
 
 
1303
  culled = duration < audio_length
1304
 
1305
  # for when i add in a little treat ;), as it requires normalized text
1306
+ if normalize and len(normalized_text) < 200:
1307
  line = f'audio/{file}|{text}|{normalized_text}'
1308
  else:
1309
  line = f'audio/{file}|{text}'
1310
 
1311
  lines['training' if not culled else 'validation'].append(line)
1312
 
1313
+ if culled or not VALLE_ENABLED:
1314
+ continue
1315
+
1316
+ # VALL-E dataset
1317
+ os.makedirs(f'{indir}/valle/', exist_ok=True)
1318
+
1319
+ try:
1320
+ from vall_e.emb.qnt import encode as quantize
1321
+ from vall_e.emb.g2p import encode as phonemize
1322
+
1323
+ if waveform.shape[0] == 2:
1324
+ waveform = wav[:1]
1325
+
1326
+ quantized = quantize( waveform, sample_rate ).cpu()
1327
+ torch.save(quantized, f'{indir}/valle/{file.replace(".wav",".qnt.pt")}')
1328
+
1329
+ phonemes = phonemize(normalized_text)
1330
+ open(f'{indir}/valle/{file.replace(".wav",".phn.txt")}', 'w', encoding='utf-8').write(" ".join(phonemes))
1331
+
1332
+ except Exception as e:
1333
+ print(e)
1334
+ pass
1335
+
1336
  training_joined = "\n".join(lines['training'])
1337
  validation_joined = "\n".join(lines['validation'])
1338
 
 
1575
  settings['source_model'] = f"pretrain_model_gpt: '{settings['source_model']}'"
1576
  settings['resume_state'] = f"# resume_state: '{settings['resume_state']}'"
1577
 
1578
+ def use_template(template, out):
1579
+ with open(template, 'r', encoding="utf-8") as f:
1580
+ yaml = f.read()
1581
 
1582
+ # i could just load and edit the YAML directly, but this is easier, as I don't need to bother with path traversals
1583
+ for k in settings:
1584
+ if settings[k] is None:
1585
+ continue
1586
+ yaml = yaml.replace(f"${{{k}}}", str(settings[k]))
1587
 
1588
+ with open(out, 'w', encoding="utf-8") as f:
1589
+ f.write(yaml)
 
1590
 
1591
+ use_template(f'./models/.template.dlas.yaml', f'./training/{settings["voice"]}/train.yaml')
1592
 
1593
+ settings['model_name'] = "ar"
1594
+ use_template(f'./models/.template.valle.yaml', f'./training/{settings["voice"]}/ar.yaml')
1595
+ settings['model_name'] = "nar"
1596
+ use_template(f'./models/.template.valle.yaml', f'./training/{settings["voice"]}/nar.yaml')
1597
+
1598
+ messages.append(f"Saved training output")
1599
  return settings, messages
1600
 
1601
  def import_voices(files, saveAs=None, progress=None):
 
1786
  'latents-lean-and-mean': True,
1787
  'voice-fixer': False, # getting tired of long initialization times in a Colab for downloading a large dataset for it
1788
  'voice-fixer-use-cuda': True,
1789
+
1790
  'force-cpu-for-conditioning-latents': False,
1791
  'defer-tts-load': False,
1792
  'device-override': None,
1793
  'prune-nonfinal-outputs': True,
 
1794
  'concurrency-count': 2,
1795
+ 'autocalculate-voice-chunk-duration-size': 10,
1796
+
1797
  'output-sample-rate': 44100,
1798
  'output-volume': 1,
1799
 
1800
+ 'tts-backend': TTSES[0],
1801
+
1802
  'autoregressive-model': None,
1803
+ 'vocoder-model': VOCODERS[-1],
1804
+
1805
  'whisper-backend': 'openai/whisper',
1806
  'whisper-model': "base",
1807
 
 
1840
  parser.add_argument("--output-sample-rate", type=int, default=default_arguments['output-sample-rate'], help="Sample rate to resample the output to (from 24KHz)")
1841
  parser.add_argument("--output-volume", type=float, default=default_arguments['output-volume'], help="Adjusts volume of output")
1842
 
1843
+ parser.add_argument("--tts-backend", default=default_arguments['tts-backend'], help="Specifies which TTS backend to use.")
1844
  parser.add_argument("--autoregressive-model", default=default_arguments['autoregressive-model'], help="Specifies which autoregressive model to use for sampling.")
1845
  parser.add_argument("--whisper-backend", default=default_arguments['whisper-backend'], action='store_true', help="Picks which whisper backend to use (openai/whisper, lightmare/whispercpp)")
1846
  parser.add_argument("--whisper-model", default=default_arguments['whisper-model'], help="Specifies which whisper model to use for transcription.")
 
1877
 
1878
  return args
1879
 
1880
+ def get_default_settings( hypenated=True ):
1881
+ settings = {
1882
+ 'listen': None if not args.listen else args.listen,
1883
+ 'share': args.share,
1884
+ 'low-vram':args.low_vram,
1885
+ 'check-for-updates':args.check_for_updates,
1886
+ 'models-from-local-only':args.models_from_local_only,
1887
+ 'force-cpu-for-conditioning-latents': args.force_cpu_for_conditioning_latents,
1888
+ 'defer-tts-load': args.defer_tts_load,
1889
+ 'prune-nonfinal-outputs': args.prune_nonfinal_outputs,
1890
+ 'device-override': args.device_override,
1891
+ 'sample-batch-size': args.sample_batch_size,
1892
+ 'embed-output-metadata': args.embed_output_metadata,
1893
+ 'latents-lean-and-mean': args.latents_lean_and_mean,
1894
+ 'voice-fixer': args.voice_fixer,
1895
+ 'voice-fixer-use-cuda': args.voice_fixer_use_cuda,
1896
+ 'concurrency-count': args.concurrency_count,
1897
+ 'output-sample-rate': args.output_sample_rate,
1898
+ 'autocalculate-voice-chunk-duration-size': args.autocalculate_voice_chunk_duration_size,
1899
+ 'output-volume': args.output_volume,
1900
+
1901
+ 'tts-backend': args.tts_backend,
1902
+
1903
+ 'autoregressive-model': args.autoregressive_model,
1904
+ 'vocoder-model': args.vocoder_model,
1905
+
1906
+ 'whisper-backend': args.whisper_backend,
1907
+ 'whisper-model': args.whisper_model,
1908
+
1909
+ 'training-default-halfp': args.training_default_halfp,
1910
+ 'training-default-bnb': args.training_default_bnb,
1911
+ }
1912
+
1913
+ res = {}
1914
+ for k in settings:
1915
+ res[k.replace("-", "_") if not hypenated else k] = settings[k]
1916
+ return res
1917
+
1918
  def update_args( **kwargs ):
1919
  global args
1920
 
1921
+ settings = get_default_settings(hypenated=False)
1922
  settings.update(kwargs)
1923
 
1924
  args.listen = settings['listen']
 
1940
  args.autocalculate_voice_chunk_duration_size = settings['autocalculate_voice_chunk_duration_size']
1941
  args.output_volume = settings['output_volume']
1942
 
1943
+ args.tts_backend = settings['tts_backend']
1944
  args.autoregressive_model = settings['autoregressive_model']
1945
  args.vocoder_model = settings['vocoder_model']
1946
+
1947
  args.whisper_backend = settings['whisper_backend']
1948
  args.whisper_model = settings['whisper_model']
1949
 
 
1954
 
1955
  def save_args_settings():
1956
  global args
1957
+ settings = get_default_settings()
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1958
 
1959
  os.makedirs('./config/', exist_ok=True)
1960
  with open(f'./config/exec.json', 'w', encoding="utf-8") as f:
 
2071
  if autoregressive_model == "auto":
2072
  autoregressive_model = deduce_autoregressive_model()
2073
 
 
2074
 
2075
  if get_device_name() == "cpu":
2076
  print("!!!! WARNING !!!! No GPU available in PyTorch. You may need to reinstall PyTorch.")
2077
 
2078
  tts_loading = True
2079
+ print(f"Loading TorToiSe... (AR: {autoregressive_model}, vocoder: {args.vocoder_model})")
2080
+ tts = TextToSpeech(minor_optimizations=not args.low_vram, autoregressive_model_path=autoregressive_model, vocoder_model=args.vocoder_model)
 
 
 
 
2081
  tts_loading = False
2082
 
2083
  get_model_path('dvae.pth')
src/webui.py CHANGED
@@ -548,11 +548,11 @@ def setup_gradio():
548
  EXEC_SETTINGS['autocalculate_voice_chunk_duration_size'] = gr.Number(label="Auto-Calculate Voice Chunk Duration (in seconds)", precision=0, value=args.autocalculate_voice_chunk_duration_size)
549
  EXEC_SETTINGS['output_volume'] = gr.Slider(label="Output Volume", minimum=0, maximum=2, value=args.output_volume)
550
 
 
 
551
  EXEC_SETTINGS['autoregressive_model'] = gr.Dropdown(choices=autoregressive_models, label="Autoregressive Model", value=args.autoregressive_model if args.autoregressive_model else autoregressive_models[0])
552
-
553
  EXEC_SETTINGS['vocoder_model'] = gr.Dropdown(VOCODERS, label="Vocoder", value=args.vocoder_model if args.vocoder_model else VOCODERS[-1])
554
 
555
-
556
  EXEC_SETTINGS['training_default_halfp'] = TRAINING_SETTINGS['half_p']
557
  EXEC_SETTINGS['training_default_bnb'] = TRAINING_SETTINGS['bitsandbytes']
558
 
 
548
  EXEC_SETTINGS['autocalculate_voice_chunk_duration_size'] = gr.Number(label="Auto-Calculate Voice Chunk Duration (in seconds)", precision=0, value=args.autocalculate_voice_chunk_duration_size)
549
  EXEC_SETTINGS['output_volume'] = gr.Slider(label="Output Volume", minimum=0, maximum=2, value=args.output_volume)
550
 
551
+ # EXEC_SETTINGS['tts_backend'] = gr.Dropdown(TTSES, label="TTS Backend", value=args.tts_backend if args.tts_backend else TTSES[0])
552
+
553
  EXEC_SETTINGS['autoregressive_model'] = gr.Dropdown(choices=autoregressive_models, label="Autoregressive Model", value=args.autoregressive_model if args.autoregressive_model else autoregressive_models[0])
 
554
  EXEC_SETTINGS['vocoder_model'] = gr.Dropdown(VOCODERS, label="Vocoder", value=args.vocoder_model if args.vocoder_model else VOCODERS[-1])
555
 
 
556
  EXEC_SETTINGS['training_default_halfp'] = TRAINING_SETTINGS['half_p']
557
  EXEC_SETTINGS['training_default_bnb'] = TRAINING_SETTINGS['bitsandbytes']
558
 
start.sh CHANGED
@@ -1,4 +1,5 @@
1
  #!/bin/bash
 
2
  source ./venv/bin/activate
3
  python3 ./src/main.py "$@"
4
  deactivate
 
1
  #!/bin/bash
2
+ ulimit -Sn `ulimit -Hn` # ROCm is a bitch
3
  source ./venv/bin/activate
4
  python3 ./src/main.py "$@"
5
  deactivate