Spaces:
Build error
Build error
mrq
commited on
Commit
·
54036fd
1
Parent(s):
92a05d3
:)
Browse files- models/{.template.yaml → .template.dlas.yaml} +0 -0
- models/.template.valle.yaml +9 -0
- src/utils.py +110 -53
- src/webui.py +2 -2
- start.sh +1 -0
models/{.template.yaml → .template.dlas.yaml}
RENAMED
File without changes
|
models/.template.valle.yaml
ADDED
@@ -0,0 +1,9 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
data_dirs: [./training/${voice}/valle/]
|
2 |
+
spkr_name_getter: "lambda p: p.parts[-3]"
|
3 |
+
|
4 |
+
model: ${model_name}
|
5 |
+
batch_size: ${batch_size}
|
6 |
+
eval_batch_size: ${validation_batch_size}
|
7 |
+
eval_every: ${validation_rate}
|
8 |
+
|
9 |
+
sampling_temperature: 1.0
|
src/utils.py
CHANGED
@@ -20,6 +20,8 @@ import subprocess
|
|
20 |
import psutil
|
21 |
import yaml
|
22 |
import hashlib
|
|
|
|
|
23 |
|
24 |
import tqdm
|
25 |
import torch
|
@@ -45,6 +47,7 @@ WHISPER_MODELS = ["tiny", "base", "small", "medium", "large"]
|
|
45 |
WHISPER_SPECIALIZED_MODELS = ["tiny.en", "base.en", "small.en", "medium.en"]
|
46 |
WHISPER_BACKENDS = ["openai/whisper", "lightmare/whispercpp"]
|
47 |
VOCODERS = ['univnet', 'bigvgan_base_24khz_100band', 'bigvgan_24khz_100band']
|
|
|
48 |
|
49 |
GENERATE_SETTINGS_ARGS = None
|
50 |
|
@@ -56,6 +59,16 @@ RESAMPLERS = {}
|
|
56 |
MIN_TRAINING_DURATION = 0.6
|
57 |
MAX_TRAINING_DURATION = 11.6097505669
|
58 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
59 |
args = None
|
60 |
tts = None
|
61 |
tts_loading = False
|
@@ -1195,7 +1208,7 @@ def slice_dataset( voice, trim_silence=True, start_offset=0, end_offset=0, resul
|
|
1195 |
messages.append(f"Sliced segments: {files} => {segments}.")
|
1196 |
return "\n".join(messages)
|
1197 |
|
1198 |
-
def prepare_dataset( voice, use_segments, text_length, audio_length, normalize=
|
1199 |
indir = f'./training/{voice}/'
|
1200 |
infile = f'{indir}/whisper.json'
|
1201 |
messages = []
|
@@ -1273,6 +1286,8 @@ def prepare_dataset( voice, use_segments, text_length, audio_length, normalize=F
|
|
1273 |
continue
|
1274 |
|
1275 |
waveform, sample_rate = torchaudio.load(path)
|
|
|
|
|
1276 |
|
1277 |
error = validate_waveform( waveform, sample_rate )
|
1278 |
if error:
|
@@ -1281,21 +1296,43 @@ def prepare_dataset( voice, use_segments, text_length, audio_length, normalize=F
|
|
1281 |
messages.append(message)
|
1282 |
errored += 1
|
1283 |
continue
|
|
|
1284 |
|
1285 |
culled = len(text) < text_length
|
1286 |
if not culled and audio_length > 0:
|
1287 |
-
num_channels, num_frames = waveform.shape
|
1288 |
-
duration = num_frames / sample_rate
|
1289 |
culled = duration < audio_length
|
1290 |
|
1291 |
# for when i add in a little treat ;), as it requires normalized text
|
1292 |
-
if normalize and
|
1293 |
line = f'audio/{file}|{text}|{normalized_text}'
|
1294 |
else:
|
1295 |
line = f'audio/{file}|{text}'
|
1296 |
|
1297 |
lines['training' if not culled else 'validation'].append(line)
|
1298 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1299 |
training_joined = "\n".join(lines['training'])
|
1300 |
validation_joined = "\n".join(lines['validation'])
|
1301 |
|
@@ -1538,21 +1575,27 @@ def save_training_settings( **kwargs ):
|
|
1538 |
settings['source_model'] = f"pretrain_model_gpt: '{settings['source_model']}'"
|
1539 |
settings['resume_state'] = f"# resume_state: '{settings['resume_state']}'"
|
1540 |
|
1541 |
-
|
1542 |
-
|
|
|
1543 |
|
1544 |
-
|
1545 |
-
|
1546 |
-
|
1547 |
-
|
1548 |
-
|
1549 |
|
1550 |
-
|
1551 |
-
|
1552 |
-
f.write(yaml)
|
1553 |
|
|
|
1554 |
|
1555 |
-
|
|
|
|
|
|
|
|
|
|
|
1556 |
return settings, messages
|
1557 |
|
1558 |
def import_voices(files, saveAs=None, progress=None):
|
@@ -1743,17 +1786,22 @@ def setup_args():
|
|
1743 |
'latents-lean-and-mean': True,
|
1744 |
'voice-fixer': False, # getting tired of long initialization times in a Colab for downloading a large dataset for it
|
1745 |
'voice-fixer-use-cuda': True,
|
|
|
1746 |
'force-cpu-for-conditioning-latents': False,
|
1747 |
'defer-tts-load': False,
|
1748 |
'device-override': None,
|
1749 |
'prune-nonfinal-outputs': True,
|
1750 |
-
'vocoder-model': VOCODERS[-1],
|
1751 |
'concurrency-count': 2,
|
1752 |
-
'autocalculate-voice-chunk-duration-size':
|
|
|
1753 |
'output-sample-rate': 44100,
|
1754 |
'output-volume': 1,
|
1755 |
|
|
|
|
|
1756 |
'autoregressive-model': None,
|
|
|
|
|
1757 |
'whisper-backend': 'openai/whisper',
|
1758 |
'whisper-model': "base",
|
1759 |
|
@@ -1792,6 +1840,7 @@ def setup_args():
|
|
1792 |
parser.add_argument("--output-sample-rate", type=int, default=default_arguments['output-sample-rate'], help="Sample rate to resample the output to (from 24KHz)")
|
1793 |
parser.add_argument("--output-volume", type=float, default=default_arguments['output-volume'], help="Adjusts volume of output")
|
1794 |
|
|
|
1795 |
parser.add_argument("--autoregressive-model", default=default_arguments['autoregressive-model'], help="Specifies which autoregressive model to use for sampling.")
|
1796 |
parser.add_argument("--whisper-backend", default=default_arguments['whisper-backend'], action='store_true', help="Picks which whisper backend to use (openai/whisper, lightmare/whispercpp)")
|
1797 |
parser.add_argument("--whisper-model", default=default_arguments['whisper-model'], help="Specifies which whisper model to use for transcription.")
|
@@ -1828,10 +1877,48 @@ def setup_args():
|
|
1828 |
|
1829 |
return args
|
1830 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1831 |
def update_args( **kwargs ):
|
1832 |
global args
|
1833 |
|
1834 |
-
settings =
|
1835 |
settings.update(kwargs)
|
1836 |
|
1837 |
args.listen = settings['listen']
|
@@ -1853,8 +1940,10 @@ def update_args( **kwargs ):
|
|
1853 |
args.autocalculate_voice_chunk_duration_size = settings['autocalculate_voice_chunk_duration_size']
|
1854 |
args.output_volume = settings['output_volume']
|
1855 |
|
|
|
1856 |
args.autoregressive_model = settings['autoregressive_model']
|
1857 |
args.vocoder_model = settings['vocoder_model']
|
|
|
1858 |
args.whisper_backend = settings['whisper_backend']
|
1859 |
args.whisper_model = settings['whisper_model']
|
1860 |
|
@@ -1865,34 +1954,7 @@ def update_args( **kwargs ):
|
|
1865 |
|
1866 |
def save_args_settings():
|
1867 |
global args
|
1868 |
-
settings =
|
1869 |
-
'listen': None if not args.listen else args.listen,
|
1870 |
-
'share': args.share,
|
1871 |
-
'low-vram':args.low_vram,
|
1872 |
-
'check-for-updates':args.check_for_updates,
|
1873 |
-
'models-from-local-only':args.models_from_local_only,
|
1874 |
-
'force-cpu-for-conditioning-latents': args.force_cpu_for_conditioning_latents,
|
1875 |
-
'defer-tts-load': args.defer_tts_load,
|
1876 |
-
'prune-nonfinal-outputs': args.prune_nonfinal_outputs,
|
1877 |
-
'device-override': args.device_override,
|
1878 |
-
'sample-batch-size': args.sample_batch_size,
|
1879 |
-
'embed-output-metadata': args.embed_output_metadata,
|
1880 |
-
'latents-lean-and-mean': args.latents_lean_and_mean,
|
1881 |
-
'voice-fixer': args.voice_fixer,
|
1882 |
-
'voice-fixer-use-cuda': args.voice_fixer_use_cuda,
|
1883 |
-
'concurrency-count': args.concurrency_count,
|
1884 |
-
'output-sample-rate': args.output_sample_rate,
|
1885 |
-
'autocalculate-voice-chunk-duration-size': args.autocalculate_voice_chunk_duration_size,
|
1886 |
-
'output-volume': args.output_volume,
|
1887 |
-
|
1888 |
-
'autoregressive-model': args.autoregressive_model,
|
1889 |
-
'vocoder-model': args.vocoder_model,
|
1890 |
-
'whisper-backend': args.whisper_backend,
|
1891 |
-
'whisper-model': args.whisper_model,
|
1892 |
-
|
1893 |
-
'training-default-halfp': args.training_default_halfp,
|
1894 |
-
'training-default-bnb': args.training_default_bnb,
|
1895 |
-
}
|
1896 |
|
1897 |
os.makedirs('./config/', exist_ok=True)
|
1898 |
with open(f'./config/exec.json', 'w', encoding="utf-8") as f:
|
@@ -2009,18 +2071,13 @@ def load_tts( restart=False, autoregressive_model=None ):
|
|
2009 |
if autoregressive_model == "auto":
|
2010 |
autoregressive_model = deduce_autoregressive_model()
|
2011 |
|
2012 |
-
print(f"Loading TorToiSe... (AR: {autoregressive_model}, vocoder: {args.vocoder_model})")
|
2013 |
|
2014 |
if get_device_name() == "cpu":
|
2015 |
print("!!!! WARNING !!!! No GPU available in PyTorch. You may need to reinstall PyTorch.")
|
2016 |
|
2017 |
tts_loading = True
|
2018 |
-
|
2019 |
-
|
2020 |
-
except Exception as e:
|
2021 |
-
tts = TextToSpeech(minor_optimizations=not args.low_vram)
|
2022 |
-
load_autoregressive_model(autoregressive_model)
|
2023 |
-
|
2024 |
tts_loading = False
|
2025 |
|
2026 |
get_model_path('dvae.pth')
|
|
|
20 |
import psutil
|
21 |
import yaml
|
22 |
import hashlib
|
23 |
+
import io
|
24 |
+
import gzip
|
25 |
|
26 |
import tqdm
|
27 |
import torch
|
|
|
47 |
WHISPER_SPECIALIZED_MODELS = ["tiny.en", "base.en", "small.en", "medium.en"]
|
48 |
WHISPER_BACKENDS = ["openai/whisper", "lightmare/whispercpp"]
|
49 |
VOCODERS = ['univnet', 'bigvgan_base_24khz_100band', 'bigvgan_24khz_100band']
|
50 |
+
TTSES = ['tortoise'] # + ['vall-e']
|
51 |
|
52 |
GENERATE_SETTINGS_ARGS = None
|
53 |
|
|
|
59 |
MIN_TRAINING_DURATION = 0.6
|
60 |
MAX_TRAINING_DURATION = 11.6097505669
|
61 |
|
62 |
+
VALLE_ENABLED = False
|
63 |
+
|
64 |
+
try:
|
65 |
+
from vall_e.emb.qnt import encode as quantize
|
66 |
+
from vall_e.emb.g2p import encode as phonemize
|
67 |
+
|
68 |
+
VALLE_ENABLED = True
|
69 |
+
except Exception as e:
|
70 |
+
pass
|
71 |
+
|
72 |
args = None
|
73 |
tts = None
|
74 |
tts_loading = False
|
|
|
1208 |
messages.append(f"Sliced segments: {files} => {segments}.")
|
1209 |
return "\n".join(messages)
|
1210 |
|
1211 |
+
def prepare_dataset( voice, use_segments, text_length, audio_length, normalize=True ):
|
1212 |
indir = f'./training/{voice}/'
|
1213 |
infile = f'{indir}/whisper.json'
|
1214 |
messages = []
|
|
|
1286 |
continue
|
1287 |
|
1288 |
waveform, sample_rate = torchaudio.load(path)
|
1289 |
+
num_channels, num_frames = waveform.shape
|
1290 |
+
duration = num_frames / sample_rate
|
1291 |
|
1292 |
error = validate_waveform( waveform, sample_rate )
|
1293 |
if error:
|
|
|
1296 |
messages.append(message)
|
1297 |
errored += 1
|
1298 |
continue
|
1299 |
+
|
1300 |
|
1301 |
culled = len(text) < text_length
|
1302 |
if not culled and audio_length > 0:
|
|
|
|
|
1303 |
culled = duration < audio_length
|
1304 |
|
1305 |
# for when i add in a little treat ;), as it requires normalized text
|
1306 |
+
if normalize and len(normalized_text) < 200:
|
1307 |
line = f'audio/{file}|{text}|{normalized_text}'
|
1308 |
else:
|
1309 |
line = f'audio/{file}|{text}'
|
1310 |
|
1311 |
lines['training' if not culled else 'validation'].append(line)
|
1312 |
|
1313 |
+
if culled or not VALLE_ENABLED:
|
1314 |
+
continue
|
1315 |
+
|
1316 |
+
# VALL-E dataset
|
1317 |
+
os.makedirs(f'{indir}/valle/', exist_ok=True)
|
1318 |
+
|
1319 |
+
try:
|
1320 |
+
from vall_e.emb.qnt import encode as quantize
|
1321 |
+
from vall_e.emb.g2p import encode as phonemize
|
1322 |
+
|
1323 |
+
if waveform.shape[0] == 2:
|
1324 |
+
waveform = wav[:1]
|
1325 |
+
|
1326 |
+
quantized = quantize( waveform, sample_rate ).cpu()
|
1327 |
+
torch.save(quantized, f'{indir}/valle/{file.replace(".wav",".qnt.pt")}')
|
1328 |
+
|
1329 |
+
phonemes = phonemize(normalized_text)
|
1330 |
+
open(f'{indir}/valle/{file.replace(".wav",".phn.txt")}', 'w', encoding='utf-8').write(" ".join(phonemes))
|
1331 |
+
|
1332 |
+
except Exception as e:
|
1333 |
+
print(e)
|
1334 |
+
pass
|
1335 |
+
|
1336 |
training_joined = "\n".join(lines['training'])
|
1337 |
validation_joined = "\n".join(lines['validation'])
|
1338 |
|
|
|
1575 |
settings['source_model'] = f"pretrain_model_gpt: '{settings['source_model']}'"
|
1576 |
settings['resume_state'] = f"# resume_state: '{settings['resume_state']}'"
|
1577 |
|
1578 |
+
def use_template(template, out):
|
1579 |
+
with open(template, 'r', encoding="utf-8") as f:
|
1580 |
+
yaml = f.read()
|
1581 |
|
1582 |
+
# i could just load and edit the YAML directly, but this is easier, as I don't need to bother with path traversals
|
1583 |
+
for k in settings:
|
1584 |
+
if settings[k] is None:
|
1585 |
+
continue
|
1586 |
+
yaml = yaml.replace(f"${{{k}}}", str(settings[k]))
|
1587 |
|
1588 |
+
with open(out, 'w', encoding="utf-8") as f:
|
1589 |
+
f.write(yaml)
|
|
|
1590 |
|
1591 |
+
use_template(f'./models/.template.dlas.yaml', f'./training/{settings["voice"]}/train.yaml')
|
1592 |
|
1593 |
+
settings['model_name'] = "ar"
|
1594 |
+
use_template(f'./models/.template.valle.yaml', f'./training/{settings["voice"]}/ar.yaml')
|
1595 |
+
settings['model_name'] = "nar"
|
1596 |
+
use_template(f'./models/.template.valle.yaml', f'./training/{settings["voice"]}/nar.yaml')
|
1597 |
+
|
1598 |
+
messages.append(f"Saved training output")
|
1599 |
return settings, messages
|
1600 |
|
1601 |
def import_voices(files, saveAs=None, progress=None):
|
|
|
1786 |
'latents-lean-and-mean': True,
|
1787 |
'voice-fixer': False, # getting tired of long initialization times in a Colab for downloading a large dataset for it
|
1788 |
'voice-fixer-use-cuda': True,
|
1789 |
+
|
1790 |
'force-cpu-for-conditioning-latents': False,
|
1791 |
'defer-tts-load': False,
|
1792 |
'device-override': None,
|
1793 |
'prune-nonfinal-outputs': True,
|
|
|
1794 |
'concurrency-count': 2,
|
1795 |
+
'autocalculate-voice-chunk-duration-size': 10,
|
1796 |
+
|
1797 |
'output-sample-rate': 44100,
|
1798 |
'output-volume': 1,
|
1799 |
|
1800 |
+
'tts-backend': TTSES[0],
|
1801 |
+
|
1802 |
'autoregressive-model': None,
|
1803 |
+
'vocoder-model': VOCODERS[-1],
|
1804 |
+
|
1805 |
'whisper-backend': 'openai/whisper',
|
1806 |
'whisper-model': "base",
|
1807 |
|
|
|
1840 |
parser.add_argument("--output-sample-rate", type=int, default=default_arguments['output-sample-rate'], help="Sample rate to resample the output to (from 24KHz)")
|
1841 |
parser.add_argument("--output-volume", type=float, default=default_arguments['output-volume'], help="Adjusts volume of output")
|
1842 |
|
1843 |
+
parser.add_argument("--tts-backend", default=default_arguments['tts-backend'], help="Specifies which TTS backend to use.")
|
1844 |
parser.add_argument("--autoregressive-model", default=default_arguments['autoregressive-model'], help="Specifies which autoregressive model to use for sampling.")
|
1845 |
parser.add_argument("--whisper-backend", default=default_arguments['whisper-backend'], action='store_true', help="Picks which whisper backend to use (openai/whisper, lightmare/whispercpp)")
|
1846 |
parser.add_argument("--whisper-model", default=default_arguments['whisper-model'], help="Specifies which whisper model to use for transcription.")
|
|
|
1877 |
|
1878 |
return args
|
1879 |
|
1880 |
+
def get_default_settings( hypenated=True ):
|
1881 |
+
settings = {
|
1882 |
+
'listen': None if not args.listen else args.listen,
|
1883 |
+
'share': args.share,
|
1884 |
+
'low-vram':args.low_vram,
|
1885 |
+
'check-for-updates':args.check_for_updates,
|
1886 |
+
'models-from-local-only':args.models_from_local_only,
|
1887 |
+
'force-cpu-for-conditioning-latents': args.force_cpu_for_conditioning_latents,
|
1888 |
+
'defer-tts-load': args.defer_tts_load,
|
1889 |
+
'prune-nonfinal-outputs': args.prune_nonfinal_outputs,
|
1890 |
+
'device-override': args.device_override,
|
1891 |
+
'sample-batch-size': args.sample_batch_size,
|
1892 |
+
'embed-output-metadata': args.embed_output_metadata,
|
1893 |
+
'latents-lean-and-mean': args.latents_lean_and_mean,
|
1894 |
+
'voice-fixer': args.voice_fixer,
|
1895 |
+
'voice-fixer-use-cuda': args.voice_fixer_use_cuda,
|
1896 |
+
'concurrency-count': args.concurrency_count,
|
1897 |
+
'output-sample-rate': args.output_sample_rate,
|
1898 |
+
'autocalculate-voice-chunk-duration-size': args.autocalculate_voice_chunk_duration_size,
|
1899 |
+
'output-volume': args.output_volume,
|
1900 |
+
|
1901 |
+
'tts-backend': args.tts_backend,
|
1902 |
+
|
1903 |
+
'autoregressive-model': args.autoregressive_model,
|
1904 |
+
'vocoder-model': args.vocoder_model,
|
1905 |
+
|
1906 |
+
'whisper-backend': args.whisper_backend,
|
1907 |
+
'whisper-model': args.whisper_model,
|
1908 |
+
|
1909 |
+
'training-default-halfp': args.training_default_halfp,
|
1910 |
+
'training-default-bnb': args.training_default_bnb,
|
1911 |
+
}
|
1912 |
+
|
1913 |
+
res = {}
|
1914 |
+
for k in settings:
|
1915 |
+
res[k.replace("-", "_") if not hypenated else k] = settings[k]
|
1916 |
+
return res
|
1917 |
+
|
1918 |
def update_args( **kwargs ):
|
1919 |
global args
|
1920 |
|
1921 |
+
settings = get_default_settings(hypenated=False)
|
1922 |
settings.update(kwargs)
|
1923 |
|
1924 |
args.listen = settings['listen']
|
|
|
1940 |
args.autocalculate_voice_chunk_duration_size = settings['autocalculate_voice_chunk_duration_size']
|
1941 |
args.output_volume = settings['output_volume']
|
1942 |
|
1943 |
+
args.tts_backend = settings['tts_backend']
|
1944 |
args.autoregressive_model = settings['autoregressive_model']
|
1945 |
args.vocoder_model = settings['vocoder_model']
|
1946 |
+
|
1947 |
args.whisper_backend = settings['whisper_backend']
|
1948 |
args.whisper_model = settings['whisper_model']
|
1949 |
|
|
|
1954 |
|
1955 |
def save_args_settings():
|
1956 |
global args
|
1957 |
+
settings = get_default_settings()
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1958 |
|
1959 |
os.makedirs('./config/', exist_ok=True)
|
1960 |
with open(f'./config/exec.json', 'w', encoding="utf-8") as f:
|
|
|
2071 |
if autoregressive_model == "auto":
|
2072 |
autoregressive_model = deduce_autoregressive_model()
|
2073 |
|
|
|
2074 |
|
2075 |
if get_device_name() == "cpu":
|
2076 |
print("!!!! WARNING !!!! No GPU available in PyTorch. You may need to reinstall PyTorch.")
|
2077 |
|
2078 |
tts_loading = True
|
2079 |
+
print(f"Loading TorToiSe... (AR: {autoregressive_model}, vocoder: {args.vocoder_model})")
|
2080 |
+
tts = TextToSpeech(minor_optimizations=not args.low_vram, autoregressive_model_path=autoregressive_model, vocoder_model=args.vocoder_model)
|
|
|
|
|
|
|
|
|
2081 |
tts_loading = False
|
2082 |
|
2083 |
get_model_path('dvae.pth')
|
src/webui.py
CHANGED
@@ -548,11 +548,11 @@ def setup_gradio():
|
|
548 |
EXEC_SETTINGS['autocalculate_voice_chunk_duration_size'] = gr.Number(label="Auto-Calculate Voice Chunk Duration (in seconds)", precision=0, value=args.autocalculate_voice_chunk_duration_size)
|
549 |
EXEC_SETTINGS['output_volume'] = gr.Slider(label="Output Volume", minimum=0, maximum=2, value=args.output_volume)
|
550 |
|
|
|
|
|
551 |
EXEC_SETTINGS['autoregressive_model'] = gr.Dropdown(choices=autoregressive_models, label="Autoregressive Model", value=args.autoregressive_model if args.autoregressive_model else autoregressive_models[0])
|
552 |
-
|
553 |
EXEC_SETTINGS['vocoder_model'] = gr.Dropdown(VOCODERS, label="Vocoder", value=args.vocoder_model if args.vocoder_model else VOCODERS[-1])
|
554 |
|
555 |
-
|
556 |
EXEC_SETTINGS['training_default_halfp'] = TRAINING_SETTINGS['half_p']
|
557 |
EXEC_SETTINGS['training_default_bnb'] = TRAINING_SETTINGS['bitsandbytes']
|
558 |
|
|
|
548 |
EXEC_SETTINGS['autocalculate_voice_chunk_duration_size'] = gr.Number(label="Auto-Calculate Voice Chunk Duration (in seconds)", precision=0, value=args.autocalculate_voice_chunk_duration_size)
|
549 |
EXEC_SETTINGS['output_volume'] = gr.Slider(label="Output Volume", minimum=0, maximum=2, value=args.output_volume)
|
550 |
|
551 |
+
# EXEC_SETTINGS['tts_backend'] = gr.Dropdown(TTSES, label="TTS Backend", value=args.tts_backend if args.tts_backend else TTSES[0])
|
552 |
+
|
553 |
EXEC_SETTINGS['autoregressive_model'] = gr.Dropdown(choices=autoregressive_models, label="Autoregressive Model", value=args.autoregressive_model if args.autoregressive_model else autoregressive_models[0])
|
|
|
554 |
EXEC_SETTINGS['vocoder_model'] = gr.Dropdown(VOCODERS, label="Vocoder", value=args.vocoder_model if args.vocoder_model else VOCODERS[-1])
|
555 |
|
|
|
556 |
EXEC_SETTINGS['training_default_halfp'] = TRAINING_SETTINGS['half_p']
|
557 |
EXEC_SETTINGS['training_default_bnb'] = TRAINING_SETTINGS['bitsandbytes']
|
558 |
|
start.sh
CHANGED
@@ -1,4 +1,5 @@
|
|
1 |
#!/bin/bash
|
|
|
2 |
source ./venv/bin/activate
|
3 |
python3 ./src/main.py "$@"
|
4 |
deactivate
|
|
|
1 |
#!/bin/bash
|
2 |
+
ulimit -Sn `ulimit -Hn` # ROCm is a bitch
|
3 |
source ./venv/bin/activate
|
4 |
python3 ./src/main.py "$@"
|
5 |
deactivate
|