re-name arg scene to soundscape
Browse files- api.py +14 -14
- msinference.py +19 -11
- tts.py +16 -10
api.py
CHANGED
@@ -18,7 +18,7 @@ from flask_cors import CORS
|
|
18 |
from moviepy.editor import *
|
19 |
from audiocraft.builders import AudioGen
|
20 |
CACHE_DIR = 'flask_cache/'
|
21 |
-
NUM_SOUND_GENERATIONS = 1 # batch size to generate same text (same
|
22 |
|
23 |
sound_generator = AudioGen(duration=.74, device='cuda:0').to('cuda:0').eval()
|
24 |
|
@@ -82,14 +82,14 @@ def _shift(x):
|
|
82 |
# fade_in = 1 - .5 * np.tanh(-4*(np.linspace(-10, 10, n) - 9.4)) + .5 * np.tanh(4*(np.linspace(-10, 10, n) + 9.4))
|
83 |
return x #* fade_in # silence this
|
84 |
|
85 |
-
def overlay(x,
|
86 |
|
87 |
-
if
|
88 |
|
89 |
# SOUNDS
|
90 |
-
print(f'AudioGen {NUM_SOUND_GENERATIONS} x {
|
91 |
background = sound_generator.generate(
|
92 |
-
[
|
93 |
).reshape(-1).detach().cpu().numpy() # bs, 11400
|
94 |
|
95 |
# upsample 16 kHz AudioGen to 24kHZ StyleTTS
|
@@ -113,7 +113,7 @@ def overlay(x, scene=None):
|
|
113 |
# background = _shift(background)
|
114 |
print(f'\n====SOUND BACKGROUND SHAPE\n{background.shape=}',
|
115 |
f'{np.abs(background.max())=}\n{x.shape=}')
|
116 |
-
x = .
|
117 |
else:
|
118 |
print('sound_background = None')
|
119 |
return x
|
@@ -121,7 +121,7 @@ def overlay(x, scene=None):
|
|
121 |
def tts_multi_sentence(precomputed_style_vector=None,
|
122 |
text=None,
|
123 |
voice=None,
|
124 |
-
|
125 |
speed=None):
|
126 |
'''create 24kHZ np.array with tts
|
127 |
|
@@ -129,7 +129,7 @@ def tts_multi_sentence(precomputed_style_vector=None,
|
|
129 |
to perform affective TTS.
|
130 |
text : string
|
131 |
voice : string or None (falls to styleTTS)
|
132 |
-
|
133 |
'''
|
134 |
|
135 |
|
@@ -161,7 +161,7 @@ def tts_multi_sentence(precomputed_style_vector=None,
|
|
161 |
|
162 |
x /= np.abs(x).max() + 1e-7 # amplify speech to full [-1,1]
|
163 |
|
164 |
-
return overlay(x,
|
165 |
|
166 |
|
167 |
|
@@ -201,7 +201,7 @@ def serve_wav():
|
|
201 |
affective = r.get('affective')[0],
|
202 |
voice = r.get('voice')[0],
|
203 |
speed = float(r.get('speed')[0]), # For Non-English MMS TTS
|
204 |
-
|
205 |
)
|
206 |
# print('\n==RECOMPOSED as \n',request.data,request.form,'\n==')
|
207 |
|
@@ -399,7 +399,7 @@ def serve_wav():
|
|
399 |
pieces.append(tts_multi_sentence(text=[_text_],
|
400 |
precomputed_style_vector=precomputed_style_vector,
|
401 |
voice=args.voice,
|
402 |
-
|
403 |
speed=args.speed)
|
404 |
)
|
405 |
total = np.concatenate(pieces, 0)
|
@@ -420,7 +420,7 @@ def serve_wav():
|
|
420 |
x = tts_multi_sentence(text=text,
|
421 |
precomputed_style_vector=precomputed_style_vector,
|
422 |
voice=args.voice,
|
423 |
-
|
424 |
speed=args.speed)
|
425 |
soundfile.write(AUDIO_TRACK, x, 24000)
|
426 |
|
@@ -439,7 +439,7 @@ def serve_wav():
|
|
439 |
x = tts_multi_sentence(text=text,
|
440 |
precomputed_style_vector=precomputed_style_vector,
|
441 |
voice=args.voice,
|
442 |
-
|
443 |
speed=args.speed
|
444 |
)
|
445 |
soundfile.write(AUDIO_TRACK, x, 24000)
|
@@ -468,7 +468,7 @@ def serve_wav():
|
|
468 |
x = tts_multi_sentence(text=text,
|
469 |
precomputed_style_vector=precomputed_style_vector,
|
470 |
voice=args.voice,
|
471 |
-
|
472 |
speed=args.speed)
|
473 |
OUT_FILE = 'tmp.wav'
|
474 |
soundfile.write(CACHE_DIR + OUT_FILE, x, 24000)
|
|
|
18 |
from moviepy.editor import *
|
19 |
from audiocraft.builders import AudioGen
|
20 |
CACHE_DIR = 'flask_cache/'
|
21 |
+
NUM_SOUND_GENERATIONS = 1 # batch size to generate same text (same soundscape for long video)
|
22 |
|
23 |
sound_generator = AudioGen(duration=.74, device='cuda:0').to('cuda:0').eval()
|
24 |
|
|
|
82 |
# fade_in = 1 - .5 * np.tanh(-4*(np.linspace(-10, 10, n) - 9.4)) + .5 * np.tanh(4*(np.linspace(-10, 10, n) + 9.4))
|
83 |
return x #* fade_in # silence this
|
84 |
|
85 |
+
def overlay(x, soundscape=None):
|
86 |
|
87 |
+
if soundscape is not None:
|
88 |
|
89 |
# SOUNDS
|
90 |
+
print(f'AudioGen {NUM_SOUND_GENERATIONS} x {soundscape}')
|
91 |
background = sound_generator.generate(
|
92 |
+
[soundscape] * NUM_SOUND_GENERATIONS
|
93 |
).reshape(-1).detach().cpu().numpy() # bs, 11400
|
94 |
|
95 |
# upsample 16 kHz AudioGen to 24kHZ StyleTTS
|
|
|
113 |
# background = _shift(background)
|
114 |
print(f'\n====SOUND BACKGROUND SHAPE\n{background.shape=}',
|
115 |
f'{np.abs(background.max())=}\n{x.shape=}')
|
116 |
+
x = .6 * x + .4 * background[:len(x)]
|
117 |
else:
|
118 |
print('sound_background = None')
|
119 |
return x
|
|
|
121 |
def tts_multi_sentence(precomputed_style_vector=None,
|
122 |
text=None,
|
123 |
voice=None,
|
124 |
+
soundscape=None,
|
125 |
speed=None):
|
126 |
'''create 24kHZ np.array with tts
|
127 |
|
|
|
129 |
to perform affective TTS.
|
130 |
text : string
|
131 |
voice : string or None (falls to styleTTS)
|
132 |
+
soundscape : 'A castle in far away lands' -> if passed will generate background sound soundscape
|
133 |
'''
|
134 |
|
135 |
|
|
|
161 |
|
162 |
x /= np.abs(x).max() + 1e-7 # amplify speech to full [-1,1]
|
163 |
|
164 |
+
return overlay(x, soundscape=soundscape)
|
165 |
|
166 |
|
167 |
|
|
|
201 |
affective = r.get('affective')[0],
|
202 |
voice = r.get('voice')[0],
|
203 |
speed = float(r.get('speed')[0]), # For Non-English MMS TTS
|
204 |
+
soundscape=r.get('soundscape')[0] if r.get('soundscape') is not None else None,
|
205 |
)
|
206 |
# print('\n==RECOMPOSED as \n',request.data,request.form,'\n==')
|
207 |
|
|
|
399 |
pieces.append(tts_multi_sentence(text=[_text_],
|
400 |
precomputed_style_vector=precomputed_style_vector,
|
401 |
voice=args.voice,
|
402 |
+
soundscape=args.soundscape,
|
403 |
speed=args.speed)
|
404 |
)
|
405 |
total = np.concatenate(pieces, 0)
|
|
|
420 |
x = tts_multi_sentence(text=text,
|
421 |
precomputed_style_vector=precomputed_style_vector,
|
422 |
voice=args.voice,
|
423 |
+
soundscape=args.soundscape,
|
424 |
speed=args.speed)
|
425 |
soundfile.write(AUDIO_TRACK, x, 24000)
|
426 |
|
|
|
439 |
x = tts_multi_sentence(text=text,
|
440 |
precomputed_style_vector=precomputed_style_vector,
|
441 |
voice=args.voice,
|
442 |
+
soundscape=args.soundscape,
|
443 |
speed=args.speed
|
444 |
)
|
445 |
soundfile.write(AUDIO_TRACK, x, 24000)
|
|
|
468 |
x = tts_multi_sentence(text=text,
|
469 |
precomputed_style_vector=precomputed_style_vector,
|
470 |
voice=args.voice,
|
471 |
+
soundscape=args.soundscape,
|
472 |
speed=args.speed)
|
473 |
OUT_FILE = 'tmp.wav'
|
474 |
soundfile.write(CACHE_DIR + OUT_FILE, x, 24000)
|
msinference.py
CHANGED
@@ -373,13 +373,16 @@ class TextForeign(object):
|
|
373 |
def foreign(text=None, # list of text
|
374 |
lang='romanian',
|
375 |
speed=None):
|
|
|
|
|
|
|
376 |
# https://huggingface.co/spaces/mms-meta/MMS
|
377 |
|
378 |
-
if 'hun' in lang
|
379 |
|
380 |
lang_code = 'hun'
|
381 |
|
382 |
-
elif 'ser' in lang
|
383 |
|
384 |
if has_cyrillic(text[0]): # check 0-th sentence if is cyrillic
|
385 |
|
@@ -389,14 +392,22 @@ def foreign(text=None, # list of text
|
|
389 |
|
390 |
lang_code = 'rmc-script_latin' # romani carpathian (has also Vlax)
|
391 |
|
392 |
-
elif 'rom' in lang
|
393 |
|
394 |
lang_code = 'ron'
|
395 |
speed = 1.24 if speed is None else speed
|
396 |
|
|
|
|
|
|
|
|
|
|
|
397 |
else:
|
|
|
398 |
lang_code = lang.split()[0].strip()
|
399 |
-
|
|
|
|
|
400 |
print(f'\n\nLANG {lang_code=}\n_____________________\n')
|
401 |
vocab_file = hf_hub_download(
|
402 |
repo_id="facebook/mms-tts",
|
@@ -444,8 +455,10 @@ def foreign(text=None, # list of text
|
|
444 |
uroman_pl = os.path.join(uroman_dir, "bin", "uroman.pl")
|
445 |
_t = text_mapper.uromanize(_t, uroman_pl)
|
446 |
|
447 |
-
_t = _t.lower().replace("ţ", "ț").replace('ț','ts')
|
|
|
448 |
_t = text_mapper.filter_oov(_t, lang=lang)
|
|
|
449 |
# print(f'{speed=}\n\n\n\n_______________________________ {_t}')
|
450 |
stn_tst = text_mapper.get_text(_t, hps)
|
451 |
with torch.no_grad():
|
@@ -464,16 +477,11 @@ def foreign(text=None, # list of text
|
|
464 |
|
465 |
x /= np.abs(x).max() + 1e-7
|
466 |
|
467 |
-
#
|
468 |
-
# x = hyp #, text
|
469 |
-
print(x.shape, x.min(), x.max(), hps.data.sampling_rate) # (hps.data.sampling_rate,
|
470 |
|
471 |
x = audresample.resample(signal=x.astype(np.float32),
|
472 |
original_rate=16000,
|
473 |
target_rate=24000)[0, :] # reshapes (64,) -> (1,64)
|
474 |
-
|
475 |
-
|
476 |
-
|
477 |
return x
|
478 |
|
479 |
|
|
|
373 |
def foreign(text=None, # list of text
|
374 |
lang='romanian',
|
375 |
speed=None):
|
376 |
+
|
377 |
+
lang = lang.lower() # https://huggingface.co/dkounadis/artificial-styletts2/blob/main/Utils/all_langs.csv
|
378 |
+
|
379 |
# https://huggingface.co/spaces/mms-meta/MMS
|
380 |
|
381 |
+
if 'hun' in lang:
|
382 |
|
383 |
lang_code = 'hun'
|
384 |
|
385 |
+
elif 'ser' in lang:
|
386 |
|
387 |
if has_cyrillic(text[0]): # check 0-th sentence if is cyrillic
|
388 |
|
|
|
392 |
|
393 |
lang_code = 'rmc-script_latin' # romani carpathian (has also Vlax)
|
394 |
|
395 |
+
elif 'rom' in lang:
|
396 |
|
397 |
lang_code = 'ron'
|
398 |
speed = 1.24 if speed is None else speed
|
399 |
|
400 |
+
elif 'ger' in lang:
|
401 |
+
|
402 |
+
lang_code = 'deu'
|
403 |
+
speed = 1.14 if speed is None else speed
|
404 |
+
|
405 |
else:
|
406 |
+
|
407 |
lang_code = lang.split()[0].strip()
|
408 |
+
|
409 |
+
# Load VITS
|
410 |
+
|
411 |
print(f'\n\nLANG {lang_code=}\n_____________________\n')
|
412 |
vocab_file = hf_hub_download(
|
413 |
repo_id="facebook/mms-tts",
|
|
|
455 |
uroman_pl = os.path.join(uroman_dir, "bin", "uroman.pl")
|
456 |
_t = text_mapper.uromanize(_t, uroman_pl)
|
457 |
|
458 |
+
_t = _t.lower().replace("ţ", "ț").replace('ț','ts').replace('î', 'u')
|
459 |
+
|
460 |
_t = text_mapper.filter_oov(_t, lang=lang)
|
461 |
+
|
462 |
# print(f'{speed=}\n\n\n\n_______________________________ {_t}')
|
463 |
stn_tst = text_mapper.get_text(_t, hps)
|
464 |
with torch.no_grad():
|
|
|
477 |
|
478 |
x /= np.abs(x).max() + 1e-7
|
479 |
|
480 |
+
# print(x.shape, x.min(), x.max(), hps.data.sampling_rate)
|
|
|
|
|
481 |
|
482 |
x = audresample.resample(signal=x.astype(np.float32),
|
483 |
original_rate=16000,
|
484 |
target_rate=24000)[0, :] # reshapes (64,) -> (1,64)
|
|
|
|
|
|
|
485 |
return x
|
486 |
|
487 |
|
tts.py
CHANGED
@@ -42,6 +42,14 @@ def command_line_args():
|
|
42 |
default='sample.txt',
|
43 |
type=str,
|
44 |
)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
45 |
parser.add_argument(
|
46 |
'--native',
|
47 |
help="""
|
@@ -83,21 +91,24 @@ def command_line_args():
|
|
83 |
|
84 |
def send_to_server(args):
|
85 |
url = "http://192.168.88.209:5000"
|
|
|
|
|
86 |
|
87 |
payload = {
|
88 |
'affective': args.affective,
|
89 |
'voice': args.voice,
|
|
|
90 |
'native': args.native,
|
91 |
'text': args.text,
|
92 |
'image': args.image,
|
93 |
'video': args.video,
|
94 |
'speed': args.speed,
|
|
|
95 |
# 'out_file': args.out_file # let serve save as temp
|
96 |
}
|
97 |
|
98 |
-
#
|
99 |
-
|
100 |
-
# In files= sent actual files if provided
|
101 |
text_file = open(args.text, 'rb')
|
102 |
|
103 |
image_file, video_file, native_file = None, None, None
|
@@ -107,7 +118,6 @@ def send_to_server(args):
|
|
107 |
image_file = open(args.image, 'rb')
|
108 |
except FileNotFoundError:
|
109 |
pass
|
110 |
-
|
111 |
|
112 |
if args.video is not None:
|
113 |
print('\nLOADING vid\n')
|
@@ -122,14 +132,10 @@ def send_to_server(args):
|
|
122 |
native_file = open(args.native, 'rb')
|
123 |
except FileNotFoundError:
|
124 |
pass
|
125 |
-
|
126 |
-
|
127 |
-
|
128 |
-
# --------------------- send this extra
|
129 |
|
130 |
-
#
|
131 |
|
132 |
-
response = requests.post(url, data=payload,
|
133 |
files=[(args.text, text_file),
|
134 |
(args.image, image_file),
|
135 |
(args.video, video_file),
|
|
|
42 |
default='sample.txt',
|
43 |
type=str,
|
44 |
)
|
45 |
+
parser.add_argument(
|
46 |
+
'--soundscape',
|
47 |
+
help='soundscape - MUST BE IN BRACKETS: \"forest\"',
|
48 |
+
default='wind fjord',
|
49 |
+
nargs='?',
|
50 |
+
type=str,
|
51 |
+
const=None,
|
52 |
+
)
|
53 |
parser.add_argument(
|
54 |
'--native',
|
55 |
help="""
|
|
|
91 |
|
92 |
def send_to_server(args):
|
93 |
url = "http://192.168.88.209:5000"
|
94 |
+
|
95 |
+
# Args
|
96 |
|
97 |
payload = {
|
98 |
'affective': args.affective,
|
99 |
'voice': args.voice,
|
100 |
+
'soundscape': args.soundscape,
|
101 |
'native': args.native,
|
102 |
'text': args.text,
|
103 |
'image': args.image,
|
104 |
'video': args.video,
|
105 |
'speed': args.speed,
|
106 |
+
|
107 |
# 'out_file': args.out_file # let serve save as temp
|
108 |
}
|
109 |
|
110 |
+
# Send Files
|
111 |
+
|
|
|
112 |
text_file = open(args.text, 'rb')
|
113 |
|
114 |
image_file, video_file, native_file = None, None, None
|
|
|
118 |
image_file = open(args.image, 'rb')
|
119 |
except FileNotFoundError:
|
120 |
pass
|
|
|
121 |
|
122 |
if args.video is not None:
|
123 |
print('\nLOADING vid\n')
|
|
|
132 |
native_file = open(args.native, 'rb')
|
133 |
except FileNotFoundError:
|
134 |
pass
|
|
|
|
|
|
|
|
|
135 |
|
136 |
+
#
|
137 |
|
138 |
+
response = requests.post(url, data=payload, # contains str
|
139 |
files=[(args.text, text_file),
|
140 |
(args.image, image_file),
|
141 |
(args.video, video_file),
|