Dionyssos commited on
Commit
560f712
·
1 Parent(s): eb2f44b

re-name arg scene to soundscape

Browse files
Files changed (3) hide show
  1. api.py +14 -14
  2. msinference.py +19 -11
  3. tts.py +16 -10
api.py CHANGED
@@ -18,7 +18,7 @@ from flask_cors import CORS
18
  from moviepy.editor import *
19
  from audiocraft.builders import AudioGen
20
  CACHE_DIR = 'flask_cache/'
21
- NUM_SOUND_GENERATIONS = 1 # batch size to generate same text (same scene for long video)
22
 
23
  sound_generator = AudioGen(duration=.74, device='cuda:0').to('cuda:0').eval()
24
 
@@ -82,14 +82,14 @@ def _shift(x):
82
  # fade_in = 1 - .5 * np.tanh(-4*(np.linspace(-10, 10, n) - 9.4)) + .5 * np.tanh(4*(np.linspace(-10, 10, n) + 9.4))
83
  return x #* fade_in # silence this
84
 
85
- def overlay(x, scene=None):
86
 
87
- if scene is not None:
88
 
89
  # SOUNDS
90
- print(f'AudioGen {NUM_SOUND_GENERATIONS} x {scene}')
91
  background = sound_generator.generate(
92
- [scene] * NUM_SOUND_GENERATIONS
93
  ).reshape(-1).detach().cpu().numpy() # bs, 11400
94
 
95
  # upsample 16 kHz AudioGen to 24kHZ StyleTTS
@@ -113,7 +113,7 @@ def overlay(x, scene=None):
113
  # background = _shift(background)
114
  print(f'\n====SOUND BACKGROUND SHAPE\n{background.shape=}',
115
  f'{np.abs(background.max())=}\n{x.shape=}')
116
- x = .1 * x + .9 * background[:len(x)]
117
  else:
118
  print('sound_background = None')
119
  return x
@@ -121,7 +121,7 @@ def overlay(x, scene=None):
121
  def tts_multi_sentence(precomputed_style_vector=None,
122
  text=None,
123
  voice=None,
124
- scene=None,
125
  speed=None):
126
  '''create 24kHZ np.array with tts
127
 
@@ -129,7 +129,7 @@ def tts_multi_sentence(precomputed_style_vector=None,
129
  to perform affective TTS.
130
  text : string
131
  voice : string or None (falls to styleTTS)
132
- scene : 'A castle in far away lands' -> if passed will generate background sound scene
133
  '''
134
 
135
 
@@ -161,7 +161,7 @@ def tts_multi_sentence(precomputed_style_vector=None,
161
 
162
  x /= np.abs(x).max() + 1e-7 # amplify speech to full [-1,1]
163
 
164
- return overlay(x, scene=scene)
165
 
166
 
167
 
@@ -201,7 +201,7 @@ def serve_wav():
201
  affective = r.get('affective')[0],
202
  voice = r.get('voice')[0],
203
  speed = float(r.get('speed')[0]), # For Non-English MMS TTS
204
- scene=r.get('scene')[0] if r.get('scene') is not None else None,
205
  )
206
  # print('\n==RECOMPOSED as \n',request.data,request.form,'\n==')
207
 
@@ -399,7 +399,7 @@ def serve_wav():
399
  pieces.append(tts_multi_sentence(text=[_text_],
400
  precomputed_style_vector=precomputed_style_vector,
401
  voice=args.voice,
402
- scene=args.scene,
403
  speed=args.speed)
404
  )
405
  total = np.concatenate(pieces, 0)
@@ -420,7 +420,7 @@ def serve_wav():
420
  x = tts_multi_sentence(text=text,
421
  precomputed_style_vector=precomputed_style_vector,
422
  voice=args.voice,
423
- scene=args.scene,
424
  speed=args.speed)
425
  soundfile.write(AUDIO_TRACK, x, 24000)
426
 
@@ -439,7 +439,7 @@ def serve_wav():
439
  x = tts_multi_sentence(text=text,
440
  precomputed_style_vector=precomputed_style_vector,
441
  voice=args.voice,
442
- scene=args.scene,
443
  speed=args.speed
444
  )
445
  soundfile.write(AUDIO_TRACK, x, 24000)
@@ -468,7 +468,7 @@ def serve_wav():
468
  x = tts_multi_sentence(text=text,
469
  precomputed_style_vector=precomputed_style_vector,
470
  voice=args.voice,
471
- scene=args.scene,
472
  speed=args.speed)
473
  OUT_FILE = 'tmp.wav'
474
  soundfile.write(CACHE_DIR + OUT_FILE, x, 24000)
 
18
  from moviepy.editor import *
19
  from audiocraft.builders import AudioGen
20
  CACHE_DIR = 'flask_cache/'
21
+ NUM_SOUND_GENERATIONS = 1 # batch size to generate same text (same soundscape for long video)
22
 
23
  sound_generator = AudioGen(duration=.74, device='cuda:0').to('cuda:0').eval()
24
 
 
82
  # fade_in = 1 - .5 * np.tanh(-4*(np.linspace(-10, 10, n) - 9.4)) + .5 * np.tanh(4*(np.linspace(-10, 10, n) + 9.4))
83
  return x #* fade_in # silence this
84
 
85
+ def overlay(x, soundscape=None):
86
 
87
+ if soundscape is not None:
88
 
89
  # SOUNDS
90
+ print(f'AudioGen {NUM_SOUND_GENERATIONS} x {soundscape}')
91
  background = sound_generator.generate(
92
+ [soundscape] * NUM_SOUND_GENERATIONS
93
  ).reshape(-1).detach().cpu().numpy() # bs, 11400
94
 
95
  # upsample 16 kHz AudioGen to 24kHZ StyleTTS
 
113
  # background = _shift(background)
114
  print(f'\n====SOUND BACKGROUND SHAPE\n{background.shape=}',
115
  f'{np.abs(background.max())=}\n{x.shape=}')
116
+ x = .6 * x + .4 * background[:len(x)]
117
  else:
118
  print('sound_background = None')
119
  return x
 
121
  def tts_multi_sentence(precomputed_style_vector=None,
122
  text=None,
123
  voice=None,
124
+ soundscape=None,
125
  speed=None):
126
  '''create 24kHZ np.array with tts
127
 
 
129
  to perform affective TTS.
130
  text : string
131
  voice : string or None (falls to styleTTS)
132
+ soundscape : 'A castle in far away lands' -> if passed will generate background sound soundscape
133
  '''
134
 
135
 
 
161
 
162
  x /= np.abs(x).max() + 1e-7 # amplify speech to full [-1,1]
163
 
164
+ return overlay(x, soundscape=soundscape)
165
 
166
 
167
 
 
201
  affective = r.get('affective')[0],
202
  voice = r.get('voice')[0],
203
  speed = float(r.get('speed')[0]), # For Non-English MMS TTS
204
+ soundscape=r.get('soundscape')[0] if r.get('soundscape') is not None else None,
205
  )
206
  # print('\n==RECOMPOSED as \n',request.data,request.form,'\n==')
207
 
 
399
  pieces.append(tts_multi_sentence(text=[_text_],
400
  precomputed_style_vector=precomputed_style_vector,
401
  voice=args.voice,
402
+ soundscape=args.soundscape,
403
  speed=args.speed)
404
  )
405
  total = np.concatenate(pieces, 0)
 
420
  x = tts_multi_sentence(text=text,
421
  precomputed_style_vector=precomputed_style_vector,
422
  voice=args.voice,
423
+ soundscape=args.soundscape,
424
  speed=args.speed)
425
  soundfile.write(AUDIO_TRACK, x, 24000)
426
 
 
439
  x = tts_multi_sentence(text=text,
440
  precomputed_style_vector=precomputed_style_vector,
441
  voice=args.voice,
442
+ soundscape=args.soundscape,
443
  speed=args.speed
444
  )
445
  soundfile.write(AUDIO_TRACK, x, 24000)
 
468
  x = tts_multi_sentence(text=text,
469
  precomputed_style_vector=precomputed_style_vector,
470
  voice=args.voice,
471
+ soundscape=args.soundscape,
472
  speed=args.speed)
473
  OUT_FILE = 'tmp.wav'
474
  soundfile.write(CACHE_DIR + OUT_FILE, x, 24000)
msinference.py CHANGED
@@ -373,13 +373,16 @@ class TextForeign(object):
373
  def foreign(text=None, # list of text
374
  lang='romanian',
375
  speed=None):
 
 
 
376
  # https://huggingface.co/spaces/mms-meta/MMS
377
 
378
- if 'hun' in lang.lower():
379
 
380
  lang_code = 'hun'
381
 
382
- elif 'ser' in lang.lower():
383
 
384
  if has_cyrillic(text[0]): # check 0-th sentence if is cyrillic
385
 
@@ -389,14 +392,22 @@ def foreign(text=None, # list of text
389
 
390
  lang_code = 'rmc-script_latin' # romani carpathian (has also Vlax)
391
 
392
- elif 'rom' in lang.lower():
393
 
394
  lang_code = 'ron'
395
  speed = 1.24 if speed is None else speed
396
 
 
 
 
 
 
397
  else:
 
398
  lang_code = lang.split()[0].strip()
399
- # Decoded Language
 
 
400
  print(f'\n\nLANG {lang_code=}\n_____________________\n')
401
  vocab_file = hf_hub_download(
402
  repo_id="facebook/mms-tts",
@@ -444,8 +455,10 @@ def foreign(text=None, # list of text
444
  uroman_pl = os.path.join(uroman_dir, "bin", "uroman.pl")
445
  _t = text_mapper.uromanize(_t, uroman_pl)
446
 
447
- _t = _t.lower().replace("ţ", "ț").replace('ț','ts') #.replace('ț', 'ts').replace('Ţ', 'ts').replace('î', 'u').replace('Î', 'u')
 
448
  _t = text_mapper.filter_oov(_t, lang=lang)
 
449
  # print(f'{speed=}\n\n\n\n_______________________________ {_t}')
450
  stn_tst = text_mapper.get_text(_t, hps)
451
  with torch.no_grad():
@@ -464,16 +477,11 @@ def foreign(text=None, # list of text
464
 
465
  x /= np.abs(x).max() + 1e-7
466
 
467
- # hyp = (hyp * 32768).astype(np.int16)
468
- # x = hyp #, text
469
- print(x.shape, x.min(), x.max(), hps.data.sampling_rate) # (hps.data.sampling_rate,
470
 
471
  x = audresample.resample(signal=x.astype(np.float32),
472
  original_rate=16000,
473
  target_rate=24000)[0, :] # reshapes (64,) -> (1,64)
474
-
475
-
476
-
477
  return x
478
 
479
 
 
373
  def foreign(text=None, # list of text
374
  lang='romanian',
375
  speed=None):
376
+
377
+ lang = lang.lower() # https://huggingface.co/dkounadis/artificial-styletts2/blob/main/Utils/all_langs.csv
378
+
379
  # https://huggingface.co/spaces/mms-meta/MMS
380
 
381
+ if 'hun' in lang:
382
 
383
  lang_code = 'hun'
384
 
385
+ elif 'ser' in lang:
386
 
387
  if has_cyrillic(text[0]): # check 0-th sentence if is cyrillic
388
 
 
392
 
393
  lang_code = 'rmc-script_latin' # romani carpathian (has also Vlax)
394
 
395
+ elif 'rom' in lang:
396
 
397
  lang_code = 'ron'
398
  speed = 1.24 if speed is None else speed
399
 
400
+ elif 'ger' in lang:
401
+
402
+ lang_code = 'deu'
403
+ speed = 1.14 if speed is None else speed
404
+
405
  else:
406
+
407
  lang_code = lang.split()[0].strip()
408
+
409
+ # Load VITS
410
+
411
  print(f'\n\nLANG {lang_code=}\n_____________________\n')
412
  vocab_file = hf_hub_download(
413
  repo_id="facebook/mms-tts",
 
455
  uroman_pl = os.path.join(uroman_dir, "bin", "uroman.pl")
456
  _t = text_mapper.uromanize(_t, uroman_pl)
457
 
458
+ _t = _t.lower().replace("ţ", "ț").replace('ț','ts').replace('î', 'u')
459
+
460
  _t = text_mapper.filter_oov(_t, lang=lang)
461
+
462
  # print(f'{speed=}\n\n\n\n_______________________________ {_t}')
463
  stn_tst = text_mapper.get_text(_t, hps)
464
  with torch.no_grad():
 
477
 
478
  x /= np.abs(x).max() + 1e-7
479
 
480
+ # print(x.shape, x.min(), x.max(), hps.data.sampling_rate)
 
 
481
 
482
  x = audresample.resample(signal=x.astype(np.float32),
483
  original_rate=16000,
484
  target_rate=24000)[0, :] # reshapes (64,) -> (1,64)
 
 
 
485
  return x
486
 
487
 
tts.py CHANGED
@@ -42,6 +42,14 @@ def command_line_args():
42
  default='sample.txt',
43
  type=str,
44
  )
 
 
 
 
 
 
 
 
45
  parser.add_argument(
46
  '--native',
47
  help="""
@@ -83,21 +91,24 @@ def command_line_args():
83
 
84
  def send_to_server(args):
85
  url = "http://192.168.88.209:5000"
 
 
86
 
87
  payload = {
88
  'affective': args.affective,
89
  'voice': args.voice,
 
90
  'native': args.native,
91
  'text': args.text,
92
  'image': args.image,
93
  'video': args.video,
94
  'speed': args.speed,
 
95
  # 'out_file': args.out_file # let serve save as temp
96
  }
97
 
98
- # In data= we can write args
99
-
100
- # In files= sent actual files if provided
101
  text_file = open(args.text, 'rb')
102
 
103
  image_file, video_file, native_file = None, None, None
@@ -107,7 +118,6 @@ def send_to_server(args):
107
  image_file = open(args.image, 'rb')
108
  except FileNotFoundError:
109
  pass
110
-
111
 
112
  if args.video is not None:
113
  print('\nLOADING vid\n')
@@ -122,14 +132,10 @@ def send_to_server(args):
122
  native_file = open(args.native, 'rb')
123
  except FileNotFoundError:
124
  pass
125
-
126
-
127
-
128
- # --------------------- send this extra
129
 
130
- # print('Sending...\n')
131
 
132
- response = requests.post(url, data=payload,
133
  files=[(args.text, text_file),
134
  (args.image, image_file),
135
  (args.video, video_file),
 
42
  default='sample.txt',
43
  type=str,
44
  )
45
+ parser.add_argument(
46
+ '--soundscape',
47
+ help='soundscape - MUST BE IN BRACKETS: \"forest\"',
48
+ default='wind fjord',
49
+ nargs='?',
50
+ type=str,
51
+ const=None,
52
+ )
53
  parser.add_argument(
54
  '--native',
55
  help="""
 
91
 
92
  def send_to_server(args):
93
  url = "http://192.168.88.209:5000"
94
+
95
+ # Args
96
 
97
  payload = {
98
  'affective': args.affective,
99
  'voice': args.voice,
100
+ 'soundscape': args.soundscape,
101
  'native': args.native,
102
  'text': args.text,
103
  'image': args.image,
104
  'video': args.video,
105
  'speed': args.speed,
106
+
107
  # 'out_file': args.out_file # let serve save as temp
108
  }
109
 
110
+ # Send Files
111
+
 
112
  text_file = open(args.text, 'rb')
113
 
114
  image_file, video_file, native_file = None, None, None
 
118
  image_file = open(args.image, 'rb')
119
  except FileNotFoundError:
120
  pass
 
121
 
122
  if args.video is not None:
123
  print('\nLOADING vid\n')
 
132
  native_file = open(args.native, 'rb')
133
  except FileNotFoundError:
134
  pass
 
 
 
 
135
 
136
+ #
137
 
138
+ response = requests.post(url, data=payload, # contains str
139
  files=[(args.text, text_file),
140
  (args.image, image_file),
141
  (args.video, video_file),