dkounadis
/

artificial-styletts2

audio-generation

Model card Files Files and versions Community

artificial-styletts2 / live_api.py

Dionyssos's picture

debug long sounds

3ac9f34 17 days ago

history blame contribute delete

3.64 kB


	# -- coding: utf-8 --
	import numpy as np
	import soundfile
	import audresample
	import text_utils

	import re
	import subprocess
	import markdown
	import json
	from pathlib import Path
	from types import SimpleNamespace
	from flask import Flask, request, send_from_directory
	from flask_cors import CORS
	from audiocraft.builders import AudioGen #, audio_write
	NUM_SOUND_GENERATIONS = 1 # they differ a lot and are unnatural to concatenate, prefer lm.n_draw
	sound_generator = AudioGen(duration=.74, device='cuda:0').to('cuda:0').eval()


	# ====STYLE VECTOR====



	# AFFECTIVE = True
	# VOICE = 'en_UK/apope_low' # en_US/m-ailabs_low#mary_ann

	# _dir = '/' if AFFECTIVE else '_v2/'
	# precomputed_style_vector = msinference.compute_style(
	# 'assets/wavs/style_vector' + _dir + VOICE.replace(
	# '/', '_').replace(
	# '#', '_').replace(
	# 'cmu-arctic', 'cmu_arctic').replace(
	# '_low', '') + '.wav')
	# print('\n STYLE VECTOR \n', precomputed_style_vector.shape)


	# ==== STYLE VECTOR

	CACHE_DIR = 'flask_cache/'
	Path(CACHE_DIR).mkdir(parents=True, exist_ok=True)




	def tts_multi_sentence(scene=None):
	if scene is not None and len(scene) >= 4:
	print(f'Processing: {scene} ..')
	# x = sound_generator.generate([scene])[0, :, :].detach().cpu().numpy()
	x = sound_generator.generate(
	[scene] * NUM_SOUND_GENERATIONS
	).reshape(1, -1).detach().cpu().numpy() # bs, 11400

	x /= np.abs(x).max() + 1e-7
	# is 16kHz - AUdiogen Fs
	x = audresample.resample(x,
	original_rate=16000,
	target_rate=24000)[0, :]


	#
	print(f'Craft Finished for: {scene}\n\n\n\n____{x.shape}')
	else:
	print(scene, '\nDrop\n')
	x = np.zeros(400)

	# # StyleTTS2
	# if ('en_US/' in voice) or ('en_UK/' in voice) or (voice is None):
	# assert precomputed_style_vector is not None, 'For affective TTS, style vector is needed.'
	# x = []
	# for _sentence in text:
	# x.append(msinference.inference(_sentence,
	# precomputed_style_vector,
	# alpha=0.3,
	# beta=0.7,
	# diffusion_steps=7,
	# embedding_scale=1))
	# x = np.concatenate(x)

	# return overlay(x, sound_background)

	return x






	app = Flask(__name__)
	cors = CORS(app)


	@app.route("/")
	def index():
	with open('README.md', 'r') as f:
	return markdown.markdown(f.read())


	@app.route("/", methods=['GET', 'POST', 'PUT'])
	def serve_wav():
	# https://stackoverflow.com/questions/13522137/in-flask-convert-form-post-
	# object-into-a-representation-suitable-for-mongodb
	r = request.form.to_dict(flat=False)


	args = SimpleNamespace(
	text=None if r.get('text') is None else r.get('text'), # string not file?
	scene=r.get('scene')[0]
	)
	# print('\n==RECOMPOSED as \n',request.data,request.form,'\n==')







	x = tts_multi_sentence(args.scene)

	OUT_FILE = 'tmp.wav'
	soundfile.write(CACHE_DIR + OUT_FILE, x, 16000)





	# send server's output as default file -> srv_result.xx
	print(f'\n=SERVER saved as {OUT_FILE=}\n')
	response = send_from_directory(CACHE_DIR, path=OUT_FILE)
	response.headers['suffix-file-type'] = OUT_FILE
	return response


	if __name__ == "__main__":
	app.run(host="0.0.0.0")