File size: 3,555 Bytes
6e78f43 c4effd2 6e78f43 2d0e2b6 6e78f43 c4effd2 6e78f43 c4effd2 2d0e2b6 6e78f43 2d0e2b6 6e78f43 2d0e2b6 6e78f43 c4effd2 6e78f43 c4effd2 6e78f43 2d0e2b6 6e78f43 c4effd2 2d0e2b6 6e78f43 2d0e2b6 6e78f43 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 |
# -*- coding: utf-8 -*-
import numpy as np
import soundfile
import audresample
import text_utils
import re
import srt
import subprocess
import markdown
import json
from pathlib import Path
from types import SimpleNamespace
from flask import Flask, request, send_from_directory
from flask_cors import CORS
from audiocraft.audiogen import AudioGen #, audio_write
sound_generator = AudioGen.get_pretrained('facebook/audiogen-medium')
sound_generator.set_generation_params(duration=4)
# ====STYLE VECTOR====
# AFFECTIVE = True
# VOICE = 'en_UK/apope_low' # en_US/m-ailabs_low#mary_ann
# _dir = '/' if AFFECTIVE else '_v2/'
# precomputed_style_vector = msinference.compute_style(
# 'assets/wavs/style_vector' + _dir + VOICE.replace(
# '/', '_').replace(
# '#', '_').replace(
# 'cmu-arctic', 'cmu_arctic').replace(
# '_low', '') + '.wav')
# print('\n STYLE VECTOR \n', precomputed_style_vector.shape)
# ==== STYLE VECTOR
CACHE_DIR = 'flask_cache/'
Path(CACHE_DIR).mkdir(parents=True, exist_ok=True)
def tts_multi_sentence(scene=None):
if scene is not None and len(scene) >= 4:
print(f'Processing: {scene} ..')
x = sound_generator.generate([scene])[0].detach().cpu().numpy()[0, :]
x /= np.abs(x).max() + 1e-7
# sound_background = audio_write(None,
# sound_background.cpu(),
# 16000, #24000, # Same as styleTTs sample_rate,
# strategy="loudness",
# loudness_compressor=True)
print(f'Craft Finished for: {scene}\n\n\n\n____{x.shape}')
else:
print(scene, '\nDrop\n')
x = np.zeros(400)
# # StyleTTS2
# if ('en_US/' in voice) or ('en_UK/' in voice) or (voice is None):
# assert precomputed_style_vector is not None, 'For affective TTS, style vector is needed.'
# x = []
# for _sentence in text:
# x.append(msinference.inference(_sentence,
# precomputed_style_vector,
# alpha=0.3,
# beta=0.7,
# diffusion_steps=7,
# embedding_scale=1))
# x = np.concatenate(x)
# return overlay(x, sound_background)
return x
app = Flask(__name__)
cors = CORS(app)
@app.route("/")
def index():
with open('README.md', 'r') as f:
return markdown.markdown(f.read())
@app.route("/", methods=['GET', 'POST', 'PUT'])
def serve_wav():
# https://stackoverflow.com/questions/13522137/in-flask-convert-form-post-
# object-into-a-representation-suitable-for-mongodb
r = request.form.to_dict(flat=False)
args = SimpleNamespace(
text=None if r.get('text') is None else r.get('text'), # string not file?
scene=r.get('scene')[0]
)
# print('\n==RECOMPOSED as \n',request.data,request.form,'\n==')
x = tts_multi_sentence(args.scene)
OUT_FILE = 'tmp.wav'
soundfile.write(CACHE_DIR + OUT_FILE, x, 16000)
# send server's output as default file -> srv_result.xx
print(f'\n=SERVER saved as {OUT_FILE=}\n')
response = send_from_directory(CACHE_DIR, path=OUT_FILE)
response.headers['suffix-file-type'] = OUT_FILE
return response
if __name__ == "__main__":
app.run(host="0.0.0.0")
|