Spaces:
Sleeping
Sleeping
import os | |
import sys | |
sys.path.append('../AudioJourney') | |
import logging | |
import yaml | |
import numpy as np | |
import torch | |
import torchaudio | |
from torchaudio.transforms import SpeedPerturbation | |
import nltk | |
from APIs import WRITE_AUDIO, LOUDNESS_NORM | |
from flask import Flask, request, jsonify | |
from transformers import BarkModel, AutoProcessor | |
with open('config.yaml', 'r') as file: | |
config = yaml.safe_load(file) | |
# Configure the logging format and level | |
logging.basicConfig( | |
level=logging.INFO, | |
format='%(asctime)s - %(levelname)s - %(message)s' | |
) | |
# Create a FileHandler for the log file | |
os.makedirs('services_logs', exist_ok=True) | |
log_filename = 'services_logs/Text-to-Speech.log' | |
file_handler = logging.FileHandler(log_filename, mode='w') | |
file_handler.setFormatter(logging.Formatter('%(asctime)s - %(levelname)s - %(message)s')) | |
# Add the FileHandler to the root logger | |
logging.getLogger('').addHandler(file_handler) | |
# Initialize the model here | |
SPEED = float(config['Text-to-Speech']['speed']) | |
speed_perturb = SpeedPerturbation(32000, [SPEED]) | |
logging.info('Loading Bark model ...') | |
# TODO: fp16? | |
model = BarkModel.from_pretrained("suno/bark") | |
device = "cuda:0" if torch.cuda.is_available() else "cpu" | |
model = model.to(device) | |
model = model.to_bettertransformer() # Flash attention | |
SAMPLE_RATE = model.generation_config.sample_rate | |
SEMANTIC_TEMPERATURE = 0.9 | |
COARSE_TEMPERATURE = 0.5 | |
FINE_TEMPERATURE = 0.5 | |
processor = AutoProcessor.from_pretrained("suno/bark") | |
app = Flask(__name__) | |
def generate_speech(): | |
# Receive the text from the POST request | |
data = request.json | |
text = data['text'] | |
speaker_id = data['speaker_id'] | |
speaker_npz = data['speaker_npz'] | |
volume = float(data.get('volume', -35)) | |
output_wav = data.get('output_wav', 'out.wav') | |
logging.info(f'TTS (Bark): Speaker: {speaker_id}, Volume: {volume} dB, Prompt: {text}') | |
try: | |
# Generate audio using the global pipe object | |
text = text.replace('\n', ' ').strip() | |
sentences = nltk.sent_tokenize(text) | |
silence = torch.zeros(int(0.1 * SAMPLE_RATE), device=device).unsqueeze(0) # 0.1 second of silence | |
pieces = [] | |
for sentence in sentences: | |
inputs = processor(sentence, voice_preset=speaker_npz).to(device) | |
# NOTE: you must run the line below, otherwise you will see the runtime error | |
# RuntimeError: view size is not compatible with input tensor's size and stride (at least one dimension spans across two contiguous subspaces). Use .reshape(...) instead. | |
inputs['history_prompt']['coarse_prompt'] = inputs['history_prompt']['coarse_prompt'].transpose(0, 1).contiguous().transpose(0, 1) | |
with torch.inference_mode(): | |
# TODO: min_eos_p? | |
output = model.generate( | |
**inputs, | |
do_sample = True, | |
semantic_temperature = SEMANTIC_TEMPERATURE, | |
coarse_temperature = COARSE_TEMPERATURE, | |
fine_temperature = FINE_TEMPERATURE | |
) | |
pieces += [output, silence] | |
result_audio = torch.cat(pieces, dim=1) | |
wav_tensor = result_audio.to(dtype=torch.float32).cpu() | |
wav = torchaudio.functional.resample(wav_tensor, orig_freq=SAMPLE_RATE, new_freq=32000) | |
wav = speed_perturb(wav.float())[0].squeeze(0) | |
wav = wav.numpy() | |
wav = LOUDNESS_NORM(wav, volumn=volume) | |
WRITE_AUDIO(wav, name=output_wav) | |
# Return success message and the filename of the generated audio | |
return jsonify({'message': f'Text-to-Speech generated successfully | {speaker_id}: {text}', 'file': output_wav}) | |
except Exception as e: | |
raise e | |
# Return error message if something goes wrong | |
return jsonify({'API error': str(e)}), 500 | |
if __name__ == '__main__': | |
service_port = config['Text-to-Speech']['service-port'] | |
app.run(debug=False, port=service_port) | |