Spaces:

Pendrokar
/

xVASynth

Running on CPU Upgrade

File size: 4,633 Bytes

import os
import sys
import requests
import json
from huggingface_hub import HfApi

# start xVASynth service (no HTTP)
import resources.app.no_server as xvaserver

from gr_client import BlocksDemo

# model
hf_model_name = "Pendrokar/xvapitch_nvidia"
model_repo = HfApi()
commits = model_repo.list_repo_commits(repo_id=hf_model_name)
latest_commit_sha = commits[0].commit_id
hf_cache_models_path = f'/home/user/.cache/huggingface/hub/models--Pendrokar--xvapitch_nvidia/snapshots/{latest_commit_sha}/'

commits = model_repo.list_repo_commits(repo_id='Pendrokar/xvasynth_lojban')
latest_commit_sha = commits[0].commit_id
hf_cache_lojban_models_path = f'/home/user/.cache/huggingface/hub/models--Pendrokar--xvasynth_lojban/snapshots/{latest_commit_sha}/'
models_path = hf_cache_models_path

current_voice_model = None
base_speaker_emb = ''

def load_model(voice_model_name):
	if voice_model_name == 'x_selpahi':
		# Lojban
		model_path =  hf_cache_lojban_models_path + voice_model_name
		model_type = 'FastPitch1.1'
	else:
		model_path =  models_path + voice_model_name
		model_type = 'xVAPitch'

	language = 'en' # seems to have no effect if generated text is from a different language

	data = {
		'outputs': None,
		'version': '3.0',
		'model': model_path,
		'modelType': model_type,
		'base_lang': language,
		'pluginsContext': '{}',
	}

	embs = base_speaker_emb

	print('Loading voice model...')
	try:
		json_data = xvaserver.loadModel(data)
		current_voice_model = voice_model_name

		with open(model_path + '.json', 'r', encoding='utf-8') as f:
		    voice_model_json = json.load(f)
		embs = voice_model_json['games'][0]['base_speaker_emb']
	except requests.exceptions.RequestException as err:
		print(f'FAILED to load voice model: {err}')

	return embs


class LocalBlocksDemo(BlocksDemo):
	def predict(
		self,
		input_text,
		voice,
		lang,
		pacing,
		pitch,
		energy,
		anger,
		happy,
		sad,
		surprise,
		use_deepmoji
	):
		# grab only the first 1000 characters
		input_text = input_text[:1000]

		# load voice model if not the current model
		if (current_voice_model != voice):
			base_speaker_emb = load_model(voice)

		model_type = 'xVAPitch'
		pace = pacing if pacing else 1.0
		save_path = '/tmp/xvapitch_audio_sample.wav'
		language = lang
		use_sr = 0
		use_cleanup = 0

		pluginsContext = {}
		pluginsContext["mantella_settings"] = {
			"emAngry": (anger if anger > 0 else 0),
			"emHappy": (happy if happy > 0 else 0),
			"emSad": (sad if sad > 0 else 0),
			"emSurprise": (surprise if surprise > 0 else 0),
			"run_model": use_deepmoji
		}


		data = {
			'pluginsContext': json.dumps(pluginsContext),
			'modelType': model_type,
			# pad with whitespaces as a workaround to avoid cutoffs
			'sequence': input_text.center(len(input_text) + 2, ' '),
			'pace': pace,
			'outfile': save_path,
			'vocoder': 'n/a',
			'base_lang': language,
			'base_emb': base_speaker_emb,
			'useSR': use_sr,
			'useCleanup': use_cleanup,
		}

		print('Synthesizing...')
		try:
			json_data = xvaserver.synthesize(data)
			# response = requests.post('http://0.0.0.0:8008/synthesize', json=data, timeout=60)
			# response.raise_for_status()  # If the response contains an HTTP error status code, raise an exception
			# json_data = json.loads(response.text)
		except requests.exceptions.RequestException as err:
			print('FAILED to synthesize: {err}')
			save_path = ''
			response = {'text': '{"message": "Failed"}'}
			json_data = {
				'arpabet': ['Failed'],
				'durations': [0],
				'em_anger': anger,
				'em_happy': happy,
				'em_sad': sad,
				'em_surprise': surprise,
			}

		# print('server.log contents:')
		# with open('resources/app/server.log', 'r') as f:
		# 	print(f.read())

		arpabet_html = '<h6>ARPAbet & Phoneme lengths</h6>'
		arpabet_symbols = json_data['arpabet'].split('|')
		utter_time = 0
		for symb_i in range(len(json_data['durations'])):
			# skip PAD symbol
			if (arpabet_symbols[symb_i] == '<PAD>'):
				continue

			length = float(json_data['durations'][symb_i])
			arpa_length = str(round(length/2, 1))
			arpabet_html += '<strong\
				class="arpabet"\
				style="padding: 0 '\
				+ str(arpa_length)\
				+'em"'\
				+f" title=\"{utter_time} + {length}\""\
				+'>'\
				+ arpabet_symbols[symb_i]\
				+ '</strong> '
			utter_time += round(length, 1)

		return [
			save_path,
			arpabet_html,
			round(json_data['em_angry'][0], 2),
			round(json_data['em_happy'][0], 2),
			round(json_data['em_sad'][0], 2),
			round(json_data['em_surprise'][0], 2),
			json_data
		]

if __name__ == "__main__":
	print('running custom Gradio interface')
	demo = LocalBlocksDemo(models_path)
	demo.block.launch()