Spaces:

kevinwang676
/

Bark-with-Voice-Cloning

Running

App Files Files Community

Bark-with-Voice-Cloning / bark /api.py

kevinwang676

Upload 41 files

79a08d6 about 1 year ago

raw history blame contribute delete

No virus

4.54 kB

	from typing import Dict, Optional, Union

	import numpy as np

	from .generation import codec_decode, generate_coarse, generate_fine, generate_text_semantic


	def generate_with_settings(text_prompt, semantic_temp=0.6, eos_p=0.2, coarse_temp=0.7, fine_temp=0.5, voice_name=None, output_full=False):

	# generation with more control
	x_semantic = generate_text_semantic(
	text_prompt,
	history_prompt=voice_name,
	temp=semantic_temp,
	min_eos_p = eos_p,
	use_kv_caching=True
	)

	x_coarse_gen = generate_coarse(
	x_semantic,
	history_prompt=voice_name,
	temp=coarse_temp,
	use_kv_caching=True
	)
	x_fine_gen = generate_fine(
	x_coarse_gen,
	history_prompt=voice_name,
	temp=fine_temp,
	)

	if output_full:
	full_generation = {
	'semantic_prompt': x_semantic,
	'coarse_prompt': x_coarse_gen,
	'fine_prompt': x_fine_gen
	}
	return full_generation, codec_decode(x_fine_gen)
	return codec_decode(x_fine_gen)


	def text_to_semantic(
	text: str,
	history_prompt: Optional[Union[Dict, str]] = None,
	temp: float = 0.7,
	silent: bool = False,
	):
	"""Generate semantic array from text.

	Args:
	text: text to be turned into audio
	history_prompt: history choice for audio cloning
	temp: generation temperature (1.0 more diverse, 0.0 more conservative)
	silent: disable progress bar

	Returns:
	numpy semantic array to be fed into `semantic_to_waveform`
	"""
	x_semantic = generate_text_semantic(
	text,
	history_prompt=history_prompt,
	temp=temp,
	silent=silent,
	use_kv_caching=True
	)
	return x_semantic


	def semantic_to_waveform(
	semantic_tokens: np.ndarray,
	history_prompt: Optional[Union[Dict, str]] = None,
	temp: float = 0.7,
	silent: bool = False,
	output_full: bool = False,
	):
	"""Generate audio array from semantic input.

	Args:
	semantic_tokens: semantic token output from `text_to_semantic`
	history_prompt: history choice for audio cloning
	temp: generation temperature (1.0 more diverse, 0.0 more conservative)
	silent: disable progress bar
	output_full: return full generation to be used as a history prompt

	Returns:
	numpy audio array at sample frequency 24khz
	"""
	coarse_tokens = generate_coarse(
	semantic_tokens,
	history_prompt=history_prompt,
	temp=temp,
	silent=silent,
	use_kv_caching=True
	)
	fine_tokens = generate_fine(
	coarse_tokens,
	history_prompt=history_prompt,
	temp=0.5,
	)
	audio_arr = codec_decode(fine_tokens)
	if output_full:
	full_generation = {
	"semantic_prompt": semantic_tokens,
	"coarse_prompt": coarse_tokens,
	"fine_prompt": fine_tokens,
	}
	return full_generation, audio_arr
	return audio_arr


	def save_as_prompt(filepath, full_generation):
	assert(filepath.endswith(".npz"))
	assert(isinstance(full_generation, dict))
	assert("semantic_prompt" in full_generation)
	assert("coarse_prompt" in full_generation)
	assert("fine_prompt" in full_generation)
	np.savez(filepath, **full_generation)


	def generate_audio(
	text: str,
	history_prompt: Optional[Union[Dict, str]] = None,
	text_temp: float = 0.7,
	waveform_temp: float = 0.7,
	silent: bool = False,
	output_full: bool = False,
	):
	"""Generate audio array from input text.

	Args:
	text: text to be turned into audio
	history_prompt: history choice for audio cloning
	text_temp: generation temperature (1.0 more diverse, 0.0 more conservative)
	waveform_temp: generation temperature (1.0 more diverse, 0.0 more conservative)
	silent: disable progress bar
	output_full: return full generation to be used as a history prompt

	Returns:
	numpy audio array at sample frequency 24khz
	"""
	semantic_tokens = text_to_semantic(
	text,
	history_prompt=history_prompt,
	temp=text_temp,
	silent=silent,
	)
	out = semantic_to_waveform(
	semantic_tokens,
	history_prompt=history_prompt,
	temp=waveform_temp,
	silent=silent,
	output_full=output_full,
	)
	if output_full:
	full_generation, audio_arr = out
	return full_generation, audio_arr
	else:
	audio_arr = out
	return audio_arr