Spaces:

GreenCounsel
/

SpeechT5-sv

Sleeping

App Files Files Community

SpeechT5-sv / app.py

CEHB

Update app.py

02e1a35 about 1 year ago

raw

history blame

No virus

4.33 kB

	import gradio as gr
	import librosa
	import numpy as np
	import torch
	import re
	from num2words import num2words

	from transformers import SpeechT5Processor, SpeechT5ForTextToSpeech, SpeechT5HifiGan

	checkpoint = "GreenCounsel/speecht5_tts_common_voice_5_sv"
	processor = SpeechT5Processor.from_pretrained("microsoft/speecht5_tts")
	model = SpeechT5ForTextToSpeech.from_pretrained(checkpoint)
	vocoder = SpeechT5HifiGan.from_pretrained("microsoft/speecht5_hifigan")


	speaker_embeddings = {
	"Female": "spkemb/cmu_us_clb_arctic-wav-arctic_a0144.npy",
	"Male": "spkemb/cmu_us_bdl_arctic-wav-arctic_a0009.npy",
	"Experimental":"spkemb/embeddings.npy",

	}


	def predict(text, speaker):
	if len(text.strip()) == 0 or len(text.strip())>200:
	text="Du måste ha minst ett och max 200 tecken."
	ar=[int(s) for s in re.findall(r'\b\d+\b',text)]
	for arr in ar:
	text=text.replace(str(arr),num2words(arr,lang="sv"))
	repl = [
	('Ä', 'ae'),
	('Å', 'o'),
	('Ö', 'oe'),
	('ä', 'ae'),
	('å', 'o'),
	('ö', 'oe'),
	('ô','oe'),
	('-',''),
	('‘',''),
	('’',''),
	('“',''),
	('”',''),
	]


	for src, dst in repl:
	text = text.replace(src, dst)

	inputs = processor(text=text, return_tensors="pt")

	# limit input length
	input_ids = inputs["input_ids"]
	input_ids = input_ids[..., :model.config.max_text_positions]

	speaker_embedding = np.load(speaker_embeddings[speaker])

	speaker_embedding = torch.tensor(speaker_embedding).unsqueeze(0)

	speech = model.generate_speech(input_ids, speaker_embedding, vocoder=vocoder)

	speech = (speech.numpy() * 32767).astype(np.int16)
	return (16000, speech)


	title = "SpeechT5 finetuned Swedish, TTS "

	description = """
	SpeechT5 text-to-speech model finetuned on the Swedish language from the
	Common Voice dataset. Inference runs on a basic CPU (2 vCPU, 16 GB ram) so
	please have patience if it takes some time. As a company founded by a female
	coder, our resources are extremely limited (female founders in tech only get approx.
	1 % of the venture capital and the women who receive funding seldom are the
	ones actually handling the tech). We are in a very biased sphere where
	female coders' companies seldom get the resources which would normally
	be necessary to do what they do. The app uses the SpeechT5 model
	finetuned for swedish by GreenCounsel, available here: [https://huggingface.co/GreenCounsel/speecht5_tts_common_voice_5_sv](https://huggingface.co/GreenCounsel/speecht5_tts_common_voice_5_sv).
	"""

	article = """
	<div style='margin:20px auto;'>
	<p>References: <a href="https://arxiv.org/abs/2110.07205">SpeechT5 paper</a> \|
	<a href="https://github.com/microsoft/SpeechT5/">original SpeechT5</a> \|
	<a href="https://huggingface.co/mechanicalsea/speecht5-tts">original weights</a></p>
	<pre>
	@article{Ao2021SpeechT5,
	title = {SpeechT5: Unified-Modal Encoder-Decoder Pre-training for Spoken Language Processing},
	author = {Junyi Ao and Rui Wang and Long Zhou and Chengyi Wang and Shuo Ren and Yu Wu and Shujie Liu and Tom Ko and Qing Li and Yu Zhang and Zhihua Wei and Yao Qian and Jinyu Li and Furu Wei},
	eprint={2110.07205},
	archivePrefix={arXiv},
	primaryClass={eess.AS},
	year={2021}
	}
	</pre>
	</div>
	"""


	examples = [
	["GreenCounsel grundades i Malmö för sex år sedan.", "Female"],
	["Med hjälp av maskininlärning kan mycket av juridiken automatiseras samtidigt som juristerna fokuserar på frågor där de ger störst värde.", "Male"],
	["GreenCounsel har byggt en chatbott som kan förstå frågor på många olika språk och ge kvalitetssäkrade svar.", "Female"],
	["Vi har också byggt ett system för att automatisera arbetsflöden för juridiska tjänster via internet.", "Male"],
	["Talsyntesen bygger på en engelsk modell och kan därför upplevas som att jag bryter lite på engelska.","Female"]
	]

	gr.Interface(
	fn=predict,
	inputs=[
	gr.Text(label="Input Text"),
	gr.Radio(label="Speaker", choices=[
	"Female",
	"Male",
	"Experimental",
	],
	value="Female"),
	],
	outputs=[
	gr.Audio(label="Generated Speech", type="numpy"),
	],
	title=title,
	description=description,
	article=article,
	examples=examples,
	).launch()