Speech-to-Speech-FM / TTS_models.py
HanaeRateau's picture
Adds ollama to requirements and removes parlerTTS from models.
915cfa4
raw
history blame
4.74 kB
from abc import ABC, abstractmethod
import io
import numpy as np
import torch
from transformers import pipeline
from datasets import load_dataset
class TTSModel:
def __init__(self, model_name):
self.hf_name = model_name
self.device = "cuda:0" if torch.cuda.is_available() else "cpu"
@abstractmethod
def synthesize(self, text):
pass
#####
####################################################
class SpeechT5(TTSModel):
def __init__(self, name="microsoft/speecht5_tts"):
super(SpeechT5, self).__init__(name)
self.synthesiser = pipeline("text-to-speech", model=self.hf_name, device=self.device)
self.embeddings_dataset = load_dataset("Matthijs/cmu-arctic-xvectors", split="validation")
self.speaker_embedding = torch.tensor(self.embeddings_dataset[7306]["xvector"]).unsqueeze(0)
def synthesize(self, text):
speech = self.synthesiser(text, forward_params={"speaker_embeddings": self.speaker_embedding})
print("[SpeechT5 - synthesize]", speech)
return (np.array(speech["audio"])* 32767).astype(np.int16) # return a numpy array of int to play
####################################################
# PENDING: NOT WORKING FROM HF
# from MeloTTS.melo.api import TTS as meloTTS
# import nltk
# class MeloTTS(TTSModel):
# def __init__(self, name="myshell-ai/MeloTTS-English"):
# super(MeloTTS, self).__init__(name)
# nltk.download('averaged_perceptron_tagger_eng')
# self.synthesiser = meloTTS(language='EN', device=self.device)
# self.speaker_ids = self.synthesiser.hps.data.spk2id
# def synthesize(self, text):
# speech = self.synthesiser.tts_to_file(text, self.speaker_ids['EN-Default'])
# print("[MeloTTS - synthesize]", speech)
# return speech
####################################################
class Bark(TTSModel):
def __init__(self, name="suno/bark"):
super(Bark, self).__init__(name)
self.synthesiser = pipeline("text-to-speech", model=self.hf_name, device=self.device)
def synthesize(self, text):
speech = self.synthesiser(text)
print("[Bark - synthesize]", speech)
return speech
####################################################
# pip install git+https://github.com/huggingface/parler-tts.git
# from parler_tts import ParlerTTSForConditionalGeneration
# from transformers import AutoTokenizer
# class ParlerTTS(TTSModel):
# def __init__(self, name="parler-tts/parler-tts-large-v1"):
# super(ParlerTTS, self).__init__(name)
# self.description = "A female speaker delivers a slightly expressive and animated speech with a moderate speed and pitch. The recording is of very high quality, with the speaker's voice sounding clear and very close up."
# self.model = ParlerTTSForConditionalGeneration.from_pretrained(self.hf_name).to(self.device)
# self.tokenizer = AutoTokenizer.from_pretrained(self.hf_name)
# # self.synthesiser = pipeline("text-to-speech", model=self.model, tokenizer=self.tokenizer, device=self.device)
# def synthesize(self, text):
# input_ids = self.tokenizer(self.description, return_tensors="pt").input_ids.to(self.device)
# prompt_input_ids = self.tokenizer(text, return_tensors="pt").input_ids.to(self.device)
# generation = self.model.generate(input_ids=input_ids, prompt_input_ids=prompt_input_ids)
# speech = generation.cpu().numpy().squeeze()
# print("[ParlerTTS - synthesize]", speech)
# return speech
####################################################
# PENDING: NOT WORKING FROM HF
# pip install coqui-tts
# https://github.com/idiap/coqui-ai-TTS
from TTS.api import TTS
class XTTS(TTSModel):
def __init__(self, name="tts_models/en/ljspeech/glow-tts"):
super(XTTS, self).__init__(name)
self.synthesiser = TTS(model_name=name, progress_bar=False).to(self.device)
# self.model = AutoModelForSequenceClassification.from_pretrained(self.hf_name).to(self.device)
# self.tokenizer = AutoTokenizer.from_pretrained(self.hf_name)
# self.synthesiser = pipeline("text-to-speech", model=self.model, tokenizer=self.tokenizer, device=self.device)
# self.synthesiser = pipeline("text-to-speech", model=self.hf_name, device=self.device)
def synthesize(self, text):
# input_ids = self.tokenizer(self.description, return_tensors="pt").input_ids.to(self.device)
# prompt_input_ids = self.tokenizer(text, return_tensors="pt").input_ids.to(self.device)
# print("synthesizing ", text)
speech = self.synthesiser.tts(text=text)
print("[XTTS - synthesize]", len(speech), text)
return speech