import gradio as gr import librosa import numpy as np import torch import re from num2words import num2words from transformers import SpeechT5Processor, SpeechT5ForTextToSpeech, SpeechT5HifiGan checkpoint = "GreenCounsel/speecht5_tts_common_voice_5_sv" processor = SpeechT5Processor.from_pretrained("microsoft/speecht5_tts") model = SpeechT5ForTextToSpeech.from_pretrained(checkpoint) vocoder = SpeechT5HifiGan.from_pretrained("microsoft/speecht5_hifigan") speaker_embeddings = { "CLB": "spkemb/cmu_us_clb_arctic-wav-arctic_a0144.npy", } def predict(text, speaker): if len(text.strip()) == 0 or len(text.strip())>200: text="Du måste ha minst ett och max 200 tecken." ar=[int(s) for s in re.findall(r'\b\d+\b',text)] for arr in ar: text=text.replace(str(arr),num2words(arr,lang="sv")) repl = [ ('Ä', 'ae'), ('Å', 'o'), ('Ö', 'oe'), ('ä', 'ae'), ('å', 'o'), ('ö', 'oe'), ('ô','oe'), ('-',''), ('‘',''), ('’',''), ('“',''), ('”',''), ] for src, dst in repl: text = text.replace(src, dst) inputs = processor(text=text, return_tensors="pt") # limit input length input_ids = inputs["input_ids"] input_ids = input_ids[..., :model.config.max_text_positions] speaker_embedding = np.load(speaker_embeddings[speaker[:3]]) speaker_embedding = torch.tensor(speaker_embedding).unsqueeze(0) speech = model.generate_speech(input_ids, speaker_embedding, vocoder=vocoder) speech = (speech.numpy() * 32767).astype(np.int16) return (16000, speech) title = "SpeechT5 finetuned Swedish, TTS " description = """ SpeechT5 text-to-speech model finetuned on the Swedish language from the Common Voice dataset. Inference runs on a basic CPU (2 vCPU, 16 GB ram) so please have patience if it takes some time. As a company founded by a female coder, our resources are extremely limited (female founders in tech only get approx. 1 % of the venture capital and the women who receive funding seldom are the ones actually handling the tech). We are in a very biased sphere where female coders' companies seldom get the resources which would normally be necessary to do what they do. The app uses the SpeechT5 model finetuned for swedish by GreenCounsel, available here: []( """ article = """

""" examples = [ ["GreenCounsel grundades i Malmö för sex år sedan.", "CLB (female)"], ["Med hjälp av maskininlärning kan mycket av juridiken automatiseras samtidigt som juristerna fokuserar på frågor där de ger störst värde.", "CLB (female)"], ["GreenCounsel har byggt en chatbott som kan förstå frågor på många olika språk och ge kvalitetssäkrade svar.", "CLB (female)"], ["Vi har också byggt ett system för att automatisera arbetsflöden för juridiska tjänster via internet.", "CLB (female)"], ] gr.Interface( fn=predict, inputs=[ gr.Text(label="Input Text"), gr.Radio(label="Speaker", choices=[ "CLB (female)", ], value="CLB (female)"), ], outputs=[ gr.Audio(label="Generated Speech", type="numpy"), ], title=title, description=description, article=article, examples=examples, ).launch()