File size: 2,963 Bytes
cf18f93
 
7f71b9c
 
d8fc388
 
04c250d
 
f6fdf7a
 
 
04c250d
 
 
 
58c3446
 
04c250d
f6fdf7a
f576d11
3cefd18
04c250d
 
970053f
 
 
 
 
576bebe
 
 
 
 
f4b46dd
 
 
 
 
04c250d
 
293d0d2
f6fdf7a
 
970053f
 
 
576bebe
 
9be2afa
c2a1b9b
 
 
 
 
32fedd3
f4b46dd
 
 
f6fdf7a
d3fe331
04c250d
f6fdf7a
 
 
 
04c250d
ee620af
04c250d
f6fdf7a
970053f
f6fdf7a
04c250d
 
f6fdf7a
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
import os



os.environ["CURL_CA_BUNDLE"]=""

import gradio as gr
import time
import urllib.request
from pathlib import Path
import os
import torch
import scipy.io.wavfile
from espnet2.bin.tts_inference import Text2Speech
from espnet2.utils.types import str_or_none
from parallel_wavegan.utils import download_pretrained_model


gos_text2speech = Text2Speech.from_pretrained(
  model_tag="https://huggingface.co/ahnafsamin/FastSpeech2-gronings/resolve/main/tts_train_fastspeech2_raw_char_tacotron_train.loss.ave.zip",
  vocoder_tag="parallel_wavegan/ljspeech_parallel_wavegan.v3"
)

hoogelandsters_text2speech = Text2Speech.from_pretrained(
  model_tag="https://huggingface.co/ahnafsamin/FastSpeech2-gronings-hoogelandsters/resolve/main/tts_train_fastspeech2_raw_char_tacotron_train.loss.ave.zip",
  vocoder_tag="parallel_wavegan/ljspeech_parallel_wavegan.v3"
)

westerkwartiers_text2speech = Text2Speech.from_pretrained(
  model_tag="https://huggingface.co/ahnafsamin/FastSpeech2-gronings-westerkwartiers/resolve/main/tts_train_fastspeech2_raw_char_tacotron_train.loss.ave.zip",
  vocoder_tag="parallel_wavegan/ljspeech_parallel_wavegan.v3"
)

oldambster_text2speech = Text2Speech.from_pretrained(
  model_tag="https://huggingface.co/ahnafsamin/FastSpeech2-gronings-oldambster/resolve/main/tts_train_fastspeech2_raw_char_tacotron_train.loss.ave.zip",
  vocoder_tag="parallel_wavegan/ljspeech_parallel_wavegan.v3"
)

def inference(text,lang):
  with torch.no_grad():
      if lang == "gronings":
          wav = gos_text2speech(text)["wav"]
          scipy.io.wavfile.write("out.wav", gos_text2speech.fs , wav.view(-1).cpu().numpy())
      elif lang == "gronings hoogelandsters":
          wav = hoogelandsters_text2speech(text)["wav"]
          scipy.io.wavfile.write("out.wav", hoogelandsters_text2speech.fs , wav.view(-1).cpu().numpy())
      elif lang == "gronings westerkwartiers":
          wav = westerkwartiers_text2speech(text)["wav"]
          wav = wav * 15
          scipy.io.wavfile.write("out.wav", westerkwartiers_text2speech.fs , wav.view(-1).cpu().numpy())
          #data, sr = librosa.load("output.wav")
          #factor = 2.0
          #data *= factor
          #sf.write("out.wav", data, sr)
          
      elif lang == "gronings oldambster":
          wav = oldambster_text2speech(text)["wav"]
          scipy.io.wavfile.write("out.wav", oldambster_text2speech.fs , wav.view(-1).cpu().numpy())

  return  "out.wav", "out.wav"

title = "GroTTS"
examples = [
  ['Ze gingen mit klas noar waddendiek, over en deur bragel lopen.', 'gronings']
]


gr.Interface(
    inference,
    [gr.inputs.Textbox(label="input text", lines=3), gr.inputs.Radio(choices=["gronings", "gronings hoogelandsters", "gronings westerkwartiers", "gronings oldambster"], type="value", default="gronings", label="language")], 
    [gr.outputs.Audio(type="file", label="Output"), gr.outputs.File()],
    title=title,
    examples=examples
    ).launch(enable_queue=True)