Solo448 commited on
Commit
d8c0142
1 Parent(s): ec45def

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +70 -6
app.py CHANGED
@@ -8,7 +8,7 @@ from speechbrain.inference import EncoderClassifier
8
 
9
  # Load models and processor
10
  processor = SpeechT5Processor.from_pretrained("microsoft/speecht5_tts")
11
- model = SpeechT5ForTextToSpeech.from_pretrained("Solo448/SpeechT5-fine-tune-en")
12
  vocoder = SpeechT5HifiGan.from_pretrained("microsoft/speecht5_hifigan")
13
 
14
  # Load speaker encoder
@@ -21,7 +21,7 @@ speaker_model = EncoderClassifier.from_hparams(
21
 
22
  # Load a sample from the dataset for speaker embedding
23
  try:
24
- dataset = load_dataset("Yassmen/TTS_English_Technical_data", split="train")
25
  dataset = dataset.cast_column("audio", Audio(sampling_rate=16000))
26
  sample = dataset[0]
27
  speaker_embedding = create_speaker_embedding(sample['audio']['array'])
@@ -40,8 +40,72 @@ def create_speaker_embedding(waveform):
40
  def text_to_speech(text):
41
  # Clean up text
42
  replacements = [
43
- ('0', 'zero'), ('1', 'one'), ('2', 'two'), ('3', 'three'), ('4', 'four'),
44
- ('5', 'five'), ('6', 'six'), ('7', 'seven'), ('8', 'eight'), ('9', 'nine')
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
45
  ]
46
  for src, dst in replacements:
47
  text = text.replace(src, dst)
@@ -54,8 +118,8 @@ iface = gr.Interface(
54
  fn=text_to_speech,
55
  inputs="text",
56
  outputs="audio",
57
- title="Technical Text-to-Speech",
58
- description="Enter technical text to convert to speech. The model has been fine-tuned on technical data."
59
  )
60
 
61
  iface.launch()
 
8
 
9
  # Load models and processor
10
  processor = SpeechT5Processor.from_pretrained("microsoft/speecht5_tts")
11
+ model = SpeechT5ForTextToSpeech.from_pretrained("Solo448/Speect5-common-voice-Hindi")
12
  vocoder = SpeechT5HifiGan.from_pretrained("microsoft/speecht5_hifigan")
13
 
14
  # Load speaker encoder
 
21
 
22
  # Load a sample from the dataset for speaker embedding
23
  try:
24
+ dataset = load_dataset("mozilla-foundation/common_voice_17_0", "hi", split="validated", trust_remote_code=True)
25
  dataset = dataset.cast_column("audio", Audio(sampling_rate=16000))
26
  sample = dataset[0]
27
  speaker_embedding = create_speaker_embedding(sample['audio']['array'])
 
40
  def text_to_speech(text):
41
  # Clean up text
42
  replacements = [
43
+ ("अ", "a"),
44
+ ("आ", "aa"),
45
+ ("इ", "i"),
46
+ ("ई", "ee"),
47
+ ("उ", "u"),
48
+ ("ऋ", "ri"),
49
+ ("ए", "ae"),
50
+ ("ऐ", "ai"),
51
+ ("ऑ", "au"),
52
+ ("ओ", "o"),
53
+ ("औ", "au"),
54
+ ("क", "k"),
55
+ ("ख", "kh"),
56
+ ("ग", "g"),
57
+ ("घ", "gh"),
58
+ ("च", "ch"),
59
+ ("छ", "chh"),
60
+ ("ज", "j"),
61
+ ("झ", "jh"),
62
+ ("ञ", "gna"),
63
+ ("ट", "t"),
64
+ ("ठ", "th"),
65
+ ("ड", "d"),
66
+ ("ढ", "dh"),
67
+ ("ण", "nr"),
68
+ ("त", "t"),
69
+ ("थ", "th"),
70
+ ("द", "d"),
71
+ ("ध", "dh"),
72
+ ("न", "n"),
73
+ ("प", "p"),
74
+ ("फ", "ph"),
75
+ ("ब", "b"),
76
+ ("भ", "bh"),
77
+ ("म", "m"),
78
+ ("य", "ya"),
79
+ ("र", "r"),
80
+ ("ल", "l"),
81
+ ("व", "w"),
82
+ ("श", "sha"),
83
+ ("ष", "sh"),
84
+ ("स", "s"),
85
+ ("ह", "ha"),
86
+ ("़", "ng"),
87
+ ("्", ""),
88
+ ("ऽ", ""),
89
+ ("ा", "a"),
90
+ ("ि", "i"),
91
+ ("ी", "ee"),
92
+ ("ु", "u"),
93
+ ("ॅ", "n"),
94
+ ("े", "e"),
95
+ ("ै", "oi"),
96
+ ("ो", "o"),
97
+ ("ौ", "ou"),
98
+ ("ॅ", "n"),
99
+ ("ॉ", "r"),
100
+ ("ू", "uh"),
101
+ ("ृ", "ri"),
102
+ ("ं", "n"),
103
+ ("क़", "q"),
104
+ ("ज़", "z"),
105
+ ("ड़", "r"),
106
+ ("ढ़", "rh"),
107
+ ("फ़", "f"),
108
+ ("|", ".")
109
  ]
110
  for src, dst in replacements:
111
  text = text.replace(src, dst)
 
118
  fn=text_to_speech,
119
  inputs="text",
120
  outputs="audio",
121
+ title="Hindi Text-to-Speech",
122
+ description="Enter hindi text to convert to speech"
123
  )
124
 
125
  iface.launch()