ayush2607 commited on
Commit
0026fd4
·
verified ·
1 Parent(s): e9d11d5

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +23 -50
app.py CHANGED
@@ -1,64 +1,37 @@
1
  import gradio as gr
 
 
 
2
  import torch
 
3
  import os
4
- from transformers import SpeechT5Processor, SpeechT5ForTextToSpeech, SpeechT5HifiGan
5
- from datasets import load_dataset, Audio
6
- import numpy as np
7
- from speechbrain.inference import EncoderClassifier
8
 
9
- # Load models and processor
10
- processor = SpeechT5Processor.from_pretrained("microsoft/speecht5_tts")
11
- model = SpeechT5ForTextToSpeech.from_pretrained("ayush2607/speecht5_tts_technical_data")
12
  vocoder = SpeechT5HifiGan.from_pretrained("microsoft/speecht5_hifigan")
13
 
14
- # Load speaker encoder
15
- device = "cuda" if torch.cuda.is_available() else "cpu"
16
- speaker_model = EncoderClassifier.from_hparams(
17
- source="speechbrain/spkrec-xvect-voxceleb",
18
- run_opts={"device": device},
19
- savedir=os.path.join("/tmp", "speechbrain/spkrec-xvect-voxceleb")
20
- )
21
-
22
- # Load a sample from the dataset for speaker embedding
23
- try:
24
- dataset = load_dataset("Yassmen/TTS_English_Technical_data", split="train")
25
- dataset = dataset.cast_column("audio", Audio(sampling_rate=16000))
26
- sample = dataset[0]
27
- speaker_embedding = create_speaker_embedding(sample['audio']['array'])
28
- except Exception as e:
29
- print(f"Error loading dataset: {e}")
30
- # Use a random speaker embedding as fallback
31
- speaker_embedding = torch.randn(1, 512)
32
-
33
- def create_speaker_embedding(waveform):
34
- with torch.no_grad():
35
- speaker_embeddings = speaker_model.encode_batch(torch.tensor(waveform))
36
- speaker_embeddings = torch.nn.functional.normalize(speaker_embeddings, dim=2)
37
- speaker_embeddings = speaker_embeddings.squeeze().cpu().numpy()
38
- return speaker_embeddings
39
 
40
  def text_to_speech(text):
41
- # Clean up text
42
- replacements = [
43
- ('$', 'dollar'), ('%', 'percent'), ('&', 'and'), ('*', 'asterick'),
44
- ('+', 'plus'), ('1', 'one'), ('2', 'two'), ('3', 'three'), ('4', 'four'),
45
- ('5', 'five'), ('6', 'six'), ('7', 'seven'), ('8', 'eight'), ('9', 'nine'),
46
- ('0', 'zero'), ('@', 'at'), ('\n', ' '), ('\xa0', ' '), (',', ' '),
47
- ('"', '"'), ('"', '"'),
48
- ]
49
- for src, dst in replacements:
50
- text = text.replace(src, dst)
51
-
52
  inputs = processor(text=text, return_tensors="pt")
53
- speech = model.generate_speech(inputs["input_ids"], speaker_embedding, vocoder=vocoder)
54
- return (16000, speech.numpy())
55
-
 
 
 
 
 
56
  iface = gr.Interface(
57
  fn=text_to_speech,
58
- inputs="text",
59
- outputs="audio",
60
- title="Technical Text-to-Speech",
61
- description="Enter technical text to convert to speech. The model has been fine-tuned on technical data."
62
  )
63
 
 
64
  iface.launch()
 
1
  import gradio as gr
2
+ from transformers import SpeechT5Processor, SpeechT5ForTextToSpeech, SpeechT5HifiGan
3
+ from transformers import AutoProcessor, AutoModelForTextToSpectrogram
4
+ from datasets import load_dataset
5
  import torch
6
+ import soundfile as sf
7
  import os
 
 
 
 
8
 
9
+ # Load models and processors
10
+ processor = AutoProcessor.from_pretrained("ayush2607/speecht5_tts_technical_data")
11
+ model = AutoModelForTextToSpectrogram.from_pretrained("ayush2607/speecht5_tts_technical_data")
12
  vocoder = SpeechT5HifiGan.from_pretrained("microsoft/speecht5_hifigan")
13
 
14
+ # Load xvector containing speaker's voice characteristics from a dataset
15
+ embeddings_dataset = load_dataset("Matthijs/cmu-arctic-xvectors", split="validation")
16
+ speaker_embeddings = torch.tensor(embeddings_dataset[7306]["xvector"]).unsqueeze(0)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
17
 
18
  def text_to_speech(text):
 
 
 
 
 
 
 
 
 
 
 
19
  inputs = processor(text=text, return_tensors="pt")
20
+ speech = model.generate_speech(inputs["input_ids"], speaker_embeddings, vocoder=vocoder)
21
+
22
+ output_path = "output.wav"
23
+ sf.write(output_path, speech.numpy(), samplerate=16000)
24
+
25
+ return output_path
26
+
27
+ # Create Gradio interface
28
  iface = gr.Interface(
29
  fn=text_to_speech,
30
+ inputs=gr.Textbox(label="Enter text to convert to speech"),
31
+ outputs=gr.Audio(label="Generated Speech"),
32
+ title="Text-to-Speech Converter",
33
+ description="Convert text to speech using the SpeechT5 model."
34
  )
35
 
36
+ # Launch the app
37
  iface.launch()