Spaces:

aswathyraj
/

TTS

Sleeping

App Files Files Community

aswathyraj commited on Jan 31

Commit

edfdd67

•

1 Parent(s): 89693d5

Upload 2 files

Browse files

Files changed (2) hide show

app.py +62 -0
requirements.txt +64 -0

app.py ADDED Viewed

	@@ -0,0 +1,62 @@

+# apis.py
+import sys
+from transformers import SpeechT5Processor, SpeechT5ForTextToSpeech, SpeechT5HifiGan
+from datasets import load_dataset
+import torch
+import soundfile as sf
+import gradio as gr
+import os
+def generate_speech(text, person):
+    # Initialize SpeechT5 components
+    processor = SpeechT5Processor.from_pretrained("microsoft/speecht5_tts")
+    model = SpeechT5ForTextToSpeech.from_pretrained("microsoft/speecht5_tts")
+    vocoder = SpeechT5HifiGan.from_pretrained("microsoft/speecht5_hifigan")
+    # Process text using the processor
+    inputs = processor(text=text, return_tensors="pt")
+    # Load xvector containing speaker's voice characteristics from a dataset
+    embeddings_dataset = load_dataset("Matthijs/cmu-arctic-xvectors", split="validation")
+    # Set the speaker based on the provided person parameter
+    if person == "male":
+        speaker_index = 5004
+    elif person == "female":
+        speaker_index = 7306
+    else:
+        raise ValueError("Invalid value for 'person'. Use 'male' or 'female'.")
+    # Generate speech using the selected speaker
+    speaker_embeddings = torch.tensor(embeddings_dataset[speaker_index]["xvector"]).unsqueeze(0)
+    speech = model.generate_speech(inputs["input_ids"], speaker_embeddings, vocoder=vocoder)
+    # Save the generated speech as a WAV file
+    # sf.write("speech.wav", speech.numpy(), samplerate=16000)
+    # print(f"The speech was generated for {result_person}.")
+    # Create an in-memory buffer to hold the speech data
+    output_file = "output_file.wav"
+    # Write the speech data to the buffer
+    sf.write(output_file, speech.numpy(), samplerate=16000, format='wav', subtype='PCM_16')
+    # Return the in-memory buffer
+    return output_file
+default_text = ""
+demo = gr.Interface(
+    fn=generate_speech,
+    inputs = [
+        gr.Textbox(value=default_text, label="Input text", placeholder="Type something here.."),
+        gr.Radio(choices=['male', 'female'], label="Targert Speaker",value="female"),
+    ],
+    outputs=gr.Audio(label=""),
+    title= "Text to speech"
+)
+if __name__ == "__main__":
+    demo.launch(share=True)

requirements.txt ADDED Viewed

	@@ -0,0 +1,64 @@

+aiohttp==3.9.2
+aiosignal==1.3.1
+async-timeout==4.0.3
+attrs==23.2.0
+blinker==1.7.0
+certifi==2023.11.17
+cffi==1.16.0
+charset-normalizer==3.3.2
+click==8.1.7
+datasets==2.16.1
+dill==0.3.7
+filelock==3.13.1
+frozenlist==1.4.1
+fsspec==2023.10.0
+huggingface-hub==0.20.3
+idna==3.6
+importlib-metadata==7.0.1
+itsdangerous==2.1.2
+Jinja2==3.1.3
+MarkupSafe==2.1.4
+mpmath==1.3.0
+multidict==6.0.4
+multiprocess==0.70.15
+networkx==3.2.1
+numpy==1.26.3
+nvidia-cublas-cu12==12.1.3.1
+nvidia-cuda-cupti-cu12==12.1.105
+nvidia-cuda-nvrtc-cu12==12.1.105
+nvidia-cuda-runtime-cu12==12.1.105
+nvidia-cudnn-cu12==8.9.2.26
+nvidia-cufft-cu12==11.0.2.54
+nvidia-curand-cu12==10.3.2.106
+nvidia-cusolver-cu12==11.4.5.107
+nvidia-cusparse-cu12==12.1.0.106
+nvidia-nccl-cu12==2.18.1
+nvidia-nvjitlink-cu12==12.3.101
+nvidia-nvtx-cu12==12.1.105
+packaging==23.2
+pandas==2.2.0
+pyarrow==15.0.0
+pyarrow-hotfix==0.6
+pycparser==2.21
+python-dateutil==2.8.2
+pytz==2023.4
+PyYAML==6.0.1
+regex==2023.12.25
+requests==2.31.0
+safetensors==0.4.2
+sentencepiece==0.1.99
+six==1.16.0
+soundfile==0.12.1
+sympy==1.12
+tokenizers==0.15.1
+torch==2.1.2
+tqdm==4.66.1
+transformers==4.37.2
+triton==2.1.0
+typing_extensions==4.9.0
+tzdata==2023.4
+urllib3==2.1.0
+Werkzeug==3.0.1
+xxhash==3.4.1
+yarl==1.9.4
+zipp==3.17.0