openpecha
/

speecht5-tts-01

Inference Endpoints

Model card Files Files and versions Community

TenzinGayche commited on Sep 28, 2023

Commit

75700af

•

1 Parent(s): 19ca4e3

Update handler.py

Files changed (1) hide show

handler.py +10 -4

handler.py CHANGED Viewed

@@ -1,5 +1,6 @@
 from typing import  Dict, Any,Union
 import librosa
 import numpy as np
 import torch
 import pyewts
@@ -10,6 +11,7 @@ from num2tib.core import convert2text
 import base64
 import re
 import requests
 converter = pyewts.pyewts()
 def download_file(url, destination):
     response = requests.get(url)
@@ -80,15 +82,19 @@ class EndpointHandler():
         text=cleanup_text(text)
         text=replace_numbers_with_convert(text)
         inputs = self.processor(text=text, return_tensors="pt")
-        # limit input length
         input_ids = inputs["input_ids"]
         input_ids = input_ids[..., :self.model.config.max_text_positions]
         speaker_embedding = np.load(speaker_embeddings['Lhasa(female)'])
         speaker_embedding = torch.tensor(speaker_embedding)
         speech = self.model.generate_speech(input_ids.to('cuda'), speaker_embedding.to('cuda'), vocoder=self.vocoder.to('cuda'))
         speech = nr.reduce_noise(y=speech.to('cpu'), sr=16000)
         return {
             "sample_rate": 16000,
-            "audio": base64.b64encode(speech.tostring()).decode("utf-8"),
-        }

 from typing import  Dict, Any,Union
 import librosa
+import tempfile
 import numpy as np
 import torch
 import pyewts
 import base64
 import re
 import requests
+import os
 converter = pyewts.pyewts()
 def download_file(url, destination):
     response = requests.get(url)
         text=cleanup_text(text)
         text=replace_numbers_with_convert(text)
         inputs = self.processor(text=text, return_tensors="pt")
         input_ids = inputs["input_ids"]
         input_ids = input_ids[..., :self.model.config.max_text_positions]
         speaker_embedding = np.load(speaker_embeddings['Lhasa(female)'])
         speaker_embedding = torch.tensor(speaker_embedding)
         speech = self.model.generate_speech(input_ids.to('cuda'), speaker_embedding.to('cuda'), vocoder=self.vocoder.to('cuda'))
         speech = nr.reduce_noise(y=speech.to('cpu'), sr=16000)
+        with tempfile.NamedTemporaryFile(delete=False, suffix=".wav") as temp_wav_file:
+            temp_wav_path = temp_wav_file.name
+            librosa.output.write_wav(temp_wav_path, speech.numpy(), sr=16000)
+        with open(temp_wav_path, "rb") as wav_file:
+            audio_base64 = base64.b64encode(wav_file.read()).decode("utf-8")
+        os.remove(temp_wav_path)
         return {
             "sample_rate": 16000,
+            "audio_base64": audio_base64,
+        }