Spaces:
Running
Running
""" | |
Copyright 2023 Balacoon | |
contains implementation | |
for voice conversion request | |
""" | |
import os | |
import asyncio | |
import base64 | |
import hashlib | |
import json | |
import ssl | |
import time | |
from typing import Tuple | |
import numpy as np | |
import resampy | |
import websockets | |
def prepare_audio(audio: Tuple[int, np.ndarray]) -> np.ndarray: | |
""" | |
ensures that audio is in int16 format, 16khz mono | |
""" | |
sr, wav = audio | |
# ensure proper type | |
if wav.dtype == np.int32: | |
max_val = np.max(np.abs(wav)) | |
mult = (32767.0 / 2**31) if max_val > 32768 else 1.0 | |
wav = (wav.astype(np.float32) * mult).astype(np.int16) | |
elif wav.dtype == np.float32 or wav.dtype == np.float64: | |
mult = 32767.0 if np.max(np.abs(wav)) <= 1.0 else 1.0 | |
wav = (wav * mult).astype(np.int16) | |
if wav.ndim == 2: | |
# average channels | |
if wav.shape[0] == 2: | |
wav = np.mean(wav, axis=0, keepdims=False) | |
if wav.shape[1] == 2: | |
wav = np.mean(wav, axis=1, keepdims=False) | |
if wav.ndim != 1: | |
return None | |
# ensure proper sampling rate | |
if sr != 16000: | |
wav = (wav / 32768.0).astype(np.float) | |
wav = resampy.resample(wav, sr, 16000) | |
wav = (wav * 32768.0).astype(np.int16) | |
return wav | |
def create_signature() -> str: | |
""" | |
helper function that creates signature, | |
required to authentificate the request | |
""" | |
int_time = int(time.time() / 1000) | |
signature_input = (os.environ["api_secret"] + str(int_time)).encode() | |
signature = hashlib.sha256(signature_input).hexdigest() | |
return signature | |
async def async_service_request(source: np.ndarray, target: np.ndarray) -> np.ndarray: | |
ssl_context = ssl.create_default_context() | |
async with websockets.connect( | |
os.environ["endpoint"], close_timeout=1024, ssl=ssl_context | |
) as websocket: | |
request_dict = { | |
"source": base64.b64encode(source.tobytes()).decode("utf-8"), | |
"target": base64.b64encode(target.tobytes()).decode("utf-8"), | |
"api_key": os.environ["api_key"], | |
"signature": create_signature(), | |
} | |
request = json.dumps(request_dict) | |
await websocket.send(request) | |
# read reply | |
result_lst = [] | |
while True: | |
try: | |
data = await websocket.recv() | |
result_lst.append(np.frombuffer(data, dtype="int16")) | |
except websockets.exceptions.ConnectionClosed: | |
break | |
if data is None: | |
break | |
result = np.concatenate(result_lst) if result_lst else None | |
return result | |
def vc_service_request( | |
source_audio: Tuple[int, np.ndarray], target_audio: Tuple[int, np.ndarray] | |
) -> Tuple[int, np.ndarray]: | |
""" | |
prepares audio (has to be 16khz mono) | |
and runs request to a voice conversion service | |
""" | |
src = prepare_audio(source_audio) | |
tgt = prepare_audio(target_audio) | |
if not src or not tgt: | |
return | |
if len(src) >= 60 * 16000 or len(tgt) >= 30 * 16000: | |
# input is way too long, dont return anything | |
return | |
res = asyncio.run(async_service_request(src, tgt)) | |
return 16000, res | |