NSect clementruhm commited on
Commit
d673948
0 Parent(s):

Duplicate from balacoon/voice_conversion_service

Browse files

Co-authored-by: Clement Ruhm <clementruhm@users.noreply.huggingface.co>

.gitattributes ADDED
@@ -0,0 +1,34 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ *.7z filter=lfs diff=lfs merge=lfs -text
2
+ *.arrow filter=lfs diff=lfs merge=lfs -text
3
+ *.bin filter=lfs diff=lfs merge=lfs -text
4
+ *.bz2 filter=lfs diff=lfs merge=lfs -text
5
+ *.ckpt filter=lfs diff=lfs merge=lfs -text
6
+ *.ftz filter=lfs diff=lfs merge=lfs -text
7
+ *.gz filter=lfs diff=lfs merge=lfs -text
8
+ *.h5 filter=lfs diff=lfs merge=lfs -text
9
+ *.joblib filter=lfs diff=lfs merge=lfs -text
10
+ *.lfs.* filter=lfs diff=lfs merge=lfs -text
11
+ *.mlmodel filter=lfs diff=lfs merge=lfs -text
12
+ *.model filter=lfs diff=lfs merge=lfs -text
13
+ *.msgpack filter=lfs diff=lfs merge=lfs -text
14
+ *.npy filter=lfs diff=lfs merge=lfs -text
15
+ *.npz filter=lfs diff=lfs merge=lfs -text
16
+ *.onnx filter=lfs diff=lfs merge=lfs -text
17
+ *.ot filter=lfs diff=lfs merge=lfs -text
18
+ *.parquet filter=lfs diff=lfs merge=lfs -text
19
+ *.pb filter=lfs diff=lfs merge=lfs -text
20
+ *.pickle filter=lfs diff=lfs merge=lfs -text
21
+ *.pkl filter=lfs diff=lfs merge=lfs -text
22
+ *.pt filter=lfs diff=lfs merge=lfs -text
23
+ *.pth filter=lfs diff=lfs merge=lfs -text
24
+ *.rar filter=lfs diff=lfs merge=lfs -text
25
+ *.safetensors filter=lfs diff=lfs merge=lfs -text
26
+ saved_model/**/* filter=lfs diff=lfs merge=lfs -text
27
+ *.tar.* filter=lfs diff=lfs merge=lfs -text
28
+ *.tflite filter=lfs diff=lfs merge=lfs -text
29
+ *.tgz filter=lfs diff=lfs merge=lfs -text
30
+ *.wasm filter=lfs diff=lfs merge=lfs -text
31
+ *.xz filter=lfs diff=lfs merge=lfs -text
32
+ *.zip filter=lfs diff=lfs merge=lfs -text
33
+ *.zst filter=lfs diff=lfs merge=lfs -text
34
+ *tfevents* filter=lfs diff=lfs merge=lfs -text
.gitignore ADDED
@@ -0,0 +1 @@
 
 
1
+ __pycache__
README.md ADDED
@@ -0,0 +1,13 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ ---
2
+ title: Voice Conversion Service
3
+ emoji: 💬
4
+ colorFrom: blue
5
+ colorTo: yellow
6
+ sdk: gradio
7
+ sdk_version: 3.32.0
8
+ app_file: app.py
9
+ pinned: false
10
+ duplicated_from: balacoon/voice_conversion_service
11
+ ---
12
+
13
+ Interactive demo for Voice Conversion service by Balacoon.
__init__.py ADDED
File without changes
app.py ADDED
@@ -0,0 +1,109 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """
2
+ Copyright 2023 Balacoon
3
+
4
+ Voice Conversion service interactive demo
5
+ """
6
+
7
+ import glob
8
+ import logging
9
+ import os
10
+
11
+ import gradio as gr
12
+
13
+ from vc_service_request import vc_service_request
14
+
15
+ script_dir = os.path.dirname(os.path.abspath(__file__))
16
+
17
+
18
+ def main():
19
+ logging.basicConfig(level=logging.INFO)
20
+
21
+ badges = """
22
+ <div style="display: flex">
23
+ <span style="margin-right: 5px">
24
+
25
+ [<img src="https://play.google.com/intl/en_us/badges/static/images/badges/en_badge_web_generic.png" width="200" height="77">](https://play.google.com/store/apps/details?id=com.app.vc)
26
+
27
+ </span>
28
+ </div>
29
+ """
30
+
31
+ with gr.Blocks() as demo:
32
+ gr.Markdown(
33
+ """
34
+ <h1 align="center">Balacoon🦝 Voice Conversion</h1>
35
+
36
+
37
+ Welcome to the live demo of Balacoon's Voice Conversion service.
38
+ Check out our [website](https://balacoon.com/demo/#voice-conversion)
39
+ to learn more.
40
+ Voice Conversion allows you to transform your own voice
41
+ into the voice of another person using just a single sample.
42
+ For optimal results, we recommend using clean audio files in English.
43
+
44
+ Here's how it works:
45
+
46
+ 1. Begin by recording your voice.
47
+ 2. Select an audio sample that represents the target voice you want to convert to.
48
+ 3. Click the "Convert" button and listen to the result!
49
+
50
+ If providing your own audio files, please use WAVE PCM.
51
+ Service works with 16kHz, 16 bit, mono audio.
52
+
53
+ If you are interested to plug in Voice Conversion
54
+ service into your own application, don't hesitate to get in touch with us at
55
+ [contact@balacoon.com](mailto:contact@balacoon.com)
56
+ """
57
+ )
58
+ gr.Markdown(badges)
59
+
60
+ with gr.Row():
61
+ with gr.Column(variant="panel"):
62
+ src_audio_mic = gr.Audio(source="microphone", label="Record your voice")
63
+ src_audio_file = gr.Audio(
64
+ source="upload", label="Or upload audio to convert"
65
+ )
66
+
67
+ with gr.Column(variant="panel"):
68
+ tgt_audio_file = gr.Audio(
69
+ source="upload", label="Select audio with target voice"
70
+ )
71
+ tgt_examples_paths = glob.glob(
72
+ os.path.join(script_dir, "references", "*.wav")
73
+ )
74
+ gr.Examples(
75
+ tgt_examples_paths,
76
+ inputs=[tgt_audio_file],
77
+ )
78
+
79
+ with gr.Row():
80
+ convert_btn = gr.Button("Convert")
81
+ with gr.Row():
82
+ result_audio = gr.Audio()
83
+
84
+ def voice_conversion(src_from_mic_, src_from_file_, tgt_from_file_):
85
+ """
86
+ helper function which checks where source come from
87
+ """
88
+ src_ = None
89
+ if src_from_mic_:
90
+ src_ = src_from_mic_
91
+ elif src_from_file_:
92
+ src_ = src_from_file_
93
+ tgt_ = tgt_from_file_
94
+ if not src_ or not tgt_:
95
+ logging.warning("source or target are not provided")
96
+ return
97
+ return vc_service_request(src_, tgt_)
98
+
99
+ convert_btn.click(
100
+ voice_conversion,
101
+ inputs=[src_audio_mic, src_audio_file, tgt_audio_file],
102
+ outputs=result_audio,
103
+ )
104
+
105
+ demo.queue(concurrency_count=1).launch()
106
+
107
+
108
+ if __name__ == "__main__":
109
+ main()
references/cate_blanchett.wav ADDED
Binary file (401 kB). View file
 
references/george_clooney.wav ADDED
Binary file (343 kB). View file
 
references/james_earl_jones.wav ADDED
Binary file (402 kB). View file
 
references/kratos.wav ADDED
Binary file (411 kB). View file
 
references/meryl_streep.wav ADDED
Binary file (398 kB). View file
 
references/mike_rowe.wav ADDED
Binary file (453 kB). View file
 
references/nikole_kidman.wav ADDED
Binary file (439 kB). View file
 
references/sam_elliott.wav ADDED
Binary file (461 kB). View file
 
requirements.txt ADDED
@@ -0,0 +1,4 @@
 
 
 
 
 
1
+ numpy==1.23.2
2
+ resampy==0.4.2
3
+ websockets==10.3
4
+ gradio_client==0.2.7
setup.cfg ADDED
@@ -0,0 +1,12 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ [flake8]
2
+ max_complexity=10
3
+ per-file-ignores=__init__.py:F401,F403
4
+ ignore = E203,W503
5
+ max-line-length=119
6
+
7
+ [isort]
8
+ profile=black
9
+ line_length=119
10
+
11
+ [mypy]
12
+ ignore_missing_imports = True
vc_service_request.py ADDED
@@ -0,0 +1,110 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """
2
+ Copyright 2023 Balacoon
3
+
4
+ contains implementation
5
+ for voice conversion request
6
+ """
7
+
8
+ import os
9
+ import asyncio
10
+ import base64
11
+ import hashlib
12
+ import json
13
+ import ssl
14
+ import time
15
+ from typing import Tuple
16
+
17
+ import numpy as np
18
+ import resampy
19
+ import websockets
20
+
21
+
22
+ def prepare_audio(audio: Tuple[int, np.ndarray]) -> np.ndarray:
23
+ """
24
+ ensures that audio is in int16 format, 16khz mono
25
+ """
26
+ sr, wav = audio
27
+ # ensure proper type
28
+ if wav.dtype == np.int32:
29
+ max_val = np.max(np.abs(wav))
30
+ mult = (32767.0 / 2**31) if max_val > 32768 else 1.0
31
+ wav = (wav.astype(np.float32) * mult).astype(np.int16)
32
+ elif wav.dtype == np.float32 or wav.dtype == np.float64:
33
+ mult = 32767.0 if np.max(np.abs(wav)) <= 1.0 else 1.0
34
+ wav = (wav * mult).astype(np.int16)
35
+
36
+ if wav.ndim == 2:
37
+ # average channels
38
+ if wav.shape[0] == 2:
39
+ wav = np.mean(wav, axis=0, keepdims=False)
40
+ if wav.shape[1] == 2:
41
+ wav = np.mean(wav, axis=1, keepdims=False)
42
+
43
+ if wav.ndim != 1:
44
+ return None
45
+
46
+ # ensure proper sampling rate
47
+ if sr != 16000:
48
+ wav = (wav / 32768.0).astype(np.float)
49
+ wav = resampy.resample(wav, sr, 16000)
50
+ wav = (wav * 32768.0).astype(np.int16)
51
+ return wav
52
+
53
+
54
+ def create_signature() -> str:
55
+ """
56
+ helper function that creates signature,
57
+ required to authentificate the request
58
+ """
59
+ int_time = int(time.time() / 1000)
60
+ signature_input = (os.environ["api_secret"] + str(int_time)).encode()
61
+ signature = hashlib.sha256(signature_input).hexdigest()
62
+ return signature
63
+
64
+
65
+ async def async_service_request(source: np.ndarray, target: np.ndarray) -> np.ndarray:
66
+ ssl_context = ssl.create_default_context()
67
+
68
+ async with websockets.connect(
69
+ os.environ["endpoint"], close_timeout=1024, ssl=ssl_context
70
+ ) as websocket:
71
+ request_dict = {
72
+ "source": base64.b64encode(source.tobytes()).decode("utf-8"),
73
+ "target": base64.b64encode(target.tobytes()).decode("utf-8"),
74
+ "api_key": os.environ["api_key"],
75
+ "signature": create_signature(),
76
+ }
77
+ request = json.dumps(request_dict)
78
+ await websocket.send(request)
79
+
80
+ # read reply
81
+ result_lst = []
82
+ while True:
83
+ try:
84
+ data = await websocket.recv()
85
+ result_lst.append(np.frombuffer(data, dtype="int16"))
86
+ except websockets.exceptions.ConnectionClosed:
87
+ break
88
+ if data is None:
89
+ break
90
+ result = np.concatenate(result_lst) if result_lst else None
91
+ return result
92
+
93
+
94
+ def vc_service_request(
95
+ source_audio: Tuple[int, np.ndarray], target_audio: Tuple[int, np.ndarray]
96
+ ) -> Tuple[int, np.ndarray]:
97
+ """
98
+ prepares audio (has to be 16khz mono)
99
+ and runs request to a voice conversion service
100
+ """
101
+ src = prepare_audio(source_audio)
102
+ tgt = prepare_audio(target_audio)
103
+ if src is None or tgt is None:
104
+ return
105
+ if len(src) >= 60 * 16000 or len(tgt) >= 30 * 16000:
106
+ # input is way too long, dont return anything
107
+ return
108
+
109
+ res = asyncio.run(async_service_request(src, tgt))
110
+ return 16000, res