glide-the commited on
Commit
8953210
β€’
1 Parent(s): ffe5bd7

Add large files to Git LFS

Browse files
Files changed (6) hide show
  1. Dockerfile +21 -0
  2. README.md +11 -0
  3. requirements.txt +52 -0
  4. setup.py +36 -0
  5. start.py +4 -0
  6. util.py +107 -0
Dockerfile ADDED
@@ -0,0 +1,21 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ FROM python:3.10
2
+
3
+ RUN apt update && apt install -y cmake gcc portaudio19-dev
4
+
5
+ WORKDIR /code
6
+ ENV NUMBA_CACHE_DIR=/tmp/
7
+ COPY ./requirements.txt /code/requirements.txt
8
+
9
+ RUN pip install --no-cache-dir --upgrade -r /code/requirements.txt
10
+ COPY . /code/
11
+
12
+ RUN pip install -e .
13
+
14
+ RUN cd /code/vits/monotonic_align && \
15
+ mkdir -p /code/vits/monotonic_align/vits/monotonic_align/ && \
16
+ python setup.py build_ext --inplace && \
17
+ mv /code/vits/monotonic_align/vits/monotonic_align/* /code/vits/monotonic_align/
18
+
19
+ CMD ["python", "-m", "speakers", "--verbose", "--mode", "web"]
20
+
21
+ EXPOSE 7860
README.md ADDED
@@ -0,0 +1,11 @@
 
 
 
 
 
 
 
 
 
 
 
 
1
+ ---
2
+ title: RVC Speakers
3
+ emoji: πŸ“š
4
+ colorFrom: green
5
+ colorTo: red
6
+ sdk: docker
7
+ pinned: false
8
+ license: bsd-3-clause
9
+ ---
10
+
11
+ Check out the configuration reference at https://huggingface.co/docs/hub/spaces-config-reference
requirements.txt ADDED
@@ -0,0 +1,52 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ Cython==0.29.21
2
+ # vits_text
3
+ unidecode
4
+ # pre uninstall cmake
5
+ pyopenjtalk
6
+ jamo
7
+ pypinyin
8
+ jieba
9
+ cn2an
10
+ # vits_text
11
+
12
+ nest_asyncio
13
+ gradio==3.33.1
14
+ fairseq
15
+ torch
16
+ torchaudio
17
+ soundfile
18
+ scipy==1.9.3
19
+ librosa==0.9.1
20
+ musicdl
21
+ pyaudio
22
+ torchcrepe==0.0.20
23
+ praat-parselmouth>=0.4.2
24
+ pyworld==0.3.2
25
+ faiss-cpu==1.7.3
26
+ numpy==1.23.5
27
+ nltk
28
+
29
+ edge-tts
30
+ IPython
31
+ tqdm
32
+ pandas
33
+
34
+ ## bark
35
+ transformers
36
+ encodec
37
+ huggingface-hub>=0.14.1
38
+ funcy
39
+
40
+ # config manage
41
+ omegaconf
42
+ pydantic
43
+
44
+ # log
45
+ colorama
46
+
47
+ # server
48
+ fastapi~=0.99.1
49
+ starlette~=0.27.0
50
+ uvicorn~=0.23.1
51
+ requests
52
+ oscrypto
setup.py ADDED
@@ -0,0 +1,36 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """
2
+ Copyright (c) 2022, salesforce.com, inc.
3
+ All rights reserved.
4
+ SPDX-License-Identifier: BSD-3-Clause
5
+ For full license text, see the LICENSE file in the repo root or https://opensource.org/licenses/BSD-3-Clause
6
+ """
7
+
8
+ from setuptools import setup, find_namespace_packages
9
+ import platform
10
+
11
+ DEPENDENCY_LINKS = []
12
+ if platform.system() == "Windows":
13
+ DEPENDENCY_LINKS.append("https://download.pytorch.org/whl/torch_stable.html")
14
+
15
+
16
+ def fetch_requirements(filename):
17
+ with open(filename) as f:
18
+ return [ln.strip() for ln in f.read().split("\n")]
19
+
20
+
21
+ setup(
22
+ name="speakers",
23
+ version="0.0.1",
24
+ author="glide-the",
25
+ description="Ready Voice Controller , generate for End-to-End Text-to-Speech,with Multi-engine integration",
26
+ long_description=open("README.md", "r", encoding="utf-8").read(),
27
+ long_description_content_type="text/markdown",
28
+ keywords="Speakers, Multimodal, Ready Voice Controller",
29
+ license="3-Clause BSD",
30
+ packages=find_namespace_packages(include="speakers.*"),
31
+ install_requires=fetch_requirements("requirements.txt"),
32
+ python_requires=">=3.9.0",
33
+ include_package_data=True,
34
+ dependency_links=DEPENDENCY_LINKS,
35
+ zip_safe=False,
36
+ )
start.py ADDED
@@ -0,0 +1,4 @@
 
 
 
 
 
1
+ from speakers.__main__ import main
2
+
3
+ if __name__ == '__main__':
4
+ main()
util.py ADDED
@@ -0,0 +1,107 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import sys
2
+ import asyncio
3
+ from io import BytesIO
4
+
5
+ from fairseq import checkpoint_utils
6
+
7
+ import torch
8
+
9
+ import edge_tts
10
+ import librosa
11
+
12
+
13
+ # https://github.com/fumiama/Retrieval-based-Voice-Conversion-WebUI/blob/main/config.py#L43-L55 # noqa
14
+ def has_mps() -> bool:
15
+ if sys.platform != "darwin":
16
+ return False
17
+ else:
18
+ if not getattr(torch, 'has_mps', False):
19
+ return False
20
+
21
+ try:
22
+ torch.zeros(1).to(torch.device("mps"))
23
+ return True
24
+ except Exception:
25
+ return False
26
+
27
+
28
+ def is_half(device: str) -> bool:
29
+ if not device.startswith('cuda'):
30
+ return False
31
+ else:
32
+ gpu_name = torch.cuda.get_device_name(
33
+ int(device.split(':')[-1])
34
+ ).upper()
35
+
36
+ # ...regex?
37
+ if (
38
+ ('16' in gpu_name and 'V100' not in gpu_name)
39
+ or 'P40' in gpu_name
40
+ or '1060' in gpu_name
41
+ or '1070' in gpu_name
42
+ or '1080' in gpu_name
43
+ ):
44
+ return False
45
+
46
+ return True
47
+
48
+
49
+ def load_hubert_model(device: str, model_path: str = 'hubert_base.pt'):
50
+ model = checkpoint_utils.load_model_ensemble_and_task(
51
+ [model_path]
52
+ )[0][0].to(device)
53
+
54
+ if is_half(device):
55
+ return model.half()
56
+ else:
57
+ return model.float()
58
+
59
+
60
+ async def call_edge_tts(speaker_name: str, text: str):
61
+ tts_com = edge_tts.Communicate(text, speaker_name)
62
+ tts_raw = b''
63
+
64
+ # Stream TTS audio to bytes
65
+ async for chunk in tts_com.stream():
66
+ if chunk['type'] == 'audio':
67
+ tts_raw += chunk['data']
68
+
69
+ # Convert mp3 stream to wav
70
+ ffmpeg_proc = await asyncio.create_subprocess_exec(
71
+ 'ffmpeg',
72
+ '-f', 'mp3',
73
+ '-i', '-',
74
+ '-f', 'wav',
75
+ '-loglevel', 'error',
76
+ '-',
77
+ stdin=asyncio.subprocess.PIPE,
78
+ stdout=asyncio.subprocess.PIPE
79
+ )
80
+ (tts_wav, _) = await ffmpeg_proc.communicate(tts_raw)
81
+
82
+ return librosa.load(BytesIO(tts_wav))
83
+
84
+
85
+ async def call_edge_tts_config(speaker_name: str, text: str, rate: str, volume: str):
86
+ tts_com = edge_tts.Communicate(text=text, voice=speaker_name, rate=rate, volume=volume)
87
+ tts_raw = b''
88
+
89
+ # Stream TTS audio to bytes
90
+ async for chunk in tts_com.stream():
91
+ if chunk['type'] == 'audio':
92
+ tts_raw += chunk['data']
93
+
94
+ # Convert mp3 stream to wav
95
+ ffmpeg_proc = await asyncio.create_subprocess_exec(
96
+ 'ffmpeg',
97
+ '-f', 'mp3',
98
+ '-i', '-',
99
+ '-f', 'wav',
100
+ '-loglevel', 'error',
101
+ '-',
102
+ stdin=asyncio.subprocess.PIPE,
103
+ stdout=asyncio.subprocess.PIPE
104
+ )
105
+ (tts_wav, _) = await ffmpeg_proc.communicate(tts_raw)
106
+
107
+ return librosa.load(BytesIO(tts_wav))