salmaniq commited on
Commit
a72b927
·
1 Parent(s): 014436f

Upload 152 files

Browse files
This view is limited to 50 files because it contains too many changes.   See raw diff
Files changed (50) hide show
  1. .gitattributes +5 -0
  2. Dockerfile +13 -0
  3. LICENSE +21 -0
  4. README.md +97 -3
  5. TODO.md +25 -0
  6. api.py +33 -0
  7. api2.py +19 -0
  8. app/__init__.py +0 -0
  9. app/__pycache__/__init__.cpython-39.pyc +0 -0
  10. app/__pycache__/config.cpython-39.pyc +0 -0
  11. app/__pycache__/main.cpython-39.pyc +0 -0
  12. app/config.py +126 -0
  13. app/main.py +104 -0
  14. app/routers/__init__.py +0 -0
  15. app/routers/__pycache__/__init__.cpython-39.pyc +0 -0
  16. app/routers/__pycache__/full.cpython-39.pyc +0 -0
  17. app/routers/__pycache__/list_speakers.cpython-39.pyc +0 -0
  18. app/routers/__pycache__/tts.cpython-39.pyc +0 -0
  19. app/routers/__pycache__/voice_api.cpython-39.pyc +0 -0
  20. app/routers/clone.py +7 -0
  21. app/routers/full.py +38 -0
  22. app/routers/list_speakers.py +83 -0
  23. app/routers/tts.py +126 -0
  24. app/routers/voice_api.py +37 -0
  25. app/rvc/README.md +2 -0
  26. app/rvc/__pycache__/config.cpython-39.pyc +0 -0
  27. app/rvc/__pycache__/misc.cpython-39.pyc +0 -0
  28. app/rvc/__pycache__/rvc_infer.cpython-39.pyc +0 -0
  29. app/rvc/__pycache__/vc_infer_pipeline.cpython-39.pyc +0 -0
  30. app/rvc/config.py +142 -0
  31. app/rvc/configs/32k.json +46 -0
  32. app/rvc/configs/40k.json +46 -0
  33. app/rvc/configs/48k.json +46 -0
  34. app/rvc/hubert_base.pt +3 -0
  35. app/rvc/infer_pack/__pycache__/attentions.cpython-39.pyc +0 -0
  36. app/rvc/infer_pack/__pycache__/audio.cpython-39.pyc +0 -0
  37. app/rvc/infer_pack/__pycache__/commons.cpython-39.pyc +0 -0
  38. app/rvc/infer_pack/__pycache__/models.cpython-39.pyc +0 -0
  39. app/rvc/infer_pack/__pycache__/modules.cpython-39.pyc +0 -0
  40. app/rvc/infer_pack/__pycache__/transforms.cpython-39.pyc +0 -0
  41. app/rvc/infer_pack/attentions.py +424 -0
  42. app/rvc/infer_pack/audio.py +21 -0
  43. app/rvc/infer_pack/commons.py +173 -0
  44. app/rvc/infer_pack/models.py +1149 -0
  45. app/rvc/infer_pack/modules.py +529 -0
  46. app/rvc/infer_pack/modules/F0Predictor/F0Predictor.py +23 -0
  47. app/rvc/infer_pack/modules/F0Predictor/__init__.py +0 -0
  48. app/rvc/infer_pack/transforms.py +216 -0
  49. app/rvc/lib/__init__.py +0 -0
  50. app/rvc/lib/__pycache__/__init__.cpython-39.pyc +0 -0
.gitattributes CHANGED
@@ -33,3 +33,8 @@ saved_model/**/* filter=lfs diff=lfs merge=lfs -text
33
  *.zip filter=lfs diff=lfs merge=lfs -text
34
  *.zst filter=lfs diff=lfs merge=lfs -text
35
  *tfevents* filter=lfs diff=lfs merge=lfs -text
 
 
 
 
 
 
33
  *.zip filter=lfs diff=lfs merge=lfs -text
34
  *.zst filter=lfs diff=lfs merge=lfs -text
35
  *tfevents* filter=lfs diff=lfs merge=lfs -text
36
+ female.wav filter=lfs diff=lfs merge=lfs -text
37
+ kanye.wav filter=lfs diff=lfs merge=lfs -text
38
+ models/speaker1/added_IVF1136_Flat_nprobe_1_shay_v2.index filter=lfs diff=lfs merge=lfs -text
39
+ models/speaker1/shay.wav filter=lfs diff=lfs merge=lfs -text
40
+ morgan.mp3 filter=lfs diff=lfs merge=lfs -text
Dockerfile ADDED
@@ -0,0 +1,13 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ FROM python:3.10
2
+ WORKDIR /opt
3
+ COPY . .
4
+ RUN apt-get update \
5
+ && apt-get -y install ffmpeg git\
6
+ && pip3 install --upgrade pip setuptools wheel \
7
+ && pip3 install --no-cache-dir -r requirements.txt \
8
+ && pip3 install TTS
9
+ RUN curl -o /root/nltk_data/tokenizers/punkt.zip \
10
+ --create-dirs -L https://raw.githubusercontent.com/nltk/nltk_data/gh-pages/packages/tokenizers/punkt.zip \
11
+ && unzip /root/nltk_data/tokenizers/punkt.zip -d /root/nltk_data/tokenizers/
12
+ EXPOSE 8000
13
+ CMD [ "uvicorn", "app.main:app", "--host", "0.0.0.0", "--port", "8000"]
LICENSE ADDED
@@ -0,0 +1,21 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ MIT License
2
+
3
+ Copyright (c) 2023 Shadan
4
+
5
+ Permission is hereby granted, free of charge, to any person obtaining a copy
6
+ of this software and associated documentation files (the "Software"), to deal
7
+ in the Software without restriction, including without limitation the rights
8
+ to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
9
+ copies of the Software, and to permit persons to whom the Software is
10
+ furnished to do so, subject to the following conditions:
11
+
12
+ The above copyright notice and this permission notice shall be included in all
13
+ copies or substantial portions of the Software.
14
+
15
+ THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16
+ IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17
+ FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18
+ AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19
+ LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20
+ OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21
+ SOFTWARE.
README.md CHANGED
@@ -1,3 +1,97 @@
1
- ---
2
- license: mit
3
- ---
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+
2
+ # TTS-RVC-API
3
+
4
+ Yes, we can use Coqui with RVC!
5
+
6
+ #Why combine the two frameworks? Coqui is a text-to-speech framework (vocoder and encoder), but cloning your own voice takes decades and offers no guarantee of better results. That's why we use RVC (Retrieval-Based Voice Conversion), which works only for speech-to-speech. You can train the model with just 2-3 minutes of dataset as it uses Hubert (a pre-trained model to fine-tune quickly and provide better results).
7
+
8
+
9
+ ## Installation
10
+
11
+ How to use Coqui + RVC api?
12
+
13
+ ```python
14
+ https://github.com/skshadan/TTS-RVC-API.git
15
+ ```
16
+ ```python
17
+ python -m venv .venv
18
+ . .venv/bin/activate
19
+ pip install -r requirements.txt
20
+ pip install TTS
21
+ python -m uvicorn app.main:app
22
+ ```
23
+ Now update `config.toml` with relative paths
24
+ config `model_dir` path or set a `speaker_name` in the request body
25
+
26
+ Where the RVC v2 model is mounted on the container at:
27
+ ```python
28
+ /
29
+ └── models
30
+ └── speaker1
31
+ ├── speaker1.pth
32
+ └── speaker1.index
33
+ ```
34
+
35
+ Now Run this
36
+ ```python
37
+ python -m uvicorn app.main:app
38
+ ```
39
+ ## POST REQUEST
40
+
41
+ ```python
42
+ http://localhost:8000/generate
43
+ ```
44
+ ```python
45
+ emotions : happy,sad,angry,dull
46
+ speed = 1.0 - 2.0
47
+ ```
48
+ ```python
49
+ {
50
+ "speaker_name": "speaker3",
51
+ "input_text": "Hey there! Welcome to the world",
52
+ "emotion": "Surprise",
53
+ "speed": 1.0
54
+ }
55
+ ```
56
+
57
+ # CODE SNIPPET
58
+
59
+ ```python
60
+ import requests
61
+ import json
62
+ import time
63
+
64
+ url = "http://127.0.0.1:8000/generate"
65
+
66
+ payload = json.dumps({
67
+ "speaker_name": "speaker3",
68
+ "input_text": "Are you mad? The way you've betrayed me is beyond comprehension, a slap in the face that's left me boiling with an anger so intense it's as if you've thrown gasoline on a fire, utterly destroying any trust that was left.",
69
+ "emotion": "Dull",
70
+ "speed": 1.0
71
+ })
72
+ headers = {
73
+ 'Content-Type': 'application/json'
74
+ }
75
+
76
+ start_time = time.time() # Start the timer
77
+
78
+ response = requests.request("POST", url, headers=headers, data=payload)
79
+
80
+ end_time = time.time() # Stop the timer
81
+
82
+ if response.status_code == 200:
83
+ audio_content = response.content
84
+
85
+ # Save the audio to a file
86
+ with open("generated_audio.wav", "wb") as audio_file:
87
+ audio_file.write(audio_content)
88
+
89
+ print("Audio saved successfully.")
90
+ print("Time taken:", end_time - start_time, "seconds")
91
+ else:
92
+ print("Error:", response.text)
93
+ ```
94
+ ## Feedback
95
+
96
+ If you have any feedback, issues please reach out to shadankhantech@gmail.com
97
+
TODO.md ADDED
@@ -0,0 +1,25 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # TODO
2
+ - FastAPI
3
+ - Generate:
4
+ - other params? (temperature, rvc f0_method, min_eos_p)
5
+
6
+ - Bark TTS
7
+ - Set speech sentiment in request body, then prefix each sentence e.g. `[Happy]<input_text>`
8
+ - Should I aim to combine sentences which will fit in the largest clip length? (14s?)
9
+ - More consistent tone etc in bark output?
10
+ - Should map the rvc model to a chosen bark voice (incl. default)?
11
+ - Or set via request body?
12
+ - Currently hardcoded for bark voice `v2/en_speaker_9` (works well with all tested RVC models, regardless of gender etc)
13
+
14
+ - RVC
15
+ - Should I re-use the Config details? (GPU info etc)
16
+ - Set `CUDA_VISIBLE_DEVICES` for bark
17
+ - Should I load hubert model for each request? Precious VRAM
18
+
19
+ - Smaller container image
20
+ - Currently used to confirm app dependencies only, don't care that it's 6 GiB
21
+
22
+ # Issues
23
+ - Splits on sentences. So a single sentence which takes longer than ~14 seconds will be a mess
24
+ - Significantly slower bark generation when ran via API vs directly in python script
25
+ - Generation time roughly equals audio length (tested on 3090)
api.py ADDED
@@ -0,0 +1,33 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import requests
2
+ import json
3
+ import time
4
+
5
+ url = "http://127.0.0.1:8000/generate"
6
+
7
+ payload = json.dumps({
8
+ "speaker_name": "speaker1",
9
+ "input_text": "Step into the future of accessibility with our revolutionary text-to-speech platform. Breaking down barriers for the visually impaired, our technology enables effortless access to written content, fostering inclusivity in the digital sphere. With customizable speech parameters and adaptable voices, users can tailor their listening experience to suit their preferences. Embrace the harmony of technology and humanity as we continue to pioneer advancements in the realm of audio communication. Empowering individuals with the gift of seamless interaction, our text-to-speech solution is paving the way for a more connected and accessible world",
10
+ "emotion": "Neutral",
11
+ "speed": 0
12
+ })
13
+ headers = {
14
+ 'Content-Type': 'application/json'
15
+ }
16
+
17
+ start_time = time.time() # Start the timer
18
+
19
+ response = requests.request("POST", url, headers=headers, data=payload)
20
+ print("myres",response)
21
+ end_time = time.time() # Stop the timer
22
+
23
+ if response.status_code == 200:
24
+ audio_content = response.content
25
+ print(audio_content)
26
+ # Save the audio to a file
27
+ with open("generated_audio.wav", "wb") as audio_file:
28
+ audio_file.write(audio_content)
29
+
30
+ print("Audio saved successfully.")
31
+ print("Time taken:", end_time - start_time, "seconds")
32
+ else:
33
+ print("Error:", response.text)
api2.py ADDED
@@ -0,0 +1,19 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import requests
2
+ import json
3
+ import time
4
+ import base64
5
+ import httpx
6
+ url = "http://127.0.0.1:8000/clone"
7
+
8
+
9
+ # Provide the audio file and speaker name
10
+ audio_file = ("rizwan.wav", open("/Users/saboor/Documents/TTS-RVC-API-1/rizwan.wav", "rb"))
11
+ data = {"audio_file": audio_file, "speaker_name": "speaker1"}
12
+
13
+ # Send the POST request
14
+ response = httpx.post(url, data=data)
15
+
16
+ # Check the response
17
+ print(response.status_code)
18
+ print(response.headers)
19
+ print(response.text)
app/__init__.py ADDED
File without changes
app/__pycache__/__init__.cpython-39.pyc ADDED
Binary file (144 Bytes). View file
 
app/__pycache__/config.cpython-39.pyc ADDED
Binary file (4.37 kB). View file
 
app/__pycache__/main.cpython-39.pyc ADDED
Binary file (1.87 kB). View file
 
app/config.py ADDED
@@ -0,0 +1,126 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import tomli
2
+ import os
3
+ import sys
4
+ import tempfile
5
+ from fastapi import HTTPException
6
+ from structlog import get_logger
7
+ import bark
8
+ from glob import glob
9
+
10
+ log = get_logger(__name__)
11
+
12
+ def parse_config():
13
+ """Parse and validate config on startup
14
+ Raise errors for invalid config
15
+
16
+ Set defaults for undefined config options
17
+ """
18
+
19
+ config = {
20
+ "rvc": {},
21
+ "tts": {},
22
+ }
23
+ config_file = None
24
+ try:
25
+ with open("config.toml", mode="rb") as fp:
26
+ config_file = tomli.load(fp)
27
+ except FileNotFoundError:
28
+ log.error("FAILURE TO START. Configuration file \"config.toml\" was not found")
29
+ sys.exit(1)
30
+
31
+ try:
32
+ if config_file["rvc"]["model_dir"]:
33
+ rvc_model_dir = config_file["rvc"]["model_dir"]
34
+ if not os.path.isdir(rvc_model_dir):
35
+ log.error(f"FAILURE TO START. Config item \"rvc.model_dir\" was defined but path \"{rvc_model_dir}\" was not found")
36
+ log.info(f"Remove config item \"rvc.model_dir\" from \"config.toml\" if you don't want to use RVC models")
37
+ sys.exit(1)
38
+ else:
39
+ config["rvc"]["model_dir"] = config_file["rvc"]["model_dir"]
40
+ except KeyError:
41
+ config["rvc"]["model_dir"] = False
42
+ log.warn(f"Config item \"rvc.model_dir\" is missing from config file. RVC features are disabled")
43
+
44
+ try:
45
+ if config_file["rvc"]["bark_voice_map"]:
46
+ if not config["rvc"]["model_dir"]:
47
+ log.error(f"FAILURE TO START. Config item \"rvc.bark_voice_map\" was defined but \"rvc.model_dir\" was not")
48
+ log.info(f"Config item \"rvc.model_dir\" is required to use RVC models. Either set this value or remove \"rvc.bark_voice_map\"")
49
+ sys.exit(1)
50
+ config["rvc"]["bark_voice_map"] = config_file["rvc"]["bark_voice_map"]
51
+ except KeyError:
52
+ # Suno Favourite from voice Bark Speaker Library (v2)
53
+ config["rvc"]["bark_voice_map"] = {"default": "v2/en_speaker_6"}
54
+ log.warn("Config item \"rvc.bark_voice_map\" is undefined. Setting \"v2/en_speaker_6\" for all RVC models")
55
+
56
+ try:
57
+ temp = config_file["tts"]["output_dir"]
58
+ if temp:
59
+ if not os.path.isdir(temp):
60
+ log.error(f"FAILURE TO START. Config item \"tts.output_dir\" was defined but path \"{temp}\" was not found")
61
+ log.info(f"Either remove config item \"tts.output_dir\" from \"config.toml\" to use system default temp dir, or set the value as an existing directory path")
62
+ sys.exit(1)
63
+ else:
64
+ config["tts"]["output_dir"] = config_file["tts"]["output_dir"]
65
+ except KeyError:
66
+ temp_dir = tempfile.gettempdir()
67
+ config["tts"]["output_dir"] = temp_dir
68
+ log.warn(f"Config item \"tts.output_dir\" is undefined. Using system default: {temp_dir}")
69
+
70
+ log.info(f"STARTUP CONFIG: {config}")
71
+ return config
72
+
73
+ # Return list of relative paths for input of list of paths and start path
74
+ def relative_bark_paths(paths, start_path):
75
+ p = []
76
+ for i in paths:
77
+ p += [os.path.relpath(os.path.splitext(i)[0], start_path)]
78
+ return p
79
+
80
+ def load_speakers(config):
81
+ """Load all available speakers on system. Including Bark voices and RVC models
82
+ """
83
+ # Get bark voices from bark package files
84
+ bark_voice_dir = os.path.join(bark.__path__[0], "assets/prompts")
85
+ if not os.path.isdir(bark_voice_dir):
86
+ log.error(f"FAILURE TO START. Bark voice directory was not found at {bark_voice_dir}")
87
+ sys.exit(1)
88
+
89
+ voices_full_path = glob(os.path.join(bark_voice_dir, "**", f"*.npz"), recursive=True)
90
+ bark_voices = relative_bark_paths(voices_full_path, bark_voice_dir)
91
+ if not bark_voices:
92
+ log.error(f"FAILURE TO START. No Bark speaker npz files were found in a recursive search of existing directory {bark_voice_dir}. Bark speakers are required")
93
+ sys.exit(1)
94
+
95
+ # Get RVC models
96
+ rvc_model_dir = config["rvc"]["model_dir"]
97
+ rvc_speakers = {}
98
+ if rvc_model_dir:
99
+ rvc_full_path = glob(os.path.join(rvc_model_dir, f"**", "*.pth"), recursive=True)
100
+ for s in rvc_full_path:
101
+ id = os.path.relpath(s, rvc_model_dir)
102
+ name = os.path.split(id)[0]
103
+ dir = os.path.dirname(s)
104
+ index = [os.path.join(dir, f) for f in os.listdir(dir) if f.endswith(".index")]
105
+ if len(index) != 1:
106
+ log.error(f"FAILURE TO START. RVC model {name} should have 1 index file. It has {len(index)}")
107
+ sys.exit(1)
108
+ index_relative = os.path.relpath(
109
+ index[0],
110
+ rvc_model_dir
111
+ )
112
+ try:
113
+ bv = config["rvc"]["bark_voice_map"][name]
114
+ except KeyError:
115
+ bv = config["rvc"]["bark_voice_map"]["default"]
116
+ rvc_speakers[name] = {"id": id, "bark_voice": bv, "index": index_relative}
117
+
118
+ if not rvc_speakers:
119
+ log.error(f"FAILURE TO START. No RVC model files were found in a recursive search of the defined and existing {rvc_model_dir}")
120
+ log.info(f"You must supply any RVC models you wish to use. Either remove or fix the config item \"rvc.model_dir\"")
121
+ sys.exit(1)
122
+
123
+ return bark_voices, rvc_speakers
124
+
125
+ config = parse_config()
126
+ bark_voices, rvc_speakers = load_speakers(config)
app/main.py ADDED
@@ -0,0 +1,104 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from fastapi import FastAPI, Request
2
+ import sys
3
+ import time
4
+ import uvicorn
5
+ import structlog
6
+ import logging
7
+ from .routers import full, list_speakers,voice_api
8
+ import nltk
9
+ from fastapi.middleware.cors import CORSMiddleware
10
+ nltk.download('punkt')
11
+
12
+ app=FastAPI()
13
+
14
+ origins = [
15
+ "http://localhost.tiangolo.com",
16
+ "https://localhost.tiangolo.com",
17
+ "http://localhost",
18
+ "http://localhost:3000",
19
+ "https://2c6b-182-186-105-95.ngrok-free.app",
20
+ ]
21
+ app.add_middleware(
22
+ CORSMiddleware,
23
+ allow_origins=origins,
24
+ allow_credentials=True,
25
+ allow_methods=["*"],
26
+ allow_headers=["*"],
27
+ )
28
+
29
+ # Setup global logger config
30
+ structlog.configure(
31
+ processors=[
32
+ # If log level is too low, abort pipeline and throw away log entry.
33
+ structlog.stdlib.filter_by_level,
34
+ # Add the name of the logger to event dict.
35
+ # structlog.stdlib.add_logger_name,
36
+ # Add log level to event dict.
37
+ structlog.stdlib.add_log_level,
38
+ # Perform %-style formatting.
39
+ structlog.stdlib.PositionalArgumentsFormatter(),
40
+ # Add a timestamp in ISO 8601 format.
41
+ structlog.processors.TimeStamper(fmt="iso"),
42
+ # If the "stack_info" key in the event dict is true, remove it and
43
+ # render the current stack trace in the "stack" key.
44
+ structlog.processors.StackInfoRenderer(),
45
+ # If the "exc_info" key in the event dict is either true or a
46
+ # sys.exc_info() tuple, remove "exc_info" and render the exception
47
+ # with traceback into the "exception" key.
48
+ structlog.processors.format_exc_info,
49
+ # If some value is in bytes, decode it to a unicode str.
50
+ structlog.processors.UnicodeDecoder(),
51
+ # Add callsite parameters.
52
+ # structlog.processors.CallsiteParameterAdder(
53
+ # {
54
+ # structlog.processors.CallsiteParameter.FILENAME,
55
+ # structlog.processors.CallsiteParameter.FUNC_NAME,
56
+ # structlog.processors.CallsiteParameter.LINENO,
57
+ # }
58
+ # ),
59
+ # Render the final event dict as JSON.
60
+ structlog.processors.JSONRenderer()
61
+ ],
62
+ # `wrapper_class` is the bound logger that you get back from
63
+ # get_logger(). This one imitates the API of `logging.Logger`.
64
+ wrapper_class=structlog.stdlib.BoundLogger,
65
+ # `logger_factory` is used to create wrapped loggers that are used for
66
+ # OUTPUT. This one returns a `logging.Logger`. The final value (a JSON
67
+ # string) from the final processor (`JSONRenderer`) will be passed to
68
+ # the method of the same name as that you've called on the bound logger.
69
+ logger_factory=structlog.stdlib.LoggerFactory(),
70
+ # Effectively freeze configuration after creating the first bound
71
+ # logger.
72
+ cache_logger_on_first_use=True,
73
+ )
74
+
75
+ logging.basicConfig(
76
+ format="%(message)s",
77
+ stream=sys.stdout,
78
+ level=logging.DEBUG,
79
+ )
80
+ log = structlog.get_logger(__name__)
81
+
82
+ # app = FastAPI()
83
+
84
+ @app.middleware("http")
85
+ async def log_requests(request: Request, call_next):
86
+ start_time = time.time()
87
+
88
+ response = await call_next(request)
89
+
90
+ process_time = (time.time() - start_time) * 1000
91
+ formatted_process_time = '{0:.2f}'.format(process_time)
92
+ # These fields should be outside the event
93
+ log.info(f"path={request.url.path} completed_in={formatted_process_time}ms status_code={response.status_code}")
94
+
95
+ return response
96
+
97
+ app.include_router(full.router)
98
+ app.include_router(list_speakers.router)
99
+ app.include_router(voice_api.router)
100
+
101
+
102
+
103
+ if __name__ == "__main__":
104
+ uvicorn.run(app, host="0.0.0.0", port=8000)
app/routers/__init__.py ADDED
File without changes
app/routers/__pycache__/__init__.cpython-39.pyc ADDED
Binary file (152 Bytes). View file
 
app/routers/__pycache__/full.cpython-39.pyc ADDED
Binary file (1.46 kB). View file
 
app/routers/__pycache__/list_speakers.cpython-39.pyc ADDED
Binary file (2.52 kB). View file
 
app/routers/__pycache__/tts.cpython-39.pyc ADDED
Binary file (3.05 kB). View file
 
app/routers/__pycache__/voice_api.cpython-39.pyc ADDED
Binary file (1.5 kB). View file
 
app/routers/clone.py ADDED
@@ -0,0 +1,7 @@
 
 
 
 
 
 
 
 
1
+ from app.rvc.rvc_infer import rvc_convert
2
+
3
+ wav=rvc_convert(
4
+ model_path="/Users/saboor/Documents/TTS-RVC-API-1/Tate_e1500_s211500.pth",
5
+ f0_up_key=0,
6
+ input_path="/Users/saboor/Documents/TTS-RVC-API-1/morgan.mp3"
7
+ )
app/routers/full.py ADDED
@@ -0,0 +1,38 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from fastapi import APIRouter,FastAPI
2
+ from fastapi.middleware.cors import CORSMiddleware
3
+ from fastapi.responses import StreamingResponse, FileResponse
4
+ from app.routers.tts import server
5
+ from pydantic import BaseModel
6
+ from app.config import config
7
+ from typing import Optional
8
+ import base64
9
+
10
+ router = APIRouter(
11
+ prefix="/generate",
12
+ tags=["generate"],
13
+ responses={404: {"description": "Not found"}},
14
+ )
15
+
16
+ class Generation(BaseModel):
17
+ speaker_name: Optional[str] = None # Change this line
18
+ input_text: str
19
+ emotion: Optional[str] = None # Added this line
20
+ speed: Optional[float] = 1.0
21
+
22
+ @router.post("/")
23
+ async def generate(gen: Generation):
24
+
25
+ rvc_speaker_id, wav_opt = server(
26
+ text=gen.input_text,
27
+ tts_output_dir=config["tts"]["output_dir"],
28
+ speaker_name=gen.speaker_name,
29
+ emotion=gen.emotion,
30
+ speed=gen.speed
31
+ )
32
+ with open(wav_opt, "rb") as wav_file:
33
+ wav_bytes = wav_file.read()
34
+ base64_bytes = base64.b64encode(wav_bytes)
35
+ base64_string = base64_bytes.decode("utf-8")
36
+
37
+ return {"base64_wav": base64_string}
38
+
app/routers/list_speakers.py ADDED
@@ -0,0 +1,83 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import os
2
+ import bark
3
+ import re
4
+ from glob import glob
5
+ from fastapi import APIRouter, HTTPException
6
+ from structlog import get_logger
7
+ from ..config import config, bark_voices, rvc_speakers
8
+
9
+ log = get_logger(__name__)
10
+
11
+ router = APIRouter(
12
+ prefix="/speakers",
13
+ tags=["speakers"],
14
+ responses={404: {"description": "Not found"}},
15
+ )
16
+
17
+ @router.get("/")
18
+ async def list_speakers(b: str = None, r: str = None):
19
+ """ List all speakers
20
+
21
+ Search with query parameters which accepts regex. `b` bark voices, `r` rvc models E.g.
22
+
23
+ `/speakers?b=en_speaker`: All english bark voices (name containing "en_speaker")
24
+
25
+ `/speakers?r=custom_model`: All RVC models in `rvc_model_dir` with basename matching string "custom_model"
26
+
27
+ `/speakers?b=en_speaker&r=custom_model`: Combination of both
28
+ """
29
+
30
+ speakers = []
31
+
32
+ # If no query parameters, return all speakers
33
+ if not (b or r):
34
+ b, r = ".*", ".*"
35
+
36
+ # Filter from available bark voices in bark package path
37
+ if b:
38
+ filtered_bark = [ x for x in bark_voices if re.match(f".*{b}.*", x) ]
39
+ if not filtered_bark:
40
+ raise HTTPException(
41
+ status_code=400,
42
+ detail=f"No Bark speaker npz files which matched regex \".*{b}.*\" were found."
43
+ )
44
+ speakers += filtered_bark
45
+
46
+ # Get RVC models
47
+ rvc_model_dir = config["rvc"]["model_dir"]
48
+ if r and rvc_model_dir:
49
+ filtered_rvc = [ x for x in rvc_speakers if re.match(f".*{r}.*", x) ]
50
+ if not filtered_rvc:
51
+ raise HTTPException(
52
+ status_code=400,
53
+ detail=f"No RVC model files which matched regex \".*{r}.*\" were found in {rvc_model_dir}"
54
+ )
55
+ speakers += filtered_rvc
56
+
57
+ return speakers
58
+
59
+ @router.get("/{speaker_id}")
60
+ def get_speaker(speaker_id):
61
+ """Get details on speaker using speaker name
62
+
63
+ Where the speaker name is the name of the directory that contains the RVC model pth and index files.
64
+ The speaker name is returned by the `/speakers/` endpoint
65
+ """
66
+
67
+ speaker = None
68
+ try:
69
+ speaker = rvc_speakers[speaker_id]
70
+ except KeyError:
71
+ raise HTTPException(
72
+ status_code=400,
73
+ detail=f"No RVC model file with speaker_id {speaker_id}"
74
+ )
75
+
76
+ return speaker
77
+
78
+ # Return list of relative paths for input of list of paths and start path
79
+ def relative_paths(paths, start_path):
80
+ p = []
81
+ for i in paths:
82
+ p += [os.path.relpath(i, start_path)]
83
+ return p
app/routers/tts.py ADDED
@@ -0,0 +1,126 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from TTS.api import TTS
2
+ import huggingface_hub
3
+ from nltk.tokenize import sent_tokenize
4
+ import numpy as np
5
+ import os
6
+ from scipy.io.wavfile import write
7
+ import time
8
+ import io
9
+ from typing import Optional
10
+ from fastapi import HTTPException
11
+ from ..rvc.rvc_infer import rvc_convert
12
+ from ..config import config, bark_voices, rvc_speakers
13
+ from ..rvc.misc import (
14
+ load_hubert,
15
+ get_vc,
16
+ vc_single
17
+ )
18
+ from structlog import get_logger
19
+
20
+
21
+
22
+ # Initialize TTS
23
+ tts = TTS(model_name="tts_models/multilingual/multi-dataset/your_tts", progress_bar=True, gpu=False)
24
+
25
+ os.environ["CUDA_VISIBLE_DEVICES"] = "0"
26
+ log = get_logger(__name__)
27
+
28
+ def server(
29
+ text: str,
30
+ tts_output_dir: str,
31
+ speaker_name: str,
32
+ emotion: Optional[str] = None,
33
+ speed: Optional[float] = 1.0
34
+ ):
35
+ rvc_model_dir = config["rvc"]["model_dir"]
36
+
37
+ # TTS output file path
38
+ tts_file = os.path.join(tts_output_dir, "bark_out.wav")
39
+
40
+ # Is the speaker an RVC model?
41
+ rvc_speaker_id = None
42
+ if speaker_name in rvc_speakers:
43
+ rvc_speaker_id = rvc_speakers[speaker_name]["id"]
44
+ else:
45
+ raise HTTPException(status_code=400, detail=f"speaker_name \"{speaker_name}\" was not found")
46
+
47
+ # Prepare the text
48
+ script = text.replace("\n", " ").strip()
49
+ sentences = sent_tokenize(script)
50
+ full_script = " ".join(sentences)
51
+ base_path = "/Users/saboor/Documents/TTS-RVC-API-1/models/"
52
+ speaker_dir = os.path.join(base_path, speaker_name)
53
+ # Check if the speaker directory exists
54
+ if os.path.exists(speaker_dir):
55
+ # Get a list of all files in the directory
56
+ files_in_dir = os.listdir(speaker_dir)
57
+ wav_files = [file for file in files_in_dir if file.endswith(".wav")]
58
+ if len(wav_files) == 1:
59
+ # Get the complete file path of the .wav file
60
+ wav_file_path = os.path.join(speaker_dir, wav_files[0])
61
+
62
+ # Pass the complete file path to the function
63
+ tts.tts_to_file(text=full_script, speaker_wav=wav_file_path, file_path=tts_file,language="en" ,emotion=emotion, speed=speed)
64
+ else:
65
+ print(f"Expected one .wav file in directory '{speaker_name}' but found {len(wav_files)} .wav files.")
66
+ else:
67
+ print(f"Speaker directory '{speaker_name}' not found at path: {speaker_dir}")
68
+
69
+ t0 = time.time()
70
+ generation_duration_s = time.time() - t0
71
+ log.info(f"took {generation_duration_s:.0f}s to generate audio")
72
+
73
+ if rvc_speaker_id and rvc_model_dir:
74
+ hubert_model = None
75
+ hubert_path = huggingface_hub.hf_hub_download(
76
+ repo_id="lj1995/VoiceConversionWebUI",
77
+ filename="hubert_base.pt",
78
+ revision="1c75048c96f23f99da4b12909b532b5983290d7d",
79
+ local_dir="models/hubert/",
80
+ local_dir_use_symlinks=True,
81
+ )
82
+ hubert_model = load_hubert(hubert_path)
83
+
84
+ get_vc(rvc_speaker_id, rvc_model_dir, 0.33, 0.5)
85
+
86
+ rvc_index = os.path.join(rvc_model_dir, rvc_speakers[speaker_name]["index"])
87
+ # wav_opt = vc_single(
88
+ # 0,
89
+ # tts_file,
90
+ # 0,
91
+ # None,
92
+ # "pm",
93
+ # rvc_index,
94
+ # '',
95
+ # 0.88,
96
+ # 3,
97
+ # 0,
98
+ # 1,
99
+ # 0.33,
100
+ # )
101
+ if os.path.exists(speaker_dir):
102
+ # Get a list of all files in the directory
103
+ files_in_dir = os.listdir(speaker_dir)
104
+ pth_files = [file for file in files_in_dir if file.endswith(".pth")]
105
+ if len(pth_files) == 1:
106
+ pth_file_path = os.path.join(speaker_dir, pth_files[0])
107
+
108
+
109
+ wav_opt=rvc_convert(
110
+ model_path=pth_file_path,
111
+ # f0_up_key=0,
112
+ input_path=tts_file
113
+ )
114
+ if wav_opt is None:
115
+ # Handle the case where vc_single returned None
116
+ # You can raise an exception or handle it as appropriate for your application
117
+ raise HTTPException(status_code=500, detail="Voice conversion failed")
118
+ # wav = io.BytesIO()
119
+
120
+ output_wav_file = "/Users/saboor/Documents/TTS-RVC-API-1/output/out.wav"
121
+ # sample_rate = 44100
122
+ # # write(wav,wav_opt[0],wav_opt)
123
+ # write(wav_opt,sample_rate)
124
+ return rvc_speaker_id, output_wav_file
125
+ else:
126
+ return rvc_speaker_id, tts_file
app/routers/voice_api.py ADDED
@@ -0,0 +1,37 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from fastapi import APIRouter, HTTPException,UploadFile
2
+ from fastapi.responses import StreamingResponse, FileResponse
3
+ from app.rvc.rvc_infer import rvc_convert
4
+ import os
5
+ import base64
6
+ router = APIRouter(
7
+ prefix="/clone",
8
+ tags=["cloning"],
9
+ responses={404: {"description": "Not found"}},
10
+ )
11
+
12
+ @router.post("/")
13
+ async def generate_voice(audio_file: UploadFile,speaker_name: str ):
14
+ base_path = "/Users/saboor/Documents/TTS-RVC-API-1/models/"
15
+ speaker_dir = os.path.join(base_path, speaker_name)
16
+
17
+ if os.path.exists(speaker_dir):
18
+ # Get a list of all files in the directory
19
+ files_in_dir = os.listdir(speaker_dir)
20
+ pth_files = [file for file in files_in_dir if file.endswith(".pth")]
21
+ if len(pth_files) == 1:
22
+ pth_file_path = os.path.join(speaker_dir, pth_files[0])
23
+ # Save the uploaded file to a temporary location
24
+ with open(f"temp_{audio_file.filename}", "wb") as f:
25
+ f.write(audio_file.file.read())
26
+
27
+ wav_opt = rvc_convert(
28
+ model_path=pth_file_path,
29
+ f0_up_key=-4,
30
+ input_path=f"temp_{audio_file.filename}",
31
+ )
32
+ with open(wav_opt, "rb") as wav_file:
33
+ wav_bytes = wav_file.read()
34
+ base64_bytes = base64.b64encode(wav_bytes)
35
+ base64_string = base64_bytes.decode("utf-8")
36
+
37
+ return {"base64_wav": base64_string}
app/rvc/README.md ADDED
@@ -0,0 +1,2 @@
 
 
 
1
+ Resources from https://github.com/RVC-Project/Retrieval-based-Voice-Conversion-WebUI
2
+ - Used because no library currently exists with equivalent functionality
app/rvc/__pycache__/config.cpython-39.pyc ADDED
Binary file (2.81 kB). View file
 
app/rvc/__pycache__/misc.cpython-39.pyc ADDED
Binary file (4.19 kB). View file
 
app/rvc/__pycache__/rvc_infer.cpython-39.pyc ADDED
Binary file (8.7 kB). View file
 
app/rvc/__pycache__/vc_infer_pipeline.cpython-39.pyc ADDED
Binary file (8.79 kB). View file
 
app/rvc/config.py ADDED
@@ -0,0 +1,142 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """
2
+ https://github.com/RVC-Project/Retrieval-based-Voice-Conversion-WebUI/blob/86ed98aacaa8b2037aad795abd11cdca122cf39f/config.py
3
+
4
+ copyright: RVC-Project
5
+ license: MIT
6
+ """
7
+
8
+ # import argparse
9
+ # import sys
10
+ import torch
11
+ from multiprocessing import cpu_count
12
+
13
+
14
+ def use_fp32_config():
15
+ for config_file in ["32k.json", "40k.json", "48k.json"]:
16
+ with open(f"/Users/saboor/Documents/TTS-RVC-API-1/app/rvc/configs/{config_file}", "r") as f:
17
+ strr = f.read().replace("true", "false")
18
+ with open(f"/Users/saboor/Documents/TTS-RVC-API-1/app/rvc/configs/{config_file}", "w") as f:
19
+ f.write(strr)
20
+ with open("trainset_preprocess_pipeline_print.py", "r") as f:
21
+ strr = f.read().replace("3.7", "3.0")
22
+ with open("trainset_preprocess_pipeline_print.py", "w") as f:
23
+ f.write(strr)
24
+
25
+
26
+ class Config:
27
+ def __init__(self):
28
+ self.device = "cuda:0"
29
+ self.is_half = True
30
+ self.n_cpu = 0
31
+ self.gpu_name = None
32
+ self.gpu_mem = None
33
+ # (
34
+ # self.python_cmd,
35
+ # self.listen_port,
36
+ # self.iscolab,
37
+ # self.noparallel,
38
+ # self.noautoopen,
39
+ # ) = self.arg_parse()
40
+ self.x_pad, self.x_query, self.x_center, self.x_max = self.device_config()
41
+
42
+ @staticmethod
43
+ # def arg_parse() -> tuple:
44
+ # exe = sys.executable or "python"
45
+ # parser = argparse.ArgumentParser()
46
+ # parser.add_argument("--port", type=int, default=7865, help="Listen port")
47
+ # parser.add_argument("--pycmd", type=str, default=exe, help="Python command")
48
+ # parser.add_argument("--colab", action="store_true", help="Launch in colab")
49
+ # parser.add_argument(
50
+ # "--noparallel", action="store_true", help="Disable parallel processing"
51
+ # )
52
+ # parser.add_argument(
53
+ # "--noautoopen",
54
+ # action="store_true",
55
+ # help="Do not open in browser automatically",
56
+ # )
57
+ # cmd_opts = parser.parse_args()
58
+
59
+ # cmd_opts.port = cmd_opts.port if 0 <= cmd_opts.port <= 65535 else 7865
60
+
61
+ # return (
62
+ # cmd_opts.pycmd,
63
+ # cmd_opts.port,
64
+ # cmd_opts.colab,
65
+ # cmd_opts.noparallel,
66
+ # cmd_opts.noautoopen,
67
+ # )
68
+
69
+ # has_mps is only available in nightly pytorch (for now) and MasOS 12.3+.
70
+ # check `getattr` and try it for compatibility
71
+ @staticmethod
72
+ def has_mps() -> bool:
73
+ if not torch.backends.mps.is_available():
74
+ return False
75
+ try:
76
+ torch.zeros(1).to(torch.device("mps"))
77
+ return True
78
+ except Exception:
79
+ return False
80
+
81
+ def device_config(self) -> tuple:
82
+ if torch.cuda.is_available():
83
+ i_device = int(self.device.split(":")[-1])
84
+ self.gpu_name = torch.cuda.get_device_name(i_device)
85
+ if (
86
+ ("16" in self.gpu_name and "V100" not in self.gpu_name.upper())
87
+ or "P40" in self.gpu_name.upper()
88
+ or "1060" in self.gpu_name
89
+ or "1070" in self.gpu_name
90
+ or "1080" in self.gpu_name
91
+ ):
92
+ print("Found GPU", self.gpu_name, ", force to fp32")
93
+ self.is_half = False
94
+ use_fp32_config()
95
+ else:
96
+ print("Found GPU", self.gpu_name)
97
+ self.gpu_mem = int(
98
+ torch.cuda.get_device_properties(i_device).total_memory
99
+ / 1024
100
+ / 1024
101
+ / 1024
102
+ + 0.4
103
+ )
104
+ if self.gpu_mem <= 4:
105
+ with open("trainset_preprocess_pipeline_print.py", "r") as f:
106
+ strr = f.read().replace("3.7", "3.0")
107
+ with open("trainset_preprocess_pipeline_print.py", "w") as f:
108
+ f.write(strr)
109
+ # elif Config.has_mps():
110
+ # print("No supported Nvidia GPU found, use MPS instead")
111
+ # self.device = "mps"
112
+ # self.is_half = False
113
+ # use_fp32_config()
114
+ else:
115
+ print("No supported Nvidia GPU found, use CPU instead")
116
+ self.device = "cpu"
117
+ self.is_half = False
118
+ use_fp32_config()
119
+
120
+ if self.n_cpu == 0:
121
+ self.n_cpu = cpu_count()
122
+
123
+ if self.is_half:
124
+ # 6G显存配置
125
+ x_pad = 3
126
+ x_query = 10
127
+ x_center = 60
128
+ x_max = 65
129
+ else:
130
+ # 5G显存配置
131
+ x_pad = 1
132
+ x_query = 6
133
+ x_center = 38
134
+ x_max = 41
135
+
136
+ if self.gpu_mem != None and self.gpu_mem <= 4:
137
+ x_pad = 1
138
+ x_query = 5
139
+ x_center = 30
140
+ x_max = 32
141
+
142
+ return x_pad, x_query, x_center, x_max
app/rvc/configs/32k.json ADDED
@@ -0,0 +1,46 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "train": {
3
+ "log_interval": 200,
4
+ "seed": 1234,
5
+ "epochs": 20000,
6
+ "learning_rate": 1e-4,
7
+ "betas": [0.8, 0.99],
8
+ "eps": 1e-9,
9
+ "batch_size": 4,
10
+ "fp16_run": false,
11
+ "lr_decay": 0.999875,
12
+ "segment_size": 12800,
13
+ "init_lr_ratio": 1,
14
+ "warmup_epochs": 0,
15
+ "c_mel": 45,
16
+ "c_kl": 1.0
17
+ },
18
+ "data": {
19
+ "max_wav_value": 32768.0,
20
+ "sampling_rate": 32000,
21
+ "filter_length": 1024,
22
+ "hop_length": 320,
23
+ "win_length": 1024,
24
+ "n_mel_channels": 80,
25
+ "mel_fmin": 0.0,
26
+ "mel_fmax": null
27
+ },
28
+ "model": {
29
+ "inter_channels": 192,
30
+ "hidden_channels": 192,
31
+ "filter_channels": 768,
32
+ "n_heads": 2,
33
+ "n_layers": 6,
34
+ "kernel_size": 3,
35
+ "p_dropout": 0,
36
+ "resblock": "1",
37
+ "resblock_kernel_sizes": [3,7,11],
38
+ "resblock_dilation_sizes": [[1,3,5], [1,3,5], [1,3,5]],
39
+ "upsample_rates": [10,4,2,2,2],
40
+ "upsample_initial_channel": 512,
41
+ "upsample_kernel_sizes": [16,16,4,4,4],
42
+ "use_spectral_norm": false,
43
+ "gin_channels": 256,
44
+ "spk_embed_dim": 109
45
+ }
46
+ }
app/rvc/configs/40k.json ADDED
@@ -0,0 +1,46 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "train": {
3
+ "log_interval": 200,
4
+ "seed": 1234,
5
+ "epochs": 20000,
6
+ "learning_rate": 1e-4,
7
+ "betas": [0.8, 0.99],
8
+ "eps": 1e-9,
9
+ "batch_size": 4,
10
+ "fp16_run": false,
11
+ "lr_decay": 0.999875,
12
+ "segment_size": 12800,
13
+ "init_lr_ratio": 1,
14
+ "warmup_epochs": 0,
15
+ "c_mel": 45,
16
+ "c_kl": 1.0
17
+ },
18
+ "data": {
19
+ "max_wav_value": 32768.0,
20
+ "sampling_rate": 40000,
21
+ "filter_length": 2048,
22
+ "hop_length": 400,
23
+ "win_length": 2048,
24
+ "n_mel_channels": 125,
25
+ "mel_fmin": 0.0,
26
+ "mel_fmax": null
27
+ },
28
+ "model": {
29
+ "inter_channels": 192,
30
+ "hidden_channels": 192,
31
+ "filter_channels": 768,
32
+ "n_heads": 2,
33
+ "n_layers": 6,
34
+ "kernel_size": 3,
35
+ "p_dropout": 0,
36
+ "resblock": "1",
37
+ "resblock_kernel_sizes": [3,7,11],
38
+ "resblock_dilation_sizes": [[1,3,5], [1,3,5], [1,3,5]],
39
+ "upsample_rates": [10,10,2,2],
40
+ "upsample_initial_channel": 512,
41
+ "upsample_kernel_sizes": [16,16,4,4],
42
+ "use_spectral_norm": false,
43
+ "gin_channels": 256,
44
+ "spk_embed_dim": 109
45
+ }
46
+ }
app/rvc/configs/48k.json ADDED
@@ -0,0 +1,46 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "train": {
3
+ "log_interval": 200,
4
+ "seed": 1234,
5
+ "epochs": 20000,
6
+ "learning_rate": 1e-4,
7
+ "betas": [0.8, 0.99],
8
+ "eps": 1e-9,
9
+ "batch_size": 4,
10
+ "fp16_run": false,
11
+ "lr_decay": 0.999875,
12
+ "segment_size": 11520,
13
+ "init_lr_ratio": 1,
14
+ "warmup_epochs": 0,
15
+ "c_mel": 45,
16
+ "c_kl": 1.0
17
+ },
18
+ "data": {
19
+ "max_wav_value": 32768.0,
20
+ "sampling_rate": 48000,
21
+ "filter_length": 2048,
22
+ "hop_length": 480,
23
+ "win_length": 2048,
24
+ "n_mel_channels": 128,
25
+ "mel_fmin": 0.0,
26
+ "mel_fmax": null
27
+ },
28
+ "model": {
29
+ "inter_channels": 192,
30
+ "hidden_channels": 192,
31
+ "filter_channels": 768,
32
+ "n_heads": 2,
33
+ "n_layers": 6,
34
+ "kernel_size": 3,
35
+ "p_dropout": 0,
36
+ "resblock": "1",
37
+ "resblock_kernel_sizes": [3,7,11],
38
+ "resblock_dilation_sizes": [[1,3,5], [1,3,5], [1,3,5]],
39
+ "upsample_rates": [10,6,2,2,2],
40
+ "upsample_initial_channel": 512,
41
+ "upsample_kernel_sizes": [16,16,4,4,4],
42
+ "use_spectral_norm": false,
43
+ "gin_channels": 256,
44
+ "spk_embed_dim": 109
45
+ }
46
+ }
app/rvc/hubert_base.pt ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:f54b40fd2802423a5643779c4861af1e9ee9c1564dc9d32f54f20b5ffba7db96
3
+ size 189507909
app/rvc/infer_pack/__pycache__/attentions.cpython-39.pyc ADDED
Binary file (10.1 kB). View file
 
app/rvc/infer_pack/__pycache__/audio.cpython-39.pyc ADDED
Binary file (739 Bytes). View file
 
app/rvc/infer_pack/__pycache__/commons.cpython-39.pyc ADDED
Binary file (6.06 kB). View file
 
app/rvc/infer_pack/__pycache__/models.cpython-39.pyc ADDED
Binary file (24.8 kB). View file
 
app/rvc/infer_pack/__pycache__/modules.cpython-39.pyc ADDED
Binary file (12.1 kB). View file
 
app/rvc/infer_pack/__pycache__/transforms.cpython-39.pyc ADDED
Binary file (4.11 kB). View file
 
app/rvc/infer_pack/attentions.py ADDED
@@ -0,0 +1,424 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """
2
+ https://github.com/RVC-Project/Retrieval-based-Voice-Conversion-WebUI/tree/86ed98aacaa8b2037aad795abd11cdca122cf39f/lib/infer_pack
3
+
4
+ copyright: RVC-Project
5
+ license: MIT
6
+ """
7
+
8
+ import copy
9
+ import math
10
+ import numpy as np
11
+ import torch
12
+ from torch import nn
13
+ from torch.nn import functional as F
14
+
15
+ from app.rvc.infer_pack import commons
16
+ from app.rvc.infer_pack import modules
17
+ from app.rvc.infer_pack.modules import LayerNorm
18
+
19
+
20
+ class Encoder(nn.Module):
21
+ def __init__(
22
+ self,
23
+ hidden_channels,
24
+ filter_channels,
25
+ n_heads,
26
+ n_layers,
27
+ kernel_size=1,
28
+ p_dropout=0.0,
29
+ window_size=10,
30
+ **kwargs
31
+ ):
32
+ super().__init__()
33
+ self.hidden_channels = hidden_channels
34
+ self.filter_channels = filter_channels
35
+ self.n_heads = n_heads
36
+ self.n_layers = n_layers
37
+ self.kernel_size = kernel_size
38
+ self.p_dropout = p_dropout
39
+ self.window_size = window_size
40
+
41
+ self.drop = nn.Dropout(p_dropout)
42
+ self.attn_layers = nn.ModuleList()
43
+ self.norm_layers_1 = nn.ModuleList()
44
+ self.ffn_layers = nn.ModuleList()
45
+ self.norm_layers_2 = nn.ModuleList()
46
+ for i in range(self.n_layers):
47
+ self.attn_layers.append(
48
+ MultiHeadAttention(
49
+ hidden_channels,
50
+ hidden_channels,
51
+ n_heads,
52
+ p_dropout=p_dropout,
53
+ window_size=window_size,
54
+ )
55
+ )
56
+ self.norm_layers_1.append(LayerNorm(hidden_channels))
57
+ self.ffn_layers.append(
58
+ FFN(
59
+ hidden_channels,
60
+ hidden_channels,
61
+ filter_channels,
62
+ kernel_size,
63
+ p_dropout=p_dropout,
64
+ )
65
+ )
66
+ self.norm_layers_2.append(LayerNorm(hidden_channels))
67
+
68
+ def forward(self, x, x_mask):
69
+ attn_mask = x_mask.unsqueeze(2) * x_mask.unsqueeze(-1)
70
+ x = x * x_mask
71
+ for i in range(self.n_layers):
72
+ y = self.attn_layers[i](x, x, attn_mask)
73
+ y = self.drop(y)
74
+ x = self.norm_layers_1[i](x + y)
75
+
76
+ y = self.ffn_layers[i](x, x_mask)
77
+ y = self.drop(y)
78
+ x = self.norm_layers_2[i](x + y)
79
+ x = x * x_mask
80
+ return x
81
+
82
+
83
+ class Decoder(nn.Module):
84
+ def __init__(
85
+ self,
86
+ hidden_channels,
87
+ filter_channels,
88
+ n_heads,
89
+ n_layers,
90
+ kernel_size=1,
91
+ p_dropout=0.0,
92
+ proximal_bias=False,
93
+ proximal_init=True,
94
+ **kwargs
95
+ ):
96
+ super().__init__()
97
+ self.hidden_channels = hidden_channels
98
+ self.filter_channels = filter_channels
99
+ self.n_heads = n_heads
100
+ self.n_layers = n_layers
101
+ self.kernel_size = kernel_size
102
+ self.p_dropout = p_dropout
103
+ self.proximal_bias = proximal_bias
104
+ self.proximal_init = proximal_init
105
+
106
+ self.drop = nn.Dropout(p_dropout)
107
+ self.self_attn_layers = nn.ModuleList()
108
+ self.norm_layers_0 = nn.ModuleList()
109
+ self.encdec_attn_layers = nn.ModuleList()
110
+ self.norm_layers_1 = nn.ModuleList()
111
+ self.ffn_layers = nn.ModuleList()
112
+ self.norm_layers_2 = nn.ModuleList()
113
+ for i in range(self.n_layers):
114
+ self.self_attn_layers.append(
115
+ MultiHeadAttention(
116
+ hidden_channels,
117
+ hidden_channels,
118
+ n_heads,
119
+ p_dropout=p_dropout,
120
+ proximal_bias=proximal_bias,
121
+ proximal_init=proximal_init,
122
+ )
123
+ )
124
+ self.norm_layers_0.append(LayerNorm(hidden_channels))
125
+ self.encdec_attn_layers.append(
126
+ MultiHeadAttention(
127
+ hidden_channels, hidden_channels, n_heads, p_dropout=p_dropout
128
+ )
129
+ )
130
+ self.norm_layers_1.append(LayerNorm(hidden_channels))
131
+ self.ffn_layers.append(
132
+ FFN(
133
+ hidden_channels,
134
+ hidden_channels,
135
+ filter_channels,
136
+ kernel_size,
137
+ p_dropout=p_dropout,
138
+ causal=True,
139
+ )
140
+ )
141
+ self.norm_layers_2.append(LayerNorm(hidden_channels))
142
+
143
+ def forward(self, x, x_mask, h, h_mask):
144
+ """
145
+ x: decoder input
146
+ h: encoder output
147
+ """
148
+ self_attn_mask = commons.subsequent_mask(x_mask.size(2)).to(
149
+ device=x.device, dtype=x.dtype
150
+ )
151
+ encdec_attn_mask = h_mask.unsqueeze(2) * x_mask.unsqueeze(-1)
152
+ x = x * x_mask
153
+ for i in range(self.n_layers):
154
+ y = self.self_attn_layers[i](x, x, self_attn_mask)
155
+ y = self.drop(y)
156
+ x = self.norm_layers_0[i](x + y)
157
+
158
+ y = self.encdec_attn_layers[i](x, h, encdec_attn_mask)
159
+ y = self.drop(y)
160
+ x = self.norm_layers_1[i](x + y)
161
+
162
+ y = self.ffn_layers[i](x, x_mask)
163
+ y = self.drop(y)
164
+ x = self.norm_layers_2[i](x + y)
165
+ x = x * x_mask
166
+ return x
167
+
168
+
169
+ class MultiHeadAttention(nn.Module):
170
+ def __init__(
171
+ self,
172
+ channels,
173
+ out_channels,
174
+ n_heads,
175
+ p_dropout=0.0,
176
+ window_size=None,
177
+ heads_share=True,
178
+ block_length=None,
179
+ proximal_bias=False,
180
+ proximal_init=False,
181
+ ):
182
+ super().__init__()
183
+ assert channels % n_heads == 0
184
+
185
+ self.channels = channels
186
+ self.out_channels = out_channels
187
+ self.n_heads = n_heads
188
+ self.p_dropout = p_dropout
189
+ self.window_size = window_size
190
+ self.heads_share = heads_share
191
+ self.block_length = block_length
192
+ self.proximal_bias = proximal_bias
193
+ self.proximal_init = proximal_init
194
+ self.attn = None
195
+
196
+ self.k_channels = channels // n_heads
197
+ self.conv_q = nn.Conv1d(channels, channels, 1)
198
+ self.conv_k = nn.Conv1d(channels, channels, 1)
199
+ self.conv_v = nn.Conv1d(channels, channels, 1)
200
+ self.conv_o = nn.Conv1d(channels, out_channels, 1)
201
+ self.drop = nn.Dropout(p_dropout)
202
+
203
+ if window_size is not None:
204
+ n_heads_rel = 1 if heads_share else n_heads
205
+ rel_stddev = self.k_channels**-0.5
206
+ self.emb_rel_k = nn.Parameter(
207
+ torch.randn(n_heads_rel, window_size * 2 + 1, self.k_channels)
208
+ * rel_stddev
209
+ )
210
+ self.emb_rel_v = nn.Parameter(
211
+ torch.randn(n_heads_rel, window_size * 2 + 1, self.k_channels)
212
+ * rel_stddev
213
+ )
214
+
215
+ nn.init.xavier_uniform_(self.conv_q.weight)
216
+ nn.init.xavier_uniform_(self.conv_k.weight)
217
+ nn.init.xavier_uniform_(self.conv_v.weight)
218
+ if proximal_init:
219
+ with torch.no_grad():
220
+ self.conv_k.weight.copy_(self.conv_q.weight)
221
+ self.conv_k.bias.copy_(self.conv_q.bias)
222
+
223
+ def forward(self, x, c, attn_mask=None):
224
+ q = self.conv_q(x)
225
+ k = self.conv_k(c)
226
+ v = self.conv_v(c)
227
+
228
+ x, self.attn = self.attention(q, k, v, mask=attn_mask)
229
+
230
+ x = self.conv_o(x)
231
+ return x
232
+
233
+ def attention(self, query, key, value, mask=None):
234
+ # reshape [b, d, t] -> [b, n_h, t, d_k]
235
+ b, d, t_s, t_t = (*key.size(), query.size(2))
236
+ query = query.view(b, self.n_heads, self.k_channels, t_t).transpose(2, 3)
237
+ key = key.view(b, self.n_heads, self.k_channels, t_s).transpose(2, 3)
238
+ value = value.view(b, self.n_heads, self.k_channels, t_s).transpose(2, 3)
239
+
240
+ scores = torch.matmul(query / math.sqrt(self.k_channels), key.transpose(-2, -1))
241
+ if self.window_size is not None:
242
+ assert (
243
+ t_s == t_t
244
+ ), "Relative attention is only available for self-attention."
245
+ key_relative_embeddings = self._get_relative_embeddings(self.emb_rel_k, t_s)
246
+ rel_logits = self._matmul_with_relative_keys(
247
+ query / math.sqrt(self.k_channels), key_relative_embeddings
248
+ )
249
+ scores_local = self._relative_position_to_absolute_position(rel_logits)
250
+ scores = scores + scores_local
251
+ if self.proximal_bias:
252
+ assert t_s == t_t, "Proximal bias is only available for self-attention."
253
+ scores = scores + self._attention_bias_proximal(t_s).to(
254
+ device=scores.device, dtype=scores.dtype
255
+ )
256
+ if mask is not None:
257
+ scores = scores.masked_fill(mask == 0, -1e4)
258
+ if self.block_length is not None:
259
+ assert (
260
+ t_s == t_t
261
+ ), "Local attention is only available for self-attention."
262
+ block_mask = (
263
+ torch.ones_like(scores)
264
+ .triu(-self.block_length)
265
+ .tril(self.block_length)
266
+ )
267
+ scores = scores.masked_fill(block_mask == 0, -1e4)
268
+ p_attn = F.softmax(scores, dim=-1) # [b, n_h, t_t, t_s]
269
+ p_attn = self.drop(p_attn)
270
+ output = torch.matmul(p_attn, value)
271
+ if self.window_size is not None:
272
+ relative_weights = self._absolute_position_to_relative_position(p_attn)
273
+ value_relative_embeddings = self._get_relative_embeddings(
274
+ self.emb_rel_v, t_s
275
+ )
276
+ output = output + self._matmul_with_relative_values(
277
+ relative_weights, value_relative_embeddings
278
+ )
279
+ output = (
280
+ output.transpose(2, 3).contiguous().view(b, d, t_t)
281
+ ) # [b, n_h, t_t, d_k] -> [b, d, t_t]
282
+ return output, p_attn
283
+
284
+ def _matmul_with_relative_values(self, x, y):
285
+ """
286
+ x: [b, h, l, m]
287
+ y: [h or 1, m, d]
288
+ ret: [b, h, l, d]
289
+ """
290
+ ret = torch.matmul(x, y.unsqueeze(0))
291
+ return ret
292
+
293
+ def _matmul_with_relative_keys(self, x, y):
294
+ """
295
+ x: [b, h, l, d]
296
+ y: [h or 1, m, d]
297
+ ret: [b, h, l, m]
298
+ """
299
+ ret = torch.matmul(x, y.unsqueeze(0).transpose(-2, -1))
300
+ return ret
301
+
302
+ def _get_relative_embeddings(self, relative_embeddings, length):
303
+ max_relative_position = 2 * self.window_size + 1
304
+ # Pad first before slice to avoid using cond ops.
305
+ pad_length = max(length - (self.window_size + 1), 0)
306
+ slice_start_position = max((self.window_size + 1) - length, 0)
307
+ slice_end_position = slice_start_position + 2 * length - 1
308
+ if pad_length > 0:
309
+ padded_relative_embeddings = F.pad(
310
+ relative_embeddings,
311
+ commons.convert_pad_shape([[0, 0], [pad_length, pad_length], [0, 0]]),
312
+ )
313
+ else:
314
+ padded_relative_embeddings = relative_embeddings
315
+ used_relative_embeddings = padded_relative_embeddings[
316
+ :, slice_start_position:slice_end_position
317
+ ]
318
+ return used_relative_embeddings
319
+
320
+ def _relative_position_to_absolute_position(self, x):
321
+ """
322
+ x: [b, h, l, 2*l-1]
323
+ ret: [b, h, l, l]
324
+ """
325
+ batch, heads, length, _ = x.size()
326
+ # Concat columns of pad to shift from relative to absolute indexing.
327
+ x = F.pad(x, commons.convert_pad_shape([[0, 0], [0, 0], [0, 0], [0, 1]]))
328
+
329
+ # Concat extra elements so to add up to shape (len+1, 2*len-1).
330
+ x_flat = x.view([batch, heads, length * 2 * length])
331
+ x_flat = F.pad(
332
+ x_flat, commons.convert_pad_shape([[0, 0], [0, 0], [0, length - 1]])
333
+ )
334
+
335
+ # Reshape and slice out the padded elements.
336
+ x_final = x_flat.view([batch, heads, length + 1, 2 * length - 1])[
337
+ :, :, :length, length - 1 :
338
+ ]
339
+ return x_final
340
+
341
+ def _absolute_position_to_relative_position(self, x):
342
+ """
343
+ x: [b, h, l, l]
344
+ ret: [b, h, l, 2*l-1]
345
+ """
346
+ batch, heads, length, _ = x.size()
347
+ # padd along column
348
+ x = F.pad(
349
+ x, commons.convert_pad_shape([[0, 0], [0, 0], [0, 0], [0, length - 1]])
350
+ )
351
+ x_flat = x.view([batch, heads, length**2 + length * (length - 1)])
352
+ # add 0's in the beginning that will skew the elements after reshape
353
+ x_flat = F.pad(x_flat, commons.convert_pad_shape([[0, 0], [0, 0], [length, 0]]))
354
+ x_final = x_flat.view([batch, heads, length, 2 * length])[:, :, :, 1:]
355
+ return x_final
356
+
357
+ def _attention_bias_proximal(self, length):
358
+ """Bias for self-attention to encourage attention to close positions.
359
+ Args:
360
+ length: an integer scalar.
361
+ Returns:
362
+ a Tensor with shape [1, 1, length, length]
363
+ """
364
+ r = torch.arange(length, dtype=torch.float32)
365
+ diff = torch.unsqueeze(r, 0) - torch.unsqueeze(r, 1)
366
+ return torch.unsqueeze(torch.unsqueeze(-torch.log1p(torch.abs(diff)), 0), 0)
367
+
368
+
369
+ class FFN(nn.Module):
370
+ def __init__(
371
+ self,
372
+ in_channels,
373
+ out_channels,
374
+ filter_channels,
375
+ kernel_size,
376
+ p_dropout=0.0,
377
+ activation=None,
378
+ causal=False,
379
+ ):
380
+ super().__init__()
381
+ self.in_channels = in_channels
382
+ self.out_channels = out_channels
383
+ self.filter_channels = filter_channels
384
+ self.kernel_size = kernel_size
385
+ self.p_dropout = p_dropout
386
+ self.activation = activation
387
+ self.causal = causal
388
+
389
+ if causal:
390
+ self.padding = self._causal_padding
391
+ else:
392
+ self.padding = self._same_padding
393
+
394
+ self.conv_1 = nn.Conv1d(in_channels, filter_channels, kernel_size)
395
+ self.conv_2 = nn.Conv1d(filter_channels, out_channels, kernel_size)
396
+ self.drop = nn.Dropout(p_dropout)
397
+
398
+ def forward(self, x, x_mask):
399
+ x = self.conv_1(self.padding(x * x_mask))
400
+ if self.activation == "gelu":
401
+ x = x * torch.sigmoid(1.702 * x)
402
+ else:
403
+ x = torch.relu(x)
404
+ x = self.drop(x)
405
+ x = self.conv_2(self.padding(x * x_mask))
406
+ return x * x_mask
407
+
408
+ def _causal_padding(self, x):
409
+ if self.kernel_size == 1:
410
+ return x
411
+ pad_l = self.kernel_size - 1
412
+ pad_r = 0
413
+ padding = [[0, 0], [0, 0], [pad_l, pad_r]]
414
+ x = F.pad(x, commons.convert_pad_shape(padding))
415
+ return x
416
+
417
+ def _same_padding(self, x):
418
+ if self.kernel_size == 1:
419
+ return x
420
+ pad_l = (self.kernel_size - 1) // 2
421
+ pad_r = self.kernel_size // 2
422
+ padding = [[0, 0], [0, 0], [pad_l, pad_r]]
423
+ x = F.pad(x, commons.convert_pad_shape(padding))
424
+ return x
app/rvc/infer_pack/audio.py ADDED
@@ -0,0 +1,21 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import ffmpeg
2
+ import numpy as np
3
+
4
+
5
+ def load_audio(file, sr):
6
+ try:
7
+ # https://github.com/openai/whisper/blob/main/whisper/audio.py#L26
8
+ # This launches a subprocess to decode audio while down-mixing and resampling as necessary.
9
+ # Requires the ffmpeg CLI and `ffmpeg-python` package to be installed.
10
+ file = (
11
+ file.strip(" ").strip('"').strip("\n").strip('"').strip(" ")
12
+ ) # 防止小白拷路径头尾带了空格和"和回车
13
+ out, _ = (
14
+ ffmpeg.input(file, threads=0)
15
+ .output("-", format="f32le", acodec="pcm_f32le", ac=1, ar=sr)
16
+ .run(cmd=["ffmpeg", "-nostdin"], capture_stdout=True, capture_stderr=True)
17
+ )
18
+ except Exception as e:
19
+ raise RuntimeError(f"Failed to load audio: {e}")
20
+
21
+ return np.frombuffer(out, np.float32).flatten()
app/rvc/infer_pack/commons.py ADDED
@@ -0,0 +1,173 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """
2
+ https://github.com/RVC-Project/Retrieval-based-Voice-Conversion-WebUI/tree/86ed98aacaa8b2037aad795abd11cdca122cf39f/lib/infer_pack
3
+
4
+ copyright: RVC-Project
5
+ license: MIT
6
+ """
7
+
8
+ import math
9
+ import numpy as np
10
+ import torch
11
+ from torch import nn
12
+ from torch.nn import functional as F
13
+
14
+
15
+ def init_weights(m, mean=0.0, std=0.01):
16
+ classname = m.__class__.__name__
17
+ if classname.find("Conv") != -1:
18
+ m.weight.data.normal_(mean, std)
19
+
20
+
21
+ def get_padding(kernel_size, dilation=1):
22
+ return int((kernel_size * dilation - dilation) / 2)
23
+
24
+
25
+ def convert_pad_shape(pad_shape):
26
+ l = pad_shape[::-1]
27
+ pad_shape = [item for sublist in l for item in sublist]
28
+ return pad_shape
29
+
30
+
31
+ def kl_divergence(m_p, logs_p, m_q, logs_q):
32
+ """KL(P||Q)"""
33
+ kl = (logs_q - logs_p) - 0.5
34
+ kl += (
35
+ 0.5 * (torch.exp(2.0 * logs_p) + ((m_p - m_q) ** 2)) * torch.exp(-2.0 * logs_q)
36
+ )
37
+ return kl
38
+
39
+
40
+ def rand_gumbel(shape):
41
+ """Sample from the Gumbel distribution, protect from overflows."""
42
+ uniform_samples = torch.rand(shape) * 0.99998 + 0.00001
43
+ return -torch.log(-torch.log(uniform_samples))
44
+
45
+
46
+ def rand_gumbel_like(x):
47
+ g = rand_gumbel(x.size()).to(dtype=x.dtype, device=x.device)
48
+ return g
49
+
50
+
51
+ def slice_segments(x, ids_str, segment_size=4):
52
+ ret = torch.zeros_like(x[:, :, :segment_size])
53
+ for i in range(x.size(0)):
54
+ idx_str = ids_str[i]
55
+ idx_end = idx_str + segment_size
56
+ ret[i] = x[i, :, idx_str:idx_end]
57
+ return ret
58
+
59
+
60
+ def slice_segments2(x, ids_str, segment_size=4):
61
+ ret = torch.zeros_like(x[:, :segment_size])
62
+ for i in range(x.size(0)):
63
+ idx_str = ids_str[i]
64
+ idx_end = idx_str + segment_size
65
+ ret[i] = x[i, idx_str:idx_end]
66
+ return ret
67
+
68
+
69
+ def rand_slice_segments(x, x_lengths=None, segment_size=4):
70
+ b, d, t = x.size()
71
+ if x_lengths is None:
72
+ x_lengths = t
73
+ ids_str_max = x_lengths - segment_size + 1
74
+ ids_str = (torch.rand([b]).to(device=x.device) * ids_str_max).to(dtype=torch.long)
75
+ ret = slice_segments(x, ids_str, segment_size)
76
+ return ret, ids_str
77
+
78
+
79
+ def get_timing_signal_1d(length, channels, min_timescale=1.0, max_timescale=1.0e4):
80
+ position = torch.arange(length, dtype=torch.float)
81
+ num_timescales = channels // 2
82
+ log_timescale_increment = math.log(float(max_timescale) / float(min_timescale)) / (
83
+ num_timescales - 1
84
+ )
85
+ inv_timescales = min_timescale * torch.exp(
86
+ torch.arange(num_timescales, dtype=torch.float) * -log_timescale_increment
87
+ )
88
+ scaled_time = position.unsqueeze(0) * inv_timescales.unsqueeze(1)
89
+ signal = torch.cat([torch.sin(scaled_time), torch.cos(scaled_time)], 0)
90
+ signal = F.pad(signal, [0, 0, 0, channels % 2])
91
+ signal = signal.view(1, channels, length)
92
+ return signal
93
+
94
+
95
+ def add_timing_signal_1d(x, min_timescale=1.0, max_timescale=1.0e4):
96
+ b, channels, length = x.size()
97
+ signal = get_timing_signal_1d(length, channels, min_timescale, max_timescale)
98
+ return x + signal.to(dtype=x.dtype, device=x.device)
99
+
100
+
101
+ def cat_timing_signal_1d(x, min_timescale=1.0, max_timescale=1.0e4, axis=1):
102
+ b, channels, length = x.size()
103
+ signal = get_timing_signal_1d(length, channels, min_timescale, max_timescale)
104
+ return torch.cat([x, signal.to(dtype=x.dtype, device=x.device)], axis)
105
+
106
+
107
+ def subsequent_mask(length):
108
+ mask = torch.tril(torch.ones(length, length)).unsqueeze(0).unsqueeze(0)
109
+ return mask
110
+
111
+
112
+ @torch.jit.script
113
+ def fused_add_tanh_sigmoid_multiply(input_a, input_b, n_channels):
114
+ n_channels_int = n_channels[0]
115
+ in_act = input_a + input_b
116
+ t_act = torch.tanh(in_act[:, :n_channels_int, :])
117
+ s_act = torch.sigmoid(in_act[:, n_channels_int:, :])
118
+ acts = t_act * s_act
119
+ return acts
120
+
121
+
122
+ def convert_pad_shape(pad_shape):
123
+ l = pad_shape[::-1]
124
+ pad_shape = [item for sublist in l for item in sublist]
125
+ return pad_shape
126
+
127
+
128
+ def shift_1d(x):
129
+ x = F.pad(x, convert_pad_shape([[0, 0], [0, 0], [1, 0]]))[:, :, :-1]
130
+ return x
131
+
132
+
133
+ def sequence_mask(length, max_length=None):
134
+ if max_length is None:
135
+ max_length = length.max()
136
+ x = torch.arange(max_length, dtype=length.dtype, device=length.device)
137
+ return x.unsqueeze(0) < length.unsqueeze(1)
138
+
139
+
140
+ def generate_path(duration, mask):
141
+ """
142
+ duration: [b, 1, t_x]
143
+ mask: [b, 1, t_y, t_x]
144
+ """
145
+ device = duration.device
146
+
147
+ b, _, t_y, t_x = mask.shape
148
+ cum_duration = torch.cumsum(duration, -1)
149
+
150
+ cum_duration_flat = cum_duration.view(b * t_x)
151
+ path = sequence_mask(cum_duration_flat, t_y).to(mask.dtype)
152
+ path = path.view(b, t_x, t_y)
153
+ path = path - F.pad(path, convert_pad_shape([[0, 0], [1, 0], [0, 0]]))[:, :-1]
154
+ path = path.unsqueeze(1).transpose(2, 3) * mask
155
+ return path
156
+
157
+
158
+ def clip_grad_value_(parameters, clip_value, norm_type=2):
159
+ if isinstance(parameters, torch.Tensor):
160
+ parameters = [parameters]
161
+ parameters = list(filter(lambda p: p.grad is not None, parameters))
162
+ norm_type = float(norm_type)
163
+ if clip_value is not None:
164
+ clip_value = float(clip_value)
165
+
166
+ total_norm = 0
167
+ for p in parameters:
168
+ param_norm = p.grad.data.norm(norm_type)
169
+ total_norm += param_norm.item() ** norm_type
170
+ if clip_value is not None:
171
+ p.grad.data.clamp_(min=-clip_value, max=clip_value)
172
+ total_norm = total_norm ** (1.0 / norm_type)
173
+ return total_norm
app/rvc/infer_pack/models.py ADDED
@@ -0,0 +1,1149 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """
2
+ https://github.com/RVC-Project/Retrieval-based-Voice-Conversion-WebUI/tree/86ed98aacaa8b2037aad795abd11cdca122cf39f/lib/infer_pack
3
+
4
+ copyright: RVC-Project
5
+ license: MIT
6
+ """
7
+
8
+ import math, pdb, os
9
+ from time import time as ttime
10
+ import torch
11
+ from torch import nn
12
+ from torch.nn import functional as F
13
+ from app.rvc.infer_pack import modules
14
+ from app.rvc.infer_pack import attentions
15
+ from app.rvc.infer_pack import commons
16
+ from app.rvc.infer_pack.commons import init_weights, get_padding
17
+ from torch.nn import Conv1d, ConvTranspose1d, AvgPool1d, Conv2d
18
+ from torch.nn.utils import weight_norm, remove_weight_norm, spectral_norm
19
+ from app.rvc.infer_pack.commons import init_weights
20
+ import numpy as np
21
+ from app.rvc.infer_pack import commons
22
+
23
+
24
+ class TextEncoder256(nn.Module):
25
+ def __init__(
26
+ self,
27
+ out_channels,
28
+ hidden_channels,
29
+ filter_channels,
30
+ n_heads,
31
+ n_layers,
32
+ kernel_size,
33
+ p_dropout,
34
+ f0=True,
35
+ ):
36
+ super().__init__()
37
+ self.out_channels = out_channels
38
+ self.hidden_channels = hidden_channels
39
+ self.filter_channels = filter_channels
40
+ self.n_heads = n_heads
41
+ self.n_layers = n_layers
42
+ self.kernel_size = kernel_size
43
+ self.p_dropout = p_dropout
44
+ self.emb_phone = nn.Linear(256, hidden_channels)
45
+ self.lrelu = nn.LeakyReLU(0.1, inplace=True)
46
+ if f0 == True:
47
+ self.emb_pitch = nn.Embedding(256, hidden_channels) # pitch 256
48
+ self.encoder = attentions.Encoder(
49
+ hidden_channels, filter_channels, n_heads, n_layers, kernel_size, p_dropout
50
+ )
51
+ self.proj = nn.Conv1d(hidden_channels, out_channels * 2, 1)
52
+
53
+ def forward(self, phone, pitch, lengths):
54
+ if pitch == None:
55
+ x = self.emb_phone(phone)
56
+ else:
57
+ x = self.emb_phone(phone) + self.emb_pitch(pitch)
58
+ x = x * math.sqrt(self.hidden_channels) # [b, t, h]
59
+ x = self.lrelu(x)
60
+ x = torch.transpose(x, 1, -1) # [b, h, t]
61
+ x_mask = torch.unsqueeze(commons.sequence_mask(lengths, x.size(2)), 1).to(
62
+ x.dtype
63
+ )
64
+ x = self.encoder(x * x_mask, x_mask)
65
+ stats = self.proj(x) * x_mask
66
+
67
+ m, logs = torch.split(stats, self.out_channels, dim=1)
68
+ return m, logs, x_mask
69
+
70
+
71
+ class TextEncoder768(nn.Module):
72
+ def __init__(
73
+ self,
74
+ out_channels,
75
+ hidden_channels,
76
+ filter_channels,
77
+ n_heads,
78
+ n_layers,
79
+ kernel_size,
80
+ p_dropout,
81
+ f0=True,
82
+ ):
83
+ super().__init__()
84
+ self.out_channels = out_channels
85
+ self.hidden_channels = hidden_channels
86
+ self.filter_channels = filter_channels
87
+ self.n_heads = n_heads
88
+ self.n_layers = n_layers
89
+ self.kernel_size = kernel_size
90
+ self.p_dropout = p_dropout
91
+ self.emb_phone = nn.Linear(768, hidden_channels)
92
+ self.lrelu = nn.LeakyReLU(0.1, inplace=True)
93
+ if f0 == True:
94
+ self.emb_pitch = nn.Embedding(256, hidden_channels) # pitch 256
95
+ self.encoder = attentions.Encoder(
96
+ hidden_channels, filter_channels, n_heads, n_layers, kernel_size, p_dropout
97
+ )
98
+ self.proj = nn.Conv1d(hidden_channels, out_channels * 2, 1)
99
+
100
+ def forward(self, phone, pitch, lengths):
101
+ if pitch == None:
102
+ x = self.emb_phone(phone)
103
+ else:
104
+ x = self.emb_phone(phone) + self.emb_pitch(pitch)
105
+ x = x * math.sqrt(self.hidden_channels) # [b, t, h]
106
+ x = self.lrelu(x)
107
+ x = torch.transpose(x, 1, -1) # [b, h, t]
108
+ x_mask = torch.unsqueeze(commons.sequence_mask(lengths, x.size(2)), 1).to(
109
+ x.dtype
110
+ )
111
+ x = self.encoder(x * x_mask, x_mask)
112
+ stats = self.proj(x) * x_mask
113
+
114
+ m, logs = torch.split(stats, self.out_channels, dim=1)
115
+ return m, logs, x_mask
116
+
117
+
118
+ class ResidualCouplingBlock(nn.Module):
119
+ def __init__(
120
+ self,
121
+ channels,
122
+ hidden_channels,
123
+ kernel_size,
124
+ dilation_rate,
125
+ n_layers,
126
+ n_flows=4,
127
+ gin_channels=0,
128
+ ):
129
+ super().__init__()
130
+ self.channels = channels
131
+ self.hidden_channels = hidden_channels
132
+ self.kernel_size = kernel_size
133
+ self.dilation_rate = dilation_rate
134
+ self.n_layers = n_layers
135
+ self.n_flows = n_flows
136
+ self.gin_channels = gin_channels
137
+
138
+ self.flows = nn.ModuleList()
139
+ for i in range(n_flows):
140
+ self.flows.append(
141
+ modules.ResidualCouplingLayer(
142
+ channels,
143
+ hidden_channels,
144
+ kernel_size,
145
+ dilation_rate,
146
+ n_layers,
147
+ gin_channels=gin_channels,
148
+ mean_only=True,
149
+ )
150
+ )
151
+ self.flows.append(modules.Flip())
152
+
153
+ def forward(self, x, x_mask, g=None, reverse=False):
154
+ if not reverse:
155
+ for flow in self.flows:
156
+ x, _ = flow(x, x_mask, g=g, reverse=reverse)
157
+ else:
158
+ for flow in reversed(self.flows):
159
+ x = flow(x, x_mask, g=g, reverse=reverse)
160
+ return x
161
+
162
+ def remove_weight_norm(self):
163
+ for i in range(self.n_flows):
164
+ self.flows[i * 2].remove_weight_norm()
165
+
166
+
167
+ class PosteriorEncoder(nn.Module):
168
+ def __init__(
169
+ self,
170
+ in_channels,
171
+ out_channels,
172
+ hidden_channels,
173
+ kernel_size,
174
+ dilation_rate,
175
+ n_layers,
176
+ gin_channels=0,
177
+ ):
178
+ super().__init__()
179
+ self.in_channels = in_channels
180
+ self.out_channels = out_channels
181
+ self.hidden_channels = hidden_channels
182
+ self.kernel_size = kernel_size
183
+ self.dilation_rate = dilation_rate
184
+ self.n_layers = n_layers
185
+ self.gin_channels = gin_channels
186
+
187
+ self.pre = nn.Conv1d(in_channels, hidden_channels, 1)
188
+ self.enc = modules.WN(
189
+ hidden_channels,
190
+ kernel_size,
191
+ dilation_rate,
192
+ n_layers,
193
+ gin_channels=gin_channels,
194
+ )
195
+ self.proj = nn.Conv1d(hidden_channels, out_channels * 2, 1)
196
+
197
+ def forward(self, x, x_lengths, g=None):
198
+ x_mask = torch.unsqueeze(commons.sequence_mask(x_lengths, x.size(2)), 1).to(
199
+ x.dtype
200
+ )
201
+ x = self.pre(x) * x_mask
202
+ x = self.enc(x, x_mask, g=g)
203
+ stats = self.proj(x) * x_mask
204
+ m, logs = torch.split(stats, self.out_channels, dim=1)
205
+ z = (m + torch.randn_like(m) * torch.exp(logs)) * x_mask
206
+ return z, m, logs, x_mask
207
+
208
+ def remove_weight_norm(self):
209
+ self.enc.remove_weight_norm()
210
+
211
+
212
+ class Generator(torch.nn.Module):
213
+ def __init__(
214
+ self,
215
+ initial_channel,
216
+ resblock,
217
+ resblock_kernel_sizes,
218
+ resblock_dilation_sizes,
219
+ upsample_rates,
220
+ upsample_initial_channel,
221
+ upsample_kernel_sizes,
222
+ gin_channels=0,
223
+ ):
224
+ super(Generator, self).__init__()
225
+ self.num_kernels = len(resblock_kernel_sizes)
226
+ self.num_upsamples = len(upsample_rates)
227
+ self.conv_pre = Conv1d(
228
+ initial_channel, upsample_initial_channel, 7, 1, padding=3
229
+ )
230
+ resblock = modules.ResBlock1 if resblock == "1" else modules.ResBlock2
231
+
232
+ self.ups = nn.ModuleList()
233
+ for i, (u, k) in enumerate(zip(upsample_rates, upsample_kernel_sizes)):
234
+ self.ups.append(
235
+ weight_norm(
236
+ ConvTranspose1d(
237
+ upsample_initial_channel // (2**i),
238
+ upsample_initial_channel // (2 ** (i + 1)),
239
+ k,
240
+ u,
241
+ padding=(k - u) // 2,
242
+ )
243
+ )
244
+ )
245
+
246
+ self.resblocks = nn.ModuleList()
247
+ for i in range(len(self.ups)):
248
+ ch = upsample_initial_channel // (2 ** (i + 1))
249
+ for j, (k, d) in enumerate(
250
+ zip(resblock_kernel_sizes, resblock_dilation_sizes)
251
+ ):
252
+ self.resblocks.append(resblock(ch, k, d))
253
+
254
+ self.conv_post = Conv1d(ch, 1, 7, 1, padding=3, bias=False)
255
+ self.ups.apply(init_weights)
256
+
257
+ if gin_channels != 0:
258
+ self.cond = nn.Conv1d(gin_channels, upsample_initial_channel, 1)
259
+
260
+ def forward(self, x, g=None):
261
+ x = self.conv_pre(x)
262
+ if g is not None:
263
+ x = x + self.cond(g)
264
+
265
+ for i in range(self.num_upsamples):
266
+ x = F.leaky_relu(x, modules.LRELU_SLOPE)
267
+ x = self.ups[i](x)
268
+ xs = None
269
+ for j in range(self.num_kernels):
270
+ if xs is None:
271
+ xs = self.resblocks[i * self.num_kernels + j](x)
272
+ else:
273
+ xs += self.resblocks[i * self.num_kernels + j](x)
274
+ x = xs / self.num_kernels
275
+ x = F.leaky_relu(x)
276
+ x = self.conv_post(x)
277
+ x = torch.tanh(x)
278
+
279
+ return x
280
+
281
+ def remove_weight_norm(self):
282
+ for l in self.ups:
283
+ remove_weight_norm(l)
284
+ for l in self.resblocks:
285
+ l.remove_weight_norm()
286
+
287
+
288
+ class SineGen(torch.nn.Module):
289
+ """Definition of sine generator
290
+ SineGen(samp_rate, harmonic_num = 0,
291
+ sine_amp = 0.1, noise_std = 0.003,
292
+ voiced_threshold = 0,
293
+ flag_for_pulse=False)
294
+ samp_rate: sampling rate in Hz
295
+ harmonic_num: number of harmonic overtones (default 0)
296
+ sine_amp: amplitude of sine-wavefrom (default 0.1)
297
+ noise_std: std of Gaussian noise (default 0.003)
298
+ voiced_thoreshold: F0 threshold for U/V classification (default 0)
299
+ flag_for_pulse: this SinGen is used inside PulseGen (default False)
300
+ Note: when flag_for_pulse is True, the first time step of a voiced
301
+ segment is always sin(np.pi) or cos(0)
302
+ """
303
+
304
+ def __init__(
305
+ self,
306
+ samp_rate,
307
+ harmonic_num=0,
308
+ sine_amp=0.1,
309
+ noise_std=0.003,
310
+ voiced_threshold=0,
311
+ flag_for_pulse=False,
312
+ ):
313
+ super(SineGen, self).__init__()
314
+ self.sine_amp = sine_amp
315
+ self.noise_std = noise_std
316
+ self.harmonic_num = harmonic_num
317
+ self.dim = self.harmonic_num + 1
318
+ self.sampling_rate = samp_rate
319
+ self.voiced_threshold = voiced_threshold
320
+
321
+ def _f02uv(self, f0):
322
+ # generate uv signal
323
+ uv = torch.ones_like(f0)
324
+ uv = uv * (f0 > self.voiced_threshold)
325
+ return uv
326
+
327
+ def forward(self, f0, upp):
328
+ """sine_tensor, uv = forward(f0)
329
+ input F0: tensor(batchsize=1, length, dim=1)
330
+ f0 for unvoiced steps should be 0
331
+ output sine_tensor: tensor(batchsize=1, length, dim)
332
+ output uv: tensor(batchsize=1, length, 1)
333
+ """
334
+ with torch.no_grad():
335
+ f0 = f0[:, None].transpose(1, 2)
336
+ f0_buf = torch.zeros(f0.shape[0], f0.shape[1], self.dim, device=f0.device)
337
+ # fundamental component
338
+ f0_buf[:, :, 0] = f0[:, :, 0]
339
+ for idx in np.arange(self.harmonic_num):
340
+ f0_buf[:, :, idx + 1] = f0_buf[:, :, 0] * (
341
+ idx + 2
342
+ ) # idx + 2: the (idx+1)-th overtone, (idx+2)-th harmonic
343
+ rad_values = (f0_buf / self.sampling_rate) % 1 ###%1意味着n_har的乘积无法后处理优化
344
+ rand_ini = torch.rand(
345
+ f0_buf.shape[0], f0_buf.shape[2], device=f0_buf.device
346
+ )
347
+ rand_ini[:, 0] = 0
348
+ rad_values[:, 0, :] = rad_values[:, 0, :] + rand_ini
349
+ tmp_over_one = torch.cumsum(rad_values, 1) # % 1 #####%1意味着后面的cumsum无法再优化
350
+ tmp_over_one *= upp
351
+ tmp_over_one = F.interpolate(
352
+ tmp_over_one.transpose(2, 1),
353
+ scale_factor=upp,
354
+ mode="linear",
355
+ align_corners=True,
356
+ ).transpose(2, 1)
357
+ rad_values = F.interpolate(
358
+ rad_values.transpose(2, 1), scale_factor=upp, mode="nearest"
359
+ ).transpose(
360
+ 2, 1
361
+ ) #######
362
+ tmp_over_one %= 1
363
+ tmp_over_one_idx = (tmp_over_one[:, 1:, :] - tmp_over_one[:, :-1, :]) < 0
364
+ cumsum_shift = torch.zeros_like(rad_values)
365
+ cumsum_shift[:, 1:, :] = tmp_over_one_idx * -1.0
366
+ sine_waves = torch.sin(
367
+ torch.cumsum(rad_values + cumsum_shift, dim=1) * 2 * np.pi
368
+ )
369
+ sine_waves = sine_waves * self.sine_amp
370
+ uv = self._f02uv(f0)
371
+ uv = F.interpolate(
372
+ uv.transpose(2, 1), scale_factor=upp, mode="nearest"
373
+ ).transpose(2, 1)
374
+ noise_amp = uv * self.noise_std + (1 - uv) * self.sine_amp / 3
375
+ noise = noise_amp * torch.randn_like(sine_waves)
376
+ sine_waves = sine_waves * uv + noise
377
+ return sine_waves, uv, noise
378
+
379
+
380
+ class SourceModuleHnNSF(torch.nn.Module):
381
+ """SourceModule for hn-nsf
382
+ SourceModule(sampling_rate, harmonic_num=0, sine_amp=0.1,
383
+ add_noise_std=0.003, voiced_threshod=0)
384
+ sampling_rate: sampling_rate in Hz
385
+ harmonic_num: number of harmonic above F0 (default: 0)
386
+ sine_amp: amplitude of sine source signal (default: 0.1)
387
+ add_noise_std: std of additive Gaussian noise (default: 0.003)
388
+ note that amplitude of noise in unvoiced is decided
389
+ by sine_amp
390
+ voiced_threshold: threhold to set U/V given F0 (default: 0)
391
+ Sine_source, noise_source = SourceModuleHnNSF(F0_sampled)
392
+ F0_sampled (batchsize, length, 1)
393
+ Sine_source (batchsize, length, 1)
394
+ noise_source (batchsize, length 1)
395
+ uv (batchsize, length, 1)
396
+ """
397
+
398
+ def __init__(
399
+ self,
400
+ sampling_rate,
401
+ harmonic_num=0,
402
+ sine_amp=0.1,
403
+ add_noise_std=0.003,
404
+ voiced_threshod=0,
405
+ is_half=True,
406
+ ):
407
+ super(SourceModuleHnNSF, self).__init__()
408
+
409
+ self.sine_amp = sine_amp
410
+ self.noise_std = add_noise_std
411
+ self.is_half = is_half
412
+ # to produce sine waveforms
413
+ self.l_sin_gen = SineGen(
414
+ sampling_rate, harmonic_num, sine_amp, add_noise_std, voiced_threshod
415
+ )
416
+
417
+ # to merge source harmonics into a single excitation
418
+ self.l_linear = torch.nn.Linear(harmonic_num + 1, 1)
419
+ self.l_tanh = torch.nn.Tanh()
420
+
421
+ def forward(self, x, upp=None):
422
+ sine_wavs, uv, _ = self.l_sin_gen(x, upp)
423
+ if self.is_half:
424
+ sine_wavs = sine_wavs.half()
425
+ sine_merge = self.l_tanh(self.l_linear(sine_wavs))
426
+ return sine_merge, None, None # noise, uv
427
+
428
+
429
+ class GeneratorNSF(torch.nn.Module):
430
+ def __init__(
431
+ self,
432
+ initial_channel,
433
+ resblock,
434
+ resblock_kernel_sizes,
435
+ resblock_dilation_sizes,
436
+ upsample_rates,
437
+ upsample_initial_channel,
438
+ upsample_kernel_sizes,
439
+ gin_channels,
440
+ sr,
441
+ is_half=False,
442
+ ):
443
+ super(GeneratorNSF, self).__init__()
444
+ self.num_kernels = len(resblock_kernel_sizes)
445
+ self.num_upsamples = len(upsample_rates)
446
+
447
+ self.f0_upsamp = torch.nn.Upsample(scale_factor=np.prod(upsample_rates))
448
+ self.m_source = SourceModuleHnNSF(
449
+ sampling_rate=sr, harmonic_num=0, is_half=is_half
450
+ )
451
+ self.noise_convs = nn.ModuleList()
452
+ self.conv_pre = Conv1d(
453
+ initial_channel, upsample_initial_channel, 7, 1, padding=3
454
+ )
455
+ resblock = modules.ResBlock1 if resblock == "1" else modules.ResBlock2
456
+
457
+ self.ups = nn.ModuleList()
458
+ for i, (u, k) in enumerate(zip(upsample_rates, upsample_kernel_sizes)):
459
+ c_cur = upsample_initial_channel // (2 ** (i + 1))
460
+ self.ups.append(
461
+ weight_norm(
462
+ ConvTranspose1d(
463
+ upsample_initial_channel // (2**i),
464
+ upsample_initial_channel // (2 ** (i + 1)),
465
+ k,
466
+ u,
467
+ padding=(k - u) // 2,
468
+ )
469
+ )
470
+ )
471
+ if i + 1 < len(upsample_rates):
472
+ stride_f0 = np.prod(upsample_rates[i + 1 :])
473
+ self.noise_convs.append(
474
+ Conv1d(
475
+ 1,
476
+ c_cur,
477
+ kernel_size=stride_f0 * 2,
478
+ stride=stride_f0,
479
+ padding=stride_f0 // 2,
480
+ )
481
+ )
482
+ else:
483
+ self.noise_convs.append(Conv1d(1, c_cur, kernel_size=1))
484
+
485
+ self.resblocks = nn.ModuleList()
486
+ for i in range(len(self.ups)):
487
+ ch = upsample_initial_channel // (2 ** (i + 1))
488
+ for j, (k, d) in enumerate(
489
+ zip(resblock_kernel_sizes, resblock_dilation_sizes)
490
+ ):
491
+ self.resblocks.append(resblock(ch, k, d))
492
+
493
+ self.conv_post = Conv1d(ch, 1, 7, 1, padding=3, bias=False)
494
+ self.ups.apply(init_weights)
495
+
496
+ if gin_channels != 0:
497
+ self.cond = nn.Conv1d(gin_channels, upsample_initial_channel, 1)
498
+
499
+ self.upp = np.prod(upsample_rates)
500
+
501
+ def forward(self, x, f0, g=None):
502
+ har_source, noi_source, uv = self.m_source(f0, self.upp)
503
+ har_source = har_source.transpose(1, 2)
504
+ x = self.conv_pre(x)
505
+ if g is not None:
506
+ x = x + self.cond(g)
507
+
508
+ for i in range(self.num_upsamples):
509
+ x = F.leaky_relu(x, modules.LRELU_SLOPE)
510
+ x = self.ups[i](x)
511
+ x_source = self.noise_convs[i](har_source)
512
+ x = x + x_source
513
+ xs = None
514
+ for j in range(self.num_kernels):
515
+ if xs is None:
516
+ xs = self.resblocks[i * self.num_kernels + j](x)
517
+ else:
518
+ xs += self.resblocks[i * self.num_kernels + j](x)
519
+ x = xs / self.num_kernels
520
+ x = F.leaky_relu(x)
521
+ x = self.conv_post(x)
522
+ x = torch.tanh(x)
523
+ return x
524
+
525
+ def remove_weight_norm(self):
526
+ for l in self.ups:
527
+ remove_weight_norm(l)
528
+ for l in self.resblocks:
529
+ l.remove_weight_norm()
530
+
531
+
532
+ sr2sr = {
533
+ "32k": 32000,
534
+ "40k": 40000,
535
+ "48k": 48000,
536
+ }
537
+
538
+
539
+ class SynthesizerTrnMs256NSFsid(nn.Module):
540
+ def __init__(
541
+ self,
542
+ spec_channels,
543
+ segment_size,
544
+ inter_channels,
545
+ hidden_channels,
546
+ filter_channels,
547
+ n_heads,
548
+ n_layers,
549
+ kernel_size,
550
+ p_dropout,
551
+ resblock,
552
+ resblock_kernel_sizes,
553
+ resblock_dilation_sizes,
554
+ upsample_rates,
555
+ upsample_initial_channel,
556
+ upsample_kernel_sizes,
557
+ spk_embed_dim,
558
+ gin_channels,
559
+ sr,
560
+ **kwargs
561
+ ):
562
+ super().__init__()
563
+ if type(sr) == type("strr"):
564
+ sr = sr2sr[sr]
565
+ self.spec_channels = spec_channels
566
+ self.inter_channels = inter_channels
567
+ self.hidden_channels = hidden_channels
568
+ self.filter_channels = filter_channels
569
+ self.n_heads = n_heads
570
+ self.n_layers = n_layers
571
+ self.kernel_size = kernel_size
572
+ self.p_dropout = p_dropout
573
+ self.resblock = resblock
574
+ self.resblock_kernel_sizes = resblock_kernel_sizes
575
+ self.resblock_dilation_sizes = resblock_dilation_sizes
576
+ self.upsample_rates = upsample_rates
577
+ self.upsample_initial_channel = upsample_initial_channel
578
+ self.upsample_kernel_sizes = upsample_kernel_sizes
579
+ self.segment_size = segment_size
580
+ self.gin_channels = gin_channels
581
+ # self.hop_length = hop_length#
582
+ self.spk_embed_dim = spk_embed_dim
583
+ self.enc_p = TextEncoder256(
584
+ inter_channels,
585
+ hidden_channels,
586
+ filter_channels,
587
+ n_heads,
588
+ n_layers,
589
+ kernel_size,
590
+ p_dropout,
591
+ )
592
+ self.dec = GeneratorNSF(
593
+ inter_channels,
594
+ resblock,
595
+ resblock_kernel_sizes,
596
+ resblock_dilation_sizes,
597
+ upsample_rates,
598
+ upsample_initial_channel,
599
+ upsample_kernel_sizes,
600
+ gin_channels=gin_channels,
601
+ sr=sr,
602
+ is_half=kwargs["is_half"],
603
+ )
604
+ self.enc_q = PosteriorEncoder(
605
+ spec_channels,
606
+ inter_channels,
607
+ hidden_channels,
608
+ 5,
609
+ 1,
610
+ 16,
611
+ gin_channels=gin_channels,
612
+ )
613
+ self.flow = ResidualCouplingBlock(
614
+ inter_channels, hidden_channels, 5, 1, 3, gin_channels=gin_channels
615
+ )
616
+ self.emb_g = nn.Embedding(self.spk_embed_dim, gin_channels)
617
+ print("gin_channels:", gin_channels, "self.spk_embed_dim:", self.spk_embed_dim)
618
+
619
+ def remove_weight_norm(self):
620
+ self.dec.remove_weight_norm()
621
+ self.flow.remove_weight_norm()
622
+ self.enc_q.remove_weight_norm()
623
+
624
+ def forward(
625
+ self, phone, phone_lengths, pitch, pitchf, y, y_lengths, ds
626
+ ): # 这里ds是id,[bs,1]
627
+ # print(1,pitch.shape)#[bs,t]
628
+ g = self.emb_g(ds).unsqueeze(-1) # [b, 256, 1]##1是t,广播的
629
+ m_p, logs_p, x_mask = self.enc_p(phone, pitch, phone_lengths)
630
+ z, m_q, logs_q, y_mask = self.enc_q(y, y_lengths, g=g)
631
+ z_p = self.flow(z, y_mask, g=g)
632
+ z_slice, ids_slice = commons.rand_slice_segments(
633
+ z, y_lengths, self.segment_size
634
+ )
635
+ # print(-1,pitchf.shape,ids_slice,self.segment_size,self.hop_length,self.segment_size//self.hop_length)
636
+ pitchf = commons.slice_segments2(pitchf, ids_slice, self.segment_size)
637
+ # print(-2,pitchf.shape,z_slice.shape)
638
+ o = self.dec(z_slice, pitchf, g=g)
639
+ return o, ids_slice, x_mask, y_mask, (z, z_p, m_p, logs_p, m_q, logs_q)
640
+
641
+ def infer(self, phone, phone_lengths, pitch, nsff0, sid, rate=None):
642
+ g = self.emb_g(sid).unsqueeze(-1)
643
+ m_p, logs_p, x_mask = self.enc_p(phone, pitch, phone_lengths)
644
+ z_p = (m_p + torch.exp(logs_p) * torch.randn_like(m_p) * 0.66666) * x_mask
645
+ if rate:
646
+ head = int(z_p.shape[2] * rate)
647
+ z_p = z_p[:, :, -head:]
648
+ x_mask = x_mask[:, :, -head:]
649
+ nsff0 = nsff0[:, -head:]
650
+ z = self.flow(z_p, x_mask, g=g, reverse=True)
651
+ o = self.dec(z * x_mask, nsff0, g=g)
652
+ return o, x_mask, (z, z_p, m_p, logs_p)
653
+
654
+
655
+ class SynthesizerTrnMs768NSFsid(nn.Module):
656
+ def __init__(
657
+ self,
658
+ spec_channels,
659
+ segment_size,
660
+ inter_channels,
661
+ hidden_channels,
662
+ filter_channels,
663
+ n_heads,
664
+ n_layers,
665
+ kernel_size,
666
+ p_dropout,
667
+ resblock,
668
+ resblock_kernel_sizes,
669
+ resblock_dilation_sizes,
670
+ upsample_rates,
671
+ upsample_initial_channel,
672
+ upsample_kernel_sizes,
673
+ spk_embed_dim,
674
+ gin_channels,
675
+ sr,
676
+ **kwargs
677
+ ):
678
+ super().__init__()
679
+ if type(sr) == type("strr"):
680
+ sr = sr2sr[sr]
681
+ self.spec_channels = spec_channels
682
+ self.inter_channels = inter_channels
683
+ self.hidden_channels = hidden_channels
684
+ self.filter_channels = filter_channels
685
+ self.n_heads = n_heads
686
+ self.n_layers = n_layers
687
+ self.kernel_size = kernel_size
688
+ self.p_dropout = p_dropout
689
+ self.resblock = resblock
690
+ self.resblock_kernel_sizes = resblock_kernel_sizes
691
+ self.resblock_dilation_sizes = resblock_dilation_sizes
692
+ self.upsample_rates = upsample_rates
693
+ self.upsample_initial_channel = upsample_initial_channel
694
+ self.upsample_kernel_sizes = upsample_kernel_sizes
695
+ self.segment_size = segment_size
696
+ self.gin_channels = gin_channels
697
+ # self.hop_length = hop_length#
698
+ self.spk_embed_dim = spk_embed_dim
699
+ self.enc_p = TextEncoder768(
700
+ inter_channels,
701
+ hidden_channels,
702
+ filter_channels,
703
+ n_heads,
704
+ n_layers,
705
+ kernel_size,
706
+ p_dropout,
707
+ )
708
+ self.dec = GeneratorNSF(
709
+ inter_channels,
710
+ resblock,
711
+ resblock_kernel_sizes,
712
+ resblock_dilation_sizes,
713
+ upsample_rates,
714
+ upsample_initial_channel,
715
+ upsample_kernel_sizes,
716
+ gin_channels=gin_channels,
717
+ sr=sr,
718
+ is_half=kwargs["is_half"],
719
+ )
720
+ self.enc_q = PosteriorEncoder(
721
+ spec_channels,
722
+ inter_channels,
723
+ hidden_channels,
724
+ 5,
725
+ 1,
726
+ 16,
727
+ gin_channels=gin_channels,
728
+ )
729
+ self.flow = ResidualCouplingBlock(
730
+ inter_channels, hidden_channels, 5, 1, 3, gin_channels=gin_channels
731
+ )
732
+ self.emb_g = nn.Embedding(self.spk_embed_dim, gin_channels)
733
+ print("gin_channels:", gin_channels, "self.spk_embed_dim:", self.spk_embed_dim)
734
+
735
+ def remove_weight_norm(self):
736
+ self.dec.remove_weight_norm()
737
+ self.flow.remove_weight_norm()
738
+ self.enc_q.remove_weight_norm()
739
+
740
+ def forward(
741
+ self, phone, phone_lengths, pitch, pitchf, y, y_lengths, ds
742
+ ): # 这里ds是id,[bs,1]
743
+ # print(1,pitch.shape)#[bs,t]
744
+ g = self.emb_g(ds).unsqueeze(-1) # [b, 256, 1]##1是t,广播的
745
+ m_p, logs_p, x_mask = self.enc_p(phone, pitch, phone_lengths)
746
+ z, m_q, logs_q, y_mask = self.enc_q(y, y_lengths, g=g)
747
+ z_p = self.flow(z, y_mask, g=g)
748
+ z_slice, ids_slice = commons.rand_slice_segments(
749
+ z, y_lengths, self.segment_size
750
+ )
751
+ # print(-1,pitchf.shape,ids_slice,self.segment_size,self.hop_length,self.segment_size//self.hop_length)
752
+ pitchf = commons.slice_segments2(pitchf, ids_slice, self.segment_size)
753
+ # print(-2,pitchf.shape,z_slice.shape)
754
+ o = self.dec(z_slice, pitchf, g=g)
755
+ return o, ids_slice, x_mask, y_mask, (z, z_p, m_p, logs_p, m_q, logs_q)
756
+
757
+ def infer(self, phone, phone_lengths, pitch, nsff0, sid, rate=None):
758
+ g = self.emb_g(sid).unsqueeze(-1)
759
+ m_p, logs_p, x_mask = self.enc_p(phone, pitch, phone_lengths)
760
+ z_p = (m_p + torch.exp(logs_p) * torch.randn_like(m_p) * 0.66666) * x_mask
761
+ if rate:
762
+ head = int(z_p.shape[2] * rate)
763
+ z_p = z_p[:, :, -head:]
764
+ x_mask = x_mask[:, :, -head:]
765
+ nsff0 = nsff0[:, -head:]
766
+ z = self.flow(z_p, x_mask, g=g, reverse=True)
767
+ o = self.dec(z * x_mask, nsff0, g=g)
768
+ return o, x_mask, (z, z_p, m_p, logs_p)
769
+
770
+
771
+ class SynthesizerTrnMs256NSFsid_nono(nn.Module):
772
+ def __init__(
773
+ self,
774
+ spec_channels,
775
+ segment_size,
776
+ inter_channels,
777
+ hidden_channels,
778
+ filter_channels,
779
+ n_heads,
780
+ n_layers,
781
+ kernel_size,
782
+ p_dropout,
783
+ resblock,
784
+ resblock_kernel_sizes,
785
+ resblock_dilation_sizes,
786
+ upsample_rates,
787
+ upsample_initial_channel,
788
+ upsample_kernel_sizes,
789
+ spk_embed_dim,
790
+ gin_channels,
791
+ sr=None,
792
+ **kwargs
793
+ ):
794
+ super().__init__()
795
+ self.spec_channels = spec_channels
796
+ self.inter_channels = inter_channels
797
+ self.hidden_channels = hidden_channels
798
+ self.filter_channels = filter_channels
799
+ self.n_heads = n_heads
800
+ self.n_layers = n_layers
801
+ self.kernel_size = kernel_size
802
+ self.p_dropout = p_dropout
803
+ self.resblock = resblock
804
+ self.resblock_kernel_sizes = resblock_kernel_sizes
805
+ self.resblock_dilation_sizes = resblock_dilation_sizes
806
+ self.upsample_rates = upsample_rates
807
+ self.upsample_initial_channel = upsample_initial_channel
808
+ self.upsample_kernel_sizes = upsample_kernel_sizes
809
+ self.segment_size = segment_size
810
+ self.gin_channels = gin_channels
811
+ # self.hop_length = hop_length#
812
+ self.spk_embed_dim = spk_embed_dim
813
+ self.enc_p = TextEncoder256(
814
+ inter_channels,
815
+ hidden_channels,
816
+ filter_channels,
817
+ n_heads,
818
+ n_layers,
819
+ kernel_size,
820
+ p_dropout,
821
+ f0=False,
822
+ )
823
+ self.dec = Generator(
824
+ inter_channels,
825
+ resblock,
826
+ resblock_kernel_sizes,
827
+ resblock_dilation_sizes,
828
+ upsample_rates,
829
+ upsample_initial_channel,
830
+ upsample_kernel_sizes,
831
+ gin_channels=gin_channels,
832
+ )
833
+ self.enc_q = PosteriorEncoder(
834
+ spec_channels,
835
+ inter_channels,
836
+ hidden_channels,
837
+ 5,
838
+ 1,
839
+ 16,
840
+ gin_channels=gin_channels,
841
+ )
842
+ self.flow = ResidualCouplingBlock(
843
+ inter_channels, hidden_channels, 5, 1, 3, gin_channels=gin_channels
844
+ )
845
+ self.emb_g = nn.Embedding(self.spk_embed_dim, gin_channels)
846
+ print("gin_channels:", gin_channels, "self.spk_embed_dim:", self.spk_embed_dim)
847
+
848
+ def remove_weight_norm(self):
849
+ self.dec.remove_weight_norm()
850
+ self.flow.remove_weight_norm()
851
+ self.enc_q.remove_weight_norm()
852
+
853
+ def forward(self, phone, phone_lengths, y, y_lengths, ds): # 这里ds是id,[bs,1]
854
+ g = self.emb_g(ds).unsqueeze(-1) # [b, 256, 1]##1是t,广播的
855
+ m_p, logs_p, x_mask = self.enc_p(phone, None, phone_lengths)
856
+ z, m_q, logs_q, y_mask = self.enc_q(y, y_lengths, g=g)
857
+ z_p = self.flow(z, y_mask, g=g)
858
+ z_slice, ids_slice = commons.rand_slice_segments(
859
+ z, y_lengths, self.segment_size
860
+ )
861
+ o = self.dec(z_slice, g=g)
862
+ return o, ids_slice, x_mask, y_mask, (z, z_p, m_p, logs_p, m_q, logs_q)
863
+
864
+ def infer(self, phone, phone_lengths, sid, rate=None):
865
+ g = self.emb_g(sid).unsqueeze(-1)
866
+ m_p, logs_p, x_mask = self.enc_p(phone, None, phone_lengths)
867
+ z_p = (m_p + torch.exp(logs_p) * torch.randn_like(m_p) * 0.66666) * x_mask
868
+ if rate:
869
+ head = int(z_p.shape[2] * rate)
870
+ z_p = z_p[:, :, -head:]
871
+ x_mask = x_mask[:, :, -head:]
872
+ z = self.flow(z_p, x_mask, g=g, reverse=True)
873
+ o = self.dec(z * x_mask, g=g)
874
+ return o, x_mask, (z, z_p, m_p, logs_p)
875
+
876
+
877
+ class SynthesizerTrnMs768NSFsid_nono(nn.Module):
878
+ def __init__(
879
+ self,
880
+ spec_channels,
881
+ segment_size,
882
+ inter_channels,
883
+ hidden_channels,
884
+ filter_channels,
885
+ n_heads,
886
+ n_layers,
887
+ kernel_size,
888
+ p_dropout,
889
+ resblock,
890
+ resblock_kernel_sizes,
891
+ resblock_dilation_sizes,
892
+ upsample_rates,
893
+ upsample_initial_channel,
894
+ upsample_kernel_sizes,
895
+ spk_embed_dim,
896
+ gin_channels,
897
+ sr=None,
898
+ **kwargs
899
+ ):
900
+ super().__init__()
901
+ self.spec_channels = spec_channels
902
+ self.inter_channels = inter_channels
903
+ self.hidden_channels = hidden_channels
904
+ self.filter_channels = filter_channels
905
+ self.n_heads = n_heads
906
+ self.n_layers = n_layers
907
+ self.kernel_size = kernel_size
908
+ self.p_dropout = p_dropout
909
+ self.resblock = resblock
910
+ self.resblock_kernel_sizes = resblock_kernel_sizes
911
+ self.resblock_dilation_sizes = resblock_dilation_sizes
912
+ self.upsample_rates = upsample_rates
913
+ self.upsample_initial_channel = upsample_initial_channel
914
+ self.upsample_kernel_sizes = upsample_kernel_sizes
915
+ self.segment_size = segment_size
916
+ self.gin_channels = gin_channels
917
+ # self.hop_length = hop_length#
918
+ self.spk_embed_dim = spk_embed_dim
919
+ self.enc_p = TextEncoder768(
920
+ inter_channels,
921
+ hidden_channels,
922
+ filter_channels,
923
+ n_heads,
924
+ n_layers,
925
+ kernel_size,
926
+ p_dropout,
927
+ f0=False,
928
+ )
929
+ self.dec = Generator(
930
+ inter_channels,
931
+ resblock,
932
+ resblock_kernel_sizes,
933
+ resblock_dilation_sizes,
934
+ upsample_rates,
935
+ upsample_initial_channel,
936
+ upsample_kernel_sizes,
937
+ gin_channels=gin_channels,
938
+ )
939
+ self.enc_q = PosteriorEncoder(
940
+ spec_channels,
941
+ inter_channels,
942
+ hidden_channels,
943
+ 5,
944
+ 1,
945
+ 16,
946
+ gin_channels=gin_channels,
947
+ )
948
+ self.flow = ResidualCouplingBlock(
949
+ inter_channels, hidden_channels, 5, 1, 3, gin_channels=gin_channels
950
+ )
951
+ self.emb_g = nn.Embedding(self.spk_embed_dim, gin_channels)
952
+ print("gin_channels:", gin_channels, "self.spk_embed_dim:", self.spk_embed_dim)
953
+
954
+ def remove_weight_norm(self):
955
+ self.dec.remove_weight_norm()
956
+ self.flow.remove_weight_norm()
957
+ self.enc_q.remove_weight_norm()
958
+
959
+ def forward(self, phone, phone_lengths, y, y_lengths, ds): # 这里ds是id,[bs,1]
960
+ g = self.emb_g(ds).unsqueeze(-1) # [b, 256, 1]##1是t,广播的
961
+ m_p, logs_p, x_mask = self.enc_p(phone, None, phone_lengths)
962
+ z, m_q, logs_q, y_mask = self.enc_q(y, y_lengths, g=g)
963
+ z_p = self.flow(z, y_mask, g=g)
964
+ z_slice, ids_slice = commons.rand_slice_segments(
965
+ z, y_lengths, self.segment_size
966
+ )
967
+ o = self.dec(z_slice, g=g)
968
+ return o, ids_slice, x_mask, y_mask, (z, z_p, m_p, logs_p, m_q, logs_q)
969
+
970
+ def infer(self, phone, phone_lengths, sid, rate=None):
971
+ g = self.emb_g(sid).unsqueeze(-1)
972
+ m_p, logs_p, x_mask = self.enc_p(phone, None, phone_lengths)
973
+ z_p = (m_p + torch.exp(logs_p) * torch.randn_like(m_p) * 0.66666) * x_mask
974
+ if rate:
975
+ head = int(z_p.shape[2] * rate)
976
+ z_p = z_p[:, :, -head:]
977
+ x_mask = x_mask[:, :, -head:]
978
+ z = self.flow(z_p, x_mask, g=g, reverse=True)
979
+ o = self.dec(z * x_mask, g=g)
980
+ return o, x_mask, (z, z_p, m_p, logs_p)
981
+
982
+
983
+ class MultiPeriodDiscriminator(torch.nn.Module):
984
+ def __init__(self, use_spectral_norm=False):
985
+ super(MultiPeriodDiscriminator, self).__init__()
986
+ periods = [2, 3, 5, 7, 11, 17]
987
+ # periods = [3, 5, 7, 11, 17, 23, 37]
988
+
989
+ discs = [DiscriminatorS(use_spectral_norm=use_spectral_norm)]
990
+ discs = discs + [
991
+ DiscriminatorP(i, use_spectral_norm=use_spectral_norm) for i in periods
992
+ ]
993
+ self.discriminators = nn.ModuleList(discs)
994
+
995
+ def forward(self, y, y_hat):
996
+ y_d_rs = [] #
997
+ y_d_gs = []
998
+ fmap_rs = []
999
+ fmap_gs = []
1000
+ for i, d in enumerate(self.discriminators):
1001
+ y_d_r, fmap_r = d(y)
1002
+ y_d_g, fmap_g = d(y_hat)
1003
+ # for j in range(len(fmap_r)):
1004
+ # print(i,j,y.shape,y_hat.shape,fmap_r[j].shape,fmap_g[j].shape)
1005
+ y_d_rs.append(y_d_r)
1006
+ y_d_gs.append(y_d_g)
1007
+ fmap_rs.append(fmap_r)
1008
+ fmap_gs.append(fmap_g)
1009
+
1010
+ return y_d_rs, y_d_gs, fmap_rs, fmap_gs
1011
+
1012
+
1013
+ class MultiPeriodDiscriminatorV2(torch.nn.Module):
1014
+ def __init__(self, use_spectral_norm=False):
1015
+ super(MultiPeriodDiscriminatorV2, self).__init__()
1016
+ # periods = [2, 3, 5, 7, 11, 17]
1017
+ periods = [2, 3, 5, 7, 11, 17, 23, 37]
1018
+
1019
+ discs = [DiscriminatorS(use_spectral_norm=use_spectral_norm)]
1020
+ discs = discs + [
1021
+ DiscriminatorP(i, use_spectral_norm=use_spectral_norm) for i in periods
1022
+ ]
1023
+ self.discriminators = nn.ModuleList(discs)
1024
+
1025
+ def forward(self, y, y_hat):
1026
+ y_d_rs = [] #
1027
+ y_d_gs = []
1028
+ fmap_rs = []
1029
+ fmap_gs = []
1030
+ for i, d in enumerate(self.discriminators):
1031
+ y_d_r, fmap_r = d(y)
1032
+ y_d_g, fmap_g = d(y_hat)
1033
+ # for j in range(len(fmap_r)):
1034
+ # print(i,j,y.shape,y_hat.shape,fmap_r[j].shape,fmap_g[j].shape)
1035
+ y_d_rs.append(y_d_r)
1036
+ y_d_gs.append(y_d_g)
1037
+ fmap_rs.append(fmap_r)
1038
+ fmap_gs.append(fmap_g)
1039
+
1040
+ return y_d_rs, y_d_gs, fmap_rs, fmap_gs
1041
+
1042
+
1043
+ class DiscriminatorS(torch.nn.Module):
1044
+ def __init__(self, use_spectral_norm=False):
1045
+ super(DiscriminatorS, self).__init__()
1046
+ norm_f = weight_norm if use_spectral_norm == False else spectral_norm
1047
+ self.convs = nn.ModuleList(
1048
+ [
1049
+ norm_f(Conv1d(1, 16, 15, 1, padding=7)),
1050
+ norm_f(Conv1d(16, 64, 41, 4, groups=4, padding=20)),
1051
+ norm_f(Conv1d(64, 256, 41, 4, groups=16, padding=20)),
1052
+ norm_f(Conv1d(256, 1024, 41, 4, groups=64, padding=20)),
1053
+ norm_f(Conv1d(1024, 1024, 41, 4, groups=256, padding=20)),
1054
+ norm_f(Conv1d(1024, 1024, 5, 1, padding=2)),
1055
+ ]
1056
+ )
1057
+ self.conv_post = norm_f(Conv1d(1024, 1, 3, 1, padding=1))
1058
+
1059
+ def forward(self, x):
1060
+ fmap = []
1061
+
1062
+ for l in self.convs:
1063
+ x = l(x)
1064
+ x = F.leaky_relu(x, modules.LRELU_SLOPE)
1065
+ fmap.append(x)
1066
+ x = self.conv_post(x)
1067
+ fmap.append(x)
1068
+ x = torch.flatten(x, 1, -1)
1069
+
1070
+ return x, fmap
1071
+
1072
+
1073
+ class DiscriminatorP(torch.nn.Module):
1074
+ def __init__(self, period, kernel_size=5, stride=3, use_spectral_norm=False):
1075
+ super(DiscriminatorP, self).__init__()
1076
+ self.period = period
1077
+ self.use_spectral_norm = use_spectral_norm
1078
+ norm_f = weight_norm if use_spectral_norm == False else spectral_norm
1079
+ self.convs = nn.ModuleList(
1080
+ [
1081
+ norm_f(
1082
+ Conv2d(
1083
+ 1,
1084
+ 32,
1085
+ (kernel_size, 1),
1086
+ (stride, 1),
1087
+ padding=(get_padding(kernel_size, 1), 0),
1088
+ )
1089
+ ),
1090
+ norm_f(
1091
+ Conv2d(
1092
+ 32,
1093
+ 128,
1094
+ (kernel_size, 1),
1095
+ (stride, 1),
1096
+ padding=(get_padding(kernel_size, 1), 0),
1097
+ )
1098
+ ),
1099
+ norm_f(
1100
+ Conv2d(
1101
+ 128,
1102
+ 512,
1103
+ (kernel_size, 1),
1104
+ (stride, 1),
1105
+ padding=(get_padding(kernel_size, 1), 0),
1106
+ )
1107
+ ),
1108
+ norm_f(
1109
+ Conv2d(
1110
+ 512,
1111
+ 1024,
1112
+ (kernel_size, 1),
1113
+ (stride, 1),
1114
+ padding=(get_padding(kernel_size, 1), 0),
1115
+ )
1116
+ ),
1117
+ norm_f(
1118
+ Conv2d(
1119
+ 1024,
1120
+ 1024,
1121
+ (kernel_size, 1),
1122
+ 1,
1123
+ padding=(get_padding(kernel_size, 1), 0),
1124
+ )
1125
+ ),
1126
+ ]
1127
+ )
1128
+ self.conv_post = norm_f(Conv2d(1024, 1, (3, 1), 1, padding=(1, 0)))
1129
+
1130
+ def forward(self, x):
1131
+ fmap = []
1132
+
1133
+ # 1d to 2d
1134
+ b, c, t = x.shape
1135
+ if t % self.period != 0: # pad first
1136
+ n_pad = self.period - (t % self.period)
1137
+ x = F.pad(x, (0, n_pad), "reflect")
1138
+ t = t + n_pad
1139
+ x = x.view(b, c, t // self.period, self.period)
1140
+
1141
+ for l in self.convs:
1142
+ x = l(x)
1143
+ x = F.leaky_relu(x, modules.LRELU_SLOPE)
1144
+ fmap.append(x)
1145
+ x = self.conv_post(x)
1146
+ fmap.append(x)
1147
+ x = torch.flatten(x, 1, -1)
1148
+
1149
+ return x, fmap
app/rvc/infer_pack/modules.py ADDED
@@ -0,0 +1,529 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """
2
+ https://github.com/RVC-Project/Retrieval-based-Voice-Conversion-WebUI/tree/86ed98aacaa8b2037aad795abd11cdca122cf39f/lib/infer_pack
3
+
4
+ copyright: RVC-Project
5
+ license: MIT
6
+ """
7
+
8
+ import copy
9
+ import math
10
+ import numpy as np
11
+ import scipy
12
+ import torch
13
+ from torch import nn
14
+ from torch.nn import functional as F
15
+
16
+ from torch.nn import Conv1d, ConvTranspose1d, AvgPool1d, Conv2d
17
+ from torch.nn.utils import weight_norm, remove_weight_norm
18
+
19
+ from app.rvc.infer_pack import commons
20
+ from app.rvc.infer_pack.commons import init_weights, get_padding
21
+ from app.rvc.infer_pack.transforms import piecewise_rational_quadratic_transform
22
+
23
+
24
+ LRELU_SLOPE = 0.1
25
+
26
+
27
+ class LayerNorm(nn.Module):
28
+ def __init__(self, channels, eps=1e-5):
29
+ super().__init__()
30
+ self.channels = channels
31
+ self.eps = eps
32
+
33
+ self.gamma = nn.Parameter(torch.ones(channels))
34
+ self.beta = nn.Parameter(torch.zeros(channels))
35
+
36
+ def forward(self, x):
37
+ x = x.transpose(1, -1)
38
+ x = F.layer_norm(x, (self.channels,), self.gamma, self.beta, self.eps)
39
+ return x.transpose(1, -1)
40
+
41
+
42
+ class ConvReluNorm(nn.Module):
43
+ def __init__(
44
+ self,
45
+ in_channels,
46
+ hidden_channels,
47
+ out_channels,
48
+ kernel_size,
49
+ n_layers,
50
+ p_dropout,
51
+ ):
52
+ super().__init__()
53
+ self.in_channels = in_channels
54
+ self.hidden_channels = hidden_channels
55
+ self.out_channels = out_channels
56
+ self.kernel_size = kernel_size
57
+ self.n_layers = n_layers
58
+ self.p_dropout = p_dropout
59
+ assert n_layers > 1, "Number of layers should be larger than 0."
60
+
61
+ self.conv_layers = nn.ModuleList()
62
+ self.norm_layers = nn.ModuleList()
63
+ self.conv_layers.append(
64
+ nn.Conv1d(
65
+ in_channels, hidden_channels, kernel_size, padding=kernel_size // 2
66
+ )
67
+ )
68
+ self.norm_layers.append(LayerNorm(hidden_channels))
69
+ self.relu_drop = nn.Sequential(nn.ReLU(), nn.Dropout(p_dropout))
70
+ for _ in range(n_layers - 1):
71
+ self.conv_layers.append(
72
+ nn.Conv1d(
73
+ hidden_channels,
74
+ hidden_channels,
75
+ kernel_size,
76
+ padding=kernel_size // 2,
77
+ )
78
+ )
79
+ self.norm_layers.append(LayerNorm(hidden_channels))
80
+ self.proj = nn.Conv1d(hidden_channels, out_channels, 1)
81
+ self.proj.weight.data.zero_()
82
+ self.proj.bias.data.zero_()
83
+
84
+ def forward(self, x, x_mask):
85
+ x_org = x
86
+ for i in range(self.n_layers):
87
+ x = self.conv_layers[i](x * x_mask)
88
+ x = self.norm_layers[i](x)
89
+ x = self.relu_drop(x)
90
+ x = x_org + self.proj(x)
91
+ return x * x_mask
92
+
93
+
94
+ class DDSConv(nn.Module):
95
+ """
96
+ Dialted and Depth-Separable Convolution
97
+ """
98
+
99
+ def __init__(self, channels, kernel_size, n_layers, p_dropout=0.0):
100
+ super().__init__()
101
+ self.channels = channels
102
+ self.kernel_size = kernel_size
103
+ self.n_layers = n_layers
104
+ self.p_dropout = p_dropout
105
+
106
+ self.drop = nn.Dropout(p_dropout)
107
+ self.convs_sep = nn.ModuleList()
108
+ self.convs_1x1 = nn.ModuleList()
109
+ self.norms_1 = nn.ModuleList()
110
+ self.norms_2 = nn.ModuleList()
111
+ for i in range(n_layers):
112
+ dilation = kernel_size**i
113
+ padding = (kernel_size * dilation - dilation) // 2
114
+ self.convs_sep.append(
115
+ nn.Conv1d(
116
+ channels,
117
+ channels,
118
+ kernel_size,
119
+ groups=channels,
120
+ dilation=dilation,
121
+ padding=padding,
122
+ )
123
+ )
124
+ self.convs_1x1.append(nn.Conv1d(channels, channels, 1))
125
+ self.norms_1.append(LayerNorm(channels))
126
+ self.norms_2.append(LayerNorm(channels))
127
+
128
+ def forward(self, x, x_mask, g=None):
129
+ if g is not None:
130
+ x = x + g
131
+ for i in range(self.n_layers):
132
+ y = self.convs_sep[i](x * x_mask)
133
+ y = self.norms_1[i](y)
134
+ y = F.gelu(y)
135
+ y = self.convs_1x1[i](y)
136
+ y = self.norms_2[i](y)
137
+ y = F.gelu(y)
138
+ y = self.drop(y)
139
+ x = x + y
140
+ return x * x_mask
141
+
142
+
143
+ class WN(torch.nn.Module):
144
+ def __init__(
145
+ self,
146
+ hidden_channels,
147
+ kernel_size,
148
+ dilation_rate,
149
+ n_layers,
150
+ gin_channels=0,
151
+ p_dropout=0,
152
+ ):
153
+ super(WN, self).__init__()
154
+ assert kernel_size % 2 == 1
155
+ self.hidden_channels = hidden_channels
156
+ self.kernel_size = (kernel_size,)
157
+ self.dilation_rate = dilation_rate
158
+ self.n_layers = n_layers
159
+ self.gin_channels = gin_channels
160
+ self.p_dropout = p_dropout
161
+
162
+ self.in_layers = torch.nn.ModuleList()
163
+ self.res_skip_layers = torch.nn.ModuleList()
164
+ self.drop = nn.Dropout(p_dropout)
165
+
166
+ if gin_channels != 0:
167
+ cond_layer = torch.nn.Conv1d(
168
+ gin_channels, 2 * hidden_channels * n_layers, 1
169
+ )
170
+ self.cond_layer = torch.nn.utils.weight_norm(cond_layer, name="weight")
171
+
172
+ for i in range(n_layers):
173
+ dilation = dilation_rate**i
174
+ padding = int((kernel_size * dilation - dilation) / 2)
175
+ in_layer = torch.nn.Conv1d(
176
+ hidden_channels,
177
+ 2 * hidden_channels,
178
+ kernel_size,
179
+ dilation=dilation,
180
+ padding=padding,
181
+ )
182
+ in_layer = torch.nn.utils.weight_norm(in_layer, name="weight")
183
+ self.in_layers.append(in_layer)
184
+
185
+ # last one is not necessary
186
+ if i < n_layers - 1:
187
+ res_skip_channels = 2 * hidden_channels
188
+ else:
189
+ res_skip_channels = hidden_channels
190
+
191
+ res_skip_layer = torch.nn.Conv1d(hidden_channels, res_skip_channels, 1)
192
+ res_skip_layer = torch.nn.utils.weight_norm(res_skip_layer, name="weight")
193
+ self.res_skip_layers.append(res_skip_layer)
194
+
195
+ def forward(self, x, x_mask, g=None, **kwargs):
196
+ output = torch.zeros_like(x)
197
+ n_channels_tensor = torch.IntTensor([self.hidden_channels])
198
+
199
+ if g is not None:
200
+ g = self.cond_layer(g)
201
+
202
+ for i in range(self.n_layers):
203
+ x_in = self.in_layers[i](x)
204
+ if g is not None:
205
+ cond_offset = i * 2 * self.hidden_channels
206
+ g_l = g[:, cond_offset : cond_offset + 2 * self.hidden_channels, :]
207
+ else:
208
+ g_l = torch.zeros_like(x_in)
209
+
210
+ acts = commons.fused_add_tanh_sigmoid_multiply(x_in, g_l, n_channels_tensor)
211
+ acts = self.drop(acts)
212
+
213
+ res_skip_acts = self.res_skip_layers[i](acts)
214
+ if i < self.n_layers - 1:
215
+ res_acts = res_skip_acts[:, : self.hidden_channels, :]
216
+ x = (x + res_acts) * x_mask
217
+ output = output + res_skip_acts[:, self.hidden_channels :, :]
218
+ else:
219
+ output = output + res_skip_acts
220
+ return output * x_mask
221
+
222
+ def remove_weight_norm(self):
223
+ if self.gin_channels != 0:
224
+ torch.nn.utils.remove_weight_norm(self.cond_layer)
225
+ for l in self.in_layers:
226
+ torch.nn.utils.remove_weight_norm(l)
227
+ for l in self.res_skip_layers:
228
+ torch.nn.utils.remove_weight_norm(l)
229
+
230
+
231
+ class ResBlock1(torch.nn.Module):
232
+ def __init__(self, channels, kernel_size=3, dilation=(1, 3, 5)):
233
+ super(ResBlock1, self).__init__()
234
+ self.convs1 = nn.ModuleList(
235
+ [
236
+ weight_norm(
237
+ Conv1d(
238
+ channels,
239
+ channels,
240
+ kernel_size,
241
+ 1,
242
+ dilation=dilation[0],
243
+ padding=get_padding(kernel_size, dilation[0]),
244
+ )
245
+ ),
246
+ weight_norm(
247
+ Conv1d(
248
+ channels,
249
+ channels,
250
+ kernel_size,
251
+ 1,
252
+ dilation=dilation[1],
253
+ padding=get_padding(kernel_size, dilation[1]),
254
+ )
255
+ ),
256
+ weight_norm(
257
+ Conv1d(
258
+ channels,
259
+ channels,
260
+ kernel_size,
261
+ 1,
262
+ dilation=dilation[2],
263
+ padding=get_padding(kernel_size, dilation[2]),
264
+ )
265
+ ),
266
+ ]
267
+ )
268
+ self.convs1.apply(init_weights)
269
+
270
+ self.convs2 = nn.ModuleList(
271
+ [
272
+ weight_norm(
273
+ Conv1d(
274
+ channels,
275
+ channels,
276
+ kernel_size,
277
+ 1,
278
+ dilation=1,
279
+ padding=get_padding(kernel_size, 1),
280
+ )
281
+ ),
282
+ weight_norm(
283
+ Conv1d(
284
+ channels,
285
+ channels,
286
+ kernel_size,
287
+ 1,
288
+ dilation=1,
289
+ padding=get_padding(kernel_size, 1),
290
+ )
291
+ ),
292
+ weight_norm(
293
+ Conv1d(
294
+ channels,
295
+ channels,
296
+ kernel_size,
297
+ 1,
298
+ dilation=1,
299
+ padding=get_padding(kernel_size, 1),
300
+ )
301
+ ),
302
+ ]
303
+ )
304
+ self.convs2.apply(init_weights)
305
+
306
+ def forward(self, x, x_mask=None):
307
+ for c1, c2 in zip(self.convs1, self.convs2):
308
+ xt = F.leaky_relu(x, LRELU_SLOPE)
309
+ if x_mask is not None:
310
+ xt = xt * x_mask
311
+ xt = c1(xt)
312
+ xt = F.leaky_relu(xt, LRELU_SLOPE)
313
+ if x_mask is not None:
314
+ xt = xt * x_mask
315
+ xt = c2(xt)
316
+ x = xt + x
317
+ if x_mask is not None:
318
+ x = x * x_mask
319
+ return x
320
+
321
+ def remove_weight_norm(self):
322
+ for l in self.convs1:
323
+ remove_weight_norm(l)
324
+ for l in self.convs2:
325
+ remove_weight_norm(l)
326
+
327
+
328
+ class ResBlock2(torch.nn.Module):
329
+ def __init__(self, channels, kernel_size=3, dilation=(1, 3)):
330
+ super(ResBlock2, self).__init__()
331
+ self.convs = nn.ModuleList(
332
+ [
333
+ weight_norm(
334
+ Conv1d(
335
+ channels,
336
+ channels,
337
+ kernel_size,
338
+ 1,
339
+ dilation=dilation[0],
340
+ padding=get_padding(kernel_size, dilation[0]),
341
+ )
342
+ ),
343
+ weight_norm(
344
+ Conv1d(
345
+ channels,
346
+ channels,
347
+ kernel_size,
348
+ 1,
349
+ dilation=dilation[1],
350
+ padding=get_padding(kernel_size, dilation[1]),
351
+ )
352
+ ),
353
+ ]
354
+ )
355
+ self.convs.apply(init_weights)
356
+
357
+ def forward(self, x, x_mask=None):
358
+ for c in self.convs:
359
+ xt = F.leaky_relu(x, LRELU_SLOPE)
360
+ if x_mask is not None:
361
+ xt = xt * x_mask
362
+ xt = c(xt)
363
+ x = xt + x
364
+ if x_mask is not None:
365
+ x = x * x_mask
366
+ return x
367
+
368
+ def remove_weight_norm(self):
369
+ for l in self.convs:
370
+ remove_weight_norm(l)
371
+
372
+
373
+ class Log(nn.Module):
374
+ def forward(self, x, x_mask, reverse=False, **kwargs):
375
+ if not reverse:
376
+ y = torch.log(torch.clamp_min(x, 1e-5)) * x_mask
377
+ logdet = torch.sum(-y, [1, 2])
378
+ return y, logdet
379
+ else:
380
+ x = torch.exp(x) * x_mask
381
+ return x
382
+
383
+
384
+ class Flip(nn.Module):
385
+ def forward(self, x, *args, reverse=False, **kwargs):
386
+ x = torch.flip(x, [1])
387
+ if not reverse:
388
+ logdet = torch.zeros(x.size(0)).to(dtype=x.dtype, device=x.device)
389
+ return x, logdet
390
+ else:
391
+ return x
392
+
393
+
394
+ class ElementwiseAffine(nn.Module):
395
+ def __init__(self, channels):
396
+ super().__init__()
397
+ self.channels = channels
398
+ self.m = nn.Parameter(torch.zeros(channels, 1))
399
+ self.logs = nn.Parameter(torch.zeros(channels, 1))
400
+
401
+ def forward(self, x, x_mask, reverse=False, **kwargs):
402
+ if not reverse:
403
+ y = self.m + torch.exp(self.logs) * x
404
+ y = y * x_mask
405
+ logdet = torch.sum(self.logs * x_mask, [1, 2])
406
+ return y, logdet
407
+ else:
408
+ x = (x - self.m) * torch.exp(-self.logs) * x_mask
409
+ return x
410
+
411
+
412
+ class ResidualCouplingLayer(nn.Module):
413
+ def __init__(
414
+ self,
415
+ channels,
416
+ hidden_channels,
417
+ kernel_size,
418
+ dilation_rate,
419
+ n_layers,
420
+ p_dropout=0,
421
+ gin_channels=0,
422
+ mean_only=False,
423
+ ):
424
+ assert channels % 2 == 0, "channels should be divisible by 2"
425
+ super().__init__()
426
+ self.channels = channels
427
+ self.hidden_channels = hidden_channels
428
+ self.kernel_size = kernel_size
429
+ self.dilation_rate = dilation_rate
430
+ self.n_layers = n_layers
431
+ self.half_channels = channels // 2
432
+ self.mean_only = mean_only
433
+
434
+ self.pre = nn.Conv1d(self.half_channels, hidden_channels, 1)
435
+ self.enc = WN(
436
+ hidden_channels,
437
+ kernel_size,
438
+ dilation_rate,
439
+ n_layers,
440
+ p_dropout=p_dropout,
441
+ gin_channels=gin_channels,
442
+ )
443
+ self.post = nn.Conv1d(hidden_channels, self.half_channels * (2 - mean_only), 1)
444
+ self.post.weight.data.zero_()
445
+ self.post.bias.data.zero_()
446
+
447
+ def forward(self, x, x_mask, g=None, reverse=False):
448
+ x0, x1 = torch.split(x, [self.half_channels] * 2, 1)
449
+ h = self.pre(x0) * x_mask
450
+ h = self.enc(h, x_mask, g=g)
451
+ stats = self.post(h) * x_mask
452
+ if not self.mean_only:
453
+ m, logs = torch.split(stats, [self.half_channels] * 2, 1)
454
+ else:
455
+ m = stats
456
+ logs = torch.zeros_like(m)
457
+
458
+ if not reverse:
459
+ x1 = m + x1 * torch.exp(logs) * x_mask
460
+ x = torch.cat([x0, x1], 1)
461
+ logdet = torch.sum(logs, [1, 2])
462
+ return x, logdet
463
+ else:
464
+ x1 = (x1 - m) * torch.exp(-logs) * x_mask
465
+ x = torch.cat([x0, x1], 1)
466
+ return x
467
+
468
+ def remove_weight_norm(self):
469
+ self.enc.remove_weight_norm()
470
+
471
+
472
+ class ConvFlow(nn.Module):
473
+ def __init__(
474
+ self,
475
+ in_channels,
476
+ filter_channels,
477
+ kernel_size,
478
+ n_layers,
479
+ num_bins=10,
480
+ tail_bound=5.0,
481
+ ):
482
+ super().__init__()
483
+ self.in_channels = in_channels
484
+ self.filter_channels = filter_channels
485
+ self.kernel_size = kernel_size
486
+ self.n_layers = n_layers
487
+ self.num_bins = num_bins
488
+ self.tail_bound = tail_bound
489
+ self.half_channels = in_channels // 2
490
+
491
+ self.pre = nn.Conv1d(self.half_channels, filter_channels, 1)
492
+ self.convs = DDSConv(filter_channels, kernel_size, n_layers, p_dropout=0.0)
493
+ self.proj = nn.Conv1d(
494
+ filter_channels, self.half_channels * (num_bins * 3 - 1), 1
495
+ )
496
+ self.proj.weight.data.zero_()
497
+ self.proj.bias.data.zero_()
498
+
499
+ def forward(self, x, x_mask, g=None, reverse=False):
500
+ x0, x1 = torch.split(x, [self.half_channels] * 2, 1)
501
+ h = self.pre(x0)
502
+ h = self.convs(h, x_mask, g=g)
503
+ h = self.proj(h) * x_mask
504
+
505
+ b, c, t = x0.shape
506
+ h = h.reshape(b, c, -1, t).permute(0, 1, 3, 2) # [b, cx?, t] -> [b, c, t, ?]
507
+
508
+ unnormalized_widths = h[..., : self.num_bins] / math.sqrt(self.filter_channels)
509
+ unnormalized_heights = h[..., self.num_bins : 2 * self.num_bins] / math.sqrt(
510
+ self.filter_channels
511
+ )
512
+ unnormalized_derivatives = h[..., 2 * self.num_bins :]
513
+
514
+ x1, logabsdet = piecewise_rational_quadratic_transform(
515
+ x1,
516
+ unnormalized_widths,
517
+ unnormalized_heights,
518
+ unnormalized_derivatives,
519
+ inverse=reverse,
520
+ tails="linear",
521
+ tail_bound=self.tail_bound,
522
+ )
523
+
524
+ x = torch.cat([x0, x1], 1) * x_mask
525
+ logdet = torch.sum(logabsdet * x_mask, [1, 2])
526
+ if not reverse:
527
+ return x, logdet
528
+ else:
529
+ return x
app/rvc/infer_pack/modules/F0Predictor/F0Predictor.py ADDED
@@ -0,0 +1,23 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """
2
+ https://github.com/RVC-Project/Retrieval-based-Voice-Conversion-WebUI/tree/86ed98aacaa8b2037aad795abd11cdca122cf39f/lib/infer_pack
3
+
4
+ copyright: RVC-Project
5
+ license: MIT
6
+ """
7
+
8
+ class F0Predictor(object):
9
+ def compute_f0(self, wav, p_len):
10
+ """
11
+ input: wav:[signal_length]
12
+ p_len:int
13
+ output: f0:[signal_length//hop_length]
14
+ """
15
+ pass
16
+
17
+ def compute_f0_uv(self, wav, p_len):
18
+ """
19
+ input: wav:[signal_length]
20
+ p_len:int
21
+ output: f0:[signal_length//hop_length],uv:[signal_length//hop_length]
22
+ """
23
+ pass
app/rvc/infer_pack/modules/F0Predictor/__init__.py ADDED
File without changes
app/rvc/infer_pack/transforms.py ADDED
@@ -0,0 +1,216 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """
2
+ https://github.com/RVC-Project/Retrieval-based-Voice-Conversion-WebUI/tree/86ed98aacaa8b2037aad795abd11cdca122cf39f/lib/infer_pack
3
+
4
+ copyright: RVC-Project
5
+ license: MIT
6
+ """
7
+
8
+ import torch
9
+ from torch.nn import functional as F
10
+
11
+ import numpy as np
12
+
13
+
14
+ DEFAULT_MIN_BIN_WIDTH = 1e-3
15
+ DEFAULT_MIN_BIN_HEIGHT = 1e-3
16
+ DEFAULT_MIN_DERIVATIVE = 1e-3
17
+
18
+
19
+ def piecewise_rational_quadratic_transform(
20
+ inputs,
21
+ unnormalized_widths,
22
+ unnormalized_heights,
23
+ unnormalized_derivatives,
24
+ inverse=False,
25
+ tails=None,
26
+ tail_bound=1.0,
27
+ min_bin_width=DEFAULT_MIN_BIN_WIDTH,
28
+ min_bin_height=DEFAULT_MIN_BIN_HEIGHT,
29
+ min_derivative=DEFAULT_MIN_DERIVATIVE,
30
+ ):
31
+ if tails is None:
32
+ spline_fn = rational_quadratic_spline
33
+ spline_kwargs = {}
34
+ else:
35
+ spline_fn = unconstrained_rational_quadratic_spline
36
+ spline_kwargs = {"tails": tails, "tail_bound": tail_bound}
37
+
38
+ outputs, logabsdet = spline_fn(
39
+ inputs=inputs,
40
+ unnormalized_widths=unnormalized_widths,
41
+ unnormalized_heights=unnormalized_heights,
42
+ unnormalized_derivatives=unnormalized_derivatives,
43
+ inverse=inverse,
44
+ min_bin_width=min_bin_width,
45
+ min_bin_height=min_bin_height,
46
+ min_derivative=min_derivative,
47
+ **spline_kwargs
48
+ )
49
+ return outputs, logabsdet
50
+
51
+
52
+ def searchsorted(bin_locations, inputs, eps=1e-6):
53
+ bin_locations[..., -1] += eps
54
+ return torch.sum(inputs[..., None] >= bin_locations, dim=-1) - 1
55
+
56
+
57
+ def unconstrained_rational_quadratic_spline(
58
+ inputs,
59
+ unnormalized_widths,
60
+ unnormalized_heights,
61
+ unnormalized_derivatives,
62
+ inverse=False,
63
+ tails="linear",
64
+ tail_bound=1.0,
65
+ min_bin_width=DEFAULT_MIN_BIN_WIDTH,
66
+ min_bin_height=DEFAULT_MIN_BIN_HEIGHT,
67
+ min_derivative=DEFAULT_MIN_DERIVATIVE,
68
+ ):
69
+ inside_interval_mask = (inputs >= -tail_bound) & (inputs <= tail_bound)
70
+ outside_interval_mask = ~inside_interval_mask
71
+
72
+ outputs = torch.zeros_like(inputs)
73
+ logabsdet = torch.zeros_like(inputs)
74
+
75
+ if tails == "linear":
76
+ unnormalized_derivatives = F.pad(unnormalized_derivatives, pad=(1, 1))
77
+ constant = np.log(np.exp(1 - min_derivative) - 1)
78
+ unnormalized_derivatives[..., 0] = constant
79
+ unnormalized_derivatives[..., -1] = constant
80
+
81
+ outputs[outside_interval_mask] = inputs[outside_interval_mask]
82
+ logabsdet[outside_interval_mask] = 0
83
+ else:
84
+ raise RuntimeError("{} tails are not implemented.".format(tails))
85
+
86
+ (
87
+ outputs[inside_interval_mask],
88
+ logabsdet[inside_interval_mask],
89
+ ) = rational_quadratic_spline(
90
+ inputs=inputs[inside_interval_mask],
91
+ unnormalized_widths=unnormalized_widths[inside_interval_mask, :],
92
+ unnormalized_heights=unnormalized_heights[inside_interval_mask, :],
93
+ unnormalized_derivatives=unnormalized_derivatives[inside_interval_mask, :],
94
+ inverse=inverse,
95
+ left=-tail_bound,
96
+ right=tail_bound,
97
+ bottom=-tail_bound,
98
+ top=tail_bound,
99
+ min_bin_width=min_bin_width,
100
+ min_bin_height=min_bin_height,
101
+ min_derivative=min_derivative,
102
+ )
103
+
104
+ return outputs, logabsdet
105
+
106
+
107
+ def rational_quadratic_spline(
108
+ inputs,
109
+ unnormalized_widths,
110
+ unnormalized_heights,
111
+ unnormalized_derivatives,
112
+ inverse=False,
113
+ left=0.0,
114
+ right=1.0,
115
+ bottom=0.0,
116
+ top=1.0,
117
+ min_bin_width=DEFAULT_MIN_BIN_WIDTH,
118
+ min_bin_height=DEFAULT_MIN_BIN_HEIGHT,
119
+ min_derivative=DEFAULT_MIN_DERIVATIVE,
120
+ ):
121
+ if torch.min(inputs) < left or torch.max(inputs) > right:
122
+ raise ValueError("Input to a transform is not within its domain")
123
+
124
+ num_bins = unnormalized_widths.shape[-1]
125
+
126
+ if min_bin_width * num_bins > 1.0:
127
+ raise ValueError("Minimal bin width too large for the number of bins")
128
+ if min_bin_height * num_bins > 1.0:
129
+ raise ValueError("Minimal bin height too large for the number of bins")
130
+
131
+ widths = F.softmax(unnormalized_widths, dim=-1)
132
+ widths = min_bin_width + (1 - min_bin_width * num_bins) * widths
133
+ cumwidths = torch.cumsum(widths, dim=-1)
134
+ cumwidths = F.pad(cumwidths, pad=(1, 0), mode="constant", value=0.0)
135
+ cumwidths = (right - left) * cumwidths + left
136
+ cumwidths[..., 0] = left
137
+ cumwidths[..., -1] = right
138
+ widths = cumwidths[..., 1:] - cumwidths[..., :-1]
139
+
140
+ derivatives = min_derivative + F.softplus(unnormalized_derivatives)
141
+
142
+ heights = F.softmax(unnormalized_heights, dim=-1)
143
+ heights = min_bin_height + (1 - min_bin_height * num_bins) * heights
144
+ cumheights = torch.cumsum(heights, dim=-1)
145
+ cumheights = F.pad(cumheights, pad=(1, 0), mode="constant", value=0.0)
146
+ cumheights = (top - bottom) * cumheights + bottom
147
+ cumheights[..., 0] = bottom
148
+ cumheights[..., -1] = top
149
+ heights = cumheights[..., 1:] - cumheights[..., :-1]
150
+
151
+ if inverse:
152
+ bin_idx = searchsorted(cumheights, inputs)[..., None]
153
+ else:
154
+ bin_idx = searchsorted(cumwidths, inputs)[..., None]
155
+
156
+ input_cumwidths = cumwidths.gather(-1, bin_idx)[..., 0]
157
+ input_bin_widths = widths.gather(-1, bin_idx)[..., 0]
158
+
159
+ input_cumheights = cumheights.gather(-1, bin_idx)[..., 0]
160
+ delta = heights / widths
161
+ input_delta = delta.gather(-1, bin_idx)[..., 0]
162
+
163
+ input_derivatives = derivatives.gather(-1, bin_idx)[..., 0]
164
+ input_derivatives_plus_one = derivatives[..., 1:].gather(-1, bin_idx)[..., 0]
165
+
166
+ input_heights = heights.gather(-1, bin_idx)[..., 0]
167
+
168
+ if inverse:
169
+ a = (inputs - input_cumheights) * (
170
+ input_derivatives + input_derivatives_plus_one - 2 * input_delta
171
+ ) + input_heights * (input_delta - input_derivatives)
172
+ b = input_heights * input_derivatives - (inputs - input_cumheights) * (
173
+ input_derivatives + input_derivatives_plus_one - 2 * input_delta
174
+ )
175
+ c = -input_delta * (inputs - input_cumheights)
176
+
177
+ discriminant = b.pow(2) - 4 * a * c
178
+ assert (discriminant >= 0).all()
179
+
180
+ root = (2 * c) / (-b - torch.sqrt(discriminant))
181
+ outputs = root * input_bin_widths + input_cumwidths
182
+
183
+ theta_one_minus_theta = root * (1 - root)
184
+ denominator = input_delta + (
185
+ (input_derivatives + input_derivatives_plus_one - 2 * input_delta)
186
+ * theta_one_minus_theta
187
+ )
188
+ derivative_numerator = input_delta.pow(2) * (
189
+ input_derivatives_plus_one * root.pow(2)
190
+ + 2 * input_delta * theta_one_minus_theta
191
+ + input_derivatives * (1 - root).pow(2)
192
+ )
193
+ logabsdet = torch.log(derivative_numerator) - 2 * torch.log(denominator)
194
+
195
+ return outputs, -logabsdet
196
+ else:
197
+ theta = (inputs - input_cumwidths) / input_bin_widths
198
+ theta_one_minus_theta = theta * (1 - theta)
199
+
200
+ numerator = input_heights * (
201
+ input_delta * theta.pow(2) + input_derivatives * theta_one_minus_theta
202
+ )
203
+ denominator = input_delta + (
204
+ (input_derivatives + input_derivatives_plus_one - 2 * input_delta)
205
+ * theta_one_minus_theta
206
+ )
207
+ outputs = input_cumheights + numerator / denominator
208
+
209
+ derivative_numerator = input_delta.pow(2) * (
210
+ input_derivatives_plus_one * theta.pow(2)
211
+ + 2 * input_delta * theta_one_minus_theta
212
+ + input_derivatives * (1 - theta).pow(2)
213
+ )
214
+ logabsdet = torch.log(derivative_numerator) - 2 * torch.log(denominator)
215
+
216
+ return outputs, logabsdet
app/rvc/lib/__init__.py ADDED
File without changes
app/rvc/lib/__pycache__/__init__.cpython-39.pyc ADDED
Binary file (152 Bytes). View file