XiaoHei Studio commited on
Commit
c82bb46
1 Parent(s): 07e9a8a

Upload 18 files

Browse files
CppDataProcess/F0Preprocess.cpp ADDED
@@ -0,0 +1,153 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ #include "F0Preprocess.hpp"
2
+
3
+
4
+ void F0PreProcess::compute_f0(const double* audio, int64_t len)
5
+ {
6
+ DioOption Doption;
7
+ InitializeDioOption(&Doption);
8
+ Doption.f0_ceil = 800;
9
+ Doption.frame_period = 1000.0 * hop / fs;
10
+ f0Len = GetSamplesForDIO(fs, (int)len, Doption.frame_period);
11
+ const auto tp = new double[f0Len];
12
+ const auto tmpf0 = new double[f0Len];
13
+ rf0 = new double[f0Len];
14
+ Dio(audio, (int)len, fs, &Doption, tp, tmpf0);
15
+ StoneMask(audio, (int)len, fs, tp, tmpf0, (int)f0Len, rf0);
16
+ delete[] tmpf0;
17
+ delete[] tp;
18
+ }
19
+
20
+ std::vector<double> arange(double start,double end,double step = 1.0,double div = 1.0)
21
+ {
22
+ std::vector<double> output;
23
+ while(start<end)
24
+ {
25
+ output.push_back(start / div);
26
+ start += step;
27
+ }
28
+ return output;
29
+ }
30
+
31
+ void F0PreProcess::InterPf0(int64_t len)
32
+ {
33
+ const auto xi = arange(0.0, (double)f0Len * (double)len, (double)f0Len, (double)len);
34
+ const auto tmp = new double[xi.size() + 1];
35
+ interp1(arange(0, (double)f0Len).data(), rf0, static_cast<int>(f0Len), xi.data(), (int)xi.size(), tmp);
36
+ for (size_t i = 0; i < xi.size(); i++)
37
+ if (isnan(tmp[i]))
38
+ tmp[i] = 0.0;
39
+ delete[] rf0;
40
+ rf0 = nullptr;
41
+ rf0 = tmp;
42
+ f0Len = (int64_t)xi.size();
43
+ }
44
+
45
+ long long* F0PreProcess::f0Log()
46
+ {
47
+ const auto tmp = new long long[f0Len];
48
+ const auto f0_mel = new double[f0Len];
49
+ for (long long i = 0; i < f0Len; i++)
50
+ {
51
+ f0_mel[i] = 1127 * log(1.0 + rf0[i] / 700.0);
52
+ if (f0_mel[i] > 0.0)
53
+ f0_mel[i] = (f0_mel[i] - f0_mel_min) * (f0_bin - 2.0) / (f0_mel_max - f0_mel_min) + 1.0;
54
+ if (f0_mel[i] < 1.0)
55
+ f0_mel[i] = 1;
56
+ if (f0_mel[i] > f0_bin - 1)
57
+ f0_mel[i] = f0_bin - 1;
58
+ tmp[i] = (long long)round(f0_mel[i]);
59
+ }
60
+ delete[] f0_mel;
61
+ delete[] rf0;
62
+ rf0 = nullptr;
63
+ return tmp;
64
+ }
65
+
66
+ std::vector<long long> F0PreProcess::GetF0AndOtherInput(const double* audio, int64_t audioLen, int64_t hubLen, int64_t tran)
67
+ {
68
+ compute_f0(audio, audioLen);
69
+ for (int64_t i = 0; i < f0Len; ++i)
70
+ {
71
+ rf0[i] = rf0[i] * pow(2.0, static_cast<double>(tran) / 12.0);
72
+ if (rf0[i] < 0.001)
73
+ rf0[i] = NAN;
74
+ }
75
+ InterPf0(hubLen);
76
+ const auto O0f = f0Log();
77
+ std::vector<long long> Of0(O0f, O0f + f0Len);
78
+ delete[] O0f;
79
+ return Of0;
80
+ }
81
+
82
+ std::vector<long long> getAligments(size_t specLen, size_t hubertLen)
83
+ {
84
+ std::vector<long long> mel2ph(specLen + 1, 0);
85
+
86
+ size_t startFrame = 0;
87
+ const double ph_durs = static_cast<double>(specLen) / static_cast<double>(hubertLen);
88
+ for (size_t iph = 0; iph < hubertLen; ++iph)
89
+ {
90
+ const auto endFrame = static_cast<size_t>(round(static_cast<double>(iph) * ph_durs + ph_durs));
91
+ for (auto j = startFrame; j < endFrame + 1; ++j)
92
+ mel2ph[j] = static_cast<long long>(iph) + 1;
93
+ startFrame = endFrame + 1;
94
+ }
95
+
96
+ return mel2ph;
97
+ }
98
+
99
+ std::vector<float> F0PreProcess::GetF0AndOtherInputF0(const double* audio, int64_t audioLen, int64_t tran)
100
+ {
101
+ compute_f0(audio, audioLen);
102
+ for (int64_t i = 0; i < f0Len; ++i)
103
+ {
104
+ rf0[i] = log2(rf0[i] * pow(2.0, static_cast<double>(tran) / 12.0));
105
+ if (rf0[i] < 0.001)
106
+ rf0[i] = NAN;
107
+ }
108
+ const int64_t specLen = audioLen / hop;
109
+ InterPf0(specLen);
110
+
111
+ std::vector<float> Of0(specLen, 0.0);
112
+
113
+ double last_value = 0.0;
114
+ for (int64_t i = 0; i < specLen; ++i)
115
+ {
116
+ if (rf0[i] <= 0.0)
117
+ {
118
+ int64_t j = i + 1;
119
+ for (; j < specLen; ++j)
120
+ {
121
+ if (rf0[j] > 0.0)
122
+ break;
123
+ }
124
+ if (j < specLen - 1)
125
+ {
126
+ if (last_value > 0.0)
127
+ {
128
+ const auto step = (rf0[j] - rf0[i - 1]) / double(j - i);
129
+ for (int64_t k = i; k < j; ++k)
130
+ Of0[k] = float(rf0[i - 1] + step * double(k - i + 1));
131
+ }
132
+ else
133
+ for (int64_t k = i; k < j; ++k)
134
+ Of0[k] = float(rf0[j]);
135
+ i = j;
136
+ }
137
+ else
138
+ {
139
+ for (int64_t k = i; k < specLen; ++k)
140
+ Of0[k] = float(last_value);
141
+ i = specLen;
142
+ }
143
+ }
144
+ else
145
+ {
146
+ Of0[i] = float(rf0[i - 1]);
147
+ last_value = rf0[i];
148
+ }
149
+ }
150
+ delete[] rf0;
151
+ rf0 = nullptr;
152
+ return Of0;
153
+ }
CppDataProcess/F0Preprocess.hpp ADDED
@@ -0,0 +1,36 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ #include "world/dio.h"
2
+ #include "world/stonemask.h"
3
+ #include "world/matlabfunctions.h"
4
+ #include <string>
5
+ #include <vector>
6
+
7
+ //Cpp F0 Preprocess
8
+
9
+ class F0PreProcess
10
+ {
11
+ public:
12
+ int fs;
13
+ short hop;
14
+ const int f0_bin = 256;
15
+ const double f0_max = 1100.0;
16
+ const double f0_min = 50.0;
17
+ const double f0_mel_min = 1127.0 * log(1.0 + f0_min / 700.0);
18
+ const double f0_mel_max = 1127.0 * log(1.0 + f0_max / 700.0);
19
+ F0PreProcess(int sr = 16000, short h = 160) :fs(sr), hop(h) {}
20
+ ~F0PreProcess()
21
+ {
22
+ delete[] rf0;
23
+ rf0 = nullptr;
24
+ }
25
+ void compute_f0(const double* audio, int64_t len);
26
+ void InterPf0(int64_t len);
27
+ long long* f0Log();
28
+ int64_t getLen()const { return f0Len; }
29
+ std::vector<long long> GetF0AndOtherInput(const double* audio, int64_t audioLen, int64_t hubLen, int64_t tran);
30
+ std::vector<float> GetF0AndOtherInputF0(const double* audio, int64_t audioLen, int64_t tran);
31
+ private:
32
+ double* rf0 = nullptr;
33
+ int64_t f0Len = 0;
34
+ };
35
+
36
+ std::vector<long long> getAligments(size_t specLen, size_t hubertLen);
CppDataProcess/Slicer.hpp ADDED
@@ -0,0 +1,82 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ #include <string>
2
+ #include <vector>
3
+ #include "Wav.hpp"
4
+
5
+ struct SliceResult
6
+ {
7
+ std::vector<unsigned long long> SliceOffset;
8
+ std::vector<bool> SliceTag;
9
+ cutResult(std::vector<unsigned long long>&& O, std::vector<bool>&& T) :SliceOffset(O), SliceTag(T) {}
10
+ };
11
+
12
+ double getAvg(const short* start, const short* end)
13
+ {
14
+ const auto size = end - start + 1;
15
+ auto avg = (double)(*start);
16
+ for (auto i = 1; i < size; i++)
17
+ {
18
+ avg = avg + (abs((double)start[i]) - avg) / (double)(i + 1ull);
19
+ }
20
+ return avg;
21
+ }
22
+
23
+ inline SliceResult SliceWav(Wav& input, double threshold, unsigned long minLen, unsigned short frame_len, unsigned short frame_shift)
24
+ {
25
+ const auto header = input.getHeader();
26
+ if (header.Subchunk2Size < minLen * header.bytesPerSec)
27
+ return { {0,header.Subchunk2Size},{true} };
28
+ auto ptr = input.getData();
29
+ std::vector<unsigned long long> output;
30
+ std::vector<bool> tag;
31
+ auto n = (header.Subchunk2Size / frame_shift) - 2 * (frame_len / frame_shift);
32
+ unsigned long nn = 0;
33
+ bool cutTag = true;
34
+ output.emplace_back(0);
35
+ while (n--)
36
+ {
37
+ //if (nn > minLen * header.bytesPerSec)
38
+ if (cutTag)
39
+ {
40
+ const auto vol = abs(getAvg((short*)ptr, (short*)ptr + frame_len));
41
+ if (vol < threshold)
42
+ {
43
+ cutTag = false;
44
+ if (nn > minLen * header.bytesPerSec)
45
+ {
46
+ nn = 0;
47
+ output.emplace_back((ptr - input.getData()) + (frame_len / 2));
48
+ }
49
+ }
50
+ else
51
+ {
52
+ cutTag = true;
53
+ }
54
+ }
55
+ else
56
+ {
57
+ const auto vol = abs(getAvg((short*)ptr, (short*)ptr + frame_len));
58
+ if (vol < threshold)
59
+ {
60
+ cutTag = false;
61
+ }
62
+ else
63
+ {
64
+ cutTag = true;
65
+ if (nn > minLen * header.bytesPerSec)
66
+ {
67
+ nn = 0;
68
+ output.emplace_back((ptr - input.getData()) + (frame_len / 2));
69
+ }
70
+ }
71
+ }
72
+ nn += frame_shift;
73
+ ptr += frame_shift;
74
+ }
75
+ output.push_back(header.Subchunk2Size);
76
+ for (size_t i = 1; i < output.size(); i++)
77
+ {
78
+ tag.push_back(abs(getAvg((short*)(input.getData() + output[i - 1]), (short*)(input.getData() + output[i]))) > threshold);
79
+ }
80
+ return { std::move(output),std::move(tag) };
81
+ }
82
+
CppDataProcess/Wav.cpp ADDED
@@ -0,0 +1,151 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ #include "Wav.hpp"
2
+
3
+ Wav::Wav(const wchar_t* Path) :header(WAV_HEADER()) {
4
+ char buf[1024];
5
+ FILE* stream;
6
+ _wfreopen_s(&stream, Path, L"rb", stderr);
7
+ if (stream == nullptr) {
8
+ throw (std::exception("File not exists"));
9
+ }
10
+ fread(buf, 1, HEAD_LENGTH, stream);
11
+ int pos = 0;
12
+ while (pos < HEAD_LENGTH) {
13
+ if ((buf[pos] == 'R') && (buf[pos + 1] == 'I') && (buf[pos + 2] == 'F') && (buf[pos + 3] == 'F')) {
14
+ pos += 4;
15
+ break;
16
+ }
17
+ ++pos;
18
+ }
19
+ if (pos >= HEAD_LENGTH)
20
+ throw (std::exception("Don't order fried rice (annoyed)"));
21
+ header.ChunkSize = *(int*)&buf[pos];
22
+ pos += 8;
23
+ while (pos < HEAD_LENGTH) {
24
+ if ((buf[pos] == 'f') && (buf[pos + 1] == 'm') && (buf[pos + 2] == 't')) {
25
+ pos += 4;
26
+ break;
27
+ }
28
+ ++pos;
29
+ }
30
+ if (pos >= HEAD_LENGTH)
31
+ throw (std::exception("Don't order fried rice (annoyed)"));
32
+ header.Subchunk1Size = *(int*)&buf[pos];
33
+ pos += 4;
34
+ header.AudioFormat = *(short*)&buf[pos];
35
+ pos += 2;
36
+ header.NumOfChan = *(short*)&buf[pos];
37
+ pos += 2;
38
+ header.SamplesPerSec = *(int*)&buf[pos];
39
+ pos += 4;
40
+ header.bytesPerSec = *(int*)&buf[pos];
41
+ pos += 4;
42
+ header.blockAlign = *(short*)&buf[pos];
43
+ pos += 2;
44
+ header.bitsPerSample = *(short*)&buf[pos];
45
+ pos += 2;
46
+ while (pos < HEAD_LENGTH) {
47
+ if ((buf[pos] == 'd') && (buf[pos + 1] == 'a') && (buf[pos + 2] == 't') && (buf[pos + 3] == 'a')) {
48
+ pos += 4;
49
+ break;
50
+ }
51
+ ++pos;
52
+ }
53
+ if (pos >= HEAD_LENGTH)
54
+ throw (std::exception("Don't order fried rice (annoyed)"));
55
+ header.Subchunk2Size = *(int*)&buf[pos];
56
+ pos += 4;
57
+ StartPos = pos;
58
+ Data = new char[header.Subchunk2Size + 1];
59
+ fseek(stream, StartPos, SEEK_SET);
60
+ fread(Data, 1, header.Subchunk2Size, stream);
61
+ if (stream != nullptr) {
62
+ fclose(stream);
63
+ }
64
+ SData = reinterpret_cast<int16_t*>(Data);
65
+ dataSize = header.Subchunk2Size / 2;
66
+ }
67
+
68
+ Wav::Wav(const Wav& input) :header(WAV_HEADER()) {
69
+ Data = new char[(input.header.Subchunk2Size + 1)];
70
+ if (Data == nullptr) { throw std::exception("OOM"); }
71
+ memcpy(header.RIFF, input.header.RIFF, 4);
72
+ memcpy(header.fmt, input.header.fmt, 4);
73
+ memcpy(header.WAVE, input.header.WAVE, 4);
74
+ memcpy(header.Subchunk2ID, input.header.Subchunk2ID, 4);
75
+ header.ChunkSize = input.header.ChunkSize;
76
+ header.Subchunk1Size = input.header.Subchunk1Size;
77
+ header.AudioFormat = input.header.AudioFormat;
78
+ header.NumOfChan = input.header.NumOfChan;
79
+ header.SamplesPerSec = input.header.SamplesPerSec;
80
+ header.bytesPerSec = input.header.bytesPerSec;
81
+ header.blockAlign = input.header.blockAlign;
82
+ header.bitsPerSample = input.header.bitsPerSample;
83
+ header.Subchunk2Size = input.header.Subchunk2Size;
84
+ StartPos = input.StartPos;
85
+ memcpy(Data, input.Data, input.header.Subchunk2Size);
86
+ SData = reinterpret_cast<int16_t*>(Data);
87
+ dataSize = header.Subchunk2Size / 2;
88
+ }
89
+
90
+ Wav::Wav(Wav&& input) noexcept
91
+ {
92
+ Data = input.Data;
93
+ input.Data = nullptr;
94
+ memcpy(header.RIFF, input.header.RIFF, 4);
95
+ memcpy(header.fmt, input.header.fmt, 4);
96
+ memcpy(header.WAVE, input.header.WAVE, 4);
97
+ memcpy(header.Subchunk2ID, input.header.Subchunk2ID, 4);
98
+ header.ChunkSize = input.header.ChunkSize;
99
+ header.Subchunk1Size = input.header.Subchunk1Size;
100
+ header.AudioFormat = input.header.AudioFormat;
101
+ header.NumOfChan = input.header.NumOfChan;
102
+ header.SamplesPerSec = input.header.SamplesPerSec;
103
+ header.bytesPerSec = input.header.bytesPerSec;
104
+ header.blockAlign = input.header.blockAlign;
105
+ header.bitsPerSample = input.header.bitsPerSample;
106
+ header.Subchunk2Size = input.header.Subchunk2Size;
107
+ StartPos = input.StartPos;
108
+ SData = reinterpret_cast<int16_t*>(Data);
109
+ dataSize = header.Subchunk2Size / 2;
110
+ }
111
+
112
+ Wav& Wav::operator=(Wav&& input) noexcept
113
+ {
114
+ destory();
115
+ Data = input.Data;
116
+ input.Data = nullptr;
117
+ memcpy(header.RIFF, input.header.RIFF, 4);
118
+ memcpy(header.fmt, input.header.fmt, 4);
119
+ memcpy(header.WAVE, input.header.WAVE, 4);
120
+ memcpy(header.Subchunk2ID, input.header.Subchunk2ID, 4);
121
+ header.ChunkSize = input.header.ChunkSize;
122
+ header.Subchunk1Size = input.header.Subchunk1Size;
123
+ header.AudioFormat = input.header.AudioFormat;
124
+ header.NumOfChan = input.header.NumOfChan;
125
+ header.SamplesPerSec = input.header.SamplesPerSec;
126
+ header.bytesPerSec = input.header.bytesPerSec;
127
+ header.blockAlign = input.header.blockAlign;
128
+ header.bitsPerSample = input.header.bitsPerSample;
129
+ header.Subchunk2Size = input.header.Subchunk2Size;
130
+ StartPos = input.StartPos;
131
+ SData = reinterpret_cast<int16_t*>(Data);
132
+ dataSize = header.Subchunk2Size / 2;
133
+ return *this;
134
+ }
135
+
136
+ Wav& Wav::cat(const Wav& input)
137
+ {
138
+ if (header.AudioFormat != 1) return *this;
139
+ if (header.SamplesPerSec != input.header.bitsPerSample || header.NumOfChan != input.header.NumOfChan) return *this;
140
+ char* buffer = new char[(int64_t)header.Subchunk2Size + (int64_t)input.header.Subchunk2Size + 1];
141
+ if (buffer == nullptr)return *this;
142
+ memcpy(buffer, Data, header.Subchunk2Size);
143
+ memcpy(buffer + header.Subchunk2Size, input.Data, input.header.Subchunk2Size);
144
+ header.ChunkSize += input.header.Subchunk2Size;
145
+ header.Subchunk2Size += input.header.Subchunk2Size;
146
+ delete[] Data;
147
+ Data = buffer;
148
+ SData = reinterpret_cast<int16_t*>(Data);
149
+ dataSize = header.Subchunk2Size / 2;
150
+ return *this;
151
+ }
CppDataProcess/Wav.hpp ADDED
@@ -0,0 +1,99 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ class Wav {
2
+ public:
3
+
4
+ struct WAV_HEADER {
5
+ char RIFF[4] = { 'R','I','F','F' }; //RIFF��ʶ
6
+ unsigned long ChunkSize; //�ļ���С-8
7
+ char WAVE[4] = { 'W','A','V','E' }; //WAVE��
8
+ char fmt[4] = { 'f','m','t',' ' }; //fmt��
9
+ unsigned long Subchunk1Size; //fmt���С
10
+ unsigned short AudioFormat; //�����ʽ
11
+ unsigned short NumOfChan; //������
12
+ unsigned long SamplesPerSec; //������
13
+ unsigned long bytesPerSec; //ÿ�����ֽ���
14
+ unsigned short blockAlign; //�������ֽ�
15
+ unsigned short bitsPerSample; //�������
16
+ char Subchunk2ID[4] = { 'd','a','t','a' }; //���ݿ�
17
+ unsigned long Subchunk2Size; //���ݿ��С
18
+ WAV_HEADER(unsigned long cs = 36, unsigned long sc1s = 16, unsigned short af = 1, unsigned short nc = 1, unsigned long sr = 22050, unsigned long bps = 44100, unsigned short ba = 2, unsigned short bips = 16, unsigned long sc2s = 0) :ChunkSize(cs), Subchunk1Size(sc1s), AudioFormat(af), NumOfChan(nc), SamplesPerSec(sr), bytesPerSec(bps), blockAlign(ba), bitsPerSample(bips), Subchunk2Size(sc2s) {}
19
+ };
20
+ using iterator = int16_t*;
21
+ Wav(unsigned long cs = 36, unsigned long sc1s = 16, unsigned short af = 1, unsigned short nc = 1, unsigned long sr = 22050, unsigned long bps = 44100, unsigned short ba = 2, unsigned short bips = 16, unsigned long sc2s = 0) :header({
22
+ cs,
23
+ sc1s,
24
+ af,
25
+ nc,
26
+ sr,
27
+ bps,
28
+ ba,
29
+ bips,
30
+ sc2s
31
+ }), Data(nullptr), StartPos(44) {
32
+ dataSize = 0;
33
+ SData = nullptr;
34
+ }
35
+ Wav(unsigned long sr, unsigned long length, const void* data) :header({
36
+ 36,
37
+ 16,
38
+ 1,
39
+ 1,
40
+ sr,
41
+ sr * 2,
42
+ 2,
43
+ 16,
44
+ length
45
+ }), Data(new char[length + 1]), StartPos(44)
46
+ {
47
+ header.ChunkSize = 36 + length;
48
+ memcpy(Data, data, length);
49
+ SData = reinterpret_cast<int16_t*>(Data);
50
+ dataSize = length / 2;
51
+ }
52
+ Wav(const wchar_t* Path);
53
+ Wav(const Wav& input);
54
+ Wav(Wav&& input) noexcept;
55
+ Wav& operator=(const Wav& input) = delete;
56
+ Wav& operator=(Wav&& input) noexcept;
57
+ ~Wav() { destory(); }
58
+ Wav& cat(const Wav& input);
59
+ bool isEmpty() const { return this->header.Subchunk2Size == 0; }
60
+ const char* getData() const { return Data; }
61
+ char* getData() { return Data; }
62
+ WAV_HEADER getHeader() const { return header; }
63
+ WAV_HEADER& Header() { return header; }
64
+ void destory() const { delete[] Data; }
65
+ void changeData(const void* indata,long length,int sr)
66
+ {
67
+ delete[] Data;
68
+ Data = new char[length];
69
+ memcpy(Data, indata, length);
70
+ header.ChunkSize = 36 + length;
71
+ header.Subchunk2Size = length;
72
+ header.SamplesPerSec = sr;
73
+ header.bytesPerSec = 2 * sr;
74
+ }
75
+ int16_t& operator[](const size_t index) const
76
+ {
77
+ if (index < dataSize)
78
+ return *(SData + index);
79
+ return *(SData + dataSize - 1);
80
+ }
81
+ iterator begin() const
82
+ {
83
+ return reinterpret_cast<int16_t*>(Data);
84
+ }
85
+ iterator end() const
86
+ {
87
+ return reinterpret_cast<int16_t*>(Data + header.Subchunk2Size);
88
+ }
89
+ int64_t getDataLen()const
90
+ {
91
+ return static_cast<int64_t>(dataSize);
92
+ }
93
+ private:
94
+ WAV_HEADER header;
95
+ char* Data;
96
+ int16_t* SData;
97
+ size_t dataSize;
98
+ int StartPos;
99
+ };
CppDataProcess/readme.md ADDED
@@ -0,0 +1,8 @@
 
 
 
 
 
 
 
 
 
1
+ ## F0Preprocess
2
+ 请前往 https://github.com/JeremyCCHsu/Python-Wrapper-for-World-Vocoder 下载PyWorld的源代码并编译出静态库并链接到你的项目之中,然后调用此头文件
3
+
4
+ ## Slicer
5
+ 一个简单的切片机
6
+
7
+ ---
8
+ ~~上面的东西是直接从MoeSS的代码里面抽出来的,可以作为预置预处理的替代品()~~
cluster/__init__.py ADDED
@@ -0,0 +1,29 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import torch
2
+ from sklearn.cluster import KMeans
3
+
4
+
5
+ def get_cluster_model(ckpt_path):
6
+ checkpoint = torch.load(ckpt_path)
7
+ kmeans_dict = {}
8
+ for spk, ckpt in checkpoint.items():
9
+ km = KMeans(ckpt["n_features_in_"])
10
+ km.__dict__["n_features_in_"] = ckpt["n_features_in_"]
11
+ km.__dict__["_n_threads"] = ckpt["_n_threads"]
12
+ km.__dict__["cluster_centers_"] = ckpt["cluster_centers_"]
13
+ kmeans_dict[spk] = km
14
+ return kmeans_dict
15
+
16
+ def get_cluster_result(model, x, speaker):
17
+ """
18
+ x: np.array [t, 256]
19
+ return cluster class result
20
+ """
21
+ return model[speaker].predict(x)
22
+
23
+ def get_cluster_center_result(model, x,speaker):
24
+ """x: np.array [t, 256]"""
25
+ predict = model[speaker].predict(x)
26
+ return model[speaker].cluster_centers_[predict]
27
+
28
+ def get_center(model, x,speaker):
29
+ return model[speaker].cluster_centers_[x]
cluster/__pycache__/__init__.cpython-38.pyc ADDED
Binary file (1.09 kB). View file
 
cluster/__pycache__/kmeans.cpython-38.pyc ADDED
Binary file (6.95 kB). View file
 
cluster/km_train.py ADDED
@@ -0,0 +1,80 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import time,pdb
2
+ import tqdm
3
+ from time import time as ttime
4
+ import os
5
+ from pathlib import Path
6
+ import logging
7
+ import argparse
8
+ from cluster.kmeans import KMeansGPU
9
+ import torch
10
+ import numpy as np
11
+ from sklearn.cluster import KMeans,MiniBatchKMeans
12
+
13
+ logging.basicConfig(level=logging.INFO)
14
+ logger = logging.getLogger(__name__)
15
+ from time import time as ttime
16
+ import pynvml,torch
17
+
18
+ def train_cluster(in_dir, n_clusters, use_minibatch=True, verbose=False,use_gpu=False):#gpu_minibatch真拉,虽然库支持但是也不考虑
19
+ logger.info(f"Loading features from {in_dir}")
20
+ features = []
21
+ nums = 0
22
+ for path in tqdm.tqdm(in_dir.glob("*.soft.pt")):
23
+ # for name in os.listdir(in_dir):
24
+ # path="%s/%s"%(in_dir,name)
25
+ features.append(torch.load(path,map_location="cpu").squeeze(0).numpy().T)
26
+ # print(features[-1].shape)
27
+ features = np.concatenate(features, axis=0)
28
+ print(nums, features.nbytes/ 1024**2, "MB , shape:",features.shape, features.dtype)
29
+ features = features.astype(np.float32)
30
+ logger.info(f"Clustering features of shape: {features.shape}")
31
+ t = time.time()
32
+ if(use_gpu==False):
33
+ if use_minibatch:
34
+ kmeans = MiniBatchKMeans(n_clusters=n_clusters,verbose=verbose, batch_size=4096, max_iter=80).fit(features)
35
+ else:
36
+ kmeans = KMeans(n_clusters=n_clusters,verbose=verbose).fit(features)
37
+ else:
38
+ kmeans = KMeansGPU(n_clusters=n_clusters, mode='euclidean', verbose=2 if verbose else 0,max_iter=500,tol=1e-2)#
39
+ features=torch.from_numpy(features)#.to(device)
40
+ labels = kmeans.fit_predict(features)#
41
+
42
+ print(time.time()-t, "s")
43
+
44
+ x = {
45
+ "n_features_in_": kmeans.n_features_in_ if use_gpu==False else features.shape[0],
46
+ "_n_threads": kmeans._n_threads if use_gpu==False else 4,
47
+ "cluster_centers_": kmeans.cluster_centers_ if use_gpu==False else kmeans.centroids.cpu().numpy(),
48
+ }
49
+ print("end")
50
+
51
+ return x
52
+
53
+ if __name__ == "__main__":
54
+ parser = argparse.ArgumentParser()
55
+ parser.add_argument('--dataset', type=Path, default="./dataset/44k",
56
+ help='path of training data directory')
57
+ parser.add_argument('--output', type=Path, default="logs/44k",
58
+ help='path of model output directory')
59
+
60
+ args = parser.parse_args()
61
+
62
+ checkpoint_dir = args.output
63
+ dataset = args.dataset
64
+ n_clusters = 1000
65
+
66
+ ckpt = {}
67
+ for spk in os.listdir(dataset):
68
+ if os.path.isdir(dataset/spk):
69
+ print(f"train kmeans for {spk}...")
70
+ in_dir = dataset/spk
71
+ x = train_cluster(in_dir, n_clusters,use_minibatch=False,verbose=False,use_gpu=True)
72
+ ckpt[spk] = x
73
+
74
+ checkpoint_path = checkpoint_dir / f"kmeans_{n_clusters}.pt"
75
+ checkpoint_path.parent.mkdir(exist_ok=True, parents=True)
76
+ torch.save(
77
+ ckpt,
78
+ checkpoint_path,
79
+ )
80
+
cluster/kmeans.py ADDED
@@ -0,0 +1,204 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from time import time
2
+
3
+ import numpy as np
4
+ import pynvml
5
+ import torch
6
+ from torch.nn.functional import normalize
7
+
8
+
9
+ # device=torch.device("cuda:0")
10
+ def _kpp(data: torch.Tensor, k: int, sample_size: int = -1):
11
+ """ Picks k points in the data based on the kmeans++ method.
12
+
13
+ Parameters
14
+ ----------
15
+ data : torch.Tensor
16
+ Expect a rank 1 or 2 array. Rank 1 is assumed to describe 1-D
17
+ data, rank 2 multidimensional data, in which case one
18
+ row is one observation.
19
+ k : int
20
+ Number of samples to generate.
21
+ sample_size : int
22
+ sample data to avoid memory overflow during calculation
23
+
24
+ Returns
25
+ -------
26
+ init : ndarray
27
+ A 'k' by 'N' containing the initial centroids.
28
+
29
+ References
30
+ ----------
31
+ .. [1] D. Arthur and S. Vassilvitskii, "k-means++: the advantages of
32
+ careful seeding", Proceedings of the Eighteenth Annual ACM-SIAM Symposium
33
+ on Discrete Algorithms, 2007.
34
+ .. [2] scipy/cluster/vq.py: _kpp
35
+ """
36
+ batch_size=data.shape[0]
37
+ if batch_size>sample_size:
38
+ data = data[torch.randint(0, batch_size,[sample_size], device=data.device)]
39
+ dims = data.shape[1] if len(data.shape) > 1 else 1
40
+ init = torch.zeros((k, dims)).to(data.device)
41
+ r = torch.distributions.uniform.Uniform(0, 1)
42
+ for i in range(k):
43
+ if i == 0:
44
+ init[i, :] = data[torch.randint(data.shape[0], [1])]
45
+ else:
46
+ D2 = torch.cdist(init[:i, :][None, :], data[None, :], p=2)[0].amin(dim=0)
47
+ probs = D2 / torch.sum(D2)
48
+ cumprobs = torch.cumsum(probs, dim=0)
49
+ init[i, :] = data[torch.searchsorted(cumprobs, r.sample([1]).to(data.device))]
50
+ return init
51
+ class KMeansGPU:
52
+ '''
53
+ Kmeans clustering algorithm implemented with PyTorch
54
+
55
+ Parameters:
56
+ n_clusters: int,
57
+ Number of clusters
58
+
59
+ max_iter: int, default: 100
60
+ Maximum number of iterations
61
+
62
+ tol: float, default: 0.0001
63
+ Tolerance
64
+
65
+ verbose: int, default: 0
66
+ Verbosity
67
+
68
+ mode: {'euclidean', 'cosine'}, default: 'euclidean'
69
+ Type of distance measure
70
+
71
+ init_method: {'random', 'point', '++'}
72
+ Type of initialization
73
+
74
+ minibatch: {None, int}, default: None
75
+ Batch size of MinibatchKmeans algorithm
76
+ if None perform full KMeans algorithm
77
+
78
+ Attributes:
79
+ centroids: torch.Tensor, shape: [n_clusters, n_features]
80
+ cluster centroids
81
+ '''
82
+ def __init__(self, n_clusters, max_iter=200, tol=1e-4, verbose=0, mode="euclidean",device=torch.device("cuda:0")):
83
+ self.n_clusters = n_clusters
84
+ self.max_iter = max_iter
85
+ self.tol = tol
86
+ self.verbose = verbose
87
+ self.mode = mode
88
+ self.device=device
89
+ pynvml.nvmlInit()
90
+ gpu_handle = pynvml.nvmlDeviceGetHandleByIndex(device.index)
91
+ info = pynvml.nvmlDeviceGetMemoryInfo(gpu_handle)
92
+ self.minibatch=int(33e6/self.n_clusters*info.free/ 1024 / 1024 / 1024)
93
+ print("free_mem/GB:",info.free/ 1024 / 1024 / 1024,"minibatch:",self.minibatch)
94
+
95
+ @staticmethod
96
+ def cos_sim(a, b):
97
+ """
98
+ Compute cosine similarity of 2 sets of vectors
99
+
100
+ Parameters:
101
+ a: torch.Tensor, shape: [m, n_features]
102
+
103
+ b: torch.Tensor, shape: [n, n_features]
104
+ """
105
+ return normalize(a, dim=-1) @ normalize(b, dim=-1).transpose(-2, -1)
106
+
107
+ @staticmethod
108
+ def euc_sim(a, b):
109
+ """
110
+ Compute euclidean similarity of 2 sets of vectors
111
+ Parameters:
112
+ a: torch.Tensor, shape: [m, n_features]
113
+ b: torch.Tensor, shape: [n, n_features]
114
+ """
115
+ return 2 * a @ b.transpose(-2, -1) -(a**2).sum(dim=1)[..., :, None] - (b**2).sum(dim=1)[..., None, :]
116
+
117
+ def max_sim(self, a, b):
118
+ """
119
+ Compute maximum similarity (or minimum distance) of each vector
120
+ in a with all of the vectors in b
121
+ Parameters:
122
+ a: torch.Tensor, shape: [m, n_features]
123
+ b: torch.Tensor, shape: [n, n_features]
124
+ """
125
+ if self.mode == 'cosine':
126
+ sim_func = self.cos_sim
127
+ elif self.mode == 'euclidean':
128
+ sim_func = self.euc_sim
129
+ sim = sim_func(a, b)
130
+ max_sim_v, max_sim_i = sim.max(dim=-1)
131
+ return max_sim_v, max_sim_i
132
+
133
+ def fit_predict(self, X):
134
+ """
135
+ Combination of fit() and predict() methods.
136
+ This is faster than calling fit() and predict() seperately.
137
+ Parameters:
138
+ X: torch.Tensor, shape: [n_samples, n_features]
139
+ centroids: {torch.Tensor, None}, default: None
140
+ if given, centroids will be initialized with given tensor
141
+ if None, centroids will be randomly chosen from X
142
+ Return:
143
+ labels: torch.Tensor, shape: [n_samples]
144
+
145
+ mini_=33kk/k*remain
146
+ mini=min(mini_,fea_shape)
147
+ offset=log2(k/1000)*1.5
148
+ kpp_all=min(mini_*10/offset,fea_shape)
149
+ kpp_sample=min(mini_/12/offset,fea_shape)
150
+ """
151
+ assert isinstance(X, torch.Tensor), "input must be torch.Tensor"
152
+ assert X.dtype in [torch.half, torch.float, torch.double], "input must be floating point"
153
+ assert X.ndim == 2, "input must be a 2d tensor with shape: [n_samples, n_features] "
154
+ # print("verbose:%s"%self.verbose)
155
+
156
+ offset = np.power(1.5,np.log(self.n_clusters / 1000))/np.log(2)
157
+ with torch.no_grad():
158
+ batch_size= X.shape[0]
159
+ # print(self.minibatch, int(self.minibatch * 10 / offset), batch_size)
160
+ start_time = time()
161
+ if (self.minibatch*10//offset< batch_size):
162
+ x = X[torch.randint(0, batch_size,[int(self.minibatch*10/offset)])].to(self.device)
163
+ else:
164
+ x = X.to(self.device)
165
+ # print(x.device)
166
+ self.centroids = _kpp(x, self.n_clusters, min(int(self.minibatch/12/offset),batch_size))
167
+ del x
168
+ torch.cuda.empty_cache()
169
+ # self.centroids = self.centroids.to(self.device)
170
+ num_points_in_clusters = torch.ones(self.n_clusters, device=self.device, dtype=X.dtype)#全1
171
+ closest = None#[3098036]#int64
172
+ if(self.minibatch>=batch_size//2 and self.minibatch<batch_size):
173
+ X = X[torch.randint(0, batch_size,[self.minibatch])].to(self.device)
174
+ elif(self.minibatch>=batch_size):
175
+ X=X.to(self.device)
176
+ for i in range(self.max_iter):
177
+ iter_time = time()
178
+ if self.minibatch<batch_size//2:#可用minibatch数太小,每次都得从内存倒腾到显存
179
+ x = X[torch.randint(0, batch_size, [self.minibatch])].to(self.device)
180
+ else:#否则直接全部缓存
181
+ x = X
182
+
183
+ closest = self.max_sim(a=x, b=self.centroids)[1].to(torch.int16)#[3098036]#int64#0~999
184
+ matched_clusters, counts = closest.unique(return_counts=True)#int64#1k
185
+ expanded_closest = closest[None].expand(self.n_clusters, -1)#[1000, 3098036]#int16#0~999
186
+ mask = (expanded_closest==torch.arange(self.n_clusters, device=self.device)[:, None]).to(X.dtype)#==后者是int64*1000
187
+ c_grad = mask @ x / mask.sum(-1)[..., :, None]
188
+ c_grad[c_grad!=c_grad] = 0 # remove NaNs
189
+ error = (c_grad - self.centroids).pow(2).sum()
190
+ if self.minibatch is not None:
191
+ lr = 1/num_points_in_clusters[:,None] * 0.9 + 0.1
192
+ else:
193
+ lr = 1
194
+ matched_clusters=matched_clusters.long()
195
+ num_points_in_clusters[matched_clusters] += counts#IndexError: tensors used as indices must be long, byte or bool tensors
196
+ self.centroids = self.centroids * (1-lr) + c_grad * lr
197
+ if self.verbose >= 2:
198
+ print('iter:', i, 'error:', error.item(), 'time spent:', round(time()-iter_time, 4))
199
+ if error <= self.tol:
200
+ break
201
+
202
+ if self.verbose >= 1:
203
+ print(f'used {i+1} iterations ({round(time()-start_time, 4)}s) to cluster {batch_size} items into {self.n_clusters} clusters')
204
+ return closest
cluster/train_cluster.py ADDED
@@ -0,0 +1,85 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import argparse
2
+ import logging
3
+ import os
4
+ import time
5
+ from pathlib import Path
6
+
7
+ import numpy as np
8
+ import torch
9
+ import tqdm
10
+ from kmeans import KMeansGPU
11
+ from sklearn.cluster import KMeans, MiniBatchKMeans
12
+
13
+ logging.basicConfig(level=logging.INFO)
14
+ logger = logging.getLogger(__name__)
15
+
16
+ def train_cluster(in_dir, n_clusters, use_minibatch=True, verbose=False,use_gpu=False):#gpu_minibatch真拉,虽然库支持但是也不考虑
17
+ if str(in_dir).endswith(".ipynb_checkpoints"):
18
+ logger.info(f"Ignore {in_dir}")
19
+
20
+ logger.info(f"Loading features from {in_dir}")
21
+ features = []
22
+ nums = 0
23
+ for path in tqdm.tqdm(in_dir.glob("*.soft.pt")):
24
+ # for name in os.listdir(in_dir):
25
+ # path="%s/%s"%(in_dir,name)
26
+ features.append(torch.load(path,map_location="cpu").squeeze(0).numpy().T)
27
+ # print(features[-1].shape)
28
+ features = np.concatenate(features, axis=0)
29
+ print(nums, features.nbytes/ 1024**2, "MB , shape:",features.shape, features.dtype)
30
+ features = features.astype(np.float32)
31
+ logger.info(f"Clustering features of shape: {features.shape}")
32
+ t = time.time()
33
+ if(use_gpu is False):
34
+ if use_minibatch:
35
+ kmeans = MiniBatchKMeans(n_clusters=n_clusters,verbose=verbose, batch_size=4096, max_iter=80).fit(features)
36
+ else:
37
+ kmeans = KMeans(n_clusters=n_clusters,verbose=verbose).fit(features)
38
+ else:
39
+ kmeans = KMeansGPU(n_clusters=n_clusters, mode='euclidean', verbose=2 if verbose else 0,max_iter=500,tol=1e-2)#
40
+ features=torch.from_numpy(features)#.to(device)
41
+ kmeans.fit_predict(features)#
42
+
43
+ print(time.time()-t, "s")
44
+
45
+ x = {
46
+ "n_features_in_": kmeans.n_features_in_ if use_gpu is False else features.shape[1],
47
+ "_n_threads": kmeans._n_threads if use_gpu is False else 4,
48
+ "cluster_centers_": kmeans.cluster_centers_ if use_gpu is False else kmeans.centroids.cpu().numpy(),
49
+ }
50
+ print("end")
51
+
52
+ return x
53
+
54
+ if __name__ == "__main__":
55
+ parser = argparse.ArgumentParser()
56
+ parser.add_argument('--dataset', type=Path, default="./dataset/44k",
57
+ help='path of training data directory')
58
+ parser.add_argument('--output', type=Path, default="logs/44k",
59
+ help='path of model output directory')
60
+ parser.add_argument('--gpu',action='store_true', default=False ,
61
+ help='to use GPU')
62
+
63
+
64
+ args = parser.parse_args()
65
+
66
+ checkpoint_dir = args.output
67
+ dataset = args.dataset
68
+ use_gpu = args.gpu
69
+ n_clusters = 10000
70
+
71
+ ckpt = {}
72
+ for spk in os.listdir(dataset):
73
+ if os.path.isdir(dataset/spk):
74
+ print(f"train kmeans for {spk}...")
75
+ in_dir = dataset/spk
76
+ x = train_cluster(in_dir, n_clusters,use_minibatch=False,verbose=False,use_gpu=use_gpu)
77
+ ckpt[spk] = x
78
+
79
+ checkpoint_path = checkpoint_dir / f"kmeans_{n_clusters}.pt"
80
+ checkpoint_path.parent.mkdir(exist_ok=True, parents=True)
81
+ torch.save(
82
+ ckpt,
83
+ checkpoint_path,
84
+ )
85
+
configs/config.json ADDED
@@ -0,0 +1,94 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "train": {
3
+ "log_interval": 20,
4
+ "eval_interval": 20,
5
+ "seed": 1234,
6
+ "epochs": 10000,
7
+ "learning_rate": 0.0001,
8
+ "betas": [
9
+ 0.8,
10
+ 0.99
11
+ ],
12
+ "eps": 1e-09,
13
+ "batch_size": 6,
14
+ "fp16_run": false,
15
+ "lr_decay": 0.999875,
16
+ "segment_size": 10240,
17
+ "init_lr_ratio": 1,
18
+ "warmup_epochs": 0,
19
+ "c_mel": 45,
20
+ "c_kl": 1.0,
21
+ "use_sr": true,
22
+ "max_speclen": 512,
23
+ "port": "8001",
24
+ "keep_ckpts": 3
25
+ },
26
+ "data": {
27
+ "training_files": "filelists/train.txt",
28
+ "validation_files": "filelists/val.txt",
29
+ "max_wav_value": 32768.0,
30
+ "sampling_rate": 44100,
31
+ "filter_length": 2048,
32
+ "hop_length": 512,
33
+ "win_length": 2048,
34
+ "n_mel_channels": 80,
35
+ "mel_fmin": 0.0,
36
+ "mel_fmax": 22050
37
+ },
38
+ "model": {
39
+ "inter_channels": 192,
40
+ "hidden_channels": 192,
41
+ "filter_channels": 768,
42
+ "n_heads": 2,
43
+ "n_layers": 6,
44
+ "kernel_size": 3,
45
+ "p_dropout": 0.1,
46
+ "resblock": "1",
47
+ "resblock_kernel_sizes": [
48
+ 3,
49
+ 7,
50
+ 11
51
+ ],
52
+ "resblock_dilation_sizes": [
53
+ [
54
+ 1,
55
+ 3,
56
+ 5
57
+ ],
58
+ [
59
+ 1,
60
+ 3,
61
+ 5
62
+ ],
63
+ [
64
+ 1,
65
+ 3,
66
+ 5
67
+ ]
68
+ ],
69
+ "upsample_rates": [
70
+ 8,
71
+ 8,
72
+ 2,
73
+ 2,
74
+ 2
75
+ ],
76
+ "upsample_initial_channel": 512,
77
+ "upsample_kernel_sizes": [
78
+ 16,
79
+ 16,
80
+ 4,
81
+ 4,
82
+ 4
83
+ ],
84
+ "n_layers_q": 3,
85
+ "use_spectral_norm": false,
86
+ "gin_channels": 256,
87
+ "ssl_dim": 256,
88
+ "n_speakers": 200,
89
+ "speech_encoder": "vec256l9"
90
+ },
91
+ "spk": {
92
+ "Shengshuyan": 0
93
+ }
94
+ }
configs/diffusion.yaml ADDED
@@ -0,0 +1,48 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ data:
2
+ sampling_rate: 44100
3
+ block_size: 512 # Equal to hop_length
4
+ duration: 2 # Audio duration during training, must be less than the duration of the shortest audio clip
5
+ encoder: 'vec768l12' # 'hubertsoft', 'vec256l9', 'vec768l12'
6
+ cnhubertsoft_gate: 10
7
+ encoder_sample_rate: 16000
8
+ encoder_hop_size: 320
9
+ encoder_out_channels: 768 # 256 if using 'hubertsoft'
10
+ training_files: "filelists/train.txt"
11
+ validation_files: "filelists/val.txt"
12
+ extensions: # List of extension included in the data collection
13
+ - wav
14
+ model:
15
+ type: 'Diffusion'
16
+ n_layers: 20
17
+ n_chans: 512
18
+ n_hidden: 256
19
+ use_pitch_aug: true
20
+ n_spk: 1 # max number of different speakers
21
+ device: cuda
22
+ vocoder:
23
+ type: 'nsf-hifigan'
24
+ ckpt: 'pretrain/nsf_hifigan/model'
25
+ infer:
26
+ speedup: 10
27
+ method: 'dpm-solver' # 'pndm' or 'dpm-solver'
28
+ env:
29
+ expdir: logs/44k/diffusion
30
+ gpu_id: 0
31
+ train:
32
+ num_workers: 2 # If your cpu and gpu are both very strong, set to 0 may be faster!
33
+ amp_dtype: fp32 # fp32, fp16 or bf16 (fp16 or bf16 may be faster if it is supported by your gpu)
34
+ batch_size: 48
35
+ cache_all_data: true # Save Internal-Memory or Graphics-Memory if it is false, but may be slow
36
+ cache_device: 'cpu' # Set to 'cuda' to cache the data into the Graphics-Memory, fastest speed for strong gpu
37
+ cache_fp16: true
38
+ epochs: 100000
39
+ interval_log: 10
40
+ interval_val: 2000
41
+ interval_force_save: 10000
42
+ lr: 0.0002
43
+ decay_step: 100000
44
+ gamma: 0.5
45
+ weight_decay: 0
46
+ save_opt: false
47
+ spk:
48
+ 'nyaru': 0
configs_template/config_template.json ADDED
@@ -0,0 +1,77 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "train": {
3
+ "log_interval": 200,
4
+ "eval_interval": 800,
5
+ "seed": 1234,
6
+ "epochs": 10000,
7
+ "learning_rate": 0.0001,
8
+ "betas": [
9
+ 0.8,
10
+ 0.99
11
+ ],
12
+ "eps": 1e-09,
13
+ "batch_size": 6,
14
+ "fp16_run": false,
15
+ "half_type": "fp16",
16
+ "lr_decay": 0.999875,
17
+ "segment_size": 10240,
18
+ "init_lr_ratio": 1,
19
+ "warmup_epochs": 0,
20
+ "c_mel": 45,
21
+ "c_kl": 1.0,
22
+ "use_sr": true,
23
+ "max_speclen": 512,
24
+ "port": "8001",
25
+ "keep_ckpts": 3,
26
+ "all_in_mem": false,
27
+ "vol_aug":false
28
+ },
29
+ "data": {
30
+ "training_files": "filelists/train.txt",
31
+ "validation_files": "filelists/val.txt",
32
+ "max_wav_value": 32768.0,
33
+ "sampling_rate": 44100,
34
+ "filter_length": 2048,
35
+ "hop_length": 512,
36
+ "win_length": 2048,
37
+ "n_mel_channels": 80,
38
+ "mel_fmin": 0.0,
39
+ "mel_fmax": 22050,
40
+ "unit_interpolate_mode":"nearest"
41
+ },
42
+ "model": {
43
+ "inter_channels": 192,
44
+ "hidden_channels": 192,
45
+ "filter_channels": 768,
46
+ "n_heads": 2,
47
+ "n_layers": 6,
48
+ "kernel_size": 3,
49
+ "p_dropout": 0.1,
50
+ "resblock": "1",
51
+ "resblock_kernel_sizes": [3,7,11],
52
+ "resblock_dilation_sizes": [[1,3,5], [1,3,5], [1,3,5]],
53
+ "upsample_rates": [ 8, 8, 2, 2, 2],
54
+ "upsample_initial_channel": 512,
55
+ "upsample_kernel_sizes": [16,16, 4, 4, 4],
56
+ "n_layers_q": 3,
57
+ "n_flow_layer": 4,
58
+ "use_spectral_norm": false,
59
+ "gin_channels": 768,
60
+ "ssl_dim": 768,
61
+ "n_speakers": 200,
62
+ "vocoder_name":"nsf-hifigan",
63
+ "speech_encoder":"vec768l12",
64
+ "speaker_embedding":false,
65
+ "vol_embedding":false,
66
+ "use_depthwise_conv":false,
67
+ "flow_share_parameter": false,
68
+ "use_automatic_f0_prediction": true
69
+ },
70
+ "spk": {
71
+ "nyaru": 0,
72
+ "huiyu": 1,
73
+ "nen": 2,
74
+ "paimon": 3,
75
+ "yunhao": 4
76
+ }
77
+ }
configs_template/config_tiny_template.json ADDED
@@ -0,0 +1,77 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "train": {
3
+ "log_interval": 200,
4
+ "eval_interval": 800,
5
+ "seed": 1234,
6
+ "epochs": 10000,
7
+ "learning_rate": 0.0001,
8
+ "betas": [
9
+ 0.8,
10
+ 0.99
11
+ ],
12
+ "eps": 1e-09,
13
+ "batch_size": 6,
14
+ "fp16_run": false,
15
+ "half_type": "fp16",
16
+ "lr_decay": 0.999875,
17
+ "segment_size": 10240,
18
+ "init_lr_ratio": 1,
19
+ "warmup_epochs": 0,
20
+ "c_mel": 45,
21
+ "c_kl": 1.0,
22
+ "use_sr": true,
23
+ "max_speclen": 512,
24
+ "port": "8001",
25
+ "keep_ckpts": 3,
26
+ "all_in_mem": false,
27
+ "vol_aug":false
28
+ },
29
+ "data": {
30
+ "training_files": "filelists/train.txt",
31
+ "validation_files": "filelists/val.txt",
32
+ "max_wav_value": 32768.0,
33
+ "sampling_rate": 44100,
34
+ "filter_length": 2048,
35
+ "hop_length": 512,
36
+ "win_length": 2048,
37
+ "n_mel_channels": 80,
38
+ "mel_fmin": 0.0,
39
+ "mel_fmax": 22050,
40
+ "unit_interpolate_mode":"nearest"
41
+ },
42
+ "model": {
43
+ "inter_channels": 192,
44
+ "hidden_channels": 192,
45
+ "filter_channels": 512,
46
+ "n_heads": 2,
47
+ "n_layers": 6,
48
+ "kernel_size": 3,
49
+ "p_dropout": 0.1,
50
+ "resblock": "1",
51
+ "resblock_kernel_sizes": [3,7,11],
52
+ "resblock_dilation_sizes": [[1,3,5], [1,3,5], [1,3,5]],
53
+ "upsample_rates": [ 8, 8, 2, 2, 2],
54
+ "upsample_initial_channel": 400,
55
+ "upsample_kernel_sizes": [16,16, 4, 4, 4],
56
+ "n_layers_q": 3,
57
+ "n_flow_layer": 4,
58
+ "use_spectral_norm": false,
59
+ "gin_channels": 768,
60
+ "ssl_dim": 768,
61
+ "n_speakers": 200,
62
+ "vocoder_name":"nsf-hifigan",
63
+ "speech_encoder":"vec768l12",
64
+ "speaker_embedding":false,
65
+ "vol_embedding":false,
66
+ "use_depthwise_conv":true,
67
+ "flow_share_parameter": true,
68
+ "use_automatic_f0_prediction": true
69
+ },
70
+ "spk": {
71
+ "nyaru": 0,
72
+ "huiyu": 1,
73
+ "nen": 2,
74
+ "paimon": 3,
75
+ "yunhao": 4
76
+ }
77
+ }
configs_template/diffusion_template.yaml ADDED
@@ -0,0 +1,51 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ data:
2
+ sampling_rate: 44100
3
+ block_size: 512 # Equal to hop_length
4
+ duration: 2 # Audio duration during training, must be less than the duration of the shortest audio clip
5
+ encoder: 'vec768l12' # 'hubertsoft', 'vec256l9', 'vec768l12'
6
+ cnhubertsoft_gate: 10
7
+ encoder_sample_rate: 16000
8
+ encoder_hop_size: 320
9
+ encoder_out_channels: 768 # 256 if using 'hubertsoft'
10
+ training_files: "filelists/train.txt"
11
+ validation_files: "filelists/val.txt"
12
+ extensions: # List of extension included in the data collection
13
+ - wav
14
+ unit_interpolate_mode: "nearest"
15
+ model:
16
+ type: 'Diffusion'
17
+ n_layers: 20
18
+ n_chans: 512
19
+ n_hidden: 256
20
+ use_pitch_aug: true
21
+ timesteps : 1000
22
+ k_step_max: 0 # must <= timesteps, If it is 0, train all
23
+ n_spk: 1 # max number of different speakers
24
+ device: cuda
25
+ vocoder:
26
+ type: 'nsf-hifigan'
27
+ ckpt: 'pretrain/nsf_hifigan/model'
28
+ infer:
29
+ speedup: 10
30
+ method: 'dpm-solver++' # 'pndm' or 'dpm-solver' or 'ddim' or 'unipc' or 'dpm-solver++'
31
+ env:
32
+ expdir: logs/44k/diffusion
33
+ gpu_id: 0
34
+ train:
35
+ num_workers: 4 # If your cpu and gpu are both very strong, set to 0 may be faster!
36
+ amp_dtype: fp32 # fp32, fp16 or bf16 (fp16 or bf16 may be faster if it is supported by your gpu)
37
+ batch_size: 48
38
+ cache_all_data: true # Save Internal-Memory or Graphics-Memory if it is false, but may be slow
39
+ cache_device: 'cpu' # Set to 'cuda' to cache the data into the Graphics-Memory, fastest speed for strong gpu
40
+ cache_fp16: true
41
+ epochs: 100000
42
+ interval_log: 10
43
+ interval_val: 2000
44
+ interval_force_save: 5000
45
+ lr: 0.0001
46
+ decay_step: 100000
47
+ gamma: 0.5
48
+ weight_decay: 0
49
+ save_opt: false
50
+ spk:
51
+ 'nyaru': 0
dataset_raw/wav_structure.txt ADDED
@@ -0,0 +1,20 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ 数据集准备
2
+
3
+ raw
4
+ ├───speaker0
5
+ │ ├───xxx1-xxx1.wav
6
+ │ ├───...
7
+ │ └───Lxx-0xx8.wav
8
+ └───speaker1
9
+ ├───xx2-0xxx2.wav
10
+ ├───...
11
+ └───xxx7-xxx007.wav
12
+
13
+ 此外还需要编辑config.json
14
+
15
+ "n_speakers": 10
16
+
17
+ "spk":{
18
+ "speaker0": 0,
19
+ "speaker1": 1,
20
+ }