Ritori commited on
Commit
a722365
1 Parent(s): 31e4d8c

Upload 72 files

Browse files
This view is limited to 50 files because it contains too many changes.   See raw diff
.gitattributes CHANGED
@@ -1,36 +1,64 @@
1
- *.7z filter=lfs diff=lfs merge=lfs -text
2
- *.arrow filter=lfs diff=lfs merge=lfs -text
3
- *.bin filter=lfs diff=lfs merge=lfs -text
4
- *.bz2 filter=lfs diff=lfs merge=lfs -text
5
- *.ckpt filter=lfs diff=lfs merge=lfs -text
6
- *.ftz filter=lfs diff=lfs merge=lfs -text
7
- *.gz filter=lfs diff=lfs merge=lfs -text
8
- *.h5 filter=lfs diff=lfs merge=lfs -text
9
- *.joblib filter=lfs diff=lfs merge=lfs -text
10
- *.lfs.* filter=lfs diff=lfs merge=lfs -text
11
- *.mlmodel filter=lfs diff=lfs merge=lfs -text
12
- *.model filter=lfs diff=lfs merge=lfs -text
13
- *.msgpack filter=lfs diff=lfs merge=lfs -text
14
- *.npy filter=lfs diff=lfs merge=lfs -text
15
- *.npz filter=lfs diff=lfs merge=lfs -text
16
- *.onnx filter=lfs diff=lfs merge=lfs -text
17
- *.ot filter=lfs diff=lfs merge=lfs -text
18
- *.parquet filter=lfs diff=lfs merge=lfs -text
19
- *.pb filter=lfs diff=lfs merge=lfs -text
20
- *.pickle filter=lfs diff=lfs merge=lfs -text
21
- *.pkl filter=lfs diff=lfs merge=lfs -text
22
- *.pt filter=lfs diff=lfs merge=lfs -text
23
- *.pth filter=lfs diff=lfs merge=lfs -text
24
- *.rar filter=lfs diff=lfs merge=lfs -text
25
- *.safetensors filter=lfs diff=lfs merge=lfs -text
26
- saved_model/**/* filter=lfs diff=lfs merge=lfs -text
27
- *.tar.* filter=lfs diff=lfs merge=lfs -text
28
- *.tar filter=lfs diff=lfs merge=lfs -text
29
- *.tflite filter=lfs diff=lfs merge=lfs -text
30
- *.tgz filter=lfs diff=lfs merge=lfs -text
31
- *.wasm filter=lfs diff=lfs merge=lfs -text
32
- *.xz filter=lfs diff=lfs merge=lfs -text
33
- *.zip filter=lfs diff=lfs merge=lfs -text
34
- *.zst filter=lfs diff=lfs merge=lfs -text
35
- *tfevents* filter=lfs diff=lfs merge=lfs -text
36
- Yui_TrapGenesis_0[[:space:]](1).8991 filter=lfs diff=lfs merge=lfs -text
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ ###############################################################################
2
+ # Set default behavior to automatically normalize line endings.
3
+ ###############################################################################
4
+ * text=auto
5
+
6
+ ###############################################################################
7
+ # Set default behavior for command prompt diff.
8
+ #
9
+ # This is need for earlier builds of msysgit that does not have it on by
10
+ # default for csharp files.
11
+ # Note: This is only used by command line
12
+ ###############################################################################
13
+ #*.cs diff=csharp
14
+
15
+ ###############################################################################
16
+ # Set the merge driver for project and solution files
17
+ #
18
+ # Merging from the command prompt will add diff markers to the files if there
19
+ # are conflicts (Merging from VS is not affected by the settings below, in VS
20
+ # the diff markers are never inserted). Diff markers may cause the following
21
+ # file extensions to fail to load in VS. An alternative would be to treat
22
+ # these files as binary and thus will always conflict and require user
23
+ # intervention with every merge. To do so, just uncomment the entries below
24
+ ###############################################################################
25
+ #*.sln merge=binary
26
+ #*.csproj merge=binary
27
+ #*.vbproj merge=binary
28
+ #*.vcxproj merge=binary
29
+ #*.vcproj merge=binary
30
+ #*.dbproj merge=binary
31
+ #*.fsproj merge=binary
32
+ #*.lsproj merge=binary
33
+ #*.wixproj merge=binary
34
+ #*.modelproj merge=binary
35
+ #*.sqlproj merge=binary
36
+ #*.wwaproj merge=binary
37
+
38
+ ###############################################################################
39
+ # behavior for image files
40
+ #
41
+ # image files are treated as binary by default.
42
+ ###############################################################################
43
+ #*.jpg binary
44
+ #*.png binary
45
+ #*.gif binary
46
+
47
+ ###############################################################################
48
+ # diff behavior for common document formats
49
+ #
50
+ # Convert binary document formats to text before diffing them. This feature
51
+ # is only available from the command line. Turn it on by uncommenting the
52
+ # entries below.
53
+ ###############################################################################
54
+ #*.doc diff=astextplain
55
+ #*.DOC diff=astextplain
56
+ #*.docx diff=astextplain
57
+ #*.DOCX diff=astextplain
58
+ #*.dot diff=astextplain
59
+ #*.DOT diff=astextplain
60
+ #*.pdf diff=astextplain
61
+ #*.PDF diff=astextplain
62
+ #*.rtf diff=astextplain
63
+ #*.RTF diff=astextplain
64
+ Yui_TrapGenesis filter=lfs diff=lfs merge=lfs -text
.gitignore ADDED
@@ -0,0 +1,374 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ ## Ignore Visual Studio temporary files, build results, and
2
+ ## files generated by popular Visual Studio add-ons.
3
+ ##
4
+ ## Get latest from https://github.com/github/gitignore/blob/master/VisualStudio.gitignore
5
+
6
+ # User-specific files
7
+ *.rsuser
8
+ *.suo
9
+ *.user
10
+ *.userosscache
11
+ *.sln.docstates
12
+
13
+ # User-specific files (MonoDevelop/Xamarin Studio)
14
+ *.userprefs
15
+
16
+ # Mono auto generated files
17
+ mono_crash.*
18
+
19
+ # Build results
20
+ [Dd]ebug/
21
+ [Dd]ebugPublic/
22
+ [Rr]elease/
23
+ [Rr]eleases/
24
+ x64/
25
+ x86/
26
+ [Ww][Ii][Nn]32/
27
+ [Aa][Rr][Mm]/
28
+ [Aa][Rr][Mm]64/
29
+ bld/
30
+ [Bb]in/
31
+ [Oo]bj/
32
+ [Oo]ut/
33
+ [Ll]og/
34
+ [Ll]ogs/
35
+
36
+ # Visual Studio 2015/2017 cache/options directory
37
+ .vs/
38
+ # Uncomment if you have tasks that create the project's static files in wwwroot
39
+ #wwwroot/
40
+
41
+ # Visual Studio 2017 auto generated files
42
+ Generated\ Files/
43
+
44
+ # MSTest test Results
45
+ [Tt]est[Rr]esult*/
46
+ [Bb]uild[Ll]og.*
47
+
48
+ # NUnit
49
+ *.VisualState.xml
50
+ TestResult.xml
51
+ nunit-*.xml
52
+
53
+ # Build Results of an ATL Project
54
+ [Dd]ebugPS/
55
+ [Rr]eleasePS/
56
+ dlldata.c
57
+
58
+ # Benchmark Results
59
+ BenchmarkDotNet.Artifacts/
60
+
61
+ # .NET Core
62
+ project.lock.json
63
+ project.fragment.lock.json
64
+ artifacts/
65
+
66
+ # ASP.NET Scaffolding
67
+ ScaffoldingReadMe.txt
68
+
69
+ # StyleCop
70
+ StyleCopReport.xml
71
+
72
+ # Files built by Visual Studio
73
+ *_i.c
74
+ *_p.c
75
+ *_h.h
76
+ *.ilk
77
+ *.meta
78
+ *.obj
79
+ *.iobj
80
+ *.pch
81
+ *.pdb
82
+ *.ipdb
83
+ *.pgc
84
+ *.pgd
85
+ *.rsp
86
+ *.sbr
87
+ *.tlb
88
+ *.tli
89
+ *.tlh
90
+ *.tmp
91
+ *.tmp_proj
92
+ *_wpftmp.csproj
93
+ *.log
94
+ *.vspscc
95
+ *.vssscc
96
+ .builds
97
+ *.pidb
98
+ *.svclog
99
+ *.scc
100
+
101
+ # Chutzpah Test files
102
+ _Chutzpah*
103
+
104
+ # Visual C++ cache files
105
+ ipch/
106
+ *.aps
107
+ *.ncb
108
+ *.opendb
109
+ *.opensdf
110
+ *.sdf
111
+ *.cachefile
112
+ *.VC.db
113
+ *.VC.VC.opendb
114
+
115
+ # Visual Studio profiler
116
+ *.psess
117
+ *.vsp
118
+ *.vspx
119
+ *.sap
120
+
121
+ # Visual Studio Trace Files
122
+ *.e2e
123
+
124
+ # TFS 2012 Local Workspace
125
+ $tf/
126
+
127
+ # Guidance Automation Toolkit
128
+ *.gpState
129
+
130
+ # ReSharper is a .NET coding add-in
131
+ _ReSharper*/
132
+ *.[Rr]e[Ss]harper
133
+ *.DotSettings.user
134
+
135
+ # TeamCity is a build add-in
136
+ _TeamCity*
137
+
138
+ # DotCover is a Code Coverage Tool
139
+ *.dotCover
140
+
141
+ # AxoCover is a Code Coverage Tool
142
+ .axoCover/*
143
+ !.axoCover/settings.json
144
+
145
+ # Coverlet is a free, cross platform Code Coverage Tool
146
+ coverage*.json
147
+ coverage*.xml
148
+ coverage*.info
149
+
150
+ # Visual Studio code coverage results
151
+ *.coverage
152
+ *.coveragexml
153
+
154
+ # NCrunch
155
+ _NCrunch_*
156
+ .*crunch*.local.xml
157
+ nCrunchTemp_*
158
+
159
+ # MightyMoose
160
+ *.mm.*
161
+ AutoTest.Net/
162
+
163
+ # Web workbench (sass)
164
+ .sass-cache/
165
+
166
+ # Installshield output folder
167
+ [Ee]xpress/
168
+
169
+ # DocProject is a documentation generator add-in
170
+ DocProject/buildhelp/
171
+ DocProject/Help/*.HxT
172
+ DocProject/Help/*.HxC
173
+ DocProject/Help/*.hhc
174
+ DocProject/Help/*.hhk
175
+ DocProject/Help/*.hhp
176
+ DocProject/Help/Html2
177
+ DocProject/Help/html
178
+
179
+ # Click-Once directory
180
+ publish/
181
+
182
+ # Publish Web Output
183
+ *.[Pp]ublish.xml
184
+ *.azurePubxml
185
+ # Note: Comment the next line if you want to checkin your web deploy settings,
186
+ # but database connection strings (with potential passwords) will be unencrypted
187
+ *.pubxml
188
+ *.publishproj
189
+
190
+ # Microsoft Azure Web App publish settings. Comment the next line if you want to
191
+ # checkin your Azure Web App publish settings, but sensitive information contained
192
+ # in these scripts will be unencrypted
193
+ PublishScripts/
194
+
195
+ # NuGet Packages
196
+ *.nupkg
197
+ # NuGet Symbol Packages
198
+ *.snupkg
199
+ # The packages folder can be ignored because of Package Restore
200
+ **/[Pp]ackages/*
201
+ # except build/, which is used as an MSBuild target.
202
+ !**/[Pp]ackages/build/
203
+ # Uncomment if necessary however generally it will be regenerated when needed
204
+ #!**/[Pp]ackages/repositories.config
205
+ # NuGet v3's project.json files produces more ignorable files
206
+ *.nuget.props
207
+ *.nuget.targets
208
+
209
+ # Microsoft Azure Build Output
210
+ csx/
211
+ *.build.csdef
212
+
213
+ # Microsoft Azure Emulator
214
+ ecf/
215
+ rcf/
216
+
217
+ # Windows Store app package directories and files
218
+ AppPackages/
219
+ BundleArtifacts/
220
+ Package.StoreAssociation.xml
221
+ _pkginfo.txt
222
+ *.appx
223
+ *.appxbundle
224
+ *.appxupload
225
+
226
+ # Visual Studio cache files
227
+ # files ending in .cache can be ignored
228
+ *.[Cc]ache
229
+ # but keep track of directories ending in .cache
230
+ !?*.[Cc]ache/
231
+
232
+ # Others
233
+ ClientBin/
234
+ ~$*
235
+ *~
236
+ *.dbmdl
237
+ *.dbproj.schemaview
238
+ *.jfm
239
+ *.pfx
240
+ *.publishsettings
241
+ orleans.codegen.cs
242
+
243
+ # Including strong name files can present a security risk
244
+ # (https://github.com/github/gitignore/pull/2483#issue-259490424)
245
+ #*.snk
246
+
247
+ # Since there are multiple workflows, uncomment next line to ignore bower_components
248
+ # (https://github.com/github/gitignore/pull/1529#issuecomment-104372622)
249
+ #bower_components/
250
+
251
+ # RIA/Silverlight projects
252
+ Generated_Code/
253
+
254
+ # Backup & report files from converting an old project file
255
+ # to a newer Visual Studio version. Backup files are not needed,
256
+ # because we have git ;-)
257
+ _UpgradeReport_Files/
258
+ Backup*/
259
+ UpgradeLog*.XML
260
+ UpgradeLog*.htm
261
+ ServiceFabricBackup/
262
+ *.rptproj.bak
263
+
264
+ # SQL Server files
265
+ *.mdf
266
+ *.ldf
267
+ *.ndf
268
+
269
+ # Business Intelligence projects
270
+ *.rdl.data
271
+ *.bim.layout
272
+ *.bim_*.settings
273
+ *.rptproj.rsuser
274
+ *- [Bb]ackup.rdl
275
+ *- [Bb]ackup ([0-9]).rdl
276
+ *- [Bb]ackup ([0-9][0-9]).rdl
277
+
278
+ # Microsoft Fakes
279
+ FakesAssemblies/
280
+
281
+ # GhostDoc plugin setting file
282
+ *.GhostDoc.xml
283
+
284
+ # Node.js Tools for Visual Studio
285
+ .ntvs_analysis.dat
286
+ node_modules/
287
+
288
+ # Visual Studio 6 build log
289
+ *.plg
290
+
291
+ # Visual Studio 6 workspace options file
292
+ *.opt
293
+
294
+ # Visual Studio 6 auto-generated workspace file (contains which files were open etc.)
295
+ *.vbw
296
+
297
+ # Visual Studio LightSwitch build output
298
+ **/*.HTMLClient/GeneratedArtifacts
299
+ **/*.DesktopClient/GeneratedArtifacts
300
+ **/*.DesktopClient/ModelManifest.xml
301
+ **/*.Server/GeneratedArtifacts
302
+ **/*.Server/ModelManifest.xml
303
+ _Pvt_Extensions
304
+
305
+ # Paket dependency manager
306
+ .paket/paket.exe
307
+ paket-files/
308
+
309
+ # FAKE - F# Make
310
+ .fake/
311
+
312
+ # CodeRush personal settings
313
+ .cr/personal
314
+
315
+ # Python Tools for Visual Studio (PTVS)
316
+ __pycache__/
317
+ *.pyc
318
+
319
+ # Cake - Uncomment if you are using it
320
+ # tools/**
321
+ # !tools/packages.config
322
+
323
+ # Tabs Studio
324
+ *.tss
325
+
326
+ # Telerik's JustMock configuration file
327
+ *.jmconfig
328
+
329
+ # BizTalk build output
330
+ *.btp.cs
331
+ *.btm.cs
332
+ *.odx.cs
333
+ *.xsd.cs
334
+
335
+ # OpenCover UI analysis results
336
+ OpenCover/
337
+
338
+ # Azure Stream Analytics local run output
339
+ ASALocalRun/
340
+
341
+ # MSBuild Binary and Structured Log
342
+ *.binlog
343
+
344
+ # NVidia Nsight GPU debugger configuration file
345
+ *.nvuser
346
+
347
+ # MFractors (Xamarin productivity tool) working folder
348
+ .mfractor/
349
+
350
+ # Local History for Visual Studio
351
+ .localhistory/
352
+
353
+ # BeatPulse healthcheck temp database
354
+ healthchecksdb
355
+
356
+ # Backup folder for Package Reference Convert tool in Visual Studio 2017
357
+ MigrationBackup/
358
+
359
+ # Ionide (cross platform F# VS Code tools) working folder
360
+ .ionide/
361
+
362
+ # Fody - auto-generated XML schema
363
+ FodyWeavers.xsd
364
+
365
+ # models
366
+ /ayachi_*
367
+ /inaba_*
368
+ /tomotake_*
369
+ /murasame_*
370
+ /arihara_*
371
+ /waveglow_*
372
+
373
+ # jupyter cache
374
+ /.ipynb_checkpoints
.gitmodules ADDED
@@ -0,0 +1,4 @@
 
 
 
 
 
1
+ [submodule "waveglow"]
2
+ path = waveglow
3
+ url = https://github.com/NVIDIA/waveglow
4
+ branch = master
Dockerfile ADDED
@@ -0,0 +1,10 @@
 
 
 
 
 
 
 
 
 
 
 
1
+ FROM pytorch/pytorch:nightly-devel-cuda10.0-cudnn7
2
+ ENV PATH /usr/local/nvidia/bin:/usr/local/cuda/bin:${PATH}
3
+
4
+ RUN apt-get update -y
5
+
6
+ RUN pip install numpy scipy matplotlib librosa==0.6.0 tensorflow tensorboardX inflect==0.2.5 Unidecode==1.0.22 pillow jupyter
7
+
8
+ ADD apex /apex/
9
+ WORKDIR /apex/
10
+ RUN pip install -v --no-cache-dir --global-option="--cpp_ext" --global-option="--cuda_ext" .
LICENSE ADDED
@@ -0,0 +1,29 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ BSD 3-Clause License
2
+
3
+ Copyright (c) 2018, NVIDIA Corporation
4
+ All rights reserved.
5
+
6
+ Redistribution and use in source and binary forms, with or without
7
+ modification, are permitted provided that the following conditions are met:
8
+
9
+ * Redistributions of source code must retain the above copyright notice, this
10
+ list of conditions and the following disclaimer.
11
+
12
+ * Redistributions in binary form must reproduce the above copyright notice,
13
+ this list of conditions and the following disclaimer in the documentation
14
+ and/or other materials provided with the distribution.
15
+
16
+ * Neither the name of the copyright holder nor the names of its
17
+ contributors may be used to endorse or promote products derived from
18
+ this software without specific prior written permission.
19
+
20
+ THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
21
+ AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
22
+ IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
23
+ DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
24
+ FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
25
+ DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
26
+ SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
27
+ CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
28
+ OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
29
+ OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
README.md CHANGED
@@ -1,3 +1,43 @@
1
  ---
2
- pipeline_tag: text-to-speech
3
- ---
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
  ---
2
+ title: TTS_Yui
3
+ app_file: Yue_gradio_cpu.py
4
+ sdk: gradio
5
+ sdk_version: 3.36.1
6
+ ---
7
+ # Tacotron2-Japanese
8
+ - Tacotron2 implementation of Japanese
9
+ ## Links
10
+ * Reference: [NVIDIA/tacotron2](https://github.com/NVIDIA/tacotron2)
11
+ * [Pre-training tacotron2 models](https://github.com/CjangCjengh/TTSModels)
12
+ * [latest changes can be viewed in this repository](https://github.com/StarxSky/tacotron2-JP)
13
+
14
+ ## How to use
15
+ 1. Put raw Japanese texts in ./filelists
16
+ 2. Put WAV files in ./wav
17
+ 3. (Optional) Download NVIDIA's [pretrained model](https://drive.google.com/file/d/1c5ZTuT7J08wLUoVZ2KkUs_VdZuJ86ZqA/view?usp=sharing)
18
+ 4. Open ./train.ipynb to install requirements and start training
19
+ 5. Download NVIDIA's [WaveGlow model](https://drive.google.com/open?id=1rpK8CzAAirq9sWZhe9nlfvxMF1dRgFbF)
20
+ 6. Open ./inference.ipynb to generate voice
21
+
22
+ ## Cleaners
23
+ File ./hparams.py line 30
24
+ ### 1. 'japanese_cleaners'
25
+ #### Before
26
+ 何かあったらいつでも話して下さい。学院のことじゃなく、私事に関することでも何でも
27
+ #### After
28
+ nanikaacltaraitsudemohanashItekudasai.gakuiNnokotojanaku,shijinikaNsurukotodemonanidemo.
29
+ ### 2. 'japanese_tokenization_cleaners'
30
+ #### Before
31
+ 何かあったらいつでも話して下さい。学院のことじゃなく、私事に関することでも何でも
32
+ #### After
33
+ nani ka acl tara itsu demo hanashi te kudasai. gakuiN no koto ja naku, shiji nikaNsuru koto de mo naNdemo.
34
+ ### 3. 'japanese_accent_cleaners'
35
+ #### Before
36
+ 何かあったらいつでも話して下さい。学院のことじゃなく、私事に関することでも何でも
37
+ #### After
38
+ :na)nika a)cltara i)tsudemo ha(na)shIte ku(dasa)i.:ga(kuiNno ko(to)janaku,:shi)jini ka(Nsu)ru ko(to)demo na)nidemo.
39
+ ### 4. 'japanese_phrase_cleaners'
40
+ #### Before
41
+ 何かあったらいつでも話して下さい。学院のことじゃなく、私事に関することでも何でも
42
+ #### After
43
+ nanika acltara itsudemo hanashIte kudasai. gakuiNno kotojanaku, shijini kaNsuru kotodemo nanidemo.
Yue_gradio.py ADDED
@@ -0,0 +1,243 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ #好用的
2
+
3
+ import os
4
+ os.system('pip install -U tensorflow')
5
+ os.system('pip install -q unidecode tensorboardX')
6
+ os.system('pip install librosa==0.8.0')
7
+ os.system('pip install pysoundfile==0.9.0.post1')
8
+ os.system('pip install unidecode==1.3.4')
9
+ os.system('pip install pyopenjtalk --no-build-isolation')
10
+ os.system('pip install inflect==5.6.2')
11
+ os.system('pip install janome==0.4.2')
12
+ os.system('pip install tqdm -q')
13
+ os.system('pip install gdown')
14
+ os.system('pip install -q librosa unidecode')
15
+
16
+ os.system('pip install ipython')
17
+ os.system('pip install --upgrade jupyter ipywidgets')
18
+ os.system('jupyter nbextension enable --py widgetsnbextension')
19
+ os.system('pip uninstall tqdm')
20
+ os.system('pip install tqdm')
21
+
22
+ import time
23
+ import pyopenjtalk
24
+ import soundfile as sf
25
+ import gradio as gr
26
+ import torch
27
+ import IPython.display as ipd
28
+ import numpy as np
29
+ import torch
30
+ import json
31
+ from hparams import create_hparams
32
+ from model import Tacotron2
33
+ from layers import TacotronSTFT
34
+ from audio_processing import griffin_lim
35
+ from text import text_to_sequence
36
+ from env import AttrDict
37
+ from meldataset import MAX_WAV_VALUE
38
+ from models import Generator
39
+
40
+ #@,tlitle 配置并运行
41
+
42
+ #国际 HiFi-GAN 模型(有点机器音): 1qpgI41wNXFcH-iKq1Y42JlBC9j0je8PW
43
+ #@markdown 你训练好的tacotron2模型的路径填在`Tacotron2_Model`这里
44
+ Tacotron2_Model = '/content/Yui_TrapGenesis'#@param {type:"string"}
45
+ TACOTRON2_ID = Tacotron2_Model
46
+ HIFIGAN_ID = "1qpgI41wNXFcH-iKq1Y42JlBC9j0je8PW"
47
+ #@markdown 选择预处理文本的cleaner
48
+ text_cleaner = 'japanese_phrase_cleaners'#@param {type:"string"}
49
+ import pyopenjtalk
50
+ import soundfile as sf
51
+ import gradio as gr
52
+
53
+ # 全局变量声明
54
+ model = None
55
+ hparams = None
56
+ hifigan = None
57
+ thisdict = None
58
+ pronounciation_dictionary = False
59
+ show_graphs = False # 添加show_graphs变量,并赋予默认值
60
+
61
+ # 初始化函数
62
+ def initialize():
63
+ global model, hparams, hifigan, thisdict, pronounciation_dictionary
64
+
65
+ # 检查是否已初始化
66
+ try:
67
+ initialized
68
+ except NameError:
69
+ print("Setting up, please wait.\n")
70
+
71
+ from tqdm.notebook import tqdm
72
+ with tqdm(total=5, leave=False) as pbar:
73
+ import os
74
+ from os.path import exists, join, basename, splitext
75
+ git_repo_url = 'https://github.com/CjangCjengh/tacotron2-japanese.git'
76
+ project_name = splitext(basename(git_repo_url))[0]
77
+ if not exists(project_name):
78
+ # clone and install
79
+ os.system('git clone -q --recursive {git_repo_url}')
80
+ os.system('git clone -q --recursive https://github.com/SortAnon/hifi-gan')
81
+
82
+ pbar.update(1) # downloaded TT2 and HiFi-GAN
83
+ import sys
84
+ sys.path.append('hifi-gan')
85
+ sys.path.append(project_name)
86
+ import time
87
+ import matplotlib
88
+ import matplotlib.pylab as plt
89
+ import gdown
90
+ d = 'https://drive.google.com/uc?id='
91
+
92
+ # %matplotlib inline
93
+ import IPython.display as ipd
94
+ import numpy as np
95
+ import torch
96
+ import json
97
+ from hparams import create_hparams
98
+ from model import Tacotron2
99
+ from layers import TacotronSTFT
100
+ from audio_processing import griffin_lim
101
+ from text import text_to_sequence
102
+ from env import AttrDict
103
+ from meldataset import MAX_WAV_VALUE
104
+ from models import Generator
105
+
106
+ pbar.update(1) # initialized Dependancies
107
+
108
+ graph_width = 900
109
+ graph_height = 360
110
+ def plot_data(data, figsize=(int(graph_width/100), int(graph_height/100))):
111
+ # %matplotlib inline
112
+ fig, axes = plt.subplots(1, len(data), figsize=figsize)
113
+ for i in range(len(data)):
114
+ axes[i].imshow(data[i], aspect='auto', origin='upper',
115
+ interpolation='none', cmap='inferno')
116
+ fig.canvas.draw()
117
+ plt.show()
118
+
119
+ # Setup Pronounciation Dictionary
120
+ os.system('wget https://github.com/wind4000/tacotron2/releases/download/v0.2/merged.dict.txt')
121
+ thisdict = {}
122
+ for line in reversed((open('merged.dict.txt', "r").read()).splitlines()):
123
+ thisdict[(line.split(" ",1))[0]] = (line.split(" ",1))[1].strip()
124
+
125
+ pbar.update(1) # Downloaded and Set up Pronounciation Dictionary
126
+
127
+ def ARPA(text, punctuation=r"!?,.;", EOS_Token=True):
128
+ out = ''
129
+ for word_ in text.split(" "):
130
+ word=word_; end_chars = ''
131
+ while any(elem in word for elem in punctuation) and len(word) > 1:
132
+ if word[-1] in punctuation: end_chars = word[-1] + end_chars; word = word[:-1]
133
+ else: break
134
+ try:
135
+ word_arpa = thisdict[word.upper()]
136
+ word = "{" + str(word_arpa) + "}"
137
+ except KeyError: pass
138
+ out = (out + " " + word + end_chars).strip()
139
+ if EOS_Token and out[-1] != ";": out += ";"
140
+ return out
141
+
142
+ def get_hifigan(MODEL_ID):
143
+ # Download HiFi-GAN
144
+ hifigan_pretrained_model = 'hifimodel'
145
+ gdown.download(d+MODEL_ID, hifigan_pretrained_model, quiet=False)
146
+ if not exists(hifigan_pretrained_model):
147
+ raise Exception("HiFI-GAN model failed to download!")
148
+
149
+ # Load HiFi-GAN
150
+ conf = os.path.join("hifi-gan", "config_v1.json")
151
+ with open(conf) as f:
152
+ json_config = json.loads(f.read())
153
+ h = AttrDict(json_config)
154
+ torch.manual_seed(h.seed)
155
+ hifigan = Generator(h).to(torch.device("cuda"))
156
+ state_dict_g = torch.load(hifigan_pretrained_model, map_location=torch.device("cuda"))
157
+ hifigan.load_state_dict(state_dict_g["generator"])
158
+ hifigan.eval()
159
+ hifigan.remove_weight_norm()
160
+ return hifigan, h
161
+
162
+ hifigan, h = get_hifigan(HIFIGAN_ID)
163
+ pbar.update(1) # Downloaded and Set up HiFi-GAN
164
+
165
+ def has_MMI(STATE_DICT):
166
+ return any(True for x in STATE_DICT.keys() if "mi." in x)
167
+
168
+ def get_Tactron2(MODEL_ID):
169
+ # Download Tacotron2
170
+ tacotron2_pretrained_model = TACOTRON2_ID
171
+ if not exists(tacotron2_pretrained_model):
172
+ raise Exception("Tacotron2 model failed to download!")
173
+ # Load Tacotron2 and Config
174
+ hparams = create_hparams()
175
+ hparams.sampling_rate = 22050
176
+ hparams.max_decoder_steps = 2000 # Max Duration
177
+ hparams.gate_threshold = 0.80 # Model must be 25% sure the clip is over before ending generation
178
+ model = Tacotron2(hparams)
179
+ state_dict = torch.load(tacotron2_pretrained_model)['state_dict']
180
+ if has_MMI(state_dict):
181
+ raise Exception("ERROR: This notebook does not currently support MMI models.")
182
+ model.load_state_dict(state_dict)
183
+ _ = model.cuda().eval().half()
184
+ return model, hparams
185
+
186
+ model, hparams = get_Tactron2(TACOTRON2_ID)
187
+ previous_tt2_id = TACOTRON2_ID
188
+
189
+ pbar.update(1) # Downloaded and Set up Tacotron2
190
+
191
+ # 初始化
192
+ initialize()
193
+
194
+ import soundfile as sf
195
+
196
+ def end_to_end_infer(text, pronounciation_dictionary, show_graphs):
197
+ audio = None # 定义一个变量用于存储音频数据
198
+ for i in [x for x in text.split("\n") if len(x)]:
199
+ if not pronounciation_dictionary:
200
+ if i[-1] != ";":
201
+ i = i + ";"
202
+ else:
203
+ i = ARPA(i)
204
+ with torch.no_grad():
205
+ sequence = np.array(text_to_sequence(i, [text_cleaner]))[None, :]
206
+ sequence = torch.autograd.Variable(torch.from_numpy(sequence)).cuda().long()
207
+ mel_outputs, mel_outputs_postnet, _, alignments = model.inference(sequence)
208
+ if show_graphs:
209
+ plot_data((mel_outputs_postnet.float().data.cpu().numpy()[0],
210
+ alignments.float().data.cpu().numpy()[0].T))
211
+ y_g_hat = hifigan(mel_outputs_postnet.float())
212
+ audio = y_g_hat.squeeze()
213
+ audio = audio * MAX_WAV_VALUE
214
+ output_filename = f"output_{time.strftime('%Y%m%d%H%M%S')}.wav"
215
+ sf.write(output_filename, audio.cpu().numpy().astype('int16'), hparams.sampling_rate)
216
+ print(f"音频已保存为 {output_filename}")
217
+ print("")
218
+ ipd.display(ipd.Audio(audio.cpu().numpy().astype("int16"), rate=hparams.sampling_rate))
219
+ return audio # 返回音频数据
220
+
221
+ # 文本到语音转换函数
222
+ def text_to_speech(text, max_decoder_steps=2000, gate_threshold=0.5):
223
+ global model, hparams, hifigan, thisdict, pronounciation_dictionary, show_graphs
224
+
225
+ hparams.max_decoder_steps = max_decoder_steps
226
+ hparams.gate_threshold = gate_threshold
227
+ output_filename = f"output_{time.strftime('%Y%m%d%H%M%S')}.wav"
228
+ audio = end_to_end_infer(text, pronounciation_dictionary, show_graphs)
229
+ if audio is not None:
230
+ sf.write(output_filename, audio.cpu().numpy().astype('int16'), hparams.sampling_rate)
231
+ return output_filename
232
+ else:
233
+ return None
234
+
235
+ # Gradio界面
236
+ inputs = [
237
+ gr.inputs.Textbox(lines=3, label="输入文本"),
238
+ gr.inputs.Slider(minimum=100, maximum=5000, default=2000, step=100, label="最大解码步数"),
239
+ gr.inputs.Slider(minimum=0.0, maximum=1.0, default=0.5, step=0.05, label="门控阈值")
240
+ ]
241
+ outputs = gr.outputs.File(label="下载生成的音频")
242
+
243
+ gr.Interface(fn=text_to_speech, inputs=inputs, outputs=outputs).launch(debug=True,share=True)
Yui_TrapGenesis ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:6c902e404953f4e52dae8cfc9e63bc673ef7654a4779fc3a461f290f3acaa43c
3
+ size 338428823
__pycache__/audio_processing.cpython-310.pyc ADDED
Binary file (2.77 kB). View file
 
__pycache__/env.cpython-310.pyc ADDED
Binary file (797 Bytes). View file
 
__pycache__/hifiutils.cpython-310.pyc ADDED
Binary file (2.01 kB). View file
 
__pycache__/hparams.cpython-310.pyc ADDED
Binary file (1.9 kB). View file
 
__pycache__/layers.cpython-310.pyc ADDED
Binary file (3.37 kB). View file
 
__pycache__/meldataset.cpython-310.pyc ADDED
Binary file (5.34 kB). View file
 
__pycache__/model.cpython-310.pyc ADDED
Binary file (14.9 kB). View file
 
__pycache__/models.cpython-310.pyc ADDED
Binary file (8.64 kB). View file
 
__pycache__/stft.cpython-310.pyc ADDED
Binary file (4.77 kB). View file
 
__pycache__/utils.cpython-310.pyc ADDED
Binary file (1.48 kB). View file
 
audio_processing.py ADDED
@@ -0,0 +1,93 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import torch
2
+ import numpy as np
3
+ from scipy.signal import get_window
4
+ import librosa.util as librosa_util
5
+
6
+
7
+ def window_sumsquare(window, n_frames, hop_length=200, win_length=800,
8
+ n_fft=800, dtype=np.float32, norm=None):
9
+ """
10
+ # from librosa 0.6
11
+ Compute the sum-square envelope of a window function at a given hop length.
12
+
13
+ This is used to estimate modulation effects induced by windowing
14
+ observations in short-time fourier transforms.
15
+
16
+ Parameters
17
+ ----------
18
+ window : string, tuple, number, callable, or list-like
19
+ Window specification, as in `get_window`
20
+
21
+ n_frames : int > 0
22
+ The number of analysis frames
23
+
24
+ hop_length : int > 0
25
+ The number of samples to advance between frames
26
+
27
+ win_length : [optional]
28
+ The length of the window function. By default, this matches `n_fft`.
29
+
30
+ n_fft : int > 0
31
+ The length of each analysis frame.
32
+
33
+ dtype : np.dtype
34
+ The data type of the output
35
+
36
+ Returns
37
+ -------
38
+ wss : np.ndarray, shape=`(n_fft + hop_length * (n_frames - 1))`
39
+ The sum-squared envelope of the window function
40
+ """
41
+ if win_length is None:
42
+ win_length = n_fft
43
+
44
+ n = n_fft + hop_length * (n_frames - 1)
45
+ x = np.zeros(n, dtype=dtype)
46
+
47
+ # Compute the squared window at the desired length
48
+ win_sq = get_window(window, win_length, fftbins=True)
49
+ win_sq = librosa_util.normalize(win_sq, norm=norm)**2
50
+ win_sq = librosa_util.pad_center(win_sq, n_fft)
51
+
52
+ # Fill the envelope
53
+ for i in range(n_frames):
54
+ sample = i * hop_length
55
+ x[sample:min(n, sample + n_fft)] += win_sq[:max(0, min(n_fft, n - sample))]
56
+ return x
57
+
58
+
59
+ def griffin_lim(magnitudes, stft_fn, n_iters=30):
60
+ """
61
+ PARAMS
62
+ ------
63
+ magnitudes: spectrogram magnitudes
64
+ stft_fn: STFT class with transform (STFT) and inverse (ISTFT) methods
65
+ """
66
+
67
+ angles = np.angle(np.exp(2j * np.pi * np.random.rand(*magnitudes.size())))
68
+ angles = angles.astype(np.float32)
69
+ angles = torch.autograd.Variable(torch.from_numpy(angles))
70
+ signal = stft_fn.inverse(magnitudes, angles).squeeze(1)
71
+
72
+ for i in range(n_iters):
73
+ _, angles = stft_fn.transform(signal)
74
+ signal = stft_fn.inverse(magnitudes, angles).squeeze(1)
75
+ return signal
76
+
77
+
78
+ def dynamic_range_compression(x, C=1, clip_val=1e-5):
79
+ """
80
+ PARAMS
81
+ ------
82
+ C: compression factor
83
+ """
84
+ return torch.log(torch.clamp(x, min=clip_val) * C)
85
+
86
+
87
+ def dynamic_range_decompression(x, C=1):
88
+ """
89
+ PARAMS
90
+ ------
91
+ C: compression factor used to compress
92
+ """
93
+ return torch.exp(x) / C
colab-train-zh-cn.ipynb ADDED
The diff for this file is too large to render. See raw diff
 
colab.ipynb ADDED
The diff for this file is too large to render. See raw diff
 
data_utils.py ADDED
@@ -0,0 +1,111 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import random
2
+ import numpy as np
3
+ import torch
4
+ import torch.utils.data
5
+
6
+ import layers
7
+ from utils import load_wav_to_torch, load_filepaths_and_text
8
+ from text import text_to_sequence
9
+
10
+
11
+ class TextMelLoader(torch.utils.data.Dataset):
12
+ """
13
+ 1) loads audio,text pairs
14
+ 2) normalizes text and converts them to sequences of one-hot vectors
15
+ 3) computes mel-spectrograms from audio files.
16
+ """
17
+ def __init__(self, audiopaths_and_text, hparams):
18
+ self.audiopaths_and_text = load_filepaths_and_text(audiopaths_and_text)
19
+ self.text_cleaners = hparams.text_cleaners
20
+ self.max_wav_value = hparams.max_wav_value
21
+ self.sampling_rate = hparams.sampling_rate
22
+ self.load_mel_from_disk = hparams.load_mel_from_disk
23
+ self.stft = layers.TacotronSTFT(
24
+ hparams.filter_length, hparams.hop_length, hparams.win_length,
25
+ hparams.n_mel_channels, hparams.sampling_rate, hparams.mel_fmin,
26
+ hparams.mel_fmax)
27
+ random.seed(hparams.seed)
28
+ random.shuffle(self.audiopaths_and_text)
29
+
30
+ def get_mel_text_pair(self, audiopath_and_text):
31
+ # separate filename and text
32
+ audiopath, text = audiopath_and_text[0], audiopath_and_text[1]
33
+ text = self.get_text(text)
34
+ mel = self.get_mel(audiopath)
35
+ return (text, mel)
36
+
37
+ def get_mel(self, filename):
38
+ if not self.load_mel_from_disk:
39
+ audio, sampling_rate = load_wav_to_torch(filename)
40
+ if sampling_rate != self.stft.sampling_rate:
41
+ raise ValueError("{} {} SR doesn't match target {} SR".format(
42
+ sampling_rate, self.stft.sampling_rate))
43
+ audio_norm = audio / self.max_wav_value
44
+ audio_norm = audio_norm.unsqueeze(0)
45
+ audio_norm = torch.autograd.Variable(audio_norm, requires_grad=False)
46
+ melspec = self.stft.mel_spectrogram(audio_norm)
47
+ melspec = torch.squeeze(melspec, 0)
48
+ else:
49
+ melspec = torch.from_numpy(np.load(filename))
50
+ assert melspec.size(0) == self.stft.n_mel_channels, (
51
+ 'Mel dimension mismatch: given {}, expected {}'.format(
52
+ melspec.size(0), self.stft.n_mel_channels))
53
+
54
+ return melspec
55
+
56
+ def get_text(self, text):
57
+ text_norm = torch.IntTensor(text_to_sequence(text, self.text_cleaners))
58
+ return text_norm
59
+
60
+ def __getitem__(self, index):
61
+ return self.get_mel_text_pair(self.audiopaths_and_text[index])
62
+
63
+ def __len__(self):
64
+ return len(self.audiopaths_and_text)
65
+
66
+
67
+ class TextMelCollate():
68
+ """ Zero-pads model inputs and targets based on number of frames per setep
69
+ """
70
+ def __init__(self, n_frames_per_step):
71
+ self.n_frames_per_step = n_frames_per_step
72
+
73
+ def __call__(self, batch):
74
+ """Collate's training batch from normalized text and mel-spectrogram
75
+ PARAMS
76
+ ------
77
+ batch: [text_normalized, mel_normalized]
78
+ """
79
+ # Right zero-pad all one-hot text sequences to max input length
80
+ input_lengths, ids_sorted_decreasing = torch.sort(
81
+ torch.LongTensor([len(x[0]) for x in batch]),
82
+ dim=0, descending=True)
83
+ max_input_len = input_lengths[0]
84
+
85
+ text_padded = torch.LongTensor(len(batch), max_input_len)
86
+ text_padded.zero_()
87
+ for i in range(len(ids_sorted_decreasing)):
88
+ text = batch[ids_sorted_decreasing[i]][0]
89
+ text_padded[i, :text.size(0)] = text
90
+
91
+ # Right zero-pad mel-spec
92
+ num_mels = batch[0][1].size(0)
93
+ max_target_len = max([x[1].size(1) for x in batch])
94
+ if max_target_len % self.n_frames_per_step != 0:
95
+ max_target_len += self.n_frames_per_step - max_target_len % self.n_frames_per_step
96
+ assert max_target_len % self.n_frames_per_step == 0
97
+
98
+ # include mel padded and gate padded
99
+ mel_padded = torch.FloatTensor(len(batch), num_mels, max_target_len)
100
+ mel_padded.zero_()
101
+ gate_padded = torch.FloatTensor(len(batch), max_target_len)
102
+ gate_padded.zero_()
103
+ output_lengths = torch.LongTensor(len(batch))
104
+ for i in range(len(ids_sorted_decreasing)):
105
+ mel = batch[ids_sorted_decreasing[i]][1]
106
+ mel_padded[i, :, :mel.size(1)] = mel
107
+ gate_padded[i, mel.size(1)-1:] = 1
108
+ output_lengths[i] = mel.size(1)
109
+
110
+ return text_padded, input_lengths, mel_padded, gate_padded, \
111
+ output_lengths
demo.wav ADDED
Binary file (148 kB). View file
 
distributed.py ADDED
@@ -0,0 +1,173 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import torch
2
+ import torch.distributed as dist
3
+ from torch.nn.modules import Module
4
+ from torch.autograd import Variable
5
+
6
+ def _flatten_dense_tensors(tensors):
7
+ """Flatten dense tensors into a contiguous 1D buffer. Assume tensors are of
8
+ same dense type.
9
+ Since inputs are dense, the resulting tensor will be a concatenated 1D
10
+ buffer. Element-wise operation on this buffer will be equivalent to
11
+ operating individually.
12
+ Arguments:
13
+ tensors (Iterable[Tensor]): dense tensors to flatten.
14
+ Returns:
15
+ A contiguous 1D buffer containing input tensors.
16
+ """
17
+ if len(tensors) == 1:
18
+ return tensors[0].contiguous().view(-1)
19
+ flat = torch.cat([t.contiguous().view(-1) for t in tensors], dim=0)
20
+ return flat
21
+
22
+ def _unflatten_dense_tensors(flat, tensors):
23
+ """View a flat buffer using the sizes of tensors. Assume that tensors are of
24
+ same dense type, and that flat is given by _flatten_dense_tensors.
25
+ Arguments:
26
+ flat (Tensor): flattened dense tensors to unflatten.
27
+ tensors (Iterable[Tensor]): dense tensors whose sizes will be used to
28
+ unflatten flat.
29
+ Returns:
30
+ Unflattened dense tensors with sizes same as tensors and values from
31
+ flat.
32
+ """
33
+ outputs = []
34
+ offset = 0
35
+ for tensor in tensors:
36
+ numel = tensor.numel()
37
+ outputs.append(flat.narrow(0, offset, numel).view_as(tensor))
38
+ offset += numel
39
+ return tuple(outputs)
40
+
41
+
42
+ '''
43
+ This version of DistributedDataParallel is designed to be used in conjunction with the multiproc.py
44
+ launcher included with this example. It assumes that your run is using multiprocess with 1
45
+ GPU/process, that the model is on the correct device, and that torch.set_device has been
46
+ used to set the device.
47
+
48
+ Parameters are broadcasted to the other processes on initialization of DistributedDataParallel,
49
+ and will be allreduced at the finish of the backward pass.
50
+ '''
51
+ class DistributedDataParallel(Module):
52
+
53
+ def __init__(self, module):
54
+ super(DistributedDataParallel, self).__init__()
55
+ #fallback for PyTorch 0.3
56
+ if not hasattr(dist, '_backend'):
57
+ self.warn_on_half = True
58
+ else:
59
+ self.warn_on_half = True if dist._backend == dist.dist_backend.GLOO else False
60
+
61
+ self.module = module
62
+
63
+ for p in self.module.state_dict().values():
64
+ if not torch.is_tensor(p):
65
+ continue
66
+ dist.broadcast(p, 0)
67
+
68
+ def allreduce_params():
69
+ if(self.needs_reduction):
70
+ self.needs_reduction = False
71
+ buckets = {}
72
+ for param in self.module.parameters():
73
+ if param.requires_grad and param.grad is not None:
74
+ tp = type(param.data)
75
+ if tp not in buckets:
76
+ buckets[tp] = []
77
+ buckets[tp].append(param)
78
+ if self.warn_on_half:
79
+ if torch.cuda.HalfTensor in buckets:
80
+ print("WARNING: gloo dist backend for half parameters may be extremely slow." +
81
+ " It is recommended to use the NCCL backend in this case. This currently requires" +
82
+ "PyTorch built from top of tree master.")
83
+ self.warn_on_half = False
84
+
85
+ for tp in buckets:
86
+ bucket = buckets[tp]
87
+ grads = [param.grad.data for param in bucket]
88
+ coalesced = _flatten_dense_tensors(grads)
89
+ dist.all_reduce(coalesced)
90
+ coalesced /= dist.get_world_size()
91
+ for buf, synced in zip(grads, _unflatten_dense_tensors(coalesced, grads)):
92
+ buf.copy_(synced)
93
+
94
+ for param in list(self.module.parameters()):
95
+ def allreduce_hook(*unused):
96
+ param._execution_engine.queue_callback(allreduce_params)
97
+ if param.requires_grad:
98
+ param.register_hook(allreduce_hook)
99
+
100
+ def forward(self, *inputs, **kwargs):
101
+ self.needs_reduction = True
102
+ return self.module(*inputs, **kwargs)
103
+
104
+ '''
105
+ def _sync_buffers(self):
106
+ buffers = list(self.module._all_buffers())
107
+ if len(buffers) > 0:
108
+ # cross-node buffer sync
109
+ flat_buffers = _flatten_dense_tensors(buffers)
110
+ dist.broadcast(flat_buffers, 0)
111
+ for buf, synced in zip(buffers, _unflatten_dense_tensors(flat_buffers, buffers)):
112
+ buf.copy_(synced)
113
+ def train(self, mode=True):
114
+ # Clear NCCL communicator and CUDA event cache of the default group ID,
115
+ # These cache will be recreated at the later call. This is currently a
116
+ # work-around for a potential NCCL deadlock.
117
+ if dist._backend == dist.dist_backend.NCCL:
118
+ dist._clear_group_cache()
119
+ super(DistributedDataParallel, self).train(mode)
120
+ self.module.train(mode)
121
+ '''
122
+ '''
123
+ Modifies existing model to do gradient allreduce, but doesn't change class
124
+ so you don't need "module"
125
+ '''
126
+ def apply_gradient_allreduce(module):
127
+ if not hasattr(dist, '_backend'):
128
+ module.warn_on_half = True
129
+ else:
130
+ module.warn_on_half = True if dist._backend == dist.dist_backend.GLOO else False
131
+
132
+ for p in module.state_dict().values():
133
+ if not torch.is_tensor(p):
134
+ continue
135
+ dist.broadcast(p, 0)
136
+
137
+ def allreduce_params():
138
+ if(module.needs_reduction):
139
+ module.needs_reduction = False
140
+ buckets = {}
141
+ for param in module.parameters():
142
+ if param.requires_grad and param.grad is not None:
143
+ tp = param.data.dtype
144
+ if tp not in buckets:
145
+ buckets[tp] = []
146
+ buckets[tp].append(param)
147
+ if module.warn_on_half:
148
+ if torch.cuda.HalfTensor in buckets:
149
+ print("WARNING: gloo dist backend for half parameters may be extremely slow." +
150
+ " It is recommended to use the NCCL backend in this case. This currently requires" +
151
+ "PyTorch built from top of tree master.")
152
+ module.warn_on_half = False
153
+
154
+ for tp in buckets:
155
+ bucket = buckets[tp]
156
+ grads = [param.grad.data for param in bucket]
157
+ coalesced = _flatten_dense_tensors(grads)
158
+ dist.all_reduce(coalesced)
159
+ coalesced /= dist.get_world_size()
160
+ for buf, synced in zip(grads, _unflatten_dense_tensors(coalesced, grads)):
161
+ buf.copy_(synced)
162
+
163
+ for param in list(module.parameters()):
164
+ def allreduce_hook(*unused):
165
+ Variable._execution_engine.queue_callback(allreduce_params)
166
+ if param.requires_grad:
167
+ param.register_hook(allreduce_hook)
168
+
169
+ def set_needs_reduction(self, input, output):
170
+ self.needs_reduction = True
171
+
172
+ module.register_forward_hook(set_needs_reduction)
173
+ return module
env.py ADDED
@@ -0,0 +1,15 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import os
2
+ import shutil
3
+
4
+
5
+ class AttrDict(dict):
6
+ def __init__(self, *args, **kwargs):
7
+ super(AttrDict, self).__init__(*args, **kwargs)
8
+ self.__dict__ = self
9
+
10
+
11
+ def build_env(config, config_name, path):
12
+ t_path = os.path.join(path, config_name)
13
+ if config != t_path:
14
+ os.makedirs(path, exist_ok=True)
15
+ shutil.copyfile(config, os.path.join(path, config_name))
filelists/transcript_train.txt ADDED
The diff for this file is too large to render. See raw diff
 
filelists/transcript_val.txt ADDED
@@ -0,0 +1,426 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ wav/nen001_001.wav|はい?呼びました?
2
+ wav/nen001_012.wav|ほしな君も
3
+ wav/nen001_024.wav|さすがに白蛇占いはできませんよ
4
+ wav/nen001_035.wav|わかりました。ありがとう……ございます
5
+ wav/nen001_049.wav|んっ、んっ、んくっ……ひっ、あっ、ぁっ、ぁっ、んんーーッ……
6
+ wav/nen001_060.wav|あああぁぁ……今は、止められなくて……じゅる……はぁ、はぁぁ……あぁぁぁあぁ……
7
+ wav/nen001_072.wav|ほしな君。珍しいところで会いますね。図書室で何か調べ物ですか?こんな時間まで大変ですね
8
+ wav/nen002_004.wav|そうですか
9
+ wav/nen002_018.wav|そうですか……
10
+ wav/nen002_036.wav|あっ、あれは、違うんです。別に悩みとか、ストレスじゃなくて……じ……事情が……ありまして
11
+ wav/nen002_051.wav|そんな風に光るなんて、私も初めて見ました一体何をしたんですか?
12
+ wav/nen002_062.wav|ど、どうして……?一体どこに……欠片が……今まで集めた欠片が……やっぱりさっきの羽根は……
13
+ wav/nen002_074.wav|それで、あの……気分はどうですか?
14
+ wav/nen002_089.wav|占ったんです、その高安先輩の交際相手である女の子の恋愛運を
15
+ wav/nen002_100.wav|そうですね。そういう人も含まれると思います
16
+ wav/nen002_113.wav|はい。私に、“心の欠片”のことを教えてくれた人に
17
+ wav/nen003_007.wav|はい、私の知り合いが営んでる喫茶店です
18
+ wav/nen003_019.wav|諦め?受け入れる?
19
+ wav/nen003_031.wav|あの……一つ、思ったことがあるんですが……
20
+ wav/nen003_047.wav|ななおは人間じゃないんです。私が契約を結んだ、アルプなんです
21
+ wav/nen003_059.wav|楽しそうにしてる時でも、どこか楽しみきれていないと言いますか。そういう気持ちは、私にもありますから
22
+ wav/nen004_011.wav|は、はい?
23
+ wav/nen005_008.wav|いえ、まだです。おそらく何もないとは思うんですが、万が一ということもあります
24
+ wav/nen005_022.wav|おこです。激おこです
25
+ wav/nen005_035.wav|いえ、困っていることがあって、私に力になれることでしたらお手伝いさせてもらいます
26
+ wav/nen005_049.wav|そう言われても事実なので
27
+ wav/nen005_060.wav|そうですね……私では解決できないような依頼も、いくつかありましたね
28
+ wav/nen005_071.wav|まあ、気は進まないんですけどね……はぁ……
29
+ wav/nen005_089.wav|ごめんなさい。でも、これが欠片の回収方法なんです
30
+ wav/nen005_100.wav|ななおに訊いても無駄ですよ。ななおは猫なんですから
31
+ wav/nen006_005.wav|私も同じです。一般的な意見なら、多少は集まりましたが……
32
+ wav/nen006_018.wav|わかりました~
33
+ wav/nen006_030.wav|じゃあ、続けますね
34
+ wav/nen006_043.wav|ありがとう、ございます……んっ、んんんっ、ひっ、ひっ、ふーーーぅ……ひっ、ひっ、ふーーーーぅ……
35
+ wav/nen006_056.wav|諦めるのはまだ早いと思います。ここにはまだテクニックが記されていますから
36
+ wav/nen006_068.wav|え?突然どうしたんですか?ほしな君に謝罪されるようなこと、ありましたか?
37
+ wav/nen007_006.wav|一部ということは……そうじゃない人には、受け入れてもらえた、ということですか?
38
+ wav/nen008_009.wav|折を見て、自分の分を買いに行こうと思ってます
39
+ wav/nen008_021.wav|そ、そうなんですか……?
40
+ wav/nen008_032.wav|もう1時間ぐらいしてますから
41
+ wav/nen008_044.wav|いなばさん……ありがとうございます。それでは、お言葉に甘えさせてもらってもいいですか?
42
+ wav/nen008_056.wav|私たちでリハーサル?
43
+ wav/nen009_004.wav|はい。川上君の悩みは、本当にデートのことでいいんでしょうか?
44
+ wav/nen009_016.wav|はい、大丈夫ですよ、時間はまだ10分ほど余裕がありますから
45
+ wav/nen009_027.wav|そうですね……ほしな君、川上君の予定では映画の後はどうなっていますか?
46
+ wav/nen009_039.wav|このことは、川上君にも伝えておいた方がいいですね
47
+ wav/nen009_052.wav|本当ですか?丁度いい機会ですから、いっそ買ってしまうのもいいですね
48
+ wav/nen009_063.wav|川上君はしっかりプランを組んだりしているんですから、むしろ川上君が嫌がるかもしれませんね
49
+ wav/nen009_074.wav|私は何でもいいですよ。嫌いな物も特にありませんから
50
+ wav/nen009_085.wav|あの、これってもう取っていいんですか?
51
+ wav/nen009_100.wav|あ、甘エビ~♪
52
+ wav/nen010_007.wav|確かにそうですね。お礼の言葉を言ってもらえたりするのも、とても嬉しいものですからね
53
+ wav/nen010_021.wav|は、はぁ……はぁ……あり、がとう、ございますぅ……ほしなくんんんっ……
54
+ wav/nen010_032.wav|はい、あと少し……もう少し……んっ、んひっ、あっ、あっ、あっ……はあぁぁぁー……
55
+ wav/nen010_045.wav|ふーーー……ふーーー……
56
+ wav/nen010_057.wav|いえ、そうじゃなくてですね、その………………スースー、しますから
57
+ wav/nen010_074.wav|はい、どうぞ
58
+ wav/nen010_086.wav|私たちのオカルト研究部も、元々は黒魔術だったみたいですよ
59
+ wav/nen010_100.wav|は、はい、大事になる前に誤解をときましょう
60
+ wav/nen011_008.wav|私が勧めたんです。更衣室で着替えるのを恥ずかしそうにしていたので
61
+ wav/nen011_019.wav|あ、ダ、ダメですよ、変なところ触っちゃくすぐったいですから
62
+ wav/nen011_033.wav|どうしたんですか?なにか連絡事項が?
63
+ wav/nen011_044.wav|ではとがくし先輩の相談は、越路さんを説得すること、でいいんですか?
64
+ wav/nen012_003.wav|それで、どうでしたか?
65
+ wav/nen012_014.wav|あの、ほしな君
66
+ wav/nen012_026.wav|あれだけ反応が弱い欠片ですと、特に
67
+ wav/nen013_001.wav|もし本当に私の他に魔女がいるとしたら……困ったことになりますね
68
+ wav/nen013_015.wav|はい、問題ありません
69
+ wav/nen013_028.wav|ロ、ローター……です………………ローターですよぅ……
70
+ wav/nen014_001.wav|そうなんですか?どうかしたんですか?
71
+ wav/nen015_071.wav|それじゃあ今後とも、よろしくお願いします
72
+ wav/nen015_004.wav|はぁ、それはわかりました。でも、一つだけ答えてくれませんか?気になる事があるんです
73
+ wav/nen015_016.wav|え?それって、どういうことですか?
74
+ wav/nen015_031.wav|そんな普通に可愛い服だなんて卑怯ですっ。私なんてこんな恥ずかしい恰好なのにぃ理不尽です~!
75
+ wav/nen015_043.wav|魔女の契約の代償……と言うことですか
76
+ wav/nen015_056.wav|しいばさんはああ言ってくれましたが、私は別にこの学院を自分の領土だなんて言うつもりはありません
77
+ wav/nen016_003.wav|はい。また何か困ったことがあれば、いつでもどうぞ
78
+ wav/nen016_014.wav|つまり、私たちはこの部室から出ていかなければいけない、ということですか?
79
+ wav/nen016_027.wav|とにかく運営のすべきことは、ほしな君が言ったことと、先生方との折衝もでしょうか?
80
+ wav/nen016_039.wav|そういうことでしたら……お願いできますか?
81
+ wav/nen016_050.wav|なにか問題がありましたか?
82
+ wav/nen017_002.wav|全員揃っていますね。それじゃ行きましょうか
83
+ wav/nen017_015.wav|それじゃあ……ここからここまでを、まず完璧に覚えましょう。ここの基礎を覚えてしまえば、次も覚えやすいですから
84
+ wav/nen017_028.wav|え?なんですか?
85
+ wav/nen017_041.wav|だ、大丈夫……大丈夫なはず……ええ、絶対大丈夫です……おそらく、きっと、多分
86
+ wav/nen017_052.wav|確かにそれぐらいの余裕はありますが……
87
+ wav/nen017_064.wav|はい、お疲れ様でした
88
+ wav/nen018_012.wav|それじゃあ、一体どうしてですか?
89
+ wav/nen018_023.wav|大きな欠伸ですね
90
+ wav/nen018_036.wav|ちょっと皮がむけちゃって、真っ赤になっちゃってますよ
91
+ wav/nen018_050.wav|ほしな君のことを、応援していますし……それが、応援になるというのでしたら……もう一度
92
+ wav/nen019_002.wav|はい。よろしくお願いします
93
+ wav/nen019_013.wav|そうですか、ありがとう……ございます
94
+ wav/nen019_026.wav|ありがとうございます、しいばさん……言わなきゃよかった言わなきゃよかった言わなきゃよかった言わなきゃよかった言わなきゃよかった言わなきゃよかった
95
+ wav/nen019_037.wav|予想よりも多くの人に集まってもらえて、準備してきた者としては嬉しい限りです
96
+ wav/nen020_102.wav|ぷぁ、はぁぁ………………疲れました
97
+ wav/nen020_114.wav|それに私もほしな君と同じで、あくまで部活の一環ですからね
98
+ wav/nen020_127.wav|もぅ、どうしてそういうことを言わせるんですか!
99
+ wav/nen020_139.wav|こちらのことは気にしないでいいんですよ?……こうして欠片が戻ったということは、ほしな君も嫌に思ってるわけじゃないんですよね?
100
+ wav/nen020_151.wav|はい。ほしな君は気付いていないかもしれませんが、笑顔が以前とは比べ物にならないぐらい自然ですから
101
+ wav/nen020_162.wav|かもしれません。でも、そういう部活も楽しくていいものですよ
102
+ wav/nen020_404.wav|ありがとうございます
103
+ wav/nen101_010.wav|はぁ……
104
+ wav/nen101_024.wav|いえ、買い物ではないんです。今日は色々疲れてしまったので……
105
+ wav/nen101_036.wav|��いんですか?
106
+ wav/nen101_048.wav|ほしな君は、このお店に入ったことがあるんですか?
107
+ wav/nen101_059.wav|ですが……こうして呪文を唱えなきゃいけないんですよね?とりあえず、初心者はこう頼むべし、って書いてありましたけど
108
+ wav/nen101_071.wav|えっ……あの、それって……
109
+ wav/nen101_086.wav|あ、美味しいですね。これがラーメン
110
+ wav/nen101_099.wav|あのほしな君、早く行きましょう
111
+ wav/nen101_111.wav|え?いえそんな、お礼を言われるような、大層なことは出来ていませんから
112
+ wav/nen101_126.wav|はい、さようなら
113
+ wav/nen102_005.wav|あの……それで、どうしたんですか?突然電話だなんて
114
+ wav/nen102_018.wav|それにですね、今朝に比べると大分マシにはなっています。ですから、このまま大人しくしていれば平気ですよ
115
+ wav/nen102_033.wav|どうぞ
116
+ wav/nen102_046.wav|私が嘘を吐いていないのは、ほしな君ならわかりますよね?
117
+ wav/nen102_057.wav|ですから、む……夢精をしちゃうような……いやらしい夢を見たんじゃないかなっと
118
+ wav/nen102_072.wav|私は一人暮らしですから。そういう思い出とは縁遠い生活ですね
119
+ wav/nen102_087.wav|今度は、ほしな君がおまじないをかける側になって下さい。そしたらきっと、私の恥ずかしさがわかってもらえるはずです
120
+ wav/nen102_099.wav|ひっ、んっ、んん……ふぅ、ふぅ……んんっ、んふぅ……んん……
121
+ wav/nen102_111.wav|はい。約束です
122
+ wav/nen102_124.wav|でも……気分が少しマシになったかもしれない。あのおまじないは効くのかな?
123
+ wav/nen103_010.wav|だから熱く語らないで下さい、思い出しちゃダメー、手をニギニギさせるのもダメですってばっ
124
+ wav/nen103_025.wav|私に、む、夢精……とか言わせたくせに、教えてくれないなんてズルいですよぅ!
125
+ wav/nen103_042.wav|ところで話は変わりますが、何かあったんですか?みんな、普段と様子が違うみたいですが
126
+ wav/nen103_053.wav|はい。先生が男の人と一緒に歩いているところを見かけましたよ
127
+ wav/nen103_070.wav|本命の質問だけでなく、無関係なダミーの質問も織り交ぜれば、怪しさも薄くなりませんか?
128
+ wav/nen103_082.wav|わかりました
129
+ wav/nen103_095.wav|そうなんですが……見られていないとわかっていても、恥ずかしいんですよぅ、この恰好
130
+ wav/nen103_106.wav|というよりも……一緒に行っていいですか?実は私もまだ書いていなくて……
131
+ wav/nen103_123.wav|あ、いえ、その……
132
+ wav/nen103_141.wav|は、はい?
133
+ wav/nen103_161.wav|私は、怒られたくないです……
134
+ wav/nen103_175.wav|あの……正直に言います。最近の私は変なんです
135
+ wav/nen103_189.wav|い、いえ、そんな風には思っていませんから、平気ですっ
136
+ wav/nen103_200.wav|それに……こんな私のことを知りたいって言ってくれたこと……嬉しかったです
137
+ wav/nen103_212.wav|こ、子供っぽいですよね?
138
+ wav/nen103_227.wav|なぅぅぅぅぅ……ほしな君のことを思うと心が落ち着かない……
139
+ wav/nen103_240.wav|あっ、うあっ、あぁぁぁぁぁぁぁぁぁぁぁっ
140
+ wav/nen103_251.wav|はあ、はあ、はぁああぁぁ……なにこれ、こんなにすごいの、しらない……いつもと、全然違う……んっ、ふーっ……ふーっ……
141
+ wav/nen103_262.wav|はぁ、はぁ、はぁ、はぁはぁはぁぁぁぁぁんっ、ぅぅぅぅぅぅうっ!
142
+ wav/nen103_273.wav|ひゃんっ……あ、あ、あぁぁぁ……ヤダぁ、止まらない、止まりませんよぉ……あ、あ、はぁぁぁぁ……っ
143
+ wav/nen104_007.wav|はい?なにが……ですか?
144
+ wav/nen104_020.wav|だってほしな君が言わせたんじゃないですかぁ
145
+ wav/nen104_031.wav|そうです。その通りです。い、今でももうおかしくなっているのに、これ以上は……
146
+ wav/nen104_043.wav|本当にごめんなさい
147
+ wav/nen104_054.wav|別に大変と言うほどのことは
148
+ wav/nen104_066.wav|そうなんですか?どうしてこんなにすぐに……いつも通り過ごしていたはずなのに
149
+ wav/nen104_078.wav|それじゃあ、考えておきます
150
+ wav/nen104_092.wav|もしよければ、その相手の怪しい行動についても、教えてもらえますか?
151
+ wav/nen104_106.wav|あっ……ぅっ……
152
+ wav/nen104_121.wav|な、なんでもないですよぅ。眠れなかったというだけですから
153
+ wav/nen104_136.wav|普通は引きますよね。一晩中オナニーしちゃうような女の子なんて……
154
+ wav/nen104_148.wav|そ、それじゃあ皆さん……あっ、んんっ……私は、お先に、失礼させてもらいます、ね……んんっ
155
+ wav/nen104_161.wav|でも、ダメでした。ちょっ���……無理そうです。答えは出そうにありません
156
+ wav/nen104_173.wav|ほしな君は、私のことを好きって言ってくれてますが私には、ほしな君にも言ってないことが……あるんです
157
+ wav/nen104_190.wav|濡れて……ます……発情が止まらなくて……ぅぅ……そ、そんな、ヘンタイな私でも好きって言ってくれますか?
158
+ wav/nen104_203.wav|もう無理です。我慢できません。自分が抑えられなくて………………だから先に謝っておきますね。ごめんなさいっ
159
+ wav/nen104_214.wav|んふーッ……じゅる、ちゅるるる……れる、れろれろれる……ちゅ、ちゅ……んちゅ
160
+ wav/nen104_226.wav|んっ、んんんーーーー……ぷぁ、はぁぁぁーー……はぁーっ……はぁーっ……
161
+ wav/nen104_239.wav|はい。わかりました――んぅっ、あ……あっ、あっ……んんぁ
162
+ wav/nen104_252.wav|あっ、はぁ、はぁ、はぁ……んんっ、んんんっ……ふーっ、ふーっ……んっ、んんーーっ
163
+ wav/nen104_264.wav|それは……はぁ、はぁ……んんっ、好きな人にされる方が、気持ちよくて……好きです……
164
+ wav/nen104_275.wav|んっ、ひぃぃぁぁぁぁぁあああっ
165
+ wav/nen104_287.wav|あ、あ、あの……そんなに、じっくり見ないで下さい……恥ずかしいんですから……
166
+ wav/nen104_299.wav|えっ?それは、やっぱり私のそこ、変ってことですか?色々自分で弄っちゃってるから、変なんですか?
167
+ wav/nen104_310.wav|ひぃぁあっ!そっ、そこっ、は……んっ、んんんっ、あ、あ、あ、あ、ああああああっ
168
+ wav/nen104_322.wav|ほ、ほしな君は、どうですか?
169
+ wav/nen104_333.wav|はっ、はっ、ああぁァァああんっ、びりびり、するぅ……はぁ、はぁ、はぁ……奥まで、きてますぅ
170
+ wav/nen104_344.wav|あああっ、頭、くらくらします……はぁはぁはぁ、ん、んんぅぅーーーーーーッ、もっと呼んでぇ、もっと名前を呼んで下さいぃ
171
+ wav/nen104_355.wav|きゃ、ぅぁっ……はぁ、はぁ、すごい、出てます、ヌルヌルのが、いっぱいっ
172
+ wav/nen104_370.wav|あ、あの、それはまた、後日にお願いします
173
+ wav/nen105_010.wav|い、いいですいいです、そんな仰々しいことっ
174
+ wav/nen105_026.wav|はい。お疲れ様でした
175
+ wav/nen105_041.wav|一人暮らし用の冷蔵庫だと小さいですから。野菜室があるタイプに買い換えようかとずっと悩んでいるんですが……
176
+ wav/nen105_058.wav|それになによりも、好きな人と一緒にいられる時間は私も好きですから
177
+ wav/nen105_071.wav|はい。頑張って作りますね
178
+ wav/nen105_087.wav|ん、れろ……れる、えるれろれろ……れるん………んっ、ちゅぅぱ、はぁ、はぁ、はぁっ、あぁんっ
179
+ wav/nen105_100.wav|あの……別に、そういう行為が嫌というわけじゃないんです。さっき、キスの前に言ったのは本当のことですから
180
+ wav/nen105_112.wav|ウソツキ……私のしたいこと、ワガママを言ってもいいって……そう言ってくれたじゃないですか
181
+ wav/nen105_127.wav|あ……あの、もう一度触っていいですか?今度はちゃんと優しく、丁寧に触りますから
182
+ wav/nen105_139.wav|もし痛かったら言って下さいね。ちゅ、ちゅ……ん……ちゅぅ……んっ、んんっ
183
+ wav/nen105_150.wav|んちゅ、じゅる……ちゅ、ちゅ……んんんー、舐めても舐めても、全然綺麗になりませんね。むしろ、ますますベトベトになってるような……
184
+ wav/nen105_162.wav|んぶ……ンッ、ちゅばちゅば……ちゅぶっ、ちゅぶぶ……んんーーっ、じゅるっ……じゅるるるるっ
185
+ wav/nen105_173.wav|んーー……じゃあ、見えなくしちゃいます……ん、じゅる、じゅるるる……ちゅ、ちゅぅぅぅぅーーー……ッ
186
+ wav/nen105_184.wav|ん……ッッ!?んっ、ぅぅぅっ……ん、んんーーー……んふぅー……ふぅー……ん、んむぅ……んんっ
187
+ wav/nen105_195.wav|はぁ……はぁ……んっ、はぁぁぁ……気持ち、よかったですか?
188
+ wav/nen106_002.wav|ちょっと待って下さいね。私も、最近は確認をしていなかったので
189
+ wav/nen106_014.wav|でも、予定は大丈夫なんですか?
190
+ wav/nen106_026.wav|は、はい。そうですね
191
+ wav/nen106_038.wav|んふぅ、んっじゅるっ、ぬちゅくちゅ……んぁ、はぁ、はぁ、はい。もう少し……はぁぁ、あむぅ……れろれるん、れちょれちょ
192
+ wav/nen106_050.wav|さようなら。また明日
193
+ wav/nen106_065.wav|いつも歩いている道ですから。それに、なるべく明るくて人気のあるところを通ります。大丈夫ですよ
194
+ wav/nen106_078.wav|すみません、気を遣わせてしまいまして。でも、本当にそれだけなので、心配は必要ありませんよ
195
+ wav/nen106_090.wav|あの、ちょっと待って下さい
196
+ wav/nen106_101.wav|そう言ってもらえて嬉しいです
197
+ wav/nen106_116.wav|は、はい。もちろんです……私も、大好きな人とキスしたい、です……
198
+ wav/nen106_129.wav|え?
199
+ wav/nen106_150.wav|それじゃあ、今日は失礼しますね
200
+ wav/nen107_003.wav|それじゃあ、お疲れ様でした。さようなら
201
+ wav/nen107_019.wav|そう、ですよね……今みたいな状態を続けても……仕方ないですよね
202
+ wav/nen107_035.wav|でも、でも……
203
+ wav/nen107_051.wav|それよりも、結局どうなんですか?私の気持ち、ちゃんと感じてもらえてますか?
204
+ wav/nen107_063.wav|それは……うっ……ぅぅぅ~~~……恥ずかしい、ですけど……今は、この温もりに包まれていたいです。そっちの方が重要です
205
+ wav/nen108_011.wav|でも急に泊まってもらうことになって……親御さんにもご迷惑を……
206
+ wav/nen108_024.wav|そうなんですか?えっと……気付いていませんでした。むしろ、私の方が甘えちゃっていますから……
207
+ wav/nen108_044.wav|んんっ、ふーっ……ふーっ……
208
+ wav/nen108_056.wav|授業に身が入らなくて……ず、ずっと、考えてたら……はぁ……はぁ……だ、だから……はぁ、はぁ、はぁ、はぁ
209
+ wav/nen108_069.wav|それは、だから……下のお口、ですとか……他にもありませんか?
210
+ wav/nen108_080.wav|ひあぁぁああぁぁああっ、それ、しび、れる……からだ、痺れちゃうっ、あ、あ、あああああ、そこ、吸うの、あっ、あああっ
211
+ wav/nen108_091.wav|ひゃあああぁっ、そんな、おま●こ全部を吸われたらぁ……あ、あ、あ、あ、我慢できませんっ、もう熱いですぅ、身体が熱くて仕方ないんです
212
+ wav/nen108_103.wav|ぁぁ……はぁー、はぁー……あ、これぇ、奥まで感じます……んぁぁ、はぁー……はぁー……
213
+ wav/nen108_114.wav|ひっ、ひああぁぁぁああぁ、それ、それ凄いですぅ……はぁはぁはぁ、あああぁぁあああぁっ
214
+ wav/nen108_125.wav|あっ、ああっ……やだぁ、エッチな音、してます……私の、おま●こから、エッチな音が……あっあっあっ、でも、我慢できなくてっ
215
+ wav/nen108_136.wav|はぁ、はぁ……はぁぁぁ……もう、ドロドロですよ……
216
+ wav/nen108_152.wav|それは……はい。確かにそういう気持ちはあります……
217
+ wav/nen108_168.wav|浮かない表情をしていました……
218
+ wav/nen108_181.wav|それは……どういう意味ですか?
219
+ wav/nen109_011.wav|そうですね……カラオケに、ボウリング、プリクラも……
220
+ wav/nen109_025.wav|はい……それじゃあ、えっと、えっと……
221
+ wav/nen109_040.wav|ここがいいでしょうか……それともこっち?
222
+ wav/nen109_052.wav|まだ色々やりたいことはあります、それは尽きませんけど………………でも本当に、後悔はしてませんよ
223
+ wav/nen109_069.wav|はい。私、幸せになります。それで、しゅうじ君のことも幸せにしてみせます
224
+ wav/kne110_008.wav|メッセージ……
225
+ wav/kne110_026.wav|こんな……形だけにこだわった物じゃないんです……でも、それはもう……無理なんですよね
226
+ wav/kne110_044.wav|はい。優しそうな人ですから
227
+ wav/nen110_013.wav|ぅっ……ぁぁ……ダメ……泣いたり、しない
228
+ wav/nen111_006.wav|言いたいこと……ですか?
229
+ wav/nen111_019.wav|ギターが欲しいんですよね?
230
+ wav/nen111_033.wav|やっぱり、未来が変わっちゃってるんですよね……
231
+ wav/nen111_047.wav|それは、えっと………………
232
+ wav/nen111_062.wav|で、ですから……わ、私の……オナニー………………オナニーですっ
233
+ wav/nen111_079.wav|は、はい。大丈夫です。すみません、驚かせてしまいまして
234
+ wav/nen111_092.wav|ひぁっ、ぅぅぅ~~~
235
+ wav/nen111_105.wav|保健室に行きますか?
236
+ wav/nen111_124.wav|好き……好きです、大好きです……私は貴方のことが大好きです。愛しています。もう離れたりしません
237
+ wav/nen112_011.wav|ほしな君はちゃんと以前から、力になってくれていましたよ
238
+ wav/nen112_029.wav|私だって嬉しいです。ほしな君が一緒にいてくれて……その、単純に近い場所にいてくれるってことじゃなくてですね
239
+ wav/nen112_043.wav|それで、いなばさんは……相談でいいんですよね?
240
+ wav/nen112_054.wav|少し考える時間をもらえますか?
241
+ wav/nen112_068.wav|いえ、そんなことはありません。私も嬉しいですよ
242
+ wav/nen112_080.wav|あ、あの、なんだか凄い騒ぎになってるみたいですけど……
243
+ wav/nen112_094.wav|時と場所さえ考えてもらえれ��……私も、や……やぶさかではありませんが……え?え?も、もしかして今日って、そういうことなんですか?
244
+ wav/nen112_108.wav|そ、そうですね。見つかったらデートできなくなってしまいますよね
245
+ wav/nen112_121.wav|もぅっ!そんなに連続して呼ばれたら、嬉しすぎておかしくなっちゃいますよぅ
246
+ wav/nen112_133.wav|だって美味しいじゃないですか。それにほら、見た目も可愛いです
247
+ wav/nen112_145.wav|はぁ……美味しかったです
248
+ wav/nen112_159.wav|自分の身体なんですから、当たってることぐらい気付いてます……わかってはいますが……抱きついていたいんです
249
+ wav/nen112_173.wav|前は私のしたいことするデートでしたが……今回はしゅうじ君が私のために計画してくれたデートで、どこに行くのかドキドキして
250
+ wav/nen112_188.wav|私はしゅうじ君のこと、嫌いになったりなんてしないのに
251
+ wav/nen112_204.wav|お、お邪魔します
252
+ wav/nen112_217.wav|それにしても、しゅうじ君はお父さんとあんな風に喋るんですね。ちょっと、意外でした
253
+ wav/nen112_229.wav|いえ、平気です
254
+ wav/nen112_243.wav|お、女の子だって興奮とか、期待とか、もにょもにょしちゃうものなんですよぅ……
255
+ wav/nen112_259.wav|わ、わかりました……
256
+ wav/nen112_270.wav|は、はい……ぅぅぅぁッ……はっ、はぁー、はぁー……お願いします、続けて下さい……もっと、触って
257
+ wav/nen112_281.wav|あ、あ、あ、また……やっ、そんなに強く捻っちゃ……ひぁっ、んぃぃ……ッッ
258
+ wav/nen112_293.wav|あぁぁ、んぁああぁぁ……ッッ、2回、2回です……んっ、んんぅぅぅぁぁぁあッ、あっ、あっ、あああッッ
259
+ wav/nen112_304.wav|だ、だって……5回だなんて……恥ずかしいです。凄くエッチですから……
260
+ wav/nen112_315.wav|ぅぅ……また、そうやって全部言わせて……本当にイジワルですよぅ……
261
+ wav/nen112_327.wav|熱くて……硬くて……はぁ、はぁ、ぁぁぁあっ……前より太くて、おっきい気が、しますぅっ
262
+ wav/nen112_339.wav|ちゅっ、んん、ふぅぅ……んっ、んっ、んんぅぅぅぅ……ぅぅーーッ
263
+ wav/nen112_350.wav|だって、だって……んっ、ぅぅぅっ……こ、こんなに、グリグリされたら、こんな声も出ちゃいますよぅ……あっ、はぁはぁはぁはぁ
264
+ wav/nen112_361.wav|あ、はぁぁぁむ、んちゅ……ちゅ、ちゅ、ちゅ、じゅるる……んちゅ、ちゅぅぅーー……ん、んむぅ、んっ、じゅるる
265
+ wav/nen112_372.wav|私も……こんなにイってしまったのは、初めてです……やっぱりオナニーとは、全然違いますね……はっ、はぁぁ……
266
+ wav/nen112_386.wav|いえ、もう起きます
267
+ wav/nen113_171.wav|わ……わかりました……それなら……私、命令通りに、オナニーします
268
+ wav/nen113_182.wav|ひぁぁ!は、はい、はいっ……んんっ、んんぅぅ……ぅぅあっ、あっ、あっ、あっ
269
+ wav/nen113_193.wav|ちがっ、違うんです……お漏らしじゃなくて……ああ、もう……どうしてこんなにビショビショなの?まだ、乳首を刺激してるだけなのに
270
+ wav/nen113_205.wav|はぁ、はぁ、こ、ここら辺ですか?もう当たりますか?
271
+ wav/nen113_216.wav|ぁっ、ぁっ、ぁっ、ぅぅあっ、なにこれ……ダメっ、ダメっ……あっ、あっ、あっ、ぁぁぁああぁぁあ、イく……イっちゃう
272
+ wav/nen113_227.wav|はぁーっ……はぁーっ……気持ちいい、です。クリトリス、気持ちいい……
273
+ wav/nen113_238.wav|んんんっ!んぁっ、んぁっ、ダメ……手が、震えて、あっ、あっ!ローター……当てていられない……あっ、あっ
274
+ wav/nen113_249.wav|はっ、はひっ、あっ、あっ、あっ!イ、イく……もう、わらひ、我慢できませんよ……ああっ、あっ、あーーーーッ!
275
+ wav/nen113_260.wav|え?あ、ちょっと待って下さい……あっ……
276
+ wav/nen113_272.wav|わかりました。それじゃあ遠慮せず、沢山イきますね……はぁ、はぁ……
277
+ wav/nen113_284.wav|それに動きたいんですよね?気持ちよくなりたいんですよね?さっきから、わたしの中でおち●ちんがビクビク、してますよ
278
+ wav/nen113_295.wav|はぁ……はぁ……はぁ……はぁ……あっ……あっ……あっ、ああああぁぁぁぁぁぁぁぁぁああああああーーーーー!!
279
+ wav/nen113_306.wav|んふぅ……んっんっんんぅぅぅぁああ!はぁ!はぁ!あああっ、んんんーーーーー……んんぁぁああっ!
280
+ wav/nen113_317.wav|んひっ、あっ!あっ!はぁぁ……まだ、出てる……あっ、あっ、あっ、はぁぁ……ん、んんっ���
281
+ wav/nen113_328.wav|ちょっと?
282
+ wav/nen113_006.wav|そんなことありませんよ。さあ、遠慮せずに中に入って下さい
283
+ wav/nen113_017.wav|あのー……
284
+ wav/nen113_031.wav|さ、参考……ですか?川上君が考えたデートプランを実際に試してみる、とかじゃなく?
285
+ wav/nen113_044.wav|私はゲームセンターも好きですよ。普段は全然入ったこともありませんから、むしろ楽しみなぐらいです
286
+ wav/nen113_056.wav|あっ、しゅうじ君。あっちにもほら、クマのぬいぐるみがありますよ
287
+ wav/nen113_067.wav|私のことを考えてくれたからこそ、思い出の方を優先してくれたんですよね?
288
+ wav/nen113_081.wav|そうですね。特別やレアって言われてしまうと、試しに頼んでみたくなりますね
289
+ wav/nen113_096.wav|なにか違うこと考えてます
290
+ wav/nen113_107.wav|いえ、もうジュースが無くなっちゃいましたから……
291
+ wav/nen113_118.wav|でも……いつもよりは、疲れましたよね?
292
+ wav/nen113_132.wav|た、確かに……そうですね
293
+ wav/nen113_145.wav|それならいいんですが……
294
+ wav/nen113_162.wav|どっ、どうやってって
295
+ wav/nen114_017.wav|いえ。むしろ、こちらこそすみません。不透明な活動ばかりで……もっと結果が残るような物があればご迷惑もおかけしなかったんですが……
296
+ wav/nen114_028.wav|それに、パーティーで演奏しないとかりやさんはギターを披露できず、モヤモヤしたままになりませんか?
297
+ wav/nen114_042.wav|そっ、その言い方は……卑怯ですよぅ
298
+ wav/nen114_056.wav|そこも気になる部分ではあるんですが……
299
+ wav/nen114_072.wav|しゅうじ君を待っていたんです。最近、一緒にいられる時間が少ない気がして……なんとかしたいなと思って、終わるのを待ってたんです
300
+ wav/nen114_083.wav|女の子同士でもですか?
301
+ wav/nen114_097.wav|はい、できました
302
+ wav/nen114_111.wav|物じゃなくてですね、あの……ですから……しゅうじ君の願い事を、なんでも叶えます、私が
303
+ wav/nen114_124.wav|ダメです
304
+ wav/nen114_135.wav|んっ、んんーーーッ……んふぅ、ふぅー……ふぅー……んっ、んんっ、んむぅ……んぅ……も、もっと……しゅうじ君、もっと……
305
+ wav/nen114_146.wav|んぷぁぁっ、はっ、はぁ……はぁ……はひっ、んぁぁあ……はぁぁぁ……
306
+ wav/nen115_007.wav|でもその前に、私たちの演奏を聞いて下さい。一生懸命練習してきましたから
307
+ wav/nen115_021.wav|しゅうじ君は……誰に投票したんですか?
308
+ wav/nen115_037.wav|はぁ、ぁぁむ……ん、んんっ、ちゅちゅ……じゅる、ちゅぱちゅる、んっ、んんんんんーーーーーー
309
+ wav/nen115_049.wav|ひっ!?あっ、あっ、あああぁぁーーっ!
310
+ wav/nen115_061.wav|ずっとオナニー我慢してて……ぁぁぁぁあっ!しゅうじくん、しゅうじくん……っ、はぁ、はぁ、はぁぁあぁっ
311
+ wav/nen115_072.wav|ふぇぇ……?はっ、はぁ、はぁ、はぁ……ど、どうかしたんですか……?
312
+ wav/nen115_083.wav|好き、あっ、あっ、あっ、ひゅきでひゅ……おち●ちんにグリグリされるの……あっ、あっ、ああぁぁああっ!
313
+ wav/nen115_094.wav|あーー……あはーーー……はひ、はひっ……んへぁぁ……私、こんな下品な声を出してイっちゃった……はぁーっ……はぁーっ……
314
+ wav/nen115_106.wav|んっ、んっ、んんーーーーっ!はひっ、はひっ、んっ、んんんーーーーッ!
315
+ wav/nen115_118.wav|んひっ、あっ、あっ、んんっ、んんぁぁあっ、はぁーっ……はぁーっ……あっ、あっ、はぁぁぁぁぁ……
316
+ wav/nen115_129.wav|それに……こんなの、まるでおち●ちんが、私から生えたみたいです。しかも硬いままで……
317
+ wav/nen115_144.wav|んー……こんなものでしょうか
318
+ wav/nen115_156.wav|お願い?
319
+ wav/nen115_168.wav|んっ、ぅうぅ……はぁ、はぁ……んんっ、んんん……
320
+ wav/nen115_180.wav|はっ、んっ、んんぁっ、んぁっ……ぁぁあぁああぁ……引っかかるの、気持ちいい、です……んんー……ッッ
321
+ wav/nen115_191.wav|はぁー……はぁー……はぁー……ぁぁぁ、んんんっ……
322
+ wav/nen115_202.wav|あっ、あっ、あーーーっ……中、中が切なくて……はぁ、はぁ、はぁ、あの、もうオナニーじゃなくなってもいいですか?
323
+ wav/nen115_214.wav|だって……んぁぁ、ずっと待ってたんです。欲しくて、我慢してたんです……だから、仕方ないんですよ、ぁぁぁ……
324
+ wav/nen115_225.wav|違う、違うのぉ……身体が勝手に……ん、ん、ん、んぁぁあーーーぁぁぁぁ……こひゅれてる、気持ちいいの、こひゅれてるぅ
325
+ wav/nen115_236.wav|おま●こですっ、おま●こに欲しい……んっ、んぁ……精液、こっちで飲みたいんです、んぁ、んぁ、んぁーーっ!
326
+ wav/nen115_249.wav|え、えぇぇ……ま、まだ足りないんですか?こんなにドロドロにしたのに……
327
+ wav/nen115_262.wav|そう言ってもらえると……ありがとうございます
328
+ wav/nen116_001.wav|はい
329
+ wav/nen116_012.wav|私は……別に流されてもいいのに……
330
+ wav/nen116_026.wav|私にできることがあるなら、何でもします。だから、1人で苦しまないで下さい
331
+ wav/nen116_039.wav|はい、大丈夫です
332
+ wav/nen117_007.wav|どうしてそういうことを言うんですか!私の好きな人なのに!
333
+ wav/nen117_020.wav|あとですね、せっかくですからお泊まり用具の他にも色々用意してきたんです
334
+ wav/nen117_034.wav|あの、お風呂頂きました。お……お待たせ……しました、しゅうじ君
335
+ wav/nen117_047.wav|せっかく気合いを入れて身体も綺麗にしたのに……先に寝ちゃうなんてひどいです
336
+ wav/nen117_058.wav|よかった、安心しました
337
+ wav/nen117_069.wav|んぅぅ……ちゅ、ちゅ、んんんんーーッ……嫌じゃないですよ?むしろ……私は濃い方が好きかもしれません……ん、じゅる、じゅるりっ
338
+ wav/nen117_080.wav|じゃあ、続けますね。ん、ちゅ、ちゅぶぶ……んっ、じゅるっ、じゅぽじゅぽ、ちゅ、ちゅるるっ
339
+ wav/nen117_091.wav|んふぅ……ほら、こうして正直に教えてくれます、気持ちいいって
340
+ wav/nen117_103.wav|はぁ、はぁ、はぁ……すごい、トロトロと匂いが、さっきから止まりません……ああ、全然綺麗にできない
341
+ wav/nen117_114.wav|んぐっ……んぶ、んぶ……ッ……んんんんーーーーーッ!ん、んんーーーー……コク……コク……ん、んんんむぅ
342
+ wav/nen117_125.wav|ひゃっ、たくさん……あつい精液、びゅーって飛んで……あ、きゃっ、ひゃっ
343
+ wav/nen117_136.wav|ん、ちゅば、ちゅば……んんっ、れろれろ……ンンッ……はぁ、はぁ……れりょれりょ
344
+ wav/nen117_147.wav|んっ、んんっ、あむあむ……ぢゅぷ、ぢゅるるる……んぽくぽ、じゅるるるっ
345
+ wav/nen117_158.wav|あっ!ダメですよ、これは罰なのに、あ、きゃぁぁぁッ
346
+ wav/nen117_171.wav|んっ、んんんぁぁぁぁーーーーーーーー……ッッ!
347
+ wav/nen117_183.wav|あ、あ、ああーーーーっ……はぁ、はぁ……あ、あ、あ、それ、すごい……すごいぃぃ……んんぁあッッ
348
+ wav/nen117_194.wav|あっ、ひっ、んひぃぃッ……あーっ、あーっ……もうらめぇ…あ、あ、あ、イく、いっっ……くぅぅぅぅーーーーーぅぅぅぅううううッッ!!
349
+ wav/nen117_206.wav|んっ、あっ、あっ、あっ、あっ……そうなんですか?わたし、もうちゃんと、覚えてるんですか?
350
+ wav/nen117_217.wav|イっちゃうっ、わたひまたイっちゃうぅぅ……ッ
351
+ wav/nen117_228.wav|かひっ、かっ、はぁ、はぁ……んんんっ……はぁ、はぁ、んんっ、んぁ……ぁぁぁぁ……
352
+ wav/nen117_239.wav|それは、ちがっ、えっと、あががががががががががが――
353
+ wav/nen203_010.wav|はい、それは残念ながら
354
+ wav/nen203_025.wav|ご協力ありがとうございます。それは思い至ってませんでした、助かりました
355
+ wav/nen203_040.wav|心を許しあえるような相手が出来れば、おそらくは
356
+ wav/nen203_053.wav|すみません……明日もこうでないといいんですが……
357
+ wav/nen203_065.wav|あの、どうかしたんですか?いなばさん
358
+ wav/nen203_080.wav|それはたぶん、昨日話をした、胸の痛みに関わることなんですよね
359
+ wav/nen203_095.wav|占いなんて、あくまでも切っ掛けみたいなものですから
360
+ wav/nen203_111.wav|あ、あの、優しくしてください……それと、電気を消して……お願いです……
361
+ wav/nen203_127.wav|せっかくですし、一緒に入りませんか?
362
+ wav/nen204_006.wav|では、今日はこの辺りで解散にしましょうか
363
+ wav/nen205_018.wav|それでですね、ほしな君
364
+ wav/nen206_007.wav|ええ、ちょっと
365
+ wav/nen206_022.wav|そうですね。少なくとも、自分のせいっていうのはいなばさんの誤解かも知れませんし
366
+ wav/nen206_033.wav|やりとりをオープンにした方が、互いに痛くもない腹を探り合わないで済むと思います
367
+ wav/nen206_048.wav|もし、木月さんの行方が知れなくなったのが、魔法や契約と絡むことなら――
368
+ wav/nen206_063.wav|だから学院にも、なにも……
369
+ wav/nen207_016.wav|座りましたっ
370
+ wav/nen207_031.wav|え?そ、それはもちろんですけど
371
+ wav/nen209_001.wav|こんにちは
372
+ wav/nen210_009.wav|とりあえず……ほしな君にそ��、想定外に下着まで見せてしまったんですよね?
373
+ wav/nen210_023.wav|放課後、ななおのところまで付き合ってもらえませんか?
374
+ wav/nen210_039.wav|お待たせしました
375
+ wav/nen211_004.wav|はい。ですからほしな君の中には今、魔女2人のものである欠片がそれぞれにあります
376
+ wav/nen211_015.wav|そして、こうなってしまったものは仕方がありませんし、回収不可能なわけでもないんですから
377
+ wav/nen212_001.wav|う、上手くいったんですか?
378
+ wav/nen212_015.wav|はい、おかげさまで
379
+ wav/nen213_011.wav|生まれつき備えてしまっていた、あの能力のせいで
380
+ wav/nen213_025.wav|はあ……せ、交尾ですか
381
+ wav/nen214_010.wav|い、いえっなんでもっ
382
+ wav/nen215_012.wav|それもわかりますけど
383
+ wav/nen217_006.wav|とがくし先輩、その――
384
+ wav/nen218_009.wav|そこはまた、ご協力いただければ助かります
385
+ wav/nen219_005.wav|ハッピーハロウィンですね、いなばさん
386
+ wav/nen301_006.wav|ええ。私の方は、あともう少しで溜まりますから
387
+ wav/nen301_017.wav|はい、頑張ります
388
+ wav/nen302_010.wav|知っている方なんですか、2人とも?
389
+ wav/nen303_003.wav|なるほど。だったら、しいばさんはあまり近づき過ぎない方がいいかもしれません
390
+ wav/nen303_014.wav|はい、どうやらほしな君の心の穴が広がってしまった可能性がありそうです
391
+ wav/nen303_030.wav|いいんです、ほしな君が吸収してしまった分なら、ほとんど回収した後ですし
392
+ wav/nen303_045.wav|ほしな君の心の穴を埋めるのも、しいばさんにお任せした方が効率的かもしれません
393
+ wav/nen305_004.wav|こ、交尾をされたわけではないですよね?
394
+ wav/nen307_005.wav|もっとも、ほしな君が誘ったのはしいばさんです。しいばさん次第だと思いますが
395
+ wav/nen308_007.wav|ですがしばらくの間、話し相手になることにしました
396
+ wav/nen310_006.wav|いいんじゃないでしょうか?
397
+ wav/nen312_003.wav|どうかしましたか?ほしな君もまだ来てないようですし、気になっていたんですが
398
+ wav/nen312_014.wav|いえ、私も何も聞いていませんが
399
+ wav/nen314_002.wav|ありがとうございます
400
+ wav/nen314_016.wav|魔女を常に見張る者が多いそうです、心当たりはありませんか?
401
+ wav/nen314_027.wav|すると心を強引に削り取った痕がみつかったんです!
402
+ wav/nen315_002.wav|え、ええ
403
+ wav/nen315_013.wav|はい、ですがこの場合、欠片は犯人から奪い返せばいいんです
404
+ wav/nen315_024.wav|見つけ出すだけでも、なかなか骨が折れそうですが
405
+ wav/nen316_003.wav|別のアルプがいるなら、匂いでわかるというのですが
406
+ wav/nen317_008.wav|いえ、厚真さんが預かっていた子犬も、行方がわからなくなっているのを思い出したんですが
407
+ wav/nen319_005.wav|人間に見えても、ぼんやりしないでしっかり警戒を
408
+ wav/nen401_006.wav|ふー……ふー………………はぁ、美味しい
409
+ wav/nen402_007.wav|はい
410
+ wav/nen402_020.wav|ちょっと思いつきませんね
411
+ wav/nen404_003.wav|もし何かあるなら休んでくれてもいいんですよ?
412
+ wav/nen404_014.wav|私に対する罪悪感といいますか、義務感と言いますか……それはきっと同情に近い感情ですから……
413
+ wav/nen405_002.wav|ほしな君。ああいうのは、どうかと思います
414
+ wav/nen405_013.wav|はい、何ですか?
415
+ wav/nen405_024.wav|いえ、今日は仕方ありませんよ。相談だけじゃなく、占いを希望する人も来ませんでしたからね
416
+ wav/nen406_010.wav|欠片が戻ってきたのは、ほしな君がとがくし先輩とお付き合いをするようになったからだと思うんです
417
+ wav/nen406_021.wav|それに……これはあくまで、責めるつもりではなく、色んな人の相談を受けて思った個人的な意見なんですが
418
+ wav/nen409_003.wav|あ、ほしな君
419
+ wav/nen409_014.wav|魔力の塊をぶつけることで、多少のショックを与えるかもしれないそうですが、先輩の心にひどい影響を与えるものじゃないそうです
420
+ wav/nen409_025.wav|私は、この弾丸を撃てばいいわけですね
421
+ wav/nen409_038.wav|それでは
422
+ wav/nen410_010.wav|それは、ほしな君がオカ研で頑張ってくれた分で相殺です。実際、今のこの欠片の量は、私がほしな君と出会う前より、ほんの少し少ないだけですから
423
+ wav/nen410_022.wav|学院内ではあれほどダメだって言ってるじゃないですか
424
+ wav/nen504_001.wav|ほしな君、調子はどうですか?
425
+ wav/nen505_008.wav|えっと……こ、ここは、励まし会とか開いた方がいいんでしょうか?
426
+ wav/nen507_009.wav|なのに、部活を続けたりしたら、擦れ違いですとか、そういうこと���心配になって
hifiutils.py ADDED
@@ -0,0 +1,58 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import glob
2
+ import os
3
+ import matplotlib
4
+ import torch
5
+ from torch.nn.utils import weight_norm
6
+ matplotlib.use("Agg")
7
+ import matplotlib.pylab as plt
8
+
9
+
10
+ def plot_spectrogram(spectrogram):
11
+ fig, ax = plt.subplots(figsize=(10, 2))
12
+ im = ax.imshow(spectrogram, aspect="auto", origin="lower",
13
+ interpolation='none')
14
+ plt.colorbar(im, ax=ax)
15
+
16
+ fig.canvas.draw()
17
+ plt.close()
18
+
19
+ return fig
20
+
21
+
22
+ def init_weights(m, mean=0.0, std=0.01):
23
+ classname = m.__class__.__name__
24
+ if classname.find("Conv") != -1:
25
+ m.weight.data.normal_(mean, std)
26
+
27
+
28
+ def apply_weight_norm(m):
29
+ classname = m.__class__.__name__
30
+ if classname.find("Conv") != -1:
31
+ weight_norm(m)
32
+
33
+
34
+ def get_padding(kernel_size, dilation=1):
35
+ return int((kernel_size*dilation - dilation)/2)
36
+
37
+
38
+ def load_checkpoint(filepath, device):
39
+ assert os.path.isfile(filepath)
40
+ print("Loading '{}'".format(filepath))
41
+ checkpoint_dict = torch.load(filepath, map_location=device)
42
+ print("Complete.")
43
+ return checkpoint_dict
44
+
45
+
46
+ def save_checkpoint(filepath, obj):
47
+ print("Saving checkpoint to {}".format(filepath))
48
+ torch.save(obj, filepath)
49
+ print("Complete.")
50
+
51
+
52
+ def scan_checkpoint(cp_dir, prefix):
53
+ pattern = os.path.join(cp_dir, prefix + '????????')
54
+ cp_list = glob.glob(pattern)
55
+ if len(cp_list) == 0:
56
+ return None
57
+ return sorted(cp_list)[-1]
58
+
hparams (1).py ADDED
@@ -0,0 +1,94 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import torch
2
+ from text import symbols
3
+
4
+
5
+ class create_hparams():
6
+ """Create model hyperparameters. Parse nondefault from given string."""
7
+ ################################
8
+ # CUDA Enable #
9
+ ################################
10
+ if torch.cuda.is_available() :
11
+ cuda_enabled = True
12
+ else :
13
+ cuda_enabled = False
14
+
15
+ ################################
16
+ # Experiment Parameters #
17
+ ################################
18
+ epochs = 100
19
+ iters_per_checkpoint = 500
20
+ seed= 1234
21
+ dynamic_loss_scaling = True
22
+ fp16_run = False
23
+ distributed_run = False
24
+ dist_backend = "nccl"
25
+ dist_url = "tcp://localhost:54321"
26
+ cudnn_enabled = True
27
+ cudnn_benchmark = False
28
+ ignore_layers = ['embedding.weight']
29
+
30
+ ################################
31
+ # Data Parameters #
32
+ ################################
33
+ load_mel_from_disk = False
34
+ training_files = 'filelists/transcript_train.txt'
35
+ validation_files = 'filelists/transcript_val.txt'
36
+ text_cleaners = ['japanese_cleaners']
37
+
38
+ ################################
39
+ # Audio Parameters #
40
+ ################################
41
+ max_wav_value = 32768.0
42
+ sampling_rate = 22050
43
+ filter_length = 1024
44
+ hop_length = 256
45
+ win_length = 1024
46
+ n_mel_channels = 80
47
+ mel_fmin = 0.0
48
+ mel_fmax = 8000.0
49
+
50
+ ################################
51
+ # Model Parameters #
52
+ ################################
53
+ n_symbols = len(symbols)
54
+ symbols_embedding_dim = 512
55
+
56
+ # Encoder parameters
57
+ encoder_kernel_size = 5
58
+ encoder_n_convolutions = 3
59
+ encoder_embedding_dim = 512
60
+
61
+ # Decoder parameters
62
+ n_frames_per_step = 1 # currently only 1 is supported
63
+ decoder_rnn_dim = 1024
64
+ prenet_dim = 256
65
+ max_decoder_steps = 1000
66
+ gate_threshold = 0.5
67
+ p_attention_dropout = 0.1
68
+ p_decoder_dropout = 0.1
69
+
70
+ # Attention parameters
71
+ attention_rnn_dim = 1024
72
+ attention_dim = 128
73
+ # Location Layer parameters
74
+ attention_location_n_filters = 32
75
+ attention_location_kernel_size = 31
76
+
77
+ # Mel-post processing network parameters
78
+ postnet_embedding_dim = 512
79
+ postnet_kernel_size = 5
80
+ postnet_n_convolutions = 5
81
+
82
+ ################################
83
+ # Optimization Hyperparameters #
84
+ ################################
85
+ use_saved_learning_rate = False
86
+ learning_rate = 1e-3
87
+ weight_decay = 1e-6
88
+ grad_clip_thresh = 1.0
89
+ batch_size = 64
90
+ mask_padding = True # set model's padded outputs to padded values
91
+
92
+
93
+
94
+
hparams.py ADDED
@@ -0,0 +1,94 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import torch
2
+ from text import symbols
3
+
4
+
5
+ class create_hparams():
6
+ """Create model hyperparameters. Parse nondefault from given string."""
7
+ ################################
8
+ # CUDA Enable #
9
+ ################################
10
+ if torch.cuda.is_available() :
11
+ cuda_enabled = True
12
+ else :
13
+ cuda_enabled = False
14
+
15
+ ################################
16
+ # Experiment Parameters #
17
+ ################################
18
+ epochs = 100
19
+ iters_per_checkpoint = 500
20
+ seed= 1234
21
+ dynamic_loss_scaling = True
22
+ fp16_run = False
23
+ distributed_run = False
24
+ dist_backend = "nccl"
25
+ dist_url = "tcp://localhost:54321"
26
+ cudnn_enabled = True
27
+ cudnn_benchmark = False
28
+ ignore_layers = ['embedding.weight']
29
+
30
+ ################################
31
+ # Data Parameters #
32
+ ################################
33
+ load_mel_from_disk = False
34
+ training_files = 'filelists/transcript_train.txt'
35
+ validation_files = 'filelists/transcript_val.txt'
36
+ text_cleaners = ['japanese_cleaners']
37
+
38
+ ################################
39
+ # Audio Parameters #
40
+ ################################
41
+ max_wav_value = 32768.0
42
+ sampling_rate = 22050
43
+ filter_length = 1024
44
+ hop_length = 256
45
+ win_length = 1024
46
+ n_mel_channels = 80
47
+ mel_fmin = 0.0
48
+ mel_fmax = 8000.0
49
+
50
+ ################################
51
+ # Model Parameters #
52
+ ################################
53
+ n_symbols = len(symbols)
54
+ symbols_embedding_dim = 512
55
+
56
+ # Encoder parameters
57
+ encoder_kernel_size = 5
58
+ encoder_n_convolutions = 3
59
+ encoder_embedding_dim = 512
60
+
61
+ # Decoder parameters
62
+ n_frames_per_step = 1 # currently only 1 is supported
63
+ decoder_rnn_dim = 1024
64
+ prenet_dim = 256
65
+ max_decoder_steps = 1000
66
+ gate_threshold = 0.5
67
+ p_attention_dropout = 0.1
68
+ p_decoder_dropout = 0.1
69
+
70
+ # Attention parameters
71
+ attention_rnn_dim = 1024
72
+ attention_dim = 128
73
+ # Location Layer parameters
74
+ attention_location_n_filters = 32
75
+ attention_location_kernel_size = 31
76
+
77
+ # Mel-post processing network parameters
78
+ postnet_embedding_dim = 512
79
+ postnet_kernel_size = 5
80
+ postnet_n_convolutions = 5
81
+
82
+ ################################
83
+ # Optimization Hyperparameters #
84
+ ################################
85
+ use_saved_learning_rate = False
86
+ learning_rate = 1e-3
87
+ weight_decay = 1e-6
88
+ grad_clip_thresh = 1.0
89
+ batch_size = 64
90
+ mask_padding = True # set model's padded outputs to padded values
91
+
92
+
93
+
94
+
inference.ipynb ADDED
The diff for this file is too large to render. See raw diff
 
layers.py ADDED
@@ -0,0 +1,80 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import torch
2
+ from librosa.filters import mel as librosa_mel_fn
3
+ from audio_processing import dynamic_range_compression
4
+ from audio_processing import dynamic_range_decompression
5
+ from stft import STFT
6
+
7
+
8
+ class LinearNorm(torch.nn.Module):
9
+ def __init__(self, in_dim, out_dim, bias=True, w_init_gain='linear'):
10
+ super(LinearNorm, self).__init__()
11
+ self.linear_layer = torch.nn.Linear(in_dim, out_dim, bias=bias)
12
+
13
+ torch.nn.init.xavier_uniform_(
14
+ self.linear_layer.weight,
15
+ gain=torch.nn.init.calculate_gain(w_init_gain))
16
+
17
+ def forward(self, x):
18
+ return self.linear_layer(x)
19
+
20
+
21
+ class ConvNorm(torch.nn.Module):
22
+ def __init__(self, in_channels, out_channels, kernel_size=1, stride=1,
23
+ padding=None, dilation=1, bias=True, w_init_gain='linear'):
24
+ super(ConvNorm, self).__init__()
25
+ if padding is None:
26
+ assert(kernel_size % 2 == 1)
27
+ padding = int(dilation * (kernel_size - 1) / 2)
28
+
29
+ self.conv = torch.nn.Conv1d(in_channels, out_channels,
30
+ kernel_size=kernel_size, stride=stride,
31
+ padding=padding, dilation=dilation,
32
+ bias=bias)
33
+
34
+ torch.nn.init.xavier_uniform_(
35
+ self.conv.weight, gain=torch.nn.init.calculate_gain(w_init_gain))
36
+
37
+ def forward(self, signal):
38
+ conv_signal = self.conv(signal)
39
+ return conv_signal
40
+
41
+
42
+ class TacotronSTFT(torch.nn.Module):
43
+ def __init__(self, filter_length=1024, hop_length=256, win_length=1024,
44
+ n_mel_channels=80, sampling_rate=22050, mel_fmin=0.0,
45
+ mel_fmax=8000.0):
46
+ super(TacotronSTFT, self).__init__()
47
+ self.n_mel_channels = n_mel_channels
48
+ self.sampling_rate = sampling_rate
49
+ self.stft_fn = STFT(filter_length, hop_length, win_length)
50
+ mel_basis = librosa_mel_fn(
51
+ sampling_rate, filter_length, n_mel_channels, mel_fmin, mel_fmax)
52
+ mel_basis = torch.from_numpy(mel_basis).float()
53
+ self.register_buffer('mel_basis', mel_basis)
54
+
55
+ def spectral_normalize(self, magnitudes):
56
+ output = dynamic_range_compression(magnitudes)
57
+ return output
58
+
59
+ def spectral_de_normalize(self, magnitudes):
60
+ output = dynamic_range_decompression(magnitudes)
61
+ return output
62
+
63
+ def mel_spectrogram(self, y):
64
+ """Computes mel-spectrograms from a batch of waves
65
+ PARAMS
66
+ ------
67
+ y: Variable(torch.FloatTensor) with shape (B, T) in range [-1, 1]
68
+
69
+ RETURNS
70
+ -------
71
+ mel_output: torch.FloatTensor of shape (B, n_mel_channels, T)
72
+ """
73
+ assert(torch.min(y.data) >= -1)
74
+ assert(torch.max(y.data) <= 1)
75
+
76
+ magnitudes, phases = self.stft_fn.transform(y)
77
+ magnitudes = magnitudes.data
78
+ mel_output = torch.matmul(self.mel_basis, magnitudes)
79
+ mel_output = self.spectral_normalize(mel_output)
80
+ return mel_output
logger.py ADDED
@@ -0,0 +1,48 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import random
2
+ import torch
3
+ from torch.utils.tensorboard import SummaryWriter
4
+ from plotting_utils import plot_alignment_to_numpy, plot_spectrogram_to_numpy
5
+ from plotting_utils import plot_gate_outputs_to_numpy
6
+
7
+
8
+ class Tacotron2Logger(SummaryWriter):
9
+ def __init__(self, logdir):
10
+ super(Tacotron2Logger, self).__init__(logdir)
11
+
12
+ def log_training(self, reduced_loss, grad_norm, learning_rate, duration,
13
+ iteration):
14
+ self.add_scalar("training.loss", reduced_loss, iteration)
15
+ self.add_scalar("grad.norm", grad_norm, iteration)
16
+ self.add_scalar("learning.rate", learning_rate, iteration)
17
+ self.add_scalar("duration", duration, iteration)
18
+
19
+ def log_validation(self, reduced_loss, model, y, y_pred, iteration):
20
+ self.add_scalar("validation.loss", reduced_loss, iteration)
21
+ _, mel_outputs, gate_outputs, alignments = y_pred
22
+ mel_targets, gate_targets = y
23
+
24
+ # plot distribution of parameters
25
+ for tag, value in model.named_parameters():
26
+ tag = tag.replace('.', '/')
27
+ self.add_histogram(tag, value.data.cpu().numpy(), iteration)
28
+
29
+ # plot alignment, mel target and predicted, gate target and predicted
30
+ idx = random.randint(0, alignments.size(0) - 1)
31
+ self.add_image(
32
+ "alignment",
33
+ plot_alignment_to_numpy(alignments[idx].data.cpu().numpy().T),
34
+ iteration, dataformats='HWC')
35
+ self.add_image(
36
+ "mel_target",
37
+ plot_spectrogram_to_numpy(mel_targets[idx].data.cpu().numpy()),
38
+ iteration, dataformats='HWC')
39
+ self.add_image(
40
+ "mel_predicted",
41
+ plot_spectrogram_to_numpy(mel_outputs[idx].data.cpu().numpy()),
42
+ iteration, dataformats='HWC')
43
+ self.add_image(
44
+ "gate",
45
+ plot_gate_outputs_to_numpy(
46
+ gate_targets[idx].data.cpu().numpy(),
47
+ torch.sigmoid(gate_outputs[idx]).data.cpu().numpy()),
48
+ iteration, dataformats='HWC')
loss_function.py ADDED
@@ -0,0 +1,19 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from torch import nn
2
+
3
+
4
+ class Tacotron2Loss(nn.Module):
5
+ def __init__(self):
6
+ super(Tacotron2Loss, self).__init__()
7
+
8
+ def forward(self, model_output, targets):
9
+ mel_target, gate_target = targets[0], targets[1]
10
+ mel_target.requires_grad = False
11
+ gate_target.requires_grad = False
12
+ gate_target = gate_target.view(-1, 1)
13
+
14
+ mel_out, mel_out_postnet, gate_out, _ = model_output
15
+ gate_out = gate_out.view(-1, 1)
16
+ mel_loss = nn.MSELoss()(mel_out, mel_target) + \
17
+ nn.MSELoss()(mel_out_postnet, mel_target)
18
+ gate_loss = nn.BCEWithLogitsLoss()(gate_out, gate_target)
19
+ return mel_loss + gate_loss
loss_scaler.py ADDED
@@ -0,0 +1,131 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import torch
2
+
3
+ class LossScaler:
4
+
5
+ def __init__(self, scale=1):
6
+ self.cur_scale = scale
7
+
8
+ # `params` is a list / generator of torch.Variable
9
+ def has_overflow(self, params):
10
+ return False
11
+
12
+ # `x` is a torch.Tensor
13
+ def _has_inf_or_nan(x):
14
+ return False
15
+
16
+ # `overflow` is boolean indicating whether we overflowed in gradient
17
+ def update_scale(self, overflow):
18
+ pass
19
+
20
+ @property
21
+ def loss_scale(self):
22
+ return self.cur_scale
23
+
24
+ def scale_gradient(self, module, grad_in, grad_out):
25
+ return tuple(self.loss_scale * g for g in grad_in)
26
+
27
+ def backward(self, loss):
28
+ scaled_loss = loss*self.loss_scale
29
+ scaled_loss.backward()
30
+
31
+ class DynamicLossScaler:
32
+
33
+ def __init__(self,
34
+ init_scale=2**32,
35
+ scale_factor=2.,
36
+ scale_window=1000):
37
+ self.cur_scale = init_scale
38
+ self.cur_iter = 0
39
+ self.last_overflow_iter = -1
40
+ self.scale_factor = scale_factor
41
+ self.scale_window = scale_window
42
+
43
+ # `params` is a list / generator of torch.Variable
44
+ def has_overflow(self, params):
45
+ # return False
46
+ for p in params:
47
+ if p.grad is not None and DynamicLossScaler._has_inf_or_nan(p.grad.data):
48
+ return True
49
+
50
+ return False
51
+
52
+ # `x` is a torch.Tensor
53
+ def _has_inf_or_nan(x):
54
+ cpu_sum = float(x.float().sum())
55
+ if cpu_sum == float('inf') or cpu_sum == -float('inf') or cpu_sum != cpu_sum:
56
+ return True
57
+ return False
58
+
59
+ # `overflow` is boolean indicating whether we overflowed in gradient
60
+ def update_scale(self, overflow):
61
+ if overflow:
62
+ #self.cur_scale /= self.scale_factor
63
+ self.cur_scale = max(self.cur_scale/self.scale_factor, 1)
64
+ self.last_overflow_iter = self.cur_iter
65
+ else:
66
+ if (self.cur_iter - self.last_overflow_iter) % self.scale_window == 0:
67
+ self.cur_scale *= self.scale_factor
68
+ # self.cur_scale = 1
69
+ self.cur_iter += 1
70
+
71
+ @property
72
+ def loss_scale(self):
73
+ return self.cur_scale
74
+
75
+ def scale_gradient(self, module, grad_in, grad_out):
76
+ return tuple(self.loss_scale * g for g in grad_in)
77
+
78
+ def backward(self, loss):
79
+ scaled_loss = loss*self.loss_scale
80
+ scaled_loss.backward()
81
+
82
+ ##############################################################
83
+ # Example usage below here -- assuming it's in a separate file
84
+ ##############################################################
85
+ if __name__ == "__main__":
86
+ import torch
87
+ from torch.autograd import Variable
88
+ from dynamic_loss_scaler import DynamicLossScaler
89
+
90
+ # N is batch size; D_in is input dimension;
91
+ # H is hidden dimension; D_out is output dimension.
92
+ N, D_in, H, D_out = 64, 1000, 100, 10
93
+
94
+ # Create random Tensors to hold inputs and outputs, and wrap them in Variables.
95
+ x = Variable(torch.randn(N, D_in), requires_grad=False)
96
+ y = Variable(torch.randn(N, D_out), requires_grad=False)
97
+
98
+ w1 = Variable(torch.randn(D_in, H), requires_grad=True)
99
+ w2 = Variable(torch.randn(H, D_out), requires_grad=True)
100
+ parameters = [w1, w2]
101
+
102
+ learning_rate = 1e-6
103
+ optimizer = torch.optim.SGD(parameters, lr=learning_rate)
104
+ loss_scaler = DynamicLossScaler()
105
+
106
+ for t in range(500):
107
+ y_pred = x.mm(w1).clamp(min=0).mm(w2)
108
+ loss = (y_pred - y).pow(2).sum() * loss_scaler.loss_scale
109
+ print('Iter {} loss scale: {}'.format(t, loss_scaler.loss_scale))
110
+ print('Iter {} scaled loss: {}'.format(t, loss.data[0]))
111
+ print('Iter {} unscaled loss: {}'.format(t, loss.data[0] / loss_scaler.loss_scale))
112
+
113
+ # Run backprop
114
+ optimizer.zero_grad()
115
+ loss.backward()
116
+
117
+ # Check for overflow
118
+ has_overflow = DynamicLossScaler.has_overflow(parameters)
119
+
120
+ # If no overflow, unscale grad and update as usual
121
+ if not has_overflow:
122
+ for param in parameters:
123
+ param.grad.data.mul_(1. / loss_scaler.loss_scale)
124
+ optimizer.step()
125
+ # Otherwise, don't do anything -- ie, skip iteration
126
+ else:
127
+ print('OVERFLOW!')
128
+
129
+ # Update loss scale for next iteration
130
+ loss_scaler.update_scale(has_overflow)
131
+
meldataset.py ADDED
@@ -0,0 +1,168 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import math
2
+ import os
3
+ import random
4
+ import torch
5
+ import torch.utils.data
6
+ import numpy as np
7
+ from librosa.util import normalize
8
+ from scipy.io.wavfile import read
9
+ from librosa.filters import mel as librosa_mel_fn
10
+
11
+ MAX_WAV_VALUE = 32768.0
12
+
13
+
14
+ def load_wav(full_path):
15
+ sampling_rate, data = read(full_path)
16
+ return data, sampling_rate
17
+
18
+
19
+ def dynamic_range_compression(x, C=1, clip_val=1e-5):
20
+ return np.log(np.clip(x, a_min=clip_val, a_max=None) * C)
21
+
22
+
23
+ def dynamic_range_decompression(x, C=1):
24
+ return np.exp(x) / C
25
+
26
+
27
+ def dynamic_range_compression_torch(x, C=1, clip_val=1e-5):
28
+ return torch.log(torch.clamp(x, min=clip_val) * C)
29
+
30
+
31
+ def dynamic_range_decompression_torch(x, C=1):
32
+ return torch.exp(x) / C
33
+
34
+
35
+ def spectral_normalize_torch(magnitudes):
36
+ output = dynamic_range_compression_torch(magnitudes)
37
+ return output
38
+
39
+
40
+ def spectral_de_normalize_torch(magnitudes):
41
+ output = dynamic_range_decompression_torch(magnitudes)
42
+ return output
43
+
44
+
45
+ mel_basis = {}
46
+ hann_window = {}
47
+
48
+
49
+ def mel_spectrogram(y, n_fft, num_mels, sampling_rate, hop_size, win_size, fmin, fmax, center=False):
50
+ if torch.min(y) < -1.:
51
+ print('min value is ', torch.min(y))
52
+ if torch.max(y) > 1.:
53
+ print('max value is ', torch.max(y))
54
+
55
+ global mel_basis, hann_window
56
+ if fmax not in mel_basis:
57
+ mel = librosa_mel_fn(sampling_rate, n_fft, num_mels, fmin, fmax)
58
+ mel_basis[str(fmax)+'_'+str(y.device)] = torch.from_numpy(mel).float().to(y.device)
59
+ hann_window[str(y.device)] = torch.hann_window(win_size).to(y.device)
60
+
61
+ y = torch.nn.functional.pad(y.unsqueeze(1), (int((n_fft-hop_size)/2), int((n_fft-hop_size)/2)), mode='reflect')
62
+ y = y.squeeze(1)
63
+
64
+ spec = torch.stft(y, n_fft, hop_length=hop_size, win_length=win_size, window=hann_window[str(y.device)],
65
+ center=center, pad_mode='reflect', normalized=False, onesided=True)
66
+
67
+ spec = torch.sqrt(spec.pow(2).sum(-1)+(1e-9))
68
+
69
+ spec = torch.matmul(mel_basis[str(fmax)+'_'+str(y.device)], spec)
70
+ spec = spectral_normalize_torch(spec)
71
+
72
+ return spec
73
+
74
+
75
+ def get_dataset_filelist(a):
76
+ with open(a.input_training_file, 'r', encoding='utf-8') as fi:
77
+ training_files = [os.path.join(a.input_wavs_dir, x.split('|')[0])
78
+ for x in fi.read().split('\n') if len(x) > 0]
79
+
80
+ with open(a.input_validation_file, 'r', encoding='utf-8') as fi:
81
+ validation_files = [os.path.join(a.input_wavs_dir, x.split('|')[0])
82
+ for x in fi.read().split('\n') if len(x) > 0]
83
+ return training_files, validation_files
84
+
85
+
86
+ class MelDataset(torch.utils.data.Dataset):
87
+ def __init__(self, training_files, segment_size, n_fft, num_mels,
88
+ hop_size, win_size, sampling_rate, fmin, fmax, split=True, shuffle=True, n_cache_reuse=1,
89
+ device=None, fmax_loss=None, fine_tuning=False, base_mels_path=None):
90
+ self.audio_files = training_files
91
+ random.seed(1234)
92
+ if shuffle:
93
+ random.shuffle(self.audio_files)
94
+ self.segment_size = segment_size
95
+ self.sampling_rate = sampling_rate
96
+ self.split = split
97
+ self.n_fft = n_fft
98
+ self.num_mels = num_mels
99
+ self.hop_size = hop_size
100
+ self.win_size = win_size
101
+ self.fmin = fmin
102
+ self.fmax = fmax
103
+ self.fmax_loss = fmax_loss
104
+ self.cached_wav = None
105
+ self.n_cache_reuse = n_cache_reuse
106
+ self._cache_ref_count = 0
107
+ self.device = device
108
+ self.fine_tuning = fine_tuning
109
+ self.base_mels_path = base_mels_path
110
+
111
+ def __getitem__(self, index):
112
+ filename = self.audio_files[index]
113
+ if self._cache_ref_count == 0:
114
+ audio, sampling_rate = load_wav(filename)
115
+ audio = audio / MAX_WAV_VALUE
116
+ if not self.fine_tuning:
117
+ audio = normalize(audio) * 0.95
118
+ self.cached_wav = audio
119
+ if sampling_rate != self.sampling_rate:
120
+ raise ValueError("{} SR doesn't match target {} SR".format(
121
+ sampling_rate, self.sampling_rate))
122
+ self._cache_ref_count = self.n_cache_reuse
123
+ else:
124
+ audio = self.cached_wav
125
+ self._cache_ref_count -= 1
126
+
127
+ audio = torch.FloatTensor(audio)
128
+ audio = audio.unsqueeze(0)
129
+
130
+ if not self.fine_tuning:
131
+ if self.split:
132
+ if audio.size(1) >= self.segment_size:
133
+ max_audio_start = audio.size(1) - self.segment_size
134
+ audio_start = random.randint(0, max_audio_start)
135
+ audio = audio[:, audio_start:audio_start+self.segment_size]
136
+ else:
137
+ audio = torch.nn.functional.pad(audio, (0, self.segment_size - audio.size(1)), 'constant')
138
+
139
+ mel = mel_spectrogram(audio, self.n_fft, self.num_mels,
140
+ self.sampling_rate, self.hop_size, self.win_size, self.fmin, self.fmax,
141
+ center=False)
142
+ else:
143
+ mel = np.load(
144
+ os.path.join(self.base_mels_path, os.path.splitext(filename)[0] + '.npy'))
145
+ mel = torch.from_numpy(mel)
146
+
147
+ if len(mel.shape) < 3:
148
+ mel = mel.unsqueeze(0)
149
+
150
+ if self.split:
151
+ frames_per_seg = math.ceil(self.segment_size / self.hop_size)
152
+
153
+ if audio.size(1) >= self.segment_size:
154
+ mel_start = random.randint(0, mel.size(2) - frames_per_seg - 1)
155
+ mel = mel[:, :, mel_start:mel_start + frames_per_seg]
156
+ audio = audio[:, mel_start * self.hop_size:(mel_start + frames_per_seg) * self.hop_size]
157
+ else:
158
+ mel = torch.nn.functional.pad(mel, (0, frames_per_seg - mel.size(2)), 'constant')
159
+ audio = torch.nn.functional.pad(audio, (0, self.segment_size - audio.size(1)), 'constant')
160
+
161
+ mel_loss = mel_spectrogram(audio, self.n_fft, self.num_mels,
162
+ self.sampling_rate, self.hop_size, self.win_size, self.fmin, self.fmax_loss,
163
+ center=False)
164
+
165
+ return (mel.squeeze(), audio.squeeze(0), filename, mel_loss.squeeze())
166
+
167
+ def __len__(self):
168
+ return len(self.audio_files)
model.py ADDED
@@ -0,0 +1,529 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from math import sqrt
2
+ import torch
3
+ from torch.autograd import Variable
4
+ from torch import nn
5
+ from torch.nn import functional as F
6
+ from layers import ConvNorm, LinearNorm
7
+ from utils import to_gpu, get_mask_from_lengths
8
+
9
+
10
+ class LocationLayer(nn.Module):
11
+ def __init__(self, attention_n_filters, attention_kernel_size,
12
+ attention_dim):
13
+ super(LocationLayer, self).__init__()
14
+ padding = int((attention_kernel_size - 1) / 2)
15
+ self.location_conv = ConvNorm(2, attention_n_filters,
16
+ kernel_size=attention_kernel_size,
17
+ padding=padding, bias=False, stride=1,
18
+ dilation=1)
19
+ self.location_dense = LinearNorm(attention_n_filters, attention_dim,
20
+ bias=False, w_init_gain='tanh')
21
+
22
+ def forward(self, attention_weights_cat):
23
+ processed_attention = self.location_conv(attention_weights_cat)
24
+ processed_attention = processed_attention.transpose(1, 2)
25
+ processed_attention = self.location_dense(processed_attention)
26
+ return processed_attention
27
+
28
+
29
+ class Attention(nn.Module):
30
+ def __init__(self, attention_rnn_dim, embedding_dim, attention_dim,
31
+ attention_location_n_filters, attention_location_kernel_size):
32
+ super(Attention, self).__init__()
33
+ self.query_layer = LinearNorm(attention_rnn_dim, attention_dim,
34
+ bias=False, w_init_gain='tanh')
35
+ self.memory_layer = LinearNorm(embedding_dim, attention_dim, bias=False,
36
+ w_init_gain='tanh')
37
+ self.v = LinearNorm(attention_dim, 1, bias=False)
38
+ self.location_layer = LocationLayer(attention_location_n_filters,
39
+ attention_location_kernel_size,
40
+ attention_dim)
41
+ self.score_mask_value = -float("inf")
42
+
43
+ def get_alignment_energies(self, query, processed_memory,
44
+ attention_weights_cat):
45
+ """
46
+ PARAMS
47
+ ------
48
+ query: decoder output (batch, n_mel_channels * n_frames_per_step)
49
+ processed_memory: processed encoder outputs (B, T_in, attention_dim)
50
+ attention_weights_cat: cumulative and prev. att weights (B, 2, max_time)
51
+
52
+ RETURNS
53
+ -------
54
+ alignment (batch, max_time)
55
+ """
56
+
57
+ processed_query = self.query_layer(query.unsqueeze(1))
58
+ processed_attention_weights = self.location_layer(attention_weights_cat)
59
+ energies = self.v(torch.tanh(
60
+ processed_query + processed_attention_weights + processed_memory))
61
+
62
+ energies = energies.squeeze(-1)
63
+ return energies
64
+
65
+ def forward(self, attention_hidden_state, memory, processed_memory,
66
+ attention_weights_cat, mask):
67
+ """
68
+ PARAMS
69
+ ------
70
+ attention_hidden_state: attention rnn last output
71
+ memory: encoder outputs
72
+ processed_memory: processed encoder outputs
73
+ attention_weights_cat: previous and cummulative attention weights
74
+ mask: binary mask for padded data
75
+ """
76
+ alignment = self.get_alignment_energies(
77
+ attention_hidden_state, processed_memory, attention_weights_cat)
78
+
79
+ if mask is not None:
80
+ alignment.data.masked_fill_(mask, self.score_mask_value)
81
+
82
+ attention_weights = F.softmax(alignment, dim=1)
83
+ attention_context = torch.bmm(attention_weights.unsqueeze(1), memory)
84
+ attention_context = attention_context.squeeze(1)
85
+
86
+ return attention_context, attention_weights
87
+
88
+
89
+ class Prenet(nn.Module):
90
+ def __init__(self, in_dim, sizes):
91
+ super(Prenet, self).__init__()
92
+ in_sizes = [in_dim] + sizes[:-1]
93
+ self.layers = nn.ModuleList(
94
+ [LinearNorm(in_size, out_size, bias=False)
95
+ for (in_size, out_size) in zip(in_sizes, sizes)])
96
+
97
+ def forward(self, x):
98
+ for linear in self.layers:
99
+ x = F.dropout(F.relu(linear(x)), p=0.5, training=True)
100
+ return x
101
+
102
+
103
+ class Postnet(nn.Module):
104
+ """Postnet
105
+ - Five 1-d convolution with 512 channels and kernel size 5
106
+ """
107
+
108
+ def __init__(self, hparams):
109
+ super(Postnet, self).__init__()
110
+ self.convolutions = nn.ModuleList()
111
+
112
+ self.convolutions.append(
113
+ nn.Sequential(
114
+ ConvNorm(hparams.n_mel_channels, hparams.postnet_embedding_dim,
115
+ kernel_size=hparams.postnet_kernel_size, stride=1,
116
+ padding=int((hparams.postnet_kernel_size - 1) / 2),
117
+ dilation=1, w_init_gain='tanh'),
118
+ nn.BatchNorm1d(hparams.postnet_embedding_dim))
119
+ )
120
+
121
+ for i in range(1, hparams.postnet_n_convolutions - 1):
122
+ self.convolutions.append(
123
+ nn.Sequential(
124
+ ConvNorm(hparams.postnet_embedding_dim,
125
+ hparams.postnet_embedding_dim,
126
+ kernel_size=hparams.postnet_kernel_size, stride=1,
127
+ padding=int((hparams.postnet_kernel_size - 1) / 2),
128
+ dilation=1, w_init_gain='tanh'),
129
+ nn.BatchNorm1d(hparams.postnet_embedding_dim))
130
+ )
131
+
132
+ self.convolutions.append(
133
+ nn.Sequential(
134
+ ConvNorm(hparams.postnet_embedding_dim, hparams.n_mel_channels,
135
+ kernel_size=hparams.postnet_kernel_size, stride=1,
136
+ padding=int((hparams.postnet_kernel_size - 1) / 2),
137
+ dilation=1, w_init_gain='linear'),
138
+ nn.BatchNorm1d(hparams.n_mel_channels))
139
+ )
140
+
141
+ def forward(self, x):
142
+ for i in range(len(self.convolutions) - 1):
143
+ x = F.dropout(torch.tanh(self.convolutions[i](x)), 0.5, self.training)
144
+ x = F.dropout(self.convolutions[-1](x), 0.5, self.training)
145
+
146
+ return x
147
+
148
+
149
+ class Encoder(nn.Module):
150
+ """Encoder module:
151
+ - Three 1-d convolution banks
152
+ - Bidirectional LSTM
153
+ """
154
+ def __init__(self, hparams):
155
+ super(Encoder, self).__init__()
156
+
157
+ convolutions = []
158
+ for _ in range(hparams.encoder_n_convolutions):
159
+ conv_layer = nn.Sequential(
160
+ ConvNorm(hparams.encoder_embedding_dim,
161
+ hparams.encoder_embedding_dim,
162
+ kernel_size=hparams.encoder_kernel_size, stride=1,
163
+ padding=int((hparams.encoder_kernel_size - 1) / 2),
164
+ dilation=1, w_init_gain='relu'),
165
+ nn.BatchNorm1d(hparams.encoder_embedding_dim))
166
+ convolutions.append(conv_layer)
167
+ self.convolutions = nn.ModuleList(convolutions)
168
+
169
+ self.lstm = nn.LSTM(hparams.encoder_embedding_dim,
170
+ int(hparams.encoder_embedding_dim / 2), 1,
171
+ batch_first=True, bidirectional=True)
172
+
173
+ def forward(self, x, input_lengths):
174
+ for conv in self.convolutions:
175
+ x = F.dropout(F.relu(conv(x)), 0.5, self.training)
176
+
177
+ x = x.transpose(1, 2)
178
+
179
+ # pytorch tensor are not reversible, hence the conversion
180
+ input_lengths = input_lengths.cpu().numpy()
181
+ x = nn.utils.rnn.pack_padded_sequence(
182
+ x, input_lengths, batch_first=True)
183
+
184
+ self.lstm.flatten_parameters()
185
+ outputs, _ = self.lstm(x)
186
+
187
+ outputs, _ = nn.utils.rnn.pad_packed_sequence(
188
+ outputs, batch_first=True)
189
+
190
+ return outputs
191
+
192
+ def inference(self, x):
193
+ for conv in self.convolutions:
194
+ x = F.dropout(F.relu(conv(x)), 0.5, self.training)
195
+
196
+ x = x.transpose(1, 2)
197
+
198
+ self.lstm.flatten_parameters()
199
+ outputs, _ = self.lstm(x)
200
+
201
+ return outputs
202
+
203
+
204
+ class Decoder(nn.Module):
205
+ def __init__(self, hparams):
206
+ super(Decoder, self).__init__()
207
+ self.n_mel_channels = hparams.n_mel_channels
208
+ self.n_frames_per_step = hparams.n_frames_per_step
209
+ self.encoder_embedding_dim = hparams.encoder_embedding_dim
210
+ self.attention_rnn_dim = hparams.attention_rnn_dim
211
+ self.decoder_rnn_dim = hparams.decoder_rnn_dim
212
+ self.prenet_dim = hparams.prenet_dim
213
+ self.max_decoder_steps = hparams.max_decoder_steps
214
+ self.gate_threshold = hparams.gate_threshold
215
+ self.p_attention_dropout = hparams.p_attention_dropout
216
+ self.p_decoder_dropout = hparams.p_decoder_dropout
217
+
218
+ self.prenet = Prenet(
219
+ hparams.n_mel_channels * hparams.n_frames_per_step,
220
+ [hparams.prenet_dim, hparams.prenet_dim])
221
+
222
+ self.attention_rnn = nn.LSTMCell(
223
+ hparams.prenet_dim + hparams.encoder_embedding_dim,
224
+ hparams.attention_rnn_dim)
225
+
226
+ self.attention_layer = Attention(
227
+ hparams.attention_rnn_dim, hparams.encoder_embedding_dim,
228
+ hparams.attention_dim, hparams.attention_location_n_filters,
229
+ hparams.attention_location_kernel_size)
230
+
231
+ self.decoder_rnn = nn.LSTMCell(
232
+ hparams.attention_rnn_dim + hparams.encoder_embedding_dim,
233
+ hparams.decoder_rnn_dim, 1)
234
+
235
+ self.linear_projection = LinearNorm(
236
+ hparams.decoder_rnn_dim + hparams.encoder_embedding_dim,
237
+ hparams.n_mel_channels * hparams.n_frames_per_step)
238
+
239
+ self.gate_layer = LinearNorm(
240
+ hparams.decoder_rnn_dim + hparams.encoder_embedding_dim, 1,
241
+ bias=True, w_init_gain='sigmoid')
242
+
243
+ def get_go_frame(self, memory):
244
+ """ Gets all zeros frames to use as first decoder input
245
+ PARAMS
246
+ ------
247
+ memory: decoder outputs
248
+
249
+ RETURNS
250
+ -------
251
+ decoder_input: all zeros frames
252
+ """
253
+ B = memory.size(0)
254
+ decoder_input = Variable(memory.data.new(
255
+ B, self.n_mel_channels * self.n_frames_per_step).zero_())
256
+ return decoder_input
257
+
258
+ def initialize_decoder_states(self, memory, mask):
259
+ """ Initializes attention rnn states, decoder rnn states, attention
260
+ weights, attention cumulative weights, attention context, stores memory
261
+ and stores processed memory
262
+ PARAMS
263
+ ------
264
+ memory: Encoder outputs
265
+ mask: Mask for padded data if training, expects None for inference
266
+ """
267
+ B = memory.size(0)
268
+ MAX_TIME = memory.size(1)
269
+
270
+ self.attention_hidden = Variable(memory.data.new(
271
+ B, self.attention_rnn_dim).zero_())
272
+ self.attention_cell = Variable(memory.data.new(
273
+ B, self.attention_rnn_dim).zero_())
274
+
275
+ self.decoder_hidden = Variable(memory.data.new(
276
+ B, self.decoder_rnn_dim).zero_())
277
+ self.decoder_cell = Variable(memory.data.new(
278
+ B, self.decoder_rnn_dim).zero_())
279
+
280
+ self.attention_weights = Variable(memory.data.new(
281
+ B, MAX_TIME).zero_())
282
+ self.attention_weights_cum = Variable(memory.data.new(
283
+ B, MAX_TIME).zero_())
284
+ self.attention_context = Variable(memory.data.new(
285
+ B, self.encoder_embedding_dim).zero_())
286
+
287
+ self.memory = memory
288
+ self.processed_memory = self.attention_layer.memory_layer(memory)
289
+ self.mask = mask
290
+
291
+ def parse_decoder_inputs(self, decoder_inputs):
292
+ """ Prepares decoder inputs, i.e. mel outputs
293
+ PARAMS
294
+ ------
295
+ decoder_inputs: inputs used for teacher-forced training, i.e. mel-specs
296
+
297
+ RETURNS
298
+ -------
299
+ inputs: processed decoder inputs
300
+
301
+ """
302
+ # (B, n_mel_channels, T_out) -> (B, T_out, n_mel_channels)
303
+ decoder_inputs = decoder_inputs.transpose(1, 2)
304
+ decoder_inputs = decoder_inputs.view(
305
+ decoder_inputs.size(0),
306
+ int(decoder_inputs.size(1)/self.n_frames_per_step), -1)
307
+ # (B, T_out, n_mel_channels) -> (T_out, B, n_mel_channels)
308
+ decoder_inputs = decoder_inputs.transpose(0, 1)
309
+ return decoder_inputs
310
+
311
+ def parse_decoder_outputs(self, mel_outputs, gate_outputs, alignments):
312
+ """ Prepares decoder outputs for output
313
+ PARAMS
314
+ ------
315
+ mel_outputs:
316
+ gate_outputs: gate output energies
317
+ alignments:
318
+
319
+ RETURNS
320
+ -------
321
+ mel_outputs:
322
+ gate_outpust: gate output energies
323
+ alignments:
324
+ """
325
+ # (T_out, B) -> (B, T_out)
326
+ alignments = torch.stack(alignments).transpose(0, 1)
327
+ # (T_out, B) -> (B, T_out)
328
+ gate_outputs = torch.stack(gate_outputs).transpose(0, 1)
329
+ gate_outputs = gate_outputs.contiguous()
330
+ # (T_out, B, n_mel_channels) -> (B, T_out, n_mel_channels)
331
+ mel_outputs = torch.stack(mel_outputs).transpose(0, 1).contiguous()
332
+ # decouple frames per step
333
+ mel_outputs = mel_outputs.view(
334
+ mel_outputs.size(0), -1, self.n_mel_channels)
335
+ # (B, T_out, n_mel_channels) -> (B, n_mel_channels, T_out)
336
+ mel_outputs = mel_outputs.transpose(1, 2)
337
+
338
+ return mel_outputs, gate_outputs, alignments
339
+
340
+ def decode(self, decoder_input):
341
+ """ Decoder step using stored states, attention and memory
342
+ PARAMS
343
+ ------
344
+ decoder_input: previous mel output
345
+
346
+ RETURNS
347
+ -------
348
+ mel_output:
349
+ gate_output: gate output energies
350
+ attention_weights:
351
+ """
352
+ cell_input = torch.cat((decoder_input, self.attention_context), -1)
353
+ self.attention_hidden, self.attention_cell = self.attention_rnn(
354
+ cell_input, (self.attention_hidden, self.attention_cell))
355
+ self.attention_hidden = F.dropout(
356
+ self.attention_hidden, self.p_attention_dropout, self.training)
357
+
358
+ attention_weights_cat = torch.cat(
359
+ (self.attention_weights.unsqueeze(1),
360
+ self.attention_weights_cum.unsqueeze(1)), dim=1)
361
+ self.attention_context, self.attention_weights = self.attention_layer(
362
+ self.attention_hidden, self.memory, self.processed_memory,
363
+ attention_weights_cat, self.mask)
364
+
365
+ self.attention_weights_cum += self.attention_weights
366
+ decoder_input = torch.cat(
367
+ (self.attention_hidden, self.attention_context), -1)
368
+ self.decoder_hidden, self.decoder_cell = self.decoder_rnn(
369
+ decoder_input, (self.decoder_hidden, self.decoder_cell))
370
+ self.decoder_hidden = F.dropout(
371
+ self.decoder_hidden, self.p_decoder_dropout, self.training)
372
+
373
+ decoder_hidden_attention_context = torch.cat(
374
+ (self.decoder_hidden, self.attention_context), dim=1)
375
+ decoder_output = self.linear_projection(
376
+ decoder_hidden_attention_context)
377
+
378
+ gate_prediction = self.gate_layer(decoder_hidden_attention_context)
379
+ return decoder_output, gate_prediction, self.attention_weights
380
+
381
+ def forward(self, memory, decoder_inputs, memory_lengths):
382
+ """ Decoder forward pass for training
383
+ PARAMS
384
+ ------
385
+ memory: Encoder outputs
386
+ decoder_inputs: Decoder inputs for teacher forcing. i.e. mel-specs
387
+ memory_lengths: Encoder output lengths for attention masking.
388
+
389
+ RETURNS
390
+ -------
391
+ mel_outputs: mel outputs from the decoder
392
+ gate_outputs: gate outputs from the decoder
393
+ alignments: sequence of attention weights from the decoder
394
+ """
395
+
396
+ decoder_input = self.get_go_frame(memory).unsqueeze(0)
397
+ decoder_inputs = self.parse_decoder_inputs(decoder_inputs)
398
+ decoder_inputs = torch.cat((decoder_input, decoder_inputs), dim=0)
399
+ decoder_inputs = self.prenet(decoder_inputs)
400
+
401
+ self.initialize_decoder_states(
402
+ memory, mask=~get_mask_from_lengths(memory_lengths))
403
+
404
+ mel_outputs, gate_outputs, alignments = [], [], []
405
+ while len(mel_outputs) < decoder_inputs.size(0) - 1:
406
+ decoder_input = decoder_inputs[len(mel_outputs)]
407
+ mel_output, gate_output, attention_weights = self.decode(
408
+ decoder_input)
409
+ mel_outputs += [mel_output.squeeze(1)]
410
+ gate_outputs += [gate_output.squeeze(1)]
411
+ alignments += [attention_weights]
412
+
413
+ mel_outputs, gate_outputs, alignments = self.parse_decoder_outputs(
414
+ mel_outputs, gate_outputs, alignments)
415
+
416
+ return mel_outputs, gate_outputs, alignments
417
+
418
+ def inference(self, memory):
419
+ """ Decoder inference
420
+ PARAMS
421
+ ------
422
+ memory: Encoder outputs
423
+
424
+ RETURNS
425
+ -------
426
+ mel_outputs: mel outputs from the decoder
427
+ gate_outputs: gate outputs from the decoder
428
+ alignments: sequence of attention weights from the decoder
429
+ """
430
+ decoder_input = self.get_go_frame(memory)
431
+
432
+ self.initialize_decoder_states(memory, mask=None)
433
+
434
+ mel_outputs, gate_outputs, alignments = [], [], []
435
+ while True:
436
+ decoder_input = self.prenet(decoder_input)
437
+ mel_output, gate_output, alignment = self.decode(decoder_input)
438
+
439
+ mel_outputs += [mel_output.squeeze(1)]
440
+ gate_outputs += [gate_output]
441
+ alignments += [alignment]
442
+
443
+ if torch.sigmoid(gate_output.data) > self.gate_threshold:
444
+ break
445
+ elif len(mel_outputs) == self.max_decoder_steps:
446
+ print("Warning! Reached max decoder steps")
447
+ break
448
+
449
+ decoder_input = mel_output
450
+
451
+ mel_outputs, gate_outputs, alignments = self.parse_decoder_outputs(
452
+ mel_outputs, gate_outputs, alignments)
453
+
454
+ return mel_outputs, gate_outputs, alignments
455
+
456
+
457
+ class Tacotron2(nn.Module):
458
+ def __init__(self, hparams):
459
+ super(Tacotron2, self).__init__()
460
+ self.mask_padding = hparams.mask_padding
461
+ self.fp16_run = hparams.fp16_run
462
+ self.n_mel_channels = hparams.n_mel_channels
463
+ self.n_frames_per_step = hparams.n_frames_per_step
464
+ self.embedding = nn.Embedding(
465
+ hparams.n_symbols, hparams.symbols_embedding_dim)
466
+ std = sqrt(2.0 / (hparams.n_symbols + hparams.symbols_embedding_dim))
467
+ val = sqrt(3.0) * std # uniform bounds for std
468
+ self.embedding.weight.data.uniform_(-val, val)
469
+ self.encoder = Encoder(hparams)
470
+ self.decoder = Decoder(hparams)
471
+ self.postnet = Postnet(hparams)
472
+
473
+ def parse_batch(self, batch):
474
+ text_padded, input_lengths, mel_padded, gate_padded, \
475
+ output_lengths = batch
476
+ text_padded = to_gpu(text_padded).long()
477
+ input_lengths = to_gpu(input_lengths).long()
478
+ max_len = torch.max(input_lengths.data).item()
479
+ mel_padded = to_gpu(mel_padded).float()
480
+ gate_padded = to_gpu(gate_padded).float()
481
+ output_lengths = to_gpu(output_lengths).long()
482
+
483
+ return (
484
+ (text_padded, input_lengths, mel_padded, max_len, output_lengths),
485
+ (mel_padded, gate_padded))
486
+
487
+ def parse_output(self, outputs, output_lengths=None):
488
+ if self.mask_padding and output_lengths is not None:
489
+ mask = ~get_mask_from_lengths(output_lengths)
490
+ mask = mask.expand(self.n_mel_channels, mask.size(0), mask.size(1))
491
+ mask = mask.permute(1, 0, 2)
492
+
493
+ outputs[0].data.masked_fill_(mask, 0.0)
494
+ outputs[1].data.masked_fill_(mask, 0.0)
495
+ outputs[2].data.masked_fill_(mask[:, 0, :], 1e3) # gate energies
496
+
497
+ return outputs
498
+
499
+ def forward(self, inputs):
500
+ text_inputs, text_lengths, mels, max_len, output_lengths = inputs
501
+ text_lengths, output_lengths = text_lengths.data, output_lengths.data
502
+
503
+ embedded_inputs = self.embedding(text_inputs).transpose(1, 2)
504
+
505
+ encoder_outputs = self.encoder(embedded_inputs, text_lengths)
506
+
507
+ mel_outputs, gate_outputs, alignments = self.decoder(
508
+ encoder_outputs, mels, memory_lengths=text_lengths)
509
+
510
+ mel_outputs_postnet = self.postnet(mel_outputs)
511
+ mel_outputs_postnet = mel_outputs + mel_outputs_postnet
512
+
513
+ return self.parse_output(
514
+ [mel_outputs, mel_outputs_postnet, gate_outputs, alignments],
515
+ output_lengths)
516
+
517
+ def inference(self, inputs):
518
+ embedded_inputs = self.embedding(inputs).transpose(1, 2)
519
+ encoder_outputs = self.encoder.inference(embedded_inputs)
520
+ mel_outputs, gate_outputs, alignments = self.decoder.inference(
521
+ encoder_outputs)
522
+
523
+ mel_outputs_postnet = self.postnet(mel_outputs)
524
+ mel_outputs_postnet = mel_outputs + mel_outputs_postnet
525
+
526
+ outputs = self.parse_output(
527
+ [mel_outputs, mel_outputs_postnet, gate_outputs, alignments])
528
+
529
+ return outputs
models.py ADDED
@@ -0,0 +1,283 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import torch
2
+ import torch.nn.functional as F
3
+ import torch.nn as nn
4
+ from torch.nn import Conv1d, ConvTranspose1d, AvgPool1d, Conv2d
5
+ from torch.nn.utils import weight_norm, remove_weight_norm, spectral_norm
6
+ from hifiutils import init_weights, get_padding
7
+
8
+ LRELU_SLOPE = 0.1
9
+
10
+
11
+ class ResBlock1(torch.nn.Module):
12
+ def __init__(self, h, channels, kernel_size=3, dilation=(1, 3, 5)):
13
+ super(ResBlock1, self).__init__()
14
+ self.h = h
15
+ self.convs1 = nn.ModuleList([
16
+ weight_norm(Conv1d(channels, channels, kernel_size, 1, dilation=dilation[0],
17
+ padding=get_padding(kernel_size, dilation[0]))),
18
+ weight_norm(Conv1d(channels, channels, kernel_size, 1, dilation=dilation[1],
19
+ padding=get_padding(kernel_size, dilation[1]))),
20
+ weight_norm(Conv1d(channels, channels, kernel_size, 1, dilation=dilation[2],
21
+ padding=get_padding(kernel_size, dilation[2])))
22
+ ])
23
+ self.convs1.apply(init_weights)
24
+
25
+ self.convs2 = nn.ModuleList([
26
+ weight_norm(Conv1d(channels, channels, kernel_size, 1, dilation=1,
27
+ padding=get_padding(kernel_size, 1))),
28
+ weight_norm(Conv1d(channels, channels, kernel_size, 1, dilation=1,
29
+ padding=get_padding(kernel_size, 1))),
30
+ weight_norm(Conv1d(channels, channels, kernel_size, 1, dilation=1,
31
+ padding=get_padding(kernel_size, 1)))
32
+ ])
33
+ self.convs2.apply(init_weights)
34
+
35
+ def forward(self, x):
36
+ for c1, c2 in zip(self.convs1, self.convs2):
37
+ xt = F.leaky_relu(x, LRELU_SLOPE)
38
+ xt = c1(xt)
39
+ xt = F.leaky_relu(xt, LRELU_SLOPE)
40
+ xt = c2(xt)
41
+ x = xt + x
42
+ return x
43
+
44
+ def remove_weight_norm(self):
45
+ for l in self.convs1:
46
+ remove_weight_norm(l)
47
+ for l in self.convs2:
48
+ remove_weight_norm(l)
49
+
50
+
51
+ class ResBlock2(torch.nn.Module):
52
+ def __init__(self, h, channels, kernel_size=3, dilation=(1, 3)):
53
+ super(ResBlock2, self).__init__()
54
+ self.h = h
55
+ self.convs = nn.ModuleList([
56
+ weight_norm(Conv1d(channels, channels, kernel_size, 1, dilation=dilation[0],
57
+ padding=get_padding(kernel_size, dilation[0]))),
58
+ weight_norm(Conv1d(channels, channels, kernel_size, 1, dilation=dilation[1],
59
+ padding=get_padding(kernel_size, dilation[1])))
60
+ ])
61
+ self.convs.apply(init_weights)
62
+
63
+ def forward(self, x):
64
+ for c in self.convs:
65
+ xt = F.leaky_relu(x, LRELU_SLOPE)
66
+ xt = c(xt)
67
+ x = xt + x
68
+ return x
69
+
70
+ def remove_weight_norm(self):
71
+ for l in self.convs:
72
+ remove_weight_norm(l)
73
+
74
+
75
+ class Generator(torch.nn.Module):
76
+ def __init__(self, h):
77
+ super(Generator, self).__init__()
78
+ self.h = h
79
+ self.num_kernels = len(h.resblock_kernel_sizes)
80
+ self.num_upsamples = len(h.upsample_rates)
81
+ self.conv_pre = weight_norm(Conv1d(80, h.upsample_initial_channel, 7, 1, padding=3))
82
+ resblock = ResBlock1 if h.resblock == '1' else ResBlock2
83
+
84
+ self.ups = nn.ModuleList()
85
+ for i, (u, k) in enumerate(zip(h.upsample_rates, h.upsample_kernel_sizes)):
86
+ self.ups.append(weight_norm(
87
+ ConvTranspose1d(h.upsample_initial_channel//(2**i), h.upsample_initial_channel//(2**(i+1)),
88
+ k, u, padding=(k-u)//2)))
89
+
90
+ self.resblocks = nn.ModuleList()
91
+ for i in range(len(self.ups)):
92
+ ch = h.upsample_initial_channel//(2**(i+1))
93
+ for j, (k, d) in enumerate(zip(h.resblock_kernel_sizes, h.resblock_dilation_sizes)):
94
+ self.resblocks.append(resblock(h, ch, k, d))
95
+
96
+ self.conv_post = weight_norm(Conv1d(ch, 1, 7, 1, padding=3))
97
+ self.ups.apply(init_weights)
98
+ self.conv_post.apply(init_weights)
99
+
100
+ def forward(self, x):
101
+ x = self.conv_pre(x)
102
+ for i in range(self.num_upsamples):
103
+ x = F.leaky_relu(x, LRELU_SLOPE)
104
+ x = self.ups[i](x)
105
+ xs = None
106
+ for j in range(self.num_kernels):
107
+ if xs is None:
108
+ xs = self.resblocks[i*self.num_kernels+j](x)
109
+ else:
110
+ xs += self.resblocks[i*self.num_kernels+j](x)
111
+ x = xs / self.num_kernels
112
+ x = F.leaky_relu(x)
113
+ x = self.conv_post(x)
114
+ x = torch.tanh(x)
115
+
116
+ return x
117
+
118
+ def remove_weight_norm(self):
119
+ print('Removing weight norm...')
120
+ for l in self.ups:
121
+ remove_weight_norm(l)
122
+ for l in self.resblocks:
123
+ l.remove_weight_norm()
124
+ remove_weight_norm(self.conv_pre)
125
+ remove_weight_norm(self.conv_post)
126
+
127
+
128
+ class DiscriminatorP(torch.nn.Module):
129
+ def __init__(self, period, kernel_size=5, stride=3, use_spectral_norm=False):
130
+ super(DiscriminatorP, self).__init__()
131
+ self.period = period
132
+ norm_f = weight_norm if use_spectral_norm == False else spectral_norm
133
+ self.convs = nn.ModuleList([
134
+ norm_f(Conv2d(1, 32, (kernel_size, 1), (stride, 1), padding=(get_padding(5, 1), 0))),
135
+ norm_f(Conv2d(32, 128, (kernel_size, 1), (stride, 1), padding=(get_padding(5, 1), 0))),
136
+ norm_f(Conv2d(128, 512, (kernel_size, 1), (stride, 1), padding=(get_padding(5, 1), 0))),
137
+ norm_f(Conv2d(512, 1024, (kernel_size, 1), (stride, 1), padding=(get_padding(5, 1), 0))),
138
+ norm_f(Conv2d(1024, 1024, (kernel_size, 1), 1, padding=(2, 0))),
139
+ ])
140
+ self.conv_post = norm_f(Conv2d(1024, 1, (3, 1), 1, padding=(1, 0)))
141
+
142
+ def forward(self, x):
143
+ fmap = []
144
+
145
+ # 1d to 2d
146
+ b, c, t = x.shape
147
+ if t % self.period != 0: # pad first
148
+ n_pad = self.period - (t % self.period)
149
+ x = F.pad(x, (0, n_pad), "reflect")
150
+ t = t + n_pad
151
+ x = x.view(b, c, t // self.period, self.period)
152
+
153
+ for l in self.convs:
154
+ x = l(x)
155
+ x = F.leaky_relu(x, LRELU_SLOPE)
156
+ fmap.append(x)
157
+ x = self.conv_post(x)
158
+ fmap.append(x)
159
+ x = torch.flatten(x, 1, -1)
160
+
161
+ return x, fmap
162
+
163
+
164
+ class MultiPeriodDiscriminator(torch.nn.Module):
165
+ def __init__(self):
166
+ super(MultiPeriodDiscriminator, self).__init__()
167
+ self.discriminators = nn.ModuleList([
168
+ DiscriminatorP(2),
169
+ DiscriminatorP(3),
170
+ DiscriminatorP(5),
171
+ DiscriminatorP(7),
172
+ DiscriminatorP(11),
173
+ ])
174
+
175
+ def forward(self, y, y_hat):
176
+ y_d_rs = []
177
+ y_d_gs = []
178
+ fmap_rs = []
179
+ fmap_gs = []
180
+ for i, d in enumerate(self.discriminators):
181
+ y_d_r, fmap_r = d(y)
182
+ y_d_g, fmap_g = d(y_hat)
183
+ y_d_rs.append(y_d_r)
184
+ fmap_rs.append(fmap_r)
185
+ y_d_gs.append(y_d_g)
186
+ fmap_gs.append(fmap_g)
187
+
188
+ return y_d_rs, y_d_gs, fmap_rs, fmap_gs
189
+
190
+
191
+ class DiscriminatorS(torch.nn.Module):
192
+ def __init__(self, use_spectral_norm=False):
193
+ super(DiscriminatorS, self).__init__()
194
+ norm_f = weight_norm if use_spectral_norm == False else spectral_norm
195
+ self.convs = nn.ModuleList([
196
+ norm_f(Conv1d(1, 128, 15, 1, padding=7)),
197
+ norm_f(Conv1d(128, 128, 41, 2, groups=4, padding=20)),
198
+ norm_f(Conv1d(128, 256, 41, 2, groups=16, padding=20)),
199
+ norm_f(Conv1d(256, 512, 41, 4, groups=16, padding=20)),
200
+ norm_f(Conv1d(512, 1024, 41, 4, groups=16, padding=20)),
201
+ norm_f(Conv1d(1024, 1024, 41, 1, groups=16, padding=20)),
202
+ norm_f(Conv1d(1024, 1024, 5, 1, padding=2)),
203
+ ])
204
+ self.conv_post = norm_f(Conv1d(1024, 1, 3, 1, padding=1))
205
+
206
+ def forward(self, x):
207
+ fmap = []
208
+ for l in self.convs:
209
+ x = l(x)
210
+ x = F.leaky_relu(x, LRELU_SLOPE)
211
+ fmap.append(x)
212
+ x = self.conv_post(x)
213
+ fmap.append(x)
214
+ x = torch.flatten(x, 1, -1)
215
+
216
+ return x, fmap
217
+
218
+
219
+ class MultiScaleDiscriminator(torch.nn.Module):
220
+ def __init__(self):
221
+ super(MultiScaleDiscriminator, self).__init__()
222
+ self.discriminators = nn.ModuleList([
223
+ DiscriminatorS(use_spectral_norm=True),
224
+ DiscriminatorS(),
225
+ DiscriminatorS(),
226
+ ])
227
+ self.meanpools = nn.ModuleList([
228
+ AvgPool1d(4, 2, padding=2),
229
+ AvgPool1d(4, 2, padding=2)
230
+ ])
231
+
232
+ def forward(self, y, y_hat):
233
+ y_d_rs = []
234
+ y_d_gs = []
235
+ fmap_rs = []
236
+ fmap_gs = []
237
+ for i, d in enumerate(self.discriminators):
238
+ if i != 0:
239
+ y = self.meanpools[i-1](y)
240
+ y_hat = self.meanpools[i-1](y_hat)
241
+ y_d_r, fmap_r = d(y)
242
+ y_d_g, fmap_g = d(y_hat)
243
+ y_d_rs.append(y_d_r)
244
+ fmap_rs.append(fmap_r)
245
+ y_d_gs.append(y_d_g)
246
+ fmap_gs.append(fmap_g)
247
+
248
+ return y_d_rs, y_d_gs, fmap_rs, fmap_gs
249
+
250
+
251
+ def feature_loss(fmap_r, fmap_g):
252
+ loss = 0
253
+ for dr, dg in zip(fmap_r, fmap_g):
254
+ for rl, gl in zip(dr, dg):
255
+ loss += torch.mean(torch.abs(rl - gl))
256
+
257
+ return loss*2
258
+
259
+
260
+ def discriminator_loss(disc_real_outputs, disc_generated_outputs):
261
+ loss = 0
262
+ r_losses = []
263
+ g_losses = []
264
+ for dr, dg in zip(disc_real_outputs, disc_generated_outputs):
265
+ r_loss = torch.mean((1-dr)**2)
266
+ g_loss = torch.mean(dg**2)
267
+ loss += (r_loss + g_loss)
268
+ r_losses.append(r_loss.item())
269
+ g_losses.append(g_loss.item())
270
+
271
+ return loss, r_losses, g_losses
272
+
273
+
274
+ def generator_loss(disc_outputs):
275
+ loss = 0
276
+ gen_losses = []
277
+ for dg in disc_outputs:
278
+ l = torch.mean((1-dg)**2)
279
+ gen_losses.append(l)
280
+ loss += l
281
+
282
+ return loss, gen_losses
283
+
multiproc.py ADDED
@@ -0,0 +1,23 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import time
2
+ import torch
3
+ import sys
4
+ import subprocess
5
+
6
+ argslist = list(sys.argv)[1:]
7
+ num_gpus = torch.cuda.device_count()
8
+ argslist.append('--n_gpus={}'.format(num_gpus))
9
+ workers = []
10
+ job_id = time.strftime("%Y_%m_%d-%H%M%S")
11
+ argslist.append("--group_name=group_{}".format(job_id))
12
+
13
+ for i in range(num_gpus):
14
+ argslist.append('--rank={}'.format(i))
15
+ stdout = None if i == 0 else open("logs/{}_GPU_{}.log".format(job_id, i),
16
+ "w")
17
+ print(argslist)
18
+ p = subprocess.Popen([str(sys.executable)]+argslist, stdout=stdout)
19
+ workers.append(p)
20
+ argslist = argslist[:-1]
21
+
22
+ for p in workers:
23
+ p.wait()
plotting_utils.py ADDED
@@ -0,0 +1,61 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import matplotlib
2
+ matplotlib.use("Agg")
3
+ import matplotlib.pylab as plt
4
+ import numpy as np
5
+
6
+
7
+ def save_figure_to_numpy(fig):
8
+ # save it to a numpy array.
9
+ data = np.fromstring(fig.canvas.tostring_rgb(), dtype=np.uint8, sep='')
10
+ data = data.reshape(fig.canvas.get_width_height()[::-1] + (3,))
11
+ return data
12
+
13
+
14
+ def plot_alignment_to_numpy(alignment, info=None):
15
+ fig, ax = plt.subplots(figsize=(6, 4))
16
+ im = ax.imshow(alignment, aspect='auto', origin='lower',
17
+ interpolation='none')
18
+ fig.colorbar(im, ax=ax)
19
+ xlabel = 'Decoder timestep'
20
+ if info is not None:
21
+ xlabel += '\n\n' + info
22
+ plt.xlabel(xlabel)
23
+ plt.ylabel('Encoder timestep')
24
+ plt.tight_layout()
25
+
26
+ fig.canvas.draw()
27
+ data = save_figure_to_numpy(fig)
28
+ plt.close()
29
+ return data
30
+
31
+
32
+ def plot_spectrogram_to_numpy(spectrogram):
33
+ fig, ax = plt.subplots(figsize=(12, 3))
34
+ im = ax.imshow(spectrogram, aspect="auto", origin="lower",
35
+ interpolation='none')
36
+ plt.colorbar(im, ax=ax)
37
+ plt.xlabel("Frames")
38
+ plt.ylabel("Channels")
39
+ plt.tight_layout()
40
+
41
+ fig.canvas.draw()
42
+ data = save_figure_to_numpy(fig)
43
+ plt.close()
44
+ return data
45
+
46
+
47
+ def plot_gate_outputs_to_numpy(gate_targets, gate_outputs):
48
+ fig, ax = plt.subplots(figsize=(12, 3))
49
+ ax.scatter(range(len(gate_targets)), gate_targets, alpha=0.5,
50
+ color='green', marker='+', s=1, label='target')
51
+ ax.scatter(range(len(gate_outputs)), gate_outputs, alpha=0.5,
52
+ color='red', marker='.', s=1, label='predicted')
53
+
54
+ plt.xlabel("Frames (Green target, Red predicted)")
55
+ plt.ylabel("Gate State")
56
+ plt.tight_layout()
57
+
58
+ fig.canvas.draw()
59
+ data = save_figure_to_numpy(fig)
60
+ plt.close()
61
+ return data
requirements.txt ADDED
@@ -0,0 +1,14 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ pillow
2
+ matplotlib
3
+ numpy==1.22.4
4
+ inflect
5
+ librosa
6
+ denoiser
7
+ pysoundfile
8
+ scipy
9
+ Unidecode
10
+ pillow
11
+ openjtalk>=0.3.0.dev2
12
+ janome
13
+ torch
14
+ tensorboardX
stft.py ADDED
@@ -0,0 +1,141 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """
2
+ BSD 3-Clause License
3
+
4
+ Copyright (c) 2017, Prem Seetharaman
5
+ All rights reserved.
6
+
7
+ * Redistribution and use in source and binary forms, with or without
8
+ modification, are permitted provided that the following conditions are met:
9
+
10
+ * Redistributions of source code must retain the above copyright notice,
11
+ this list of conditions and the following disclaimer.
12
+
13
+ * Redistributions in binary form must reproduce the above copyright notice, this
14
+ list of conditions and the following disclaimer in the
15
+ documentation and/or other materials provided with the distribution.
16
+
17
+ * Neither the name of the copyright holder nor the names of its
18
+ contributors may be used to endorse or promote products derived from this
19
+ software without specific prior written permission.
20
+
21
+ THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
22
+ ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
23
+ WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
24
+ DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR
25
+ ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
26
+ (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
27
+ LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON
28
+ ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
29
+ (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
30
+ SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
31
+ """
32
+
33
+ import torch
34
+ import numpy as np
35
+ import torch.nn.functional as F
36
+ from torch.autograd import Variable
37
+ from scipy.signal import get_window
38
+ from librosa.util import pad_center, tiny
39
+ from audio_processing import window_sumsquare
40
+
41
+
42
+ class STFT(torch.nn.Module):
43
+ """adapted from Prem Seetharaman's https://github.com/pseeth/pytorch-stft"""
44
+ def __init__(self, filter_length=800, hop_length=200, win_length=800,
45
+ window='hann'):
46
+ super(STFT, self).__init__()
47
+ self.filter_length = filter_length
48
+ self.hop_length = hop_length
49
+ self.win_length = win_length
50
+ self.window = window
51
+ self.forward_transform = None
52
+ scale = self.filter_length / self.hop_length
53
+ fourier_basis = np.fft.fft(np.eye(self.filter_length))
54
+
55
+ cutoff = int((self.filter_length / 2 + 1))
56
+ fourier_basis = np.vstack([np.real(fourier_basis[:cutoff, :]),
57
+ np.imag(fourier_basis[:cutoff, :])])
58
+
59
+ forward_basis = torch.FloatTensor(fourier_basis[:, None, :])
60
+ inverse_basis = torch.FloatTensor(
61
+ np.linalg.pinv(scale * fourier_basis).T[:, None, :])
62
+
63
+ if window is not None:
64
+ assert(filter_length >= win_length)
65
+ # get window and zero center pad it to filter_length
66
+ fft_window = get_window(window, win_length, fftbins=True)
67
+ fft_window = pad_center(fft_window, filter_length)
68
+ fft_window = torch.from_numpy(fft_window).float()
69
+
70
+ # window the bases
71
+ forward_basis *= fft_window
72
+ inverse_basis *= fft_window
73
+
74
+ self.register_buffer('forward_basis', forward_basis.float())
75
+ self.register_buffer('inverse_basis', inverse_basis.float())
76
+
77
+ def transform(self, input_data):
78
+ num_batches = input_data.size(0)
79
+ num_samples = input_data.size(1)
80
+
81
+ self.num_samples = num_samples
82
+
83
+ # similar to librosa, reflect-pad the input
84
+ input_data = input_data.view(num_batches, 1, num_samples)
85
+ input_data = F.pad(
86
+ input_data.unsqueeze(1),
87
+ (int(self.filter_length / 2), int(self.filter_length / 2), 0, 0),
88
+ mode='reflect')
89
+ input_data = input_data.squeeze(1)
90
+
91
+ forward_transform = F.conv1d(
92
+ input_data,
93
+ Variable(self.forward_basis, requires_grad=False),
94
+ stride=self.hop_length,
95
+ padding=0)
96
+
97
+ cutoff = int((self.filter_length / 2) + 1)
98
+ real_part = forward_transform[:, :cutoff, :]
99
+ imag_part = forward_transform[:, cutoff:, :]
100
+
101
+ magnitude = torch.sqrt(real_part**2 + imag_part**2)
102
+ phase = torch.autograd.Variable(
103
+ torch.atan2(imag_part.data, real_part.data))
104
+
105
+ return magnitude, phase
106
+
107
+ def inverse(self, magnitude, phase):
108
+ recombine_magnitude_phase = torch.cat(
109
+ [magnitude*torch.cos(phase), magnitude*torch.sin(phase)], dim=1)
110
+
111
+ inverse_transform = F.conv_transpose1d(
112
+ recombine_magnitude_phase,
113
+ Variable(self.inverse_basis, requires_grad=False),
114
+ stride=self.hop_length,
115
+ padding=0)
116
+
117
+ if self.window is not None:
118
+ window_sum = window_sumsquare(
119
+ self.window, magnitude.size(-1), hop_length=self.hop_length,
120
+ win_length=self.win_length, n_fft=self.filter_length,
121
+ dtype=np.float32)
122
+ # remove modulation effects
123
+ approx_nonzero_indices = torch.from_numpy(
124
+ np.where(window_sum > tiny(window_sum))[0])
125
+ window_sum = torch.autograd.Variable(
126
+ torch.from_numpy(window_sum), requires_grad=False)
127
+ window_sum = window_sum.cuda() if magnitude.is_cuda else window_sum
128
+ inverse_transform[:, :, approx_nonzero_indices] /= window_sum[approx_nonzero_indices]
129
+
130
+ # scale by hop ratio
131
+ inverse_transform *= float(self.filter_length) / self.hop_length
132
+
133
+ inverse_transform = inverse_transform[:, :, int(self.filter_length/2):]
134
+ inverse_transform = inverse_transform[:, :, :-int(self.filter_length/2):]
135
+
136
+ return inverse_transform
137
+
138
+ def forward(self, input_data):
139
+ self.magnitude, self.phase = self.transform(input_data)
140
+ reconstruction = self.inverse(self.magnitude, self.phase)
141
+ return reconstruction
tensorboard.png ADDED
text/LICENSE ADDED
@@ -0,0 +1,19 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ Copyright (c) 2017 Keith Ito
2
+
3
+ Permission is hereby granted, free of charge, to any person obtaining a copy
4
+ of this software and associated documentation files (the "Software"), to deal
5
+ in the Software without restriction, including without limitation the rights
6
+ to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
7
+ copies of the Software, and to permit persons to whom the Software is
8
+ furnished to do so, subject to the following conditions:
9
+
10
+ The above copyright notice and this permission notice shall be included in
11
+ all copies or substantial portions of the Software.
12
+
13
+ THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
14
+ IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
15
+ FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
16
+ AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
17
+ LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
18
+ OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
19
+ THE SOFTWARE.
text/__init__.py ADDED
@@ -0,0 +1,74 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """ from https://github.com/keithito/tacotron """
2
+ import re
3
+ from text import cleaners
4
+ from text.symbols import symbols
5
+
6
+
7
+ # Mappings from symbol to numeric ID and vice versa:
8
+ _symbol_to_id = {s: i for i, s in enumerate(symbols)}
9
+ _id_to_symbol = {i: s for i, s in enumerate(symbols)}
10
+
11
+ # Regular expression matching text enclosed in curly braces:
12
+ _curly_re = re.compile(r'(.*?)\{(.+?)\}(.*)')
13
+
14
+
15
+ def text_to_sequence(text, cleaner_names):
16
+ '''Converts a string of text to a sequence of IDs corresponding to the symbols in the text.
17
+
18
+ The text can optionally have ARPAbet sequences enclosed in curly braces embedded
19
+ in it. For example, "Turn left on {HH AW1 S S T AH0 N} Street."
20
+
21
+ Args:
22
+ text: string to convert to a sequence
23
+ cleaner_names: names of the cleaner functions to run the text through
24
+
25
+ Returns:
26
+ List of integers corresponding to the symbols in the text
27
+ '''
28
+ sequence = []
29
+
30
+ # Check for curly braces and treat their contents as ARPAbet:
31
+ while len(text):
32
+ m = _curly_re.match(text)
33
+ if not m:
34
+ sequence += _symbols_to_sequence(_clean_text(text, cleaner_names))
35
+ break
36
+ sequence += _symbols_to_sequence(_clean_text(m.group(1), cleaner_names))
37
+ sequence += _arpabet_to_sequence(m.group(2))
38
+ text = m.group(3)
39
+
40
+ return sequence
41
+
42
+
43
+ def sequence_to_text(sequence):
44
+ '''Converts a sequence of IDs back to a string'''
45
+ result = ''
46
+ for symbol_id in sequence:
47
+ if symbol_id in _id_to_symbol:
48
+ s = _id_to_symbol[symbol_id]
49
+ # Enclose ARPAbet back in curly braces:
50
+ if len(s) > 1 and s[0] == '@':
51
+ s = '{%s}' % s[1:]
52
+ result += s
53
+ return result.replace('}{', ' ')
54
+
55
+
56
+ def _clean_text(text, cleaner_names):
57
+ for name in cleaner_names:
58
+ cleaner = getattr(cleaners, name)
59
+ if not cleaner:
60
+ raise Exception('Unknown cleaner: %s' % name)
61
+ text = cleaner(text)
62
+ return text
63
+
64
+
65
+ def _symbols_to_sequence(symbols):
66
+ return [_symbol_to_id[s] for s in symbols if _should_keep_symbol(s)]
67
+
68
+
69
+ def _arpabet_to_sequence(text):
70
+ return _symbols_to_sequence(['@' + s for s in text.split()])
71
+
72
+
73
+ def _should_keep_symbol(s):
74
+ return s in _symbol_to_id and s is not '_' and s is not '~'
text/__pycache__/__init__.cpython-310.pyc ADDED
Binary file (2.69 kB). View file
 
text/__pycache__/cleaners.cpython-310.pyc ADDED
Binary file (5.15 kB). View file
 
text/__pycache__/cmudict.cpython-310.pyc ADDED
Binary file (2.35 kB). View file
 
text/__pycache__/numbers.cpython-310.pyc ADDED
Binary file (2.19 kB). View file
 
text/__pycache__/symbols.cpython-310.pyc ADDED
Binary file (578 Bytes). View file