Upload 72 files
Browse filesThis view is limited to 50 files because it contains too many changes.
See raw diff
- .gitattributes +64 -36
- .gitignore +374 -0
- .gitmodules +4 -0
- Dockerfile +10 -0
- LICENSE +29 -0
- README.md +42 -2
- Yue_gradio.py +243 -0
- Yui_TrapGenesis +3 -0
- __pycache__/audio_processing.cpython-310.pyc +0 -0
- __pycache__/env.cpython-310.pyc +0 -0
- __pycache__/hifiutils.cpython-310.pyc +0 -0
- __pycache__/hparams.cpython-310.pyc +0 -0
- __pycache__/layers.cpython-310.pyc +0 -0
- __pycache__/meldataset.cpython-310.pyc +0 -0
- __pycache__/model.cpython-310.pyc +0 -0
- __pycache__/models.cpython-310.pyc +0 -0
- __pycache__/stft.cpython-310.pyc +0 -0
- __pycache__/utils.cpython-310.pyc +0 -0
- audio_processing.py +93 -0
- colab-train-zh-cn.ipynb +0 -0
- colab.ipynb +0 -0
- data_utils.py +111 -0
- demo.wav +0 -0
- distributed.py +173 -0
- env.py +15 -0
- filelists/transcript_train.txt +0 -0
- filelists/transcript_val.txt +426 -0
- hifiutils.py +58 -0
- hparams (1).py +94 -0
- hparams.py +94 -0
- inference.ipynb +0 -0
- layers.py +80 -0
- logger.py +48 -0
- loss_function.py +19 -0
- loss_scaler.py +131 -0
- meldataset.py +168 -0
- model.py +529 -0
- models.py +283 -0
- multiproc.py +23 -0
- plotting_utils.py +61 -0
- requirements.txt +14 -0
- stft.py +141 -0
- tensorboard.png +0 -0
- text/LICENSE +19 -0
- text/__init__.py +74 -0
- text/__pycache__/__init__.cpython-310.pyc +0 -0
- text/__pycache__/cleaners.cpython-310.pyc +0 -0
- text/__pycache__/cmudict.cpython-310.pyc +0 -0
- text/__pycache__/numbers.cpython-310.pyc +0 -0
- text/__pycache__/symbols.cpython-310.pyc +0 -0
.gitattributes
CHANGED
@@ -1,36 +1,64 @@
|
|
1 |
-
|
2 |
-
|
3 |
-
|
4 |
-
|
5 |
-
|
6 |
-
|
7 |
-
|
8 |
-
|
9 |
-
|
10 |
-
|
11 |
-
|
12 |
-
|
13 |
-
|
14 |
-
|
15 |
-
|
16 |
-
|
17 |
-
|
18 |
-
|
19 |
-
|
20 |
-
|
21 |
-
|
22 |
-
|
23 |
-
|
24 |
-
|
25 |
-
|
26 |
-
|
27 |
-
|
28 |
-
|
29 |
-
|
30 |
-
|
31 |
-
|
32 |
-
|
33 |
-
|
34 |
-
|
35 |
-
|
36 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
###############################################################################
|
2 |
+
# Set default behavior to automatically normalize line endings.
|
3 |
+
###############################################################################
|
4 |
+
* text=auto
|
5 |
+
|
6 |
+
###############################################################################
|
7 |
+
# Set default behavior for command prompt diff.
|
8 |
+
#
|
9 |
+
# This is need for earlier builds of msysgit that does not have it on by
|
10 |
+
# default for csharp files.
|
11 |
+
# Note: This is only used by command line
|
12 |
+
###############################################################################
|
13 |
+
#*.cs diff=csharp
|
14 |
+
|
15 |
+
###############################################################################
|
16 |
+
# Set the merge driver for project and solution files
|
17 |
+
#
|
18 |
+
# Merging from the command prompt will add diff markers to the files if there
|
19 |
+
# are conflicts (Merging from VS is not affected by the settings below, in VS
|
20 |
+
# the diff markers are never inserted). Diff markers may cause the following
|
21 |
+
# file extensions to fail to load in VS. An alternative would be to treat
|
22 |
+
# these files as binary and thus will always conflict and require user
|
23 |
+
# intervention with every merge. To do so, just uncomment the entries below
|
24 |
+
###############################################################################
|
25 |
+
#*.sln merge=binary
|
26 |
+
#*.csproj merge=binary
|
27 |
+
#*.vbproj merge=binary
|
28 |
+
#*.vcxproj merge=binary
|
29 |
+
#*.vcproj merge=binary
|
30 |
+
#*.dbproj merge=binary
|
31 |
+
#*.fsproj merge=binary
|
32 |
+
#*.lsproj merge=binary
|
33 |
+
#*.wixproj merge=binary
|
34 |
+
#*.modelproj merge=binary
|
35 |
+
#*.sqlproj merge=binary
|
36 |
+
#*.wwaproj merge=binary
|
37 |
+
|
38 |
+
###############################################################################
|
39 |
+
# behavior for image files
|
40 |
+
#
|
41 |
+
# image files are treated as binary by default.
|
42 |
+
###############################################################################
|
43 |
+
#*.jpg binary
|
44 |
+
#*.png binary
|
45 |
+
#*.gif binary
|
46 |
+
|
47 |
+
###############################################################################
|
48 |
+
# diff behavior for common document formats
|
49 |
+
#
|
50 |
+
# Convert binary document formats to text before diffing them. This feature
|
51 |
+
# is only available from the command line. Turn it on by uncommenting the
|
52 |
+
# entries below.
|
53 |
+
###############################################################################
|
54 |
+
#*.doc diff=astextplain
|
55 |
+
#*.DOC diff=astextplain
|
56 |
+
#*.docx diff=astextplain
|
57 |
+
#*.DOCX diff=astextplain
|
58 |
+
#*.dot diff=astextplain
|
59 |
+
#*.DOT diff=astextplain
|
60 |
+
#*.pdf diff=astextplain
|
61 |
+
#*.PDF diff=astextplain
|
62 |
+
#*.rtf diff=astextplain
|
63 |
+
#*.RTF diff=astextplain
|
64 |
+
Yui_TrapGenesis filter=lfs diff=lfs merge=lfs -text
|
.gitignore
ADDED
@@ -0,0 +1,374 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
## Ignore Visual Studio temporary files, build results, and
|
2 |
+
## files generated by popular Visual Studio add-ons.
|
3 |
+
##
|
4 |
+
## Get latest from https://github.com/github/gitignore/blob/master/VisualStudio.gitignore
|
5 |
+
|
6 |
+
# User-specific files
|
7 |
+
*.rsuser
|
8 |
+
*.suo
|
9 |
+
*.user
|
10 |
+
*.userosscache
|
11 |
+
*.sln.docstates
|
12 |
+
|
13 |
+
# User-specific files (MonoDevelop/Xamarin Studio)
|
14 |
+
*.userprefs
|
15 |
+
|
16 |
+
# Mono auto generated files
|
17 |
+
mono_crash.*
|
18 |
+
|
19 |
+
# Build results
|
20 |
+
[Dd]ebug/
|
21 |
+
[Dd]ebugPublic/
|
22 |
+
[Rr]elease/
|
23 |
+
[Rr]eleases/
|
24 |
+
x64/
|
25 |
+
x86/
|
26 |
+
[Ww][Ii][Nn]32/
|
27 |
+
[Aa][Rr][Mm]/
|
28 |
+
[Aa][Rr][Mm]64/
|
29 |
+
bld/
|
30 |
+
[Bb]in/
|
31 |
+
[Oo]bj/
|
32 |
+
[Oo]ut/
|
33 |
+
[Ll]og/
|
34 |
+
[Ll]ogs/
|
35 |
+
|
36 |
+
# Visual Studio 2015/2017 cache/options directory
|
37 |
+
.vs/
|
38 |
+
# Uncomment if you have tasks that create the project's static files in wwwroot
|
39 |
+
#wwwroot/
|
40 |
+
|
41 |
+
# Visual Studio 2017 auto generated files
|
42 |
+
Generated\ Files/
|
43 |
+
|
44 |
+
# MSTest test Results
|
45 |
+
[Tt]est[Rr]esult*/
|
46 |
+
[Bb]uild[Ll]og.*
|
47 |
+
|
48 |
+
# NUnit
|
49 |
+
*.VisualState.xml
|
50 |
+
TestResult.xml
|
51 |
+
nunit-*.xml
|
52 |
+
|
53 |
+
# Build Results of an ATL Project
|
54 |
+
[Dd]ebugPS/
|
55 |
+
[Rr]eleasePS/
|
56 |
+
dlldata.c
|
57 |
+
|
58 |
+
# Benchmark Results
|
59 |
+
BenchmarkDotNet.Artifacts/
|
60 |
+
|
61 |
+
# .NET Core
|
62 |
+
project.lock.json
|
63 |
+
project.fragment.lock.json
|
64 |
+
artifacts/
|
65 |
+
|
66 |
+
# ASP.NET Scaffolding
|
67 |
+
ScaffoldingReadMe.txt
|
68 |
+
|
69 |
+
# StyleCop
|
70 |
+
StyleCopReport.xml
|
71 |
+
|
72 |
+
# Files built by Visual Studio
|
73 |
+
*_i.c
|
74 |
+
*_p.c
|
75 |
+
*_h.h
|
76 |
+
*.ilk
|
77 |
+
*.meta
|
78 |
+
*.obj
|
79 |
+
*.iobj
|
80 |
+
*.pch
|
81 |
+
*.pdb
|
82 |
+
*.ipdb
|
83 |
+
*.pgc
|
84 |
+
*.pgd
|
85 |
+
*.rsp
|
86 |
+
*.sbr
|
87 |
+
*.tlb
|
88 |
+
*.tli
|
89 |
+
*.tlh
|
90 |
+
*.tmp
|
91 |
+
*.tmp_proj
|
92 |
+
*_wpftmp.csproj
|
93 |
+
*.log
|
94 |
+
*.vspscc
|
95 |
+
*.vssscc
|
96 |
+
.builds
|
97 |
+
*.pidb
|
98 |
+
*.svclog
|
99 |
+
*.scc
|
100 |
+
|
101 |
+
# Chutzpah Test files
|
102 |
+
_Chutzpah*
|
103 |
+
|
104 |
+
# Visual C++ cache files
|
105 |
+
ipch/
|
106 |
+
*.aps
|
107 |
+
*.ncb
|
108 |
+
*.opendb
|
109 |
+
*.opensdf
|
110 |
+
*.sdf
|
111 |
+
*.cachefile
|
112 |
+
*.VC.db
|
113 |
+
*.VC.VC.opendb
|
114 |
+
|
115 |
+
# Visual Studio profiler
|
116 |
+
*.psess
|
117 |
+
*.vsp
|
118 |
+
*.vspx
|
119 |
+
*.sap
|
120 |
+
|
121 |
+
# Visual Studio Trace Files
|
122 |
+
*.e2e
|
123 |
+
|
124 |
+
# TFS 2012 Local Workspace
|
125 |
+
$tf/
|
126 |
+
|
127 |
+
# Guidance Automation Toolkit
|
128 |
+
*.gpState
|
129 |
+
|
130 |
+
# ReSharper is a .NET coding add-in
|
131 |
+
_ReSharper*/
|
132 |
+
*.[Rr]e[Ss]harper
|
133 |
+
*.DotSettings.user
|
134 |
+
|
135 |
+
# TeamCity is a build add-in
|
136 |
+
_TeamCity*
|
137 |
+
|
138 |
+
# DotCover is a Code Coverage Tool
|
139 |
+
*.dotCover
|
140 |
+
|
141 |
+
# AxoCover is a Code Coverage Tool
|
142 |
+
.axoCover/*
|
143 |
+
!.axoCover/settings.json
|
144 |
+
|
145 |
+
# Coverlet is a free, cross platform Code Coverage Tool
|
146 |
+
coverage*.json
|
147 |
+
coverage*.xml
|
148 |
+
coverage*.info
|
149 |
+
|
150 |
+
# Visual Studio code coverage results
|
151 |
+
*.coverage
|
152 |
+
*.coveragexml
|
153 |
+
|
154 |
+
# NCrunch
|
155 |
+
_NCrunch_*
|
156 |
+
.*crunch*.local.xml
|
157 |
+
nCrunchTemp_*
|
158 |
+
|
159 |
+
# MightyMoose
|
160 |
+
*.mm.*
|
161 |
+
AutoTest.Net/
|
162 |
+
|
163 |
+
# Web workbench (sass)
|
164 |
+
.sass-cache/
|
165 |
+
|
166 |
+
# Installshield output folder
|
167 |
+
[Ee]xpress/
|
168 |
+
|
169 |
+
# DocProject is a documentation generator add-in
|
170 |
+
DocProject/buildhelp/
|
171 |
+
DocProject/Help/*.HxT
|
172 |
+
DocProject/Help/*.HxC
|
173 |
+
DocProject/Help/*.hhc
|
174 |
+
DocProject/Help/*.hhk
|
175 |
+
DocProject/Help/*.hhp
|
176 |
+
DocProject/Help/Html2
|
177 |
+
DocProject/Help/html
|
178 |
+
|
179 |
+
# Click-Once directory
|
180 |
+
publish/
|
181 |
+
|
182 |
+
# Publish Web Output
|
183 |
+
*.[Pp]ublish.xml
|
184 |
+
*.azurePubxml
|
185 |
+
# Note: Comment the next line if you want to checkin your web deploy settings,
|
186 |
+
# but database connection strings (with potential passwords) will be unencrypted
|
187 |
+
*.pubxml
|
188 |
+
*.publishproj
|
189 |
+
|
190 |
+
# Microsoft Azure Web App publish settings. Comment the next line if you want to
|
191 |
+
# checkin your Azure Web App publish settings, but sensitive information contained
|
192 |
+
# in these scripts will be unencrypted
|
193 |
+
PublishScripts/
|
194 |
+
|
195 |
+
# NuGet Packages
|
196 |
+
*.nupkg
|
197 |
+
# NuGet Symbol Packages
|
198 |
+
*.snupkg
|
199 |
+
# The packages folder can be ignored because of Package Restore
|
200 |
+
**/[Pp]ackages/*
|
201 |
+
# except build/, which is used as an MSBuild target.
|
202 |
+
!**/[Pp]ackages/build/
|
203 |
+
# Uncomment if necessary however generally it will be regenerated when needed
|
204 |
+
#!**/[Pp]ackages/repositories.config
|
205 |
+
# NuGet v3's project.json files produces more ignorable files
|
206 |
+
*.nuget.props
|
207 |
+
*.nuget.targets
|
208 |
+
|
209 |
+
# Microsoft Azure Build Output
|
210 |
+
csx/
|
211 |
+
*.build.csdef
|
212 |
+
|
213 |
+
# Microsoft Azure Emulator
|
214 |
+
ecf/
|
215 |
+
rcf/
|
216 |
+
|
217 |
+
# Windows Store app package directories and files
|
218 |
+
AppPackages/
|
219 |
+
BundleArtifacts/
|
220 |
+
Package.StoreAssociation.xml
|
221 |
+
_pkginfo.txt
|
222 |
+
*.appx
|
223 |
+
*.appxbundle
|
224 |
+
*.appxupload
|
225 |
+
|
226 |
+
# Visual Studio cache files
|
227 |
+
# files ending in .cache can be ignored
|
228 |
+
*.[Cc]ache
|
229 |
+
# but keep track of directories ending in .cache
|
230 |
+
!?*.[Cc]ache/
|
231 |
+
|
232 |
+
# Others
|
233 |
+
ClientBin/
|
234 |
+
~$*
|
235 |
+
*~
|
236 |
+
*.dbmdl
|
237 |
+
*.dbproj.schemaview
|
238 |
+
*.jfm
|
239 |
+
*.pfx
|
240 |
+
*.publishsettings
|
241 |
+
orleans.codegen.cs
|
242 |
+
|
243 |
+
# Including strong name files can present a security risk
|
244 |
+
# (https://github.com/github/gitignore/pull/2483#issue-259490424)
|
245 |
+
#*.snk
|
246 |
+
|
247 |
+
# Since there are multiple workflows, uncomment next line to ignore bower_components
|
248 |
+
# (https://github.com/github/gitignore/pull/1529#issuecomment-104372622)
|
249 |
+
#bower_components/
|
250 |
+
|
251 |
+
# RIA/Silverlight projects
|
252 |
+
Generated_Code/
|
253 |
+
|
254 |
+
# Backup & report files from converting an old project file
|
255 |
+
# to a newer Visual Studio version. Backup files are not needed,
|
256 |
+
# because we have git ;-)
|
257 |
+
_UpgradeReport_Files/
|
258 |
+
Backup*/
|
259 |
+
UpgradeLog*.XML
|
260 |
+
UpgradeLog*.htm
|
261 |
+
ServiceFabricBackup/
|
262 |
+
*.rptproj.bak
|
263 |
+
|
264 |
+
# SQL Server files
|
265 |
+
*.mdf
|
266 |
+
*.ldf
|
267 |
+
*.ndf
|
268 |
+
|
269 |
+
# Business Intelligence projects
|
270 |
+
*.rdl.data
|
271 |
+
*.bim.layout
|
272 |
+
*.bim_*.settings
|
273 |
+
*.rptproj.rsuser
|
274 |
+
*- [Bb]ackup.rdl
|
275 |
+
*- [Bb]ackup ([0-9]).rdl
|
276 |
+
*- [Bb]ackup ([0-9][0-9]).rdl
|
277 |
+
|
278 |
+
# Microsoft Fakes
|
279 |
+
FakesAssemblies/
|
280 |
+
|
281 |
+
# GhostDoc plugin setting file
|
282 |
+
*.GhostDoc.xml
|
283 |
+
|
284 |
+
# Node.js Tools for Visual Studio
|
285 |
+
.ntvs_analysis.dat
|
286 |
+
node_modules/
|
287 |
+
|
288 |
+
# Visual Studio 6 build log
|
289 |
+
*.plg
|
290 |
+
|
291 |
+
# Visual Studio 6 workspace options file
|
292 |
+
*.opt
|
293 |
+
|
294 |
+
# Visual Studio 6 auto-generated workspace file (contains which files were open etc.)
|
295 |
+
*.vbw
|
296 |
+
|
297 |
+
# Visual Studio LightSwitch build output
|
298 |
+
**/*.HTMLClient/GeneratedArtifacts
|
299 |
+
**/*.DesktopClient/GeneratedArtifacts
|
300 |
+
**/*.DesktopClient/ModelManifest.xml
|
301 |
+
**/*.Server/GeneratedArtifacts
|
302 |
+
**/*.Server/ModelManifest.xml
|
303 |
+
_Pvt_Extensions
|
304 |
+
|
305 |
+
# Paket dependency manager
|
306 |
+
.paket/paket.exe
|
307 |
+
paket-files/
|
308 |
+
|
309 |
+
# FAKE - F# Make
|
310 |
+
.fake/
|
311 |
+
|
312 |
+
# CodeRush personal settings
|
313 |
+
.cr/personal
|
314 |
+
|
315 |
+
# Python Tools for Visual Studio (PTVS)
|
316 |
+
__pycache__/
|
317 |
+
*.pyc
|
318 |
+
|
319 |
+
# Cake - Uncomment if you are using it
|
320 |
+
# tools/**
|
321 |
+
# !tools/packages.config
|
322 |
+
|
323 |
+
# Tabs Studio
|
324 |
+
*.tss
|
325 |
+
|
326 |
+
# Telerik's JustMock configuration file
|
327 |
+
*.jmconfig
|
328 |
+
|
329 |
+
# BizTalk build output
|
330 |
+
*.btp.cs
|
331 |
+
*.btm.cs
|
332 |
+
*.odx.cs
|
333 |
+
*.xsd.cs
|
334 |
+
|
335 |
+
# OpenCover UI analysis results
|
336 |
+
OpenCover/
|
337 |
+
|
338 |
+
# Azure Stream Analytics local run output
|
339 |
+
ASALocalRun/
|
340 |
+
|
341 |
+
# MSBuild Binary and Structured Log
|
342 |
+
*.binlog
|
343 |
+
|
344 |
+
# NVidia Nsight GPU debugger configuration file
|
345 |
+
*.nvuser
|
346 |
+
|
347 |
+
# MFractors (Xamarin productivity tool) working folder
|
348 |
+
.mfractor/
|
349 |
+
|
350 |
+
# Local History for Visual Studio
|
351 |
+
.localhistory/
|
352 |
+
|
353 |
+
# BeatPulse healthcheck temp database
|
354 |
+
healthchecksdb
|
355 |
+
|
356 |
+
# Backup folder for Package Reference Convert tool in Visual Studio 2017
|
357 |
+
MigrationBackup/
|
358 |
+
|
359 |
+
# Ionide (cross platform F# VS Code tools) working folder
|
360 |
+
.ionide/
|
361 |
+
|
362 |
+
# Fody - auto-generated XML schema
|
363 |
+
FodyWeavers.xsd
|
364 |
+
|
365 |
+
# models
|
366 |
+
/ayachi_*
|
367 |
+
/inaba_*
|
368 |
+
/tomotake_*
|
369 |
+
/murasame_*
|
370 |
+
/arihara_*
|
371 |
+
/waveglow_*
|
372 |
+
|
373 |
+
# jupyter cache
|
374 |
+
/.ipynb_checkpoints
|
.gitmodules
ADDED
@@ -0,0 +1,4 @@
|
|
|
|
|
|
|
|
|
|
|
1 |
+
[submodule "waveglow"]
|
2 |
+
path = waveglow
|
3 |
+
url = https://github.com/NVIDIA/waveglow
|
4 |
+
branch = master
|
Dockerfile
ADDED
@@ -0,0 +1,10 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
FROM pytorch/pytorch:nightly-devel-cuda10.0-cudnn7
|
2 |
+
ENV PATH /usr/local/nvidia/bin:/usr/local/cuda/bin:${PATH}
|
3 |
+
|
4 |
+
RUN apt-get update -y
|
5 |
+
|
6 |
+
RUN pip install numpy scipy matplotlib librosa==0.6.0 tensorflow tensorboardX inflect==0.2.5 Unidecode==1.0.22 pillow jupyter
|
7 |
+
|
8 |
+
ADD apex /apex/
|
9 |
+
WORKDIR /apex/
|
10 |
+
RUN pip install -v --no-cache-dir --global-option="--cpp_ext" --global-option="--cuda_ext" .
|
LICENSE
ADDED
@@ -0,0 +1,29 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
BSD 3-Clause License
|
2 |
+
|
3 |
+
Copyright (c) 2018, NVIDIA Corporation
|
4 |
+
All rights reserved.
|
5 |
+
|
6 |
+
Redistribution and use in source and binary forms, with or without
|
7 |
+
modification, are permitted provided that the following conditions are met:
|
8 |
+
|
9 |
+
* Redistributions of source code must retain the above copyright notice, this
|
10 |
+
list of conditions and the following disclaimer.
|
11 |
+
|
12 |
+
* Redistributions in binary form must reproduce the above copyright notice,
|
13 |
+
this list of conditions and the following disclaimer in the documentation
|
14 |
+
and/or other materials provided with the distribution.
|
15 |
+
|
16 |
+
* Neither the name of the copyright holder nor the names of its
|
17 |
+
contributors may be used to endorse or promote products derived from
|
18 |
+
this software without specific prior written permission.
|
19 |
+
|
20 |
+
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
|
21 |
+
AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
|
22 |
+
IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
|
23 |
+
DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
|
24 |
+
FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
|
25 |
+
DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
|
26 |
+
SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
|
27 |
+
CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
|
28 |
+
OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
|
29 |
+
OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
README.md
CHANGED
@@ -1,3 +1,43 @@
|
|
1 |
---
|
2 |
-
|
3 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
---
|
2 |
+
title: TTS_Yui
|
3 |
+
app_file: Yue_gradio_cpu.py
|
4 |
+
sdk: gradio
|
5 |
+
sdk_version: 3.36.1
|
6 |
+
---
|
7 |
+
# Tacotron2-Japanese
|
8 |
+
- Tacotron2 implementation of Japanese
|
9 |
+
## Links
|
10 |
+
* Reference: [NVIDIA/tacotron2](https://github.com/NVIDIA/tacotron2)
|
11 |
+
* [Pre-training tacotron2 models](https://github.com/CjangCjengh/TTSModels)
|
12 |
+
* [latest changes can be viewed in this repository](https://github.com/StarxSky/tacotron2-JP)
|
13 |
+
|
14 |
+
## How to use
|
15 |
+
1. Put raw Japanese texts in ./filelists
|
16 |
+
2. Put WAV files in ./wav
|
17 |
+
3. (Optional) Download NVIDIA's [pretrained model](https://drive.google.com/file/d/1c5ZTuT7J08wLUoVZ2KkUs_VdZuJ86ZqA/view?usp=sharing)
|
18 |
+
4. Open ./train.ipynb to install requirements and start training
|
19 |
+
5. Download NVIDIA's [WaveGlow model](https://drive.google.com/open?id=1rpK8CzAAirq9sWZhe9nlfvxMF1dRgFbF)
|
20 |
+
6. Open ./inference.ipynb to generate voice
|
21 |
+
|
22 |
+
## Cleaners
|
23 |
+
File ./hparams.py line 30
|
24 |
+
### 1. 'japanese_cleaners'
|
25 |
+
#### Before
|
26 |
+
何かあったらいつでも話して下さい。学院のことじゃなく、私事に関することでも何でも
|
27 |
+
#### After
|
28 |
+
nanikaacltaraitsudemohanashItekudasai.gakuiNnokotojanaku,shijinikaNsurukotodemonanidemo.
|
29 |
+
### 2. 'japanese_tokenization_cleaners'
|
30 |
+
#### Before
|
31 |
+
何かあったらいつでも話して下さい。学院のことじゃなく、私事に関することでも何でも
|
32 |
+
#### After
|
33 |
+
nani ka acl tara itsu demo hanashi te kudasai. gakuiN no koto ja naku, shiji nikaNsuru koto de mo naNdemo.
|
34 |
+
### 3. 'japanese_accent_cleaners'
|
35 |
+
#### Before
|
36 |
+
何かあったらいつでも話して下さい。学院のことじゃなく、私事に関することでも何でも
|
37 |
+
#### After
|
38 |
+
:na)nika a)cltara i)tsudemo ha(na)shIte ku(dasa)i.:ga(kuiNno ko(to)janaku,:shi)jini ka(Nsu)ru ko(to)demo na)nidemo.
|
39 |
+
### 4. 'japanese_phrase_cleaners'
|
40 |
+
#### Before
|
41 |
+
何かあったらいつでも話して下さい。学院のことじゃなく、私事に関することでも何でも
|
42 |
+
#### After
|
43 |
+
nanika acltara itsudemo hanashIte kudasai. gakuiNno kotojanaku, shijini kaNsuru kotodemo nanidemo.
|
Yue_gradio.py
ADDED
@@ -0,0 +1,243 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
#好用的
|
2 |
+
|
3 |
+
import os
|
4 |
+
os.system('pip install -U tensorflow')
|
5 |
+
os.system('pip install -q unidecode tensorboardX')
|
6 |
+
os.system('pip install librosa==0.8.0')
|
7 |
+
os.system('pip install pysoundfile==0.9.0.post1')
|
8 |
+
os.system('pip install unidecode==1.3.4')
|
9 |
+
os.system('pip install pyopenjtalk --no-build-isolation')
|
10 |
+
os.system('pip install inflect==5.6.2')
|
11 |
+
os.system('pip install janome==0.4.2')
|
12 |
+
os.system('pip install tqdm -q')
|
13 |
+
os.system('pip install gdown')
|
14 |
+
os.system('pip install -q librosa unidecode')
|
15 |
+
|
16 |
+
os.system('pip install ipython')
|
17 |
+
os.system('pip install --upgrade jupyter ipywidgets')
|
18 |
+
os.system('jupyter nbextension enable --py widgetsnbextension')
|
19 |
+
os.system('pip uninstall tqdm')
|
20 |
+
os.system('pip install tqdm')
|
21 |
+
|
22 |
+
import time
|
23 |
+
import pyopenjtalk
|
24 |
+
import soundfile as sf
|
25 |
+
import gradio as gr
|
26 |
+
import torch
|
27 |
+
import IPython.display as ipd
|
28 |
+
import numpy as np
|
29 |
+
import torch
|
30 |
+
import json
|
31 |
+
from hparams import create_hparams
|
32 |
+
from model import Tacotron2
|
33 |
+
from layers import TacotronSTFT
|
34 |
+
from audio_processing import griffin_lim
|
35 |
+
from text import text_to_sequence
|
36 |
+
from env import AttrDict
|
37 |
+
from meldataset import MAX_WAV_VALUE
|
38 |
+
from models import Generator
|
39 |
+
|
40 |
+
#@,tlitle 配置并运行
|
41 |
+
|
42 |
+
#国际 HiFi-GAN 模型(有点机器音): 1qpgI41wNXFcH-iKq1Y42JlBC9j0je8PW
|
43 |
+
#@markdown 你训练好的tacotron2模型的路径填在`Tacotron2_Model`这里
|
44 |
+
Tacotron2_Model = '/content/Yui_TrapGenesis'#@param {type:"string"}
|
45 |
+
TACOTRON2_ID = Tacotron2_Model
|
46 |
+
HIFIGAN_ID = "1qpgI41wNXFcH-iKq1Y42JlBC9j0je8PW"
|
47 |
+
#@markdown 选择预处理文本的cleaner
|
48 |
+
text_cleaner = 'japanese_phrase_cleaners'#@param {type:"string"}
|
49 |
+
import pyopenjtalk
|
50 |
+
import soundfile as sf
|
51 |
+
import gradio as gr
|
52 |
+
|
53 |
+
# 全局变量声明
|
54 |
+
model = None
|
55 |
+
hparams = None
|
56 |
+
hifigan = None
|
57 |
+
thisdict = None
|
58 |
+
pronounciation_dictionary = False
|
59 |
+
show_graphs = False # 添加show_graphs变量,并赋予默认值
|
60 |
+
|
61 |
+
# 初始化函数
|
62 |
+
def initialize():
|
63 |
+
global model, hparams, hifigan, thisdict, pronounciation_dictionary
|
64 |
+
|
65 |
+
# 检查是否已初始化
|
66 |
+
try:
|
67 |
+
initialized
|
68 |
+
except NameError:
|
69 |
+
print("Setting up, please wait.\n")
|
70 |
+
|
71 |
+
from tqdm.notebook import tqdm
|
72 |
+
with tqdm(total=5, leave=False) as pbar:
|
73 |
+
import os
|
74 |
+
from os.path import exists, join, basename, splitext
|
75 |
+
git_repo_url = 'https://github.com/CjangCjengh/tacotron2-japanese.git'
|
76 |
+
project_name = splitext(basename(git_repo_url))[0]
|
77 |
+
if not exists(project_name):
|
78 |
+
# clone and install
|
79 |
+
os.system('git clone -q --recursive {git_repo_url}')
|
80 |
+
os.system('git clone -q --recursive https://github.com/SortAnon/hifi-gan')
|
81 |
+
|
82 |
+
pbar.update(1) # downloaded TT2 and HiFi-GAN
|
83 |
+
import sys
|
84 |
+
sys.path.append('hifi-gan')
|
85 |
+
sys.path.append(project_name)
|
86 |
+
import time
|
87 |
+
import matplotlib
|
88 |
+
import matplotlib.pylab as plt
|
89 |
+
import gdown
|
90 |
+
d = 'https://drive.google.com/uc?id='
|
91 |
+
|
92 |
+
# %matplotlib inline
|
93 |
+
import IPython.display as ipd
|
94 |
+
import numpy as np
|
95 |
+
import torch
|
96 |
+
import json
|
97 |
+
from hparams import create_hparams
|
98 |
+
from model import Tacotron2
|
99 |
+
from layers import TacotronSTFT
|
100 |
+
from audio_processing import griffin_lim
|
101 |
+
from text import text_to_sequence
|
102 |
+
from env import AttrDict
|
103 |
+
from meldataset import MAX_WAV_VALUE
|
104 |
+
from models import Generator
|
105 |
+
|
106 |
+
pbar.update(1) # initialized Dependancies
|
107 |
+
|
108 |
+
graph_width = 900
|
109 |
+
graph_height = 360
|
110 |
+
def plot_data(data, figsize=(int(graph_width/100), int(graph_height/100))):
|
111 |
+
# %matplotlib inline
|
112 |
+
fig, axes = plt.subplots(1, len(data), figsize=figsize)
|
113 |
+
for i in range(len(data)):
|
114 |
+
axes[i].imshow(data[i], aspect='auto', origin='upper',
|
115 |
+
interpolation='none', cmap='inferno')
|
116 |
+
fig.canvas.draw()
|
117 |
+
plt.show()
|
118 |
+
|
119 |
+
# Setup Pronounciation Dictionary
|
120 |
+
os.system('wget https://github.com/wind4000/tacotron2/releases/download/v0.2/merged.dict.txt')
|
121 |
+
thisdict = {}
|
122 |
+
for line in reversed((open('merged.dict.txt', "r").read()).splitlines()):
|
123 |
+
thisdict[(line.split(" ",1))[0]] = (line.split(" ",1))[1].strip()
|
124 |
+
|
125 |
+
pbar.update(1) # Downloaded and Set up Pronounciation Dictionary
|
126 |
+
|
127 |
+
def ARPA(text, punctuation=r"!?,.;", EOS_Token=True):
|
128 |
+
out = ''
|
129 |
+
for word_ in text.split(" "):
|
130 |
+
word=word_; end_chars = ''
|
131 |
+
while any(elem in word for elem in punctuation) and len(word) > 1:
|
132 |
+
if word[-1] in punctuation: end_chars = word[-1] + end_chars; word = word[:-1]
|
133 |
+
else: break
|
134 |
+
try:
|
135 |
+
word_arpa = thisdict[word.upper()]
|
136 |
+
word = "{" + str(word_arpa) + "}"
|
137 |
+
except KeyError: pass
|
138 |
+
out = (out + " " + word + end_chars).strip()
|
139 |
+
if EOS_Token and out[-1] != ";": out += ";"
|
140 |
+
return out
|
141 |
+
|
142 |
+
def get_hifigan(MODEL_ID):
|
143 |
+
# Download HiFi-GAN
|
144 |
+
hifigan_pretrained_model = 'hifimodel'
|
145 |
+
gdown.download(d+MODEL_ID, hifigan_pretrained_model, quiet=False)
|
146 |
+
if not exists(hifigan_pretrained_model):
|
147 |
+
raise Exception("HiFI-GAN model failed to download!")
|
148 |
+
|
149 |
+
# Load HiFi-GAN
|
150 |
+
conf = os.path.join("hifi-gan", "config_v1.json")
|
151 |
+
with open(conf) as f:
|
152 |
+
json_config = json.loads(f.read())
|
153 |
+
h = AttrDict(json_config)
|
154 |
+
torch.manual_seed(h.seed)
|
155 |
+
hifigan = Generator(h).to(torch.device("cuda"))
|
156 |
+
state_dict_g = torch.load(hifigan_pretrained_model, map_location=torch.device("cuda"))
|
157 |
+
hifigan.load_state_dict(state_dict_g["generator"])
|
158 |
+
hifigan.eval()
|
159 |
+
hifigan.remove_weight_norm()
|
160 |
+
return hifigan, h
|
161 |
+
|
162 |
+
hifigan, h = get_hifigan(HIFIGAN_ID)
|
163 |
+
pbar.update(1) # Downloaded and Set up HiFi-GAN
|
164 |
+
|
165 |
+
def has_MMI(STATE_DICT):
|
166 |
+
return any(True for x in STATE_DICT.keys() if "mi." in x)
|
167 |
+
|
168 |
+
def get_Tactron2(MODEL_ID):
|
169 |
+
# Download Tacotron2
|
170 |
+
tacotron2_pretrained_model = TACOTRON2_ID
|
171 |
+
if not exists(tacotron2_pretrained_model):
|
172 |
+
raise Exception("Tacotron2 model failed to download!")
|
173 |
+
# Load Tacotron2 and Config
|
174 |
+
hparams = create_hparams()
|
175 |
+
hparams.sampling_rate = 22050
|
176 |
+
hparams.max_decoder_steps = 2000 # Max Duration
|
177 |
+
hparams.gate_threshold = 0.80 # Model must be 25% sure the clip is over before ending generation
|
178 |
+
model = Tacotron2(hparams)
|
179 |
+
state_dict = torch.load(tacotron2_pretrained_model)['state_dict']
|
180 |
+
if has_MMI(state_dict):
|
181 |
+
raise Exception("ERROR: This notebook does not currently support MMI models.")
|
182 |
+
model.load_state_dict(state_dict)
|
183 |
+
_ = model.cuda().eval().half()
|
184 |
+
return model, hparams
|
185 |
+
|
186 |
+
model, hparams = get_Tactron2(TACOTRON2_ID)
|
187 |
+
previous_tt2_id = TACOTRON2_ID
|
188 |
+
|
189 |
+
pbar.update(1) # Downloaded and Set up Tacotron2
|
190 |
+
|
191 |
+
# 初始化
|
192 |
+
initialize()
|
193 |
+
|
194 |
+
import soundfile as sf
|
195 |
+
|
196 |
+
def end_to_end_infer(text, pronounciation_dictionary, show_graphs):
|
197 |
+
audio = None # 定义一个变量用于存储音频数据
|
198 |
+
for i in [x for x in text.split("\n") if len(x)]:
|
199 |
+
if not pronounciation_dictionary:
|
200 |
+
if i[-1] != ";":
|
201 |
+
i = i + ";"
|
202 |
+
else:
|
203 |
+
i = ARPA(i)
|
204 |
+
with torch.no_grad():
|
205 |
+
sequence = np.array(text_to_sequence(i, [text_cleaner]))[None, :]
|
206 |
+
sequence = torch.autograd.Variable(torch.from_numpy(sequence)).cuda().long()
|
207 |
+
mel_outputs, mel_outputs_postnet, _, alignments = model.inference(sequence)
|
208 |
+
if show_graphs:
|
209 |
+
plot_data((mel_outputs_postnet.float().data.cpu().numpy()[0],
|
210 |
+
alignments.float().data.cpu().numpy()[0].T))
|
211 |
+
y_g_hat = hifigan(mel_outputs_postnet.float())
|
212 |
+
audio = y_g_hat.squeeze()
|
213 |
+
audio = audio * MAX_WAV_VALUE
|
214 |
+
output_filename = f"output_{time.strftime('%Y%m%d%H%M%S')}.wav"
|
215 |
+
sf.write(output_filename, audio.cpu().numpy().astype('int16'), hparams.sampling_rate)
|
216 |
+
print(f"音频已保存为 {output_filename}")
|
217 |
+
print("")
|
218 |
+
ipd.display(ipd.Audio(audio.cpu().numpy().astype("int16"), rate=hparams.sampling_rate))
|
219 |
+
return audio # 返回音频数据
|
220 |
+
|
221 |
+
# 文本到语音转换函数
|
222 |
+
def text_to_speech(text, max_decoder_steps=2000, gate_threshold=0.5):
|
223 |
+
global model, hparams, hifigan, thisdict, pronounciation_dictionary, show_graphs
|
224 |
+
|
225 |
+
hparams.max_decoder_steps = max_decoder_steps
|
226 |
+
hparams.gate_threshold = gate_threshold
|
227 |
+
output_filename = f"output_{time.strftime('%Y%m%d%H%M%S')}.wav"
|
228 |
+
audio = end_to_end_infer(text, pronounciation_dictionary, show_graphs)
|
229 |
+
if audio is not None:
|
230 |
+
sf.write(output_filename, audio.cpu().numpy().astype('int16'), hparams.sampling_rate)
|
231 |
+
return output_filename
|
232 |
+
else:
|
233 |
+
return None
|
234 |
+
|
235 |
+
# Gradio界面
|
236 |
+
inputs = [
|
237 |
+
gr.inputs.Textbox(lines=3, label="输入文本"),
|
238 |
+
gr.inputs.Slider(minimum=100, maximum=5000, default=2000, step=100, label="最大解码步数"),
|
239 |
+
gr.inputs.Slider(minimum=0.0, maximum=1.0, default=0.5, step=0.05, label="门控阈值")
|
240 |
+
]
|
241 |
+
outputs = gr.outputs.File(label="下载生成的音频")
|
242 |
+
|
243 |
+
gr.Interface(fn=text_to_speech, inputs=inputs, outputs=outputs).launch(debug=True,share=True)
|
Yui_TrapGenesis
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:6c902e404953f4e52dae8cfc9e63bc673ef7654a4779fc3a461f290f3acaa43c
|
3 |
+
size 338428823
|
__pycache__/audio_processing.cpython-310.pyc
ADDED
Binary file (2.77 kB). View file
|
|
__pycache__/env.cpython-310.pyc
ADDED
Binary file (797 Bytes). View file
|
|
__pycache__/hifiutils.cpython-310.pyc
ADDED
Binary file (2.01 kB). View file
|
|
__pycache__/hparams.cpython-310.pyc
ADDED
Binary file (1.9 kB). View file
|
|
__pycache__/layers.cpython-310.pyc
ADDED
Binary file (3.37 kB). View file
|
|
__pycache__/meldataset.cpython-310.pyc
ADDED
Binary file (5.34 kB). View file
|
|
__pycache__/model.cpython-310.pyc
ADDED
Binary file (14.9 kB). View file
|
|
__pycache__/models.cpython-310.pyc
ADDED
Binary file (8.64 kB). View file
|
|
__pycache__/stft.cpython-310.pyc
ADDED
Binary file (4.77 kB). View file
|
|
__pycache__/utils.cpython-310.pyc
ADDED
Binary file (1.48 kB). View file
|
|
audio_processing.py
ADDED
@@ -0,0 +1,93 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import torch
|
2 |
+
import numpy as np
|
3 |
+
from scipy.signal import get_window
|
4 |
+
import librosa.util as librosa_util
|
5 |
+
|
6 |
+
|
7 |
+
def window_sumsquare(window, n_frames, hop_length=200, win_length=800,
|
8 |
+
n_fft=800, dtype=np.float32, norm=None):
|
9 |
+
"""
|
10 |
+
# from librosa 0.6
|
11 |
+
Compute the sum-square envelope of a window function at a given hop length.
|
12 |
+
|
13 |
+
This is used to estimate modulation effects induced by windowing
|
14 |
+
observations in short-time fourier transforms.
|
15 |
+
|
16 |
+
Parameters
|
17 |
+
----------
|
18 |
+
window : string, tuple, number, callable, or list-like
|
19 |
+
Window specification, as in `get_window`
|
20 |
+
|
21 |
+
n_frames : int > 0
|
22 |
+
The number of analysis frames
|
23 |
+
|
24 |
+
hop_length : int > 0
|
25 |
+
The number of samples to advance between frames
|
26 |
+
|
27 |
+
win_length : [optional]
|
28 |
+
The length of the window function. By default, this matches `n_fft`.
|
29 |
+
|
30 |
+
n_fft : int > 0
|
31 |
+
The length of each analysis frame.
|
32 |
+
|
33 |
+
dtype : np.dtype
|
34 |
+
The data type of the output
|
35 |
+
|
36 |
+
Returns
|
37 |
+
-------
|
38 |
+
wss : np.ndarray, shape=`(n_fft + hop_length * (n_frames - 1))`
|
39 |
+
The sum-squared envelope of the window function
|
40 |
+
"""
|
41 |
+
if win_length is None:
|
42 |
+
win_length = n_fft
|
43 |
+
|
44 |
+
n = n_fft + hop_length * (n_frames - 1)
|
45 |
+
x = np.zeros(n, dtype=dtype)
|
46 |
+
|
47 |
+
# Compute the squared window at the desired length
|
48 |
+
win_sq = get_window(window, win_length, fftbins=True)
|
49 |
+
win_sq = librosa_util.normalize(win_sq, norm=norm)**2
|
50 |
+
win_sq = librosa_util.pad_center(win_sq, n_fft)
|
51 |
+
|
52 |
+
# Fill the envelope
|
53 |
+
for i in range(n_frames):
|
54 |
+
sample = i * hop_length
|
55 |
+
x[sample:min(n, sample + n_fft)] += win_sq[:max(0, min(n_fft, n - sample))]
|
56 |
+
return x
|
57 |
+
|
58 |
+
|
59 |
+
def griffin_lim(magnitudes, stft_fn, n_iters=30):
|
60 |
+
"""
|
61 |
+
PARAMS
|
62 |
+
------
|
63 |
+
magnitudes: spectrogram magnitudes
|
64 |
+
stft_fn: STFT class with transform (STFT) and inverse (ISTFT) methods
|
65 |
+
"""
|
66 |
+
|
67 |
+
angles = np.angle(np.exp(2j * np.pi * np.random.rand(*magnitudes.size())))
|
68 |
+
angles = angles.astype(np.float32)
|
69 |
+
angles = torch.autograd.Variable(torch.from_numpy(angles))
|
70 |
+
signal = stft_fn.inverse(magnitudes, angles).squeeze(1)
|
71 |
+
|
72 |
+
for i in range(n_iters):
|
73 |
+
_, angles = stft_fn.transform(signal)
|
74 |
+
signal = stft_fn.inverse(magnitudes, angles).squeeze(1)
|
75 |
+
return signal
|
76 |
+
|
77 |
+
|
78 |
+
def dynamic_range_compression(x, C=1, clip_val=1e-5):
|
79 |
+
"""
|
80 |
+
PARAMS
|
81 |
+
------
|
82 |
+
C: compression factor
|
83 |
+
"""
|
84 |
+
return torch.log(torch.clamp(x, min=clip_val) * C)
|
85 |
+
|
86 |
+
|
87 |
+
def dynamic_range_decompression(x, C=1):
|
88 |
+
"""
|
89 |
+
PARAMS
|
90 |
+
------
|
91 |
+
C: compression factor used to compress
|
92 |
+
"""
|
93 |
+
return torch.exp(x) / C
|
colab-train-zh-cn.ipynb
ADDED
The diff for this file is too large to render.
See raw diff
|
|
colab.ipynb
ADDED
The diff for this file is too large to render.
See raw diff
|
|
data_utils.py
ADDED
@@ -0,0 +1,111 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import random
|
2 |
+
import numpy as np
|
3 |
+
import torch
|
4 |
+
import torch.utils.data
|
5 |
+
|
6 |
+
import layers
|
7 |
+
from utils import load_wav_to_torch, load_filepaths_and_text
|
8 |
+
from text import text_to_sequence
|
9 |
+
|
10 |
+
|
11 |
+
class TextMelLoader(torch.utils.data.Dataset):
|
12 |
+
"""
|
13 |
+
1) loads audio,text pairs
|
14 |
+
2) normalizes text and converts them to sequences of one-hot vectors
|
15 |
+
3) computes mel-spectrograms from audio files.
|
16 |
+
"""
|
17 |
+
def __init__(self, audiopaths_and_text, hparams):
|
18 |
+
self.audiopaths_and_text = load_filepaths_and_text(audiopaths_and_text)
|
19 |
+
self.text_cleaners = hparams.text_cleaners
|
20 |
+
self.max_wav_value = hparams.max_wav_value
|
21 |
+
self.sampling_rate = hparams.sampling_rate
|
22 |
+
self.load_mel_from_disk = hparams.load_mel_from_disk
|
23 |
+
self.stft = layers.TacotronSTFT(
|
24 |
+
hparams.filter_length, hparams.hop_length, hparams.win_length,
|
25 |
+
hparams.n_mel_channels, hparams.sampling_rate, hparams.mel_fmin,
|
26 |
+
hparams.mel_fmax)
|
27 |
+
random.seed(hparams.seed)
|
28 |
+
random.shuffle(self.audiopaths_and_text)
|
29 |
+
|
30 |
+
def get_mel_text_pair(self, audiopath_and_text):
|
31 |
+
# separate filename and text
|
32 |
+
audiopath, text = audiopath_and_text[0], audiopath_and_text[1]
|
33 |
+
text = self.get_text(text)
|
34 |
+
mel = self.get_mel(audiopath)
|
35 |
+
return (text, mel)
|
36 |
+
|
37 |
+
def get_mel(self, filename):
|
38 |
+
if not self.load_mel_from_disk:
|
39 |
+
audio, sampling_rate = load_wav_to_torch(filename)
|
40 |
+
if sampling_rate != self.stft.sampling_rate:
|
41 |
+
raise ValueError("{} {} SR doesn't match target {} SR".format(
|
42 |
+
sampling_rate, self.stft.sampling_rate))
|
43 |
+
audio_norm = audio / self.max_wav_value
|
44 |
+
audio_norm = audio_norm.unsqueeze(0)
|
45 |
+
audio_norm = torch.autograd.Variable(audio_norm, requires_grad=False)
|
46 |
+
melspec = self.stft.mel_spectrogram(audio_norm)
|
47 |
+
melspec = torch.squeeze(melspec, 0)
|
48 |
+
else:
|
49 |
+
melspec = torch.from_numpy(np.load(filename))
|
50 |
+
assert melspec.size(0) == self.stft.n_mel_channels, (
|
51 |
+
'Mel dimension mismatch: given {}, expected {}'.format(
|
52 |
+
melspec.size(0), self.stft.n_mel_channels))
|
53 |
+
|
54 |
+
return melspec
|
55 |
+
|
56 |
+
def get_text(self, text):
|
57 |
+
text_norm = torch.IntTensor(text_to_sequence(text, self.text_cleaners))
|
58 |
+
return text_norm
|
59 |
+
|
60 |
+
def __getitem__(self, index):
|
61 |
+
return self.get_mel_text_pair(self.audiopaths_and_text[index])
|
62 |
+
|
63 |
+
def __len__(self):
|
64 |
+
return len(self.audiopaths_and_text)
|
65 |
+
|
66 |
+
|
67 |
+
class TextMelCollate():
|
68 |
+
""" Zero-pads model inputs and targets based on number of frames per setep
|
69 |
+
"""
|
70 |
+
def __init__(self, n_frames_per_step):
|
71 |
+
self.n_frames_per_step = n_frames_per_step
|
72 |
+
|
73 |
+
def __call__(self, batch):
|
74 |
+
"""Collate's training batch from normalized text and mel-spectrogram
|
75 |
+
PARAMS
|
76 |
+
------
|
77 |
+
batch: [text_normalized, mel_normalized]
|
78 |
+
"""
|
79 |
+
# Right zero-pad all one-hot text sequences to max input length
|
80 |
+
input_lengths, ids_sorted_decreasing = torch.sort(
|
81 |
+
torch.LongTensor([len(x[0]) for x in batch]),
|
82 |
+
dim=0, descending=True)
|
83 |
+
max_input_len = input_lengths[0]
|
84 |
+
|
85 |
+
text_padded = torch.LongTensor(len(batch), max_input_len)
|
86 |
+
text_padded.zero_()
|
87 |
+
for i in range(len(ids_sorted_decreasing)):
|
88 |
+
text = batch[ids_sorted_decreasing[i]][0]
|
89 |
+
text_padded[i, :text.size(0)] = text
|
90 |
+
|
91 |
+
# Right zero-pad mel-spec
|
92 |
+
num_mels = batch[0][1].size(0)
|
93 |
+
max_target_len = max([x[1].size(1) for x in batch])
|
94 |
+
if max_target_len % self.n_frames_per_step != 0:
|
95 |
+
max_target_len += self.n_frames_per_step - max_target_len % self.n_frames_per_step
|
96 |
+
assert max_target_len % self.n_frames_per_step == 0
|
97 |
+
|
98 |
+
# include mel padded and gate padded
|
99 |
+
mel_padded = torch.FloatTensor(len(batch), num_mels, max_target_len)
|
100 |
+
mel_padded.zero_()
|
101 |
+
gate_padded = torch.FloatTensor(len(batch), max_target_len)
|
102 |
+
gate_padded.zero_()
|
103 |
+
output_lengths = torch.LongTensor(len(batch))
|
104 |
+
for i in range(len(ids_sorted_decreasing)):
|
105 |
+
mel = batch[ids_sorted_decreasing[i]][1]
|
106 |
+
mel_padded[i, :, :mel.size(1)] = mel
|
107 |
+
gate_padded[i, mel.size(1)-1:] = 1
|
108 |
+
output_lengths[i] = mel.size(1)
|
109 |
+
|
110 |
+
return text_padded, input_lengths, mel_padded, gate_padded, \
|
111 |
+
output_lengths
|
demo.wav
ADDED
Binary file (148 kB). View file
|
|
distributed.py
ADDED
@@ -0,0 +1,173 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import torch
|
2 |
+
import torch.distributed as dist
|
3 |
+
from torch.nn.modules import Module
|
4 |
+
from torch.autograd import Variable
|
5 |
+
|
6 |
+
def _flatten_dense_tensors(tensors):
|
7 |
+
"""Flatten dense tensors into a contiguous 1D buffer. Assume tensors are of
|
8 |
+
same dense type.
|
9 |
+
Since inputs are dense, the resulting tensor will be a concatenated 1D
|
10 |
+
buffer. Element-wise operation on this buffer will be equivalent to
|
11 |
+
operating individually.
|
12 |
+
Arguments:
|
13 |
+
tensors (Iterable[Tensor]): dense tensors to flatten.
|
14 |
+
Returns:
|
15 |
+
A contiguous 1D buffer containing input tensors.
|
16 |
+
"""
|
17 |
+
if len(tensors) == 1:
|
18 |
+
return tensors[0].contiguous().view(-1)
|
19 |
+
flat = torch.cat([t.contiguous().view(-1) for t in tensors], dim=0)
|
20 |
+
return flat
|
21 |
+
|
22 |
+
def _unflatten_dense_tensors(flat, tensors):
|
23 |
+
"""View a flat buffer using the sizes of tensors. Assume that tensors are of
|
24 |
+
same dense type, and that flat is given by _flatten_dense_tensors.
|
25 |
+
Arguments:
|
26 |
+
flat (Tensor): flattened dense tensors to unflatten.
|
27 |
+
tensors (Iterable[Tensor]): dense tensors whose sizes will be used to
|
28 |
+
unflatten flat.
|
29 |
+
Returns:
|
30 |
+
Unflattened dense tensors with sizes same as tensors and values from
|
31 |
+
flat.
|
32 |
+
"""
|
33 |
+
outputs = []
|
34 |
+
offset = 0
|
35 |
+
for tensor in tensors:
|
36 |
+
numel = tensor.numel()
|
37 |
+
outputs.append(flat.narrow(0, offset, numel).view_as(tensor))
|
38 |
+
offset += numel
|
39 |
+
return tuple(outputs)
|
40 |
+
|
41 |
+
|
42 |
+
'''
|
43 |
+
This version of DistributedDataParallel is designed to be used in conjunction with the multiproc.py
|
44 |
+
launcher included with this example. It assumes that your run is using multiprocess with 1
|
45 |
+
GPU/process, that the model is on the correct device, and that torch.set_device has been
|
46 |
+
used to set the device.
|
47 |
+
|
48 |
+
Parameters are broadcasted to the other processes on initialization of DistributedDataParallel,
|
49 |
+
and will be allreduced at the finish of the backward pass.
|
50 |
+
'''
|
51 |
+
class DistributedDataParallel(Module):
|
52 |
+
|
53 |
+
def __init__(self, module):
|
54 |
+
super(DistributedDataParallel, self).__init__()
|
55 |
+
#fallback for PyTorch 0.3
|
56 |
+
if not hasattr(dist, '_backend'):
|
57 |
+
self.warn_on_half = True
|
58 |
+
else:
|
59 |
+
self.warn_on_half = True if dist._backend == dist.dist_backend.GLOO else False
|
60 |
+
|
61 |
+
self.module = module
|
62 |
+
|
63 |
+
for p in self.module.state_dict().values():
|
64 |
+
if not torch.is_tensor(p):
|
65 |
+
continue
|
66 |
+
dist.broadcast(p, 0)
|
67 |
+
|
68 |
+
def allreduce_params():
|
69 |
+
if(self.needs_reduction):
|
70 |
+
self.needs_reduction = False
|
71 |
+
buckets = {}
|
72 |
+
for param in self.module.parameters():
|
73 |
+
if param.requires_grad and param.grad is not None:
|
74 |
+
tp = type(param.data)
|
75 |
+
if tp not in buckets:
|
76 |
+
buckets[tp] = []
|
77 |
+
buckets[tp].append(param)
|
78 |
+
if self.warn_on_half:
|
79 |
+
if torch.cuda.HalfTensor in buckets:
|
80 |
+
print("WARNING: gloo dist backend for half parameters may be extremely slow." +
|
81 |
+
" It is recommended to use the NCCL backend in this case. This currently requires" +
|
82 |
+
"PyTorch built from top of tree master.")
|
83 |
+
self.warn_on_half = False
|
84 |
+
|
85 |
+
for tp in buckets:
|
86 |
+
bucket = buckets[tp]
|
87 |
+
grads = [param.grad.data for param in bucket]
|
88 |
+
coalesced = _flatten_dense_tensors(grads)
|
89 |
+
dist.all_reduce(coalesced)
|
90 |
+
coalesced /= dist.get_world_size()
|
91 |
+
for buf, synced in zip(grads, _unflatten_dense_tensors(coalesced, grads)):
|
92 |
+
buf.copy_(synced)
|
93 |
+
|
94 |
+
for param in list(self.module.parameters()):
|
95 |
+
def allreduce_hook(*unused):
|
96 |
+
param._execution_engine.queue_callback(allreduce_params)
|
97 |
+
if param.requires_grad:
|
98 |
+
param.register_hook(allreduce_hook)
|
99 |
+
|
100 |
+
def forward(self, *inputs, **kwargs):
|
101 |
+
self.needs_reduction = True
|
102 |
+
return self.module(*inputs, **kwargs)
|
103 |
+
|
104 |
+
'''
|
105 |
+
def _sync_buffers(self):
|
106 |
+
buffers = list(self.module._all_buffers())
|
107 |
+
if len(buffers) > 0:
|
108 |
+
# cross-node buffer sync
|
109 |
+
flat_buffers = _flatten_dense_tensors(buffers)
|
110 |
+
dist.broadcast(flat_buffers, 0)
|
111 |
+
for buf, synced in zip(buffers, _unflatten_dense_tensors(flat_buffers, buffers)):
|
112 |
+
buf.copy_(synced)
|
113 |
+
def train(self, mode=True):
|
114 |
+
# Clear NCCL communicator and CUDA event cache of the default group ID,
|
115 |
+
# These cache will be recreated at the later call. This is currently a
|
116 |
+
# work-around for a potential NCCL deadlock.
|
117 |
+
if dist._backend == dist.dist_backend.NCCL:
|
118 |
+
dist._clear_group_cache()
|
119 |
+
super(DistributedDataParallel, self).train(mode)
|
120 |
+
self.module.train(mode)
|
121 |
+
'''
|
122 |
+
'''
|
123 |
+
Modifies existing model to do gradient allreduce, but doesn't change class
|
124 |
+
so you don't need "module"
|
125 |
+
'''
|
126 |
+
def apply_gradient_allreduce(module):
|
127 |
+
if not hasattr(dist, '_backend'):
|
128 |
+
module.warn_on_half = True
|
129 |
+
else:
|
130 |
+
module.warn_on_half = True if dist._backend == dist.dist_backend.GLOO else False
|
131 |
+
|
132 |
+
for p in module.state_dict().values():
|
133 |
+
if not torch.is_tensor(p):
|
134 |
+
continue
|
135 |
+
dist.broadcast(p, 0)
|
136 |
+
|
137 |
+
def allreduce_params():
|
138 |
+
if(module.needs_reduction):
|
139 |
+
module.needs_reduction = False
|
140 |
+
buckets = {}
|
141 |
+
for param in module.parameters():
|
142 |
+
if param.requires_grad and param.grad is not None:
|
143 |
+
tp = param.data.dtype
|
144 |
+
if tp not in buckets:
|
145 |
+
buckets[tp] = []
|
146 |
+
buckets[tp].append(param)
|
147 |
+
if module.warn_on_half:
|
148 |
+
if torch.cuda.HalfTensor in buckets:
|
149 |
+
print("WARNING: gloo dist backend for half parameters may be extremely slow." +
|
150 |
+
" It is recommended to use the NCCL backend in this case. This currently requires" +
|
151 |
+
"PyTorch built from top of tree master.")
|
152 |
+
module.warn_on_half = False
|
153 |
+
|
154 |
+
for tp in buckets:
|
155 |
+
bucket = buckets[tp]
|
156 |
+
grads = [param.grad.data for param in bucket]
|
157 |
+
coalesced = _flatten_dense_tensors(grads)
|
158 |
+
dist.all_reduce(coalesced)
|
159 |
+
coalesced /= dist.get_world_size()
|
160 |
+
for buf, synced in zip(grads, _unflatten_dense_tensors(coalesced, grads)):
|
161 |
+
buf.copy_(synced)
|
162 |
+
|
163 |
+
for param in list(module.parameters()):
|
164 |
+
def allreduce_hook(*unused):
|
165 |
+
Variable._execution_engine.queue_callback(allreduce_params)
|
166 |
+
if param.requires_grad:
|
167 |
+
param.register_hook(allreduce_hook)
|
168 |
+
|
169 |
+
def set_needs_reduction(self, input, output):
|
170 |
+
self.needs_reduction = True
|
171 |
+
|
172 |
+
module.register_forward_hook(set_needs_reduction)
|
173 |
+
return module
|
env.py
ADDED
@@ -0,0 +1,15 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import os
|
2 |
+
import shutil
|
3 |
+
|
4 |
+
|
5 |
+
class AttrDict(dict):
|
6 |
+
def __init__(self, *args, **kwargs):
|
7 |
+
super(AttrDict, self).__init__(*args, **kwargs)
|
8 |
+
self.__dict__ = self
|
9 |
+
|
10 |
+
|
11 |
+
def build_env(config, config_name, path):
|
12 |
+
t_path = os.path.join(path, config_name)
|
13 |
+
if config != t_path:
|
14 |
+
os.makedirs(path, exist_ok=True)
|
15 |
+
shutil.copyfile(config, os.path.join(path, config_name))
|
filelists/transcript_train.txt
ADDED
The diff for this file is too large to render.
See raw diff
|
|
filelists/transcript_val.txt
ADDED
@@ -0,0 +1,426 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
wav/nen001_001.wav|はい?呼びました?
|
2 |
+
wav/nen001_012.wav|ほしな君も
|
3 |
+
wav/nen001_024.wav|さすがに白蛇占いはできませんよ
|
4 |
+
wav/nen001_035.wav|わかりました。ありがとう……ございます
|
5 |
+
wav/nen001_049.wav|んっ、んっ、んくっ……ひっ、あっ、ぁっ、ぁっ、んんーーッ……
|
6 |
+
wav/nen001_060.wav|あああぁぁ……今は、止められなくて……じゅる……はぁ、はぁぁ……あぁぁぁあぁ……
|
7 |
+
wav/nen001_072.wav|ほしな君。珍しいところで会いますね。図書室で何か調べ物ですか?こんな時間まで大変ですね
|
8 |
+
wav/nen002_004.wav|そうですか
|
9 |
+
wav/nen002_018.wav|そうですか……
|
10 |
+
wav/nen002_036.wav|あっ、あれは、違うんです。別に悩みとか、ストレスじゃなくて……じ……事情が……ありまして
|
11 |
+
wav/nen002_051.wav|そんな風に光るなんて、私も初めて見ました一体何をしたんですか?
|
12 |
+
wav/nen002_062.wav|ど、どうして……?一体どこに……欠片が……今まで集めた欠片が……やっぱりさっきの羽根は……
|
13 |
+
wav/nen002_074.wav|それで、あの……気分はどうですか?
|
14 |
+
wav/nen002_089.wav|占ったんです、その高安先輩の交際相手である女の子の恋愛運を
|
15 |
+
wav/nen002_100.wav|そうですね。そういう人も含まれると思います
|
16 |
+
wav/nen002_113.wav|はい。私に、“心の欠片”のことを教えてくれた人に
|
17 |
+
wav/nen003_007.wav|はい、私の知り合いが営んでる喫茶店です
|
18 |
+
wav/nen003_019.wav|諦め?受け入れる?
|
19 |
+
wav/nen003_031.wav|あの……一つ、思ったことがあるんですが……
|
20 |
+
wav/nen003_047.wav|ななおは人間じゃないんです。私が契約を結んだ、アルプなんです
|
21 |
+
wav/nen003_059.wav|楽しそうにしてる時でも、どこか楽しみきれていないと言いますか。そういう気持ちは、私にもありますから
|
22 |
+
wav/nen004_011.wav|は、はい?
|
23 |
+
wav/nen005_008.wav|いえ、まだです。おそらく何もないとは思うんですが、万が一ということもあります
|
24 |
+
wav/nen005_022.wav|おこです。激おこです
|
25 |
+
wav/nen005_035.wav|いえ、困っていることがあって、私に力になれることでしたらお手伝いさせてもらいます
|
26 |
+
wav/nen005_049.wav|そう言われても事実なので
|
27 |
+
wav/nen005_060.wav|そうですね……私では解決できないような依頼も、いくつかありましたね
|
28 |
+
wav/nen005_071.wav|まあ、気は進まないんですけどね……はぁ……
|
29 |
+
wav/nen005_089.wav|ごめんなさい。でも、これが欠片の回収方法なんです
|
30 |
+
wav/nen005_100.wav|ななおに訊いても無駄ですよ。ななおは猫なんですから
|
31 |
+
wav/nen006_005.wav|私も同じです。一般的な意見なら、多少は集まりましたが……
|
32 |
+
wav/nen006_018.wav|わかりました~
|
33 |
+
wav/nen006_030.wav|じゃあ、続けますね
|
34 |
+
wav/nen006_043.wav|ありがとう、ございます……んっ、んんんっ、ひっ、ひっ、ふーーーぅ……ひっ、ひっ、ふーーーーぅ……
|
35 |
+
wav/nen006_056.wav|諦めるのはまだ早いと思います。ここにはまだテクニックが記されていますから
|
36 |
+
wav/nen006_068.wav|え?突然どうしたんですか?ほしな君に謝罪されるようなこと、ありましたか?
|
37 |
+
wav/nen007_006.wav|一部ということは……そうじゃない人には、受け入れてもらえた、ということですか?
|
38 |
+
wav/nen008_009.wav|折を見て、自分の分を買いに行こうと思ってます
|
39 |
+
wav/nen008_021.wav|そ、そうなんですか……?
|
40 |
+
wav/nen008_032.wav|もう1時間ぐらいしてますから
|
41 |
+
wav/nen008_044.wav|いなばさん……ありがとうございます。それでは、お言葉に甘えさせてもらってもいいですか?
|
42 |
+
wav/nen008_056.wav|私たちでリハーサル?
|
43 |
+
wav/nen009_004.wav|はい。川上君の悩みは、本当にデートのことでいいんでしょうか?
|
44 |
+
wav/nen009_016.wav|はい、大丈夫ですよ、時間はまだ10分ほど余裕がありますから
|
45 |
+
wav/nen009_027.wav|そうですね……ほしな君、川上君の予定では映画の後はどうなっていますか?
|
46 |
+
wav/nen009_039.wav|このことは、川上君にも伝えておいた方がいいですね
|
47 |
+
wav/nen009_052.wav|本当ですか?丁度いい機会ですから、いっそ買ってしまうのもいいですね
|
48 |
+
wav/nen009_063.wav|川上君はしっかりプランを組んだりしているんですから、むしろ川上君が嫌がるかもしれませんね
|
49 |
+
wav/nen009_074.wav|私は何でもいいですよ。嫌いな物も特にありませんから
|
50 |
+
wav/nen009_085.wav|あの、これってもう取っていいんですか?
|
51 |
+
wav/nen009_100.wav|あ、甘エビ~♪
|
52 |
+
wav/nen010_007.wav|確かにそうですね。お礼の言葉を言ってもらえたりするのも、とても嬉しいものですからね
|
53 |
+
wav/nen010_021.wav|は、はぁ……はぁ……あり、がとう、ございますぅ……ほしなくんんんっ……
|
54 |
+
wav/nen010_032.wav|はい、あと少し……もう少し……んっ、んひっ、あっ、あっ、あっ……はあぁぁぁー……
|
55 |
+
wav/nen010_045.wav|ふーーー……ふーーー……
|
56 |
+
wav/nen010_057.wav|いえ、そうじゃなくてですね、その………………スースー、しますから
|
57 |
+
wav/nen010_074.wav|はい、どうぞ
|
58 |
+
wav/nen010_086.wav|私たちのオカルト研究部も、元々は黒魔術だったみたいですよ
|
59 |
+
wav/nen010_100.wav|は、はい、大事になる前に誤解をときましょう
|
60 |
+
wav/nen011_008.wav|私が勧めたんです。更衣室で着替えるのを恥ずかしそうにしていたので
|
61 |
+
wav/nen011_019.wav|あ、ダ、ダメですよ、変なところ触っちゃくすぐったいですから
|
62 |
+
wav/nen011_033.wav|どうしたんですか?なにか連絡事項が?
|
63 |
+
wav/nen011_044.wav|ではとがくし先輩の相談は、越路さんを説得すること、でいいんですか?
|
64 |
+
wav/nen012_003.wav|それで、どうでしたか?
|
65 |
+
wav/nen012_014.wav|あの、ほしな君
|
66 |
+
wav/nen012_026.wav|あれだけ反応が弱い欠片ですと、特に
|
67 |
+
wav/nen013_001.wav|もし本当に私の他に魔女がいるとしたら……困ったことになりますね
|
68 |
+
wav/nen013_015.wav|はい、問題ありません
|
69 |
+
wav/nen013_028.wav|ロ、ローター……です………………ローターですよぅ……
|
70 |
+
wav/nen014_001.wav|そうなんですか?どうかしたんですか?
|
71 |
+
wav/nen015_071.wav|それじゃあ今後とも、よろしくお願いします
|
72 |
+
wav/nen015_004.wav|はぁ、それはわかりました。でも、一つだけ答えてくれませんか?気になる事があるんです
|
73 |
+
wav/nen015_016.wav|え?それって、どういうことですか?
|
74 |
+
wav/nen015_031.wav|そんな普通に可愛い服だなんて卑怯ですっ。私なんてこんな恥ずかしい恰好なのにぃ理不尽です~!
|
75 |
+
wav/nen015_043.wav|魔女の契約の代償……と言うことですか
|
76 |
+
wav/nen015_056.wav|しいばさんはああ言ってくれましたが、私は別にこの学院を自分の領土だなんて言うつもりはありません
|
77 |
+
wav/nen016_003.wav|はい。また何か困ったことがあれば、いつでもどうぞ
|
78 |
+
wav/nen016_014.wav|つまり、私たちはこの部室から出ていかなければいけない、ということですか?
|
79 |
+
wav/nen016_027.wav|とにかく運営のすべきことは、ほしな君が言ったことと、先生方との折衝もでしょうか?
|
80 |
+
wav/nen016_039.wav|そういうことでしたら……お願いできますか?
|
81 |
+
wav/nen016_050.wav|なにか問題がありましたか?
|
82 |
+
wav/nen017_002.wav|全員揃っていますね。それじゃ行きましょうか
|
83 |
+
wav/nen017_015.wav|それじゃあ……ここからここまでを、まず完璧に覚えましょう。ここの基礎を覚えてしまえば、次も覚えやすいですから
|
84 |
+
wav/nen017_028.wav|え?なんですか?
|
85 |
+
wav/nen017_041.wav|だ、大丈夫……大丈夫なはず……ええ、絶対大丈夫です……おそらく、きっと、多分
|
86 |
+
wav/nen017_052.wav|確かにそれぐらいの余裕はありますが……
|
87 |
+
wav/nen017_064.wav|はい、お疲れ様でした
|
88 |
+
wav/nen018_012.wav|それじゃあ、一体どうしてですか?
|
89 |
+
wav/nen018_023.wav|大きな欠伸ですね
|
90 |
+
wav/nen018_036.wav|ちょっと皮がむけちゃって、真っ赤になっちゃってますよ
|
91 |
+
wav/nen018_050.wav|ほしな君のことを、応援していますし……それが、応援になるというのでしたら……もう一度
|
92 |
+
wav/nen019_002.wav|はい。よろしくお願いします
|
93 |
+
wav/nen019_013.wav|そうですか、ありがとう……ございます
|
94 |
+
wav/nen019_026.wav|ありがとうございます、しいばさん……言わなきゃよかった言わなきゃよかった言わなきゃよかった言わなきゃよかった言わなきゃよかった言わなきゃよかった
|
95 |
+
wav/nen019_037.wav|予想よりも多くの人に集まってもらえて、準備してきた者としては嬉しい限りです
|
96 |
+
wav/nen020_102.wav|ぷぁ、はぁぁ………………疲れました
|
97 |
+
wav/nen020_114.wav|それに私もほしな君と同じで、あくまで部活の一環ですからね
|
98 |
+
wav/nen020_127.wav|もぅ、どうしてそういうことを言わせるんですか!
|
99 |
+
wav/nen020_139.wav|こちらのことは気にしないでいいんですよ?……こうして欠片が戻ったということは、ほしな君も嫌に思ってるわけじゃないんですよね?
|
100 |
+
wav/nen020_151.wav|はい。ほしな君は気付いていないかもしれませんが、笑顔が以前とは比べ物にならないぐらい自然ですから
|
101 |
+
wav/nen020_162.wav|かもしれません。でも、そういう部活も楽しくていいものですよ
|
102 |
+
wav/nen020_404.wav|ありがとうございます
|
103 |
+
wav/nen101_010.wav|はぁ……
|
104 |
+
wav/nen101_024.wav|いえ、買い物ではないんです。今日は色々疲れてしまったので……
|
105 |
+
wav/nen101_036.wav|��いんですか?
|
106 |
+
wav/nen101_048.wav|ほしな君は、このお店に入ったことがあるんですか?
|
107 |
+
wav/nen101_059.wav|ですが……こうして呪文を唱えなきゃいけないんですよね?とりあえず、初心者はこう頼むべし、って書いてありましたけど
|
108 |
+
wav/nen101_071.wav|えっ……あの、それって……
|
109 |
+
wav/nen101_086.wav|あ、美味しいですね。これがラーメン
|
110 |
+
wav/nen101_099.wav|あのほしな君、早く行きましょう
|
111 |
+
wav/nen101_111.wav|え?いえそんな、お礼を言われるような、大層なことは出来ていませんから
|
112 |
+
wav/nen101_126.wav|はい、さようなら
|
113 |
+
wav/nen102_005.wav|あの……それで、どうしたんですか?突然電話だなんて
|
114 |
+
wav/nen102_018.wav|それにですね、今朝に比べると大分マシにはなっています。ですから、このまま大人しくしていれば平気ですよ
|
115 |
+
wav/nen102_033.wav|どうぞ
|
116 |
+
wav/nen102_046.wav|私が嘘を吐いていないのは、ほしな君ならわかりますよね?
|
117 |
+
wav/nen102_057.wav|ですから、む……夢精をしちゃうような……いやらしい夢を見たんじゃないかなっと
|
118 |
+
wav/nen102_072.wav|私は一人暮らしですから。そういう思い出とは縁遠い生活ですね
|
119 |
+
wav/nen102_087.wav|今度は、ほしな君がおまじないをかける側になって下さい。そしたらきっと、私の恥ずかしさがわかってもらえるはずです
|
120 |
+
wav/nen102_099.wav|ひっ、んっ、んん……ふぅ、ふぅ……んんっ、んふぅ……んん……
|
121 |
+
wav/nen102_111.wav|はい。約束です
|
122 |
+
wav/nen102_124.wav|でも……気分が少しマシになったかもしれない。あのおまじないは効くのかな?
|
123 |
+
wav/nen103_010.wav|だから熱く語らないで下さい、思い出しちゃダメー、手をニギニギさせるのもダメですってばっ
|
124 |
+
wav/nen103_025.wav|私に、む、夢精……とか言わせたくせに、教えてくれないなんてズルいですよぅ!
|
125 |
+
wav/nen103_042.wav|ところで話は変わりますが、何かあったんですか?みんな、普段と様子が違うみたいですが
|
126 |
+
wav/nen103_053.wav|はい。先生が男の人と一緒に歩いているところを見かけましたよ
|
127 |
+
wav/nen103_070.wav|本命の質問だけでなく、無関係なダミーの質問も織り交ぜれば、怪しさも薄くなりませんか?
|
128 |
+
wav/nen103_082.wav|わかりました
|
129 |
+
wav/nen103_095.wav|そうなんですが……見られていないとわかっていても、恥ずかしいんですよぅ、この恰好
|
130 |
+
wav/nen103_106.wav|というよりも……一緒に行っていいですか?実は私もまだ書いていなくて……
|
131 |
+
wav/nen103_123.wav|あ、いえ、その……
|
132 |
+
wav/nen103_141.wav|は、はい?
|
133 |
+
wav/nen103_161.wav|私は、怒られたくないです……
|
134 |
+
wav/nen103_175.wav|あの……正直に言います。最近の私は変なんです
|
135 |
+
wav/nen103_189.wav|い、いえ、そんな風には思っていませんから、平気ですっ
|
136 |
+
wav/nen103_200.wav|それに……こんな私のことを知りたいって言ってくれたこと……嬉しかったです
|
137 |
+
wav/nen103_212.wav|こ、子供っぽいですよね?
|
138 |
+
wav/nen103_227.wav|なぅぅぅぅぅ……ほしな君のことを思うと心が落ち着かない……
|
139 |
+
wav/nen103_240.wav|あっ、うあっ、あぁぁぁぁぁぁぁぁぁぁぁっ
|
140 |
+
wav/nen103_251.wav|はあ、はあ、はぁああぁぁ……なにこれ、こんなにすごいの、しらない……いつもと、全然違う……んっ、ふーっ……ふーっ……
|
141 |
+
wav/nen103_262.wav|はぁ、はぁ、はぁ、はぁはぁはぁぁぁぁぁんっ、ぅぅぅぅぅぅうっ!
|
142 |
+
wav/nen103_273.wav|ひゃんっ……あ、あ、あぁぁぁ……ヤダぁ、止まらない、止まりませんよぉ……あ、あ、はぁぁぁぁ……っ
|
143 |
+
wav/nen104_007.wav|はい?なにが……ですか?
|
144 |
+
wav/nen104_020.wav|だってほしな君が言わせたんじゃないですかぁ
|
145 |
+
wav/nen104_031.wav|そうです。その通りです。い、今でももうおかしくなっているのに、これ以上は……
|
146 |
+
wav/nen104_043.wav|本当にごめんなさい
|
147 |
+
wav/nen104_054.wav|別に大変と言うほどのことは
|
148 |
+
wav/nen104_066.wav|そうなんですか?どうしてこんなにすぐに……いつも通り過ごしていたはずなのに
|
149 |
+
wav/nen104_078.wav|それじゃあ、考えておきます
|
150 |
+
wav/nen104_092.wav|もしよければ、その相手の怪しい行動についても、教えてもらえますか?
|
151 |
+
wav/nen104_106.wav|あっ……ぅっ……
|
152 |
+
wav/nen104_121.wav|な、なんでもないですよぅ。眠れなかったというだけですから
|
153 |
+
wav/nen104_136.wav|普通は引きますよね。一晩中オナニーしちゃうような女の子なんて……
|
154 |
+
wav/nen104_148.wav|そ、それじゃあ皆さん……あっ、んんっ……私は、お先に、失礼させてもらいます、ね……んんっ
|
155 |
+
wav/nen104_161.wav|でも、ダメでした。ちょっ���……無理そうです。答えは出そうにありません
|
156 |
+
wav/nen104_173.wav|ほしな君は、私のことを好きって言ってくれてますが私には、ほしな君にも言ってないことが……あるんです
|
157 |
+
wav/nen104_190.wav|濡れて……ます……発情が止まらなくて……ぅぅ……そ、そんな、ヘンタイな私でも好きって言ってくれますか?
|
158 |
+
wav/nen104_203.wav|もう無理です。我慢できません。自分が抑えられなくて………………だから先に謝っておきますね。ごめんなさいっ
|
159 |
+
wav/nen104_214.wav|んふーッ……じゅる、ちゅるるる……れる、れろれろれる……ちゅ、ちゅ……んちゅ
|
160 |
+
wav/nen104_226.wav|んっ、んんんーーーー……ぷぁ、はぁぁぁーー……はぁーっ……はぁーっ……
|
161 |
+
wav/nen104_239.wav|はい。わかりました――んぅっ、あ……あっ、あっ……んんぁ
|
162 |
+
wav/nen104_252.wav|あっ、はぁ、はぁ、はぁ……んんっ、んんんっ……ふーっ、ふーっ……んっ、んんーーっ
|
163 |
+
wav/nen104_264.wav|それは……はぁ、はぁ……んんっ、好きな人にされる方が、気持ちよくて……好きです……
|
164 |
+
wav/nen104_275.wav|んっ、ひぃぃぁぁぁぁぁあああっ
|
165 |
+
wav/nen104_287.wav|あ、あ、あの……そんなに、じっくり見ないで下さい……恥ずかしいんですから……
|
166 |
+
wav/nen104_299.wav|えっ?それは、やっぱり私のそこ、変ってことですか?色々自分で弄っちゃってるから、変なんですか?
|
167 |
+
wav/nen104_310.wav|ひぃぁあっ!そっ、そこっ、は……んっ、んんんっ、あ、あ、あ、あ、ああああああっ
|
168 |
+
wav/nen104_322.wav|ほ、ほしな君は、どうですか?
|
169 |
+
wav/nen104_333.wav|はっ、はっ、ああぁァァああんっ、びりびり、するぅ……はぁ、はぁ、はぁ……奥まで、きてますぅ
|
170 |
+
wav/nen104_344.wav|あああっ、頭、くらくらします……はぁはぁはぁ、ん、んんぅぅーーーーーーッ、もっと呼んでぇ、もっと名前を呼んで下さいぃ
|
171 |
+
wav/nen104_355.wav|きゃ、ぅぁっ……はぁ、はぁ、すごい、出てます、ヌルヌルのが、いっぱいっ
|
172 |
+
wav/nen104_370.wav|あ、あの、それはまた、後日にお願いします
|
173 |
+
wav/nen105_010.wav|い、いいですいいです、そんな仰々しいことっ
|
174 |
+
wav/nen105_026.wav|はい。お疲れ様でした
|
175 |
+
wav/nen105_041.wav|一人暮らし用の冷蔵庫だと小さいですから。野菜室があるタイプに買い換えようかとずっと悩んでいるんですが……
|
176 |
+
wav/nen105_058.wav|それになによりも、好きな人と一緒にいられる時間は私も好きですから
|
177 |
+
wav/nen105_071.wav|はい。頑張って作りますね
|
178 |
+
wav/nen105_087.wav|ん、れろ……れる、えるれろれろ……れるん………んっ、ちゅぅぱ、はぁ、はぁ、はぁっ、あぁんっ
|
179 |
+
wav/nen105_100.wav|あの……別に、そういう行為が嫌というわけじゃないんです。さっき、キスの前に言ったのは本当のことですから
|
180 |
+
wav/nen105_112.wav|ウソツキ……私のしたいこと、ワガママを言ってもいいって……そう言ってくれたじゃないですか
|
181 |
+
wav/nen105_127.wav|あ……あの、もう一度触っていいですか?今度はちゃんと優しく、丁寧に触りますから
|
182 |
+
wav/nen105_139.wav|もし痛かったら言って下さいね。ちゅ、ちゅ……ん……ちゅぅ……んっ、んんっ
|
183 |
+
wav/nen105_150.wav|んちゅ、じゅる……ちゅ、ちゅ……んんんー、舐めても舐めても、全然綺麗になりませんね。むしろ、ますますベトベトになってるような……
|
184 |
+
wav/nen105_162.wav|んぶ……ンッ、ちゅばちゅば……ちゅぶっ、ちゅぶぶ……んんーーっ、じゅるっ……じゅるるるるっ
|
185 |
+
wav/nen105_173.wav|んーー……じゃあ、見えなくしちゃいます……ん、じゅる、じゅるるる……ちゅ、ちゅぅぅぅぅーーー……ッ
|
186 |
+
wav/nen105_184.wav|ん……ッッ!?んっ、ぅぅぅっ……ん、んんーーー……んふぅー……ふぅー……ん、んむぅ……んんっ
|
187 |
+
wav/nen105_195.wav|はぁ……はぁ……んっ、はぁぁぁ……気持ち、よかったですか?
|
188 |
+
wav/nen106_002.wav|ちょっと待って下さいね。私も、最近は確認をしていなかったので
|
189 |
+
wav/nen106_014.wav|でも、予定は大丈夫なんですか?
|
190 |
+
wav/nen106_026.wav|は、はい。そうですね
|
191 |
+
wav/nen106_038.wav|んふぅ、んっじゅるっ、ぬちゅくちゅ……んぁ、はぁ、はぁ、はい。もう少し……はぁぁ、あむぅ……れろれるん、れちょれちょ
|
192 |
+
wav/nen106_050.wav|さようなら。また明日
|
193 |
+
wav/nen106_065.wav|いつも歩いている道ですから。それに、なるべく明るくて人気のあるところを通ります。大丈夫ですよ
|
194 |
+
wav/nen106_078.wav|すみません、気を遣わせてしまいまして。でも、本当にそれだけなので、心配は必要ありませんよ
|
195 |
+
wav/nen106_090.wav|あの、ちょっと待って下さい
|
196 |
+
wav/nen106_101.wav|そう言ってもらえて嬉しいです
|
197 |
+
wav/nen106_116.wav|は、はい。もちろんです……私も、大好きな人とキスしたい、です……
|
198 |
+
wav/nen106_129.wav|え?
|
199 |
+
wav/nen106_150.wav|それじゃあ、今日は失礼しますね
|
200 |
+
wav/nen107_003.wav|それじゃあ、お疲れ様でした。さようなら
|
201 |
+
wav/nen107_019.wav|そう、ですよね……今みたいな状態を続けても……仕方ないですよね
|
202 |
+
wav/nen107_035.wav|でも、でも……
|
203 |
+
wav/nen107_051.wav|それよりも、結局どうなんですか?私の気持ち、ちゃんと感じてもらえてますか?
|
204 |
+
wav/nen107_063.wav|それは……うっ……ぅぅぅ~~~……恥ずかしい、ですけど……今は、この温もりに包まれていたいです。そっちの方が重要です
|
205 |
+
wav/nen108_011.wav|でも急に泊まってもらうことになって……親御さんにもご迷惑を……
|
206 |
+
wav/nen108_024.wav|そうなんですか?えっと……気付いていませんでした。むしろ、私の方が甘えちゃっていますから……
|
207 |
+
wav/nen108_044.wav|んんっ、ふーっ……ふーっ……
|
208 |
+
wav/nen108_056.wav|授業に身が入らなくて……ず、ずっと、考えてたら……はぁ……はぁ……だ、だから……はぁ、はぁ、はぁ、はぁ
|
209 |
+
wav/nen108_069.wav|それは、だから……下のお口、ですとか……他にもありませんか?
|
210 |
+
wav/nen108_080.wav|ひあぁぁああぁぁああっ、それ、しび、れる……からだ、痺れちゃうっ、あ、あ、あああああ、そこ、吸うの、あっ、あああっ
|
211 |
+
wav/nen108_091.wav|ひゃあああぁっ、そんな、おま●こ全部を吸われたらぁ……あ、あ、あ、あ、我慢できませんっ、もう熱いですぅ、身体が熱くて仕方ないんです
|
212 |
+
wav/nen108_103.wav|ぁぁ……はぁー、はぁー……あ、これぇ、奥まで感じます……んぁぁ、はぁー……はぁー……
|
213 |
+
wav/nen108_114.wav|ひっ、ひああぁぁぁああぁ、それ、それ凄いですぅ……はぁはぁはぁ、あああぁぁあああぁっ
|
214 |
+
wav/nen108_125.wav|あっ、ああっ……やだぁ、エッチな音、してます……私の、おま●こから、エッチな音が……あっあっあっ、でも、我慢できなくてっ
|
215 |
+
wav/nen108_136.wav|はぁ、はぁ……はぁぁぁ……もう、ドロドロですよ……
|
216 |
+
wav/nen108_152.wav|それは……はい。確かにそういう気持ちはあります……
|
217 |
+
wav/nen108_168.wav|浮かない表情をしていました……
|
218 |
+
wav/nen108_181.wav|それは……どういう意味ですか?
|
219 |
+
wav/nen109_011.wav|そうですね……カラオケに、ボウリング、プリクラも……
|
220 |
+
wav/nen109_025.wav|はい……それじゃあ、えっと、えっと……
|
221 |
+
wav/nen109_040.wav|ここがいいでしょうか……それともこっち?
|
222 |
+
wav/nen109_052.wav|まだ色々やりたいことはあります、それは尽きませんけど………………でも本当に、後悔はしてませんよ
|
223 |
+
wav/nen109_069.wav|はい。私、幸せになります。それで、しゅうじ君のことも幸せにしてみせます
|
224 |
+
wav/kne110_008.wav|メッセージ……
|
225 |
+
wav/kne110_026.wav|こんな……形だけにこだわった物じゃないんです……でも、それはもう……無理なんですよね
|
226 |
+
wav/kne110_044.wav|はい。優しそうな人ですから
|
227 |
+
wav/nen110_013.wav|ぅっ……ぁぁ……ダメ……泣いたり、しない
|
228 |
+
wav/nen111_006.wav|言いたいこと……ですか?
|
229 |
+
wav/nen111_019.wav|ギターが欲しいんですよね?
|
230 |
+
wav/nen111_033.wav|やっぱり、未来が変わっちゃってるんですよね……
|
231 |
+
wav/nen111_047.wav|それは、えっと………………
|
232 |
+
wav/nen111_062.wav|で、ですから……わ、私の……オナニー………………オナニーですっ
|
233 |
+
wav/nen111_079.wav|は、はい。大丈夫です。すみません、驚かせてしまいまして
|
234 |
+
wav/nen111_092.wav|ひぁっ、ぅぅぅ~~~
|
235 |
+
wav/nen111_105.wav|保健室に行きますか?
|
236 |
+
wav/nen111_124.wav|好き……好きです、大好きです……私は貴方のことが大好きです。愛しています。もう離れたりしません
|
237 |
+
wav/nen112_011.wav|ほしな君はちゃんと以前から、力になってくれていましたよ
|
238 |
+
wav/nen112_029.wav|私だって嬉しいです。ほしな君が一緒にいてくれて……その、単純に近い場所にいてくれるってことじゃなくてですね
|
239 |
+
wav/nen112_043.wav|それで、いなばさんは……相談でいいんですよね?
|
240 |
+
wav/nen112_054.wav|少し考える時間をもらえますか?
|
241 |
+
wav/nen112_068.wav|いえ、そんなことはありません。私も嬉しいですよ
|
242 |
+
wav/nen112_080.wav|あ、あの、なんだか凄い騒ぎになってるみたいですけど……
|
243 |
+
wav/nen112_094.wav|時と場所さえ考えてもらえれ��……私も、や……やぶさかではありませんが……え?え?も、もしかして今日って、そういうことなんですか?
|
244 |
+
wav/nen112_108.wav|そ、そうですね。見つかったらデートできなくなってしまいますよね
|
245 |
+
wav/nen112_121.wav|もぅっ!そんなに連続して呼ばれたら、嬉しすぎておかしくなっちゃいますよぅ
|
246 |
+
wav/nen112_133.wav|だって美味しいじゃないですか。それにほら、見た目も可愛いです
|
247 |
+
wav/nen112_145.wav|はぁ……美味しかったです
|
248 |
+
wav/nen112_159.wav|自分の身体なんですから、当たってることぐらい気付いてます……わかってはいますが……抱きついていたいんです
|
249 |
+
wav/nen112_173.wav|前は私のしたいことするデートでしたが……今回はしゅうじ君が私のために計画してくれたデートで、どこに行くのかドキドキして
|
250 |
+
wav/nen112_188.wav|私はしゅうじ君のこと、嫌いになったりなんてしないのに
|
251 |
+
wav/nen112_204.wav|お、お邪魔します
|
252 |
+
wav/nen112_217.wav|それにしても、しゅうじ君はお父さんとあんな風に喋るんですね。ちょっと、意外でした
|
253 |
+
wav/nen112_229.wav|いえ、平気です
|
254 |
+
wav/nen112_243.wav|お、女の子だって興奮とか、期待とか、もにょもにょしちゃうものなんですよぅ……
|
255 |
+
wav/nen112_259.wav|わ、わかりました……
|
256 |
+
wav/nen112_270.wav|は、はい……ぅぅぅぁッ……はっ、はぁー、はぁー……お願いします、続けて下さい……もっと、触って
|
257 |
+
wav/nen112_281.wav|あ、あ、あ、また……やっ、そんなに強く捻っちゃ……ひぁっ、んぃぃ……ッッ
|
258 |
+
wav/nen112_293.wav|あぁぁ、んぁああぁぁ……ッッ、2回、2回です……んっ、んんぅぅぅぁぁぁあッ、あっ、あっ、あああッッ
|
259 |
+
wav/nen112_304.wav|だ、だって……5回だなんて……恥ずかしいです。凄くエッチですから……
|
260 |
+
wav/nen112_315.wav|ぅぅ……また、そうやって全部言わせて……本当にイジワルですよぅ……
|
261 |
+
wav/nen112_327.wav|熱くて……硬くて……はぁ、はぁ、ぁぁぁあっ……前より太くて、おっきい気が、しますぅっ
|
262 |
+
wav/nen112_339.wav|ちゅっ、んん、ふぅぅ……んっ、んっ、んんぅぅぅぅ……ぅぅーーッ
|
263 |
+
wav/nen112_350.wav|だって、だって……んっ、ぅぅぅっ……こ、こんなに、グリグリされたら、こんな声も出ちゃいますよぅ……あっ、はぁはぁはぁはぁ
|
264 |
+
wav/nen112_361.wav|あ、はぁぁぁむ、んちゅ……ちゅ、ちゅ、ちゅ、じゅるる……んちゅ、ちゅぅぅーー……ん、んむぅ、んっ、じゅるる
|
265 |
+
wav/nen112_372.wav|私も……こんなにイってしまったのは、初めてです……やっぱりオナニーとは、全然違いますね……はっ、はぁぁ……
|
266 |
+
wav/nen112_386.wav|いえ、もう起きます
|
267 |
+
wav/nen113_171.wav|わ……わかりました……それなら……私、命令通りに、オナニーします
|
268 |
+
wav/nen113_182.wav|ひぁぁ!は、はい、はいっ……んんっ、んんぅぅ……ぅぅあっ、あっ、あっ、あっ
|
269 |
+
wav/nen113_193.wav|ちがっ、違うんです……お漏らしじゃなくて……ああ、もう……どうしてこんなにビショビショなの?まだ、乳首を刺激してるだけなのに
|
270 |
+
wav/nen113_205.wav|はぁ、はぁ、こ、ここら辺ですか?もう当たりますか?
|
271 |
+
wav/nen113_216.wav|ぁっ、ぁっ、ぁっ、ぅぅあっ、なにこれ……ダメっ、ダメっ……あっ、あっ、あっ、ぁぁぁああぁぁあ、イく……イっちゃう
|
272 |
+
wav/nen113_227.wav|はぁーっ……はぁーっ……気持ちいい、です。クリトリス、気持ちいい……
|
273 |
+
wav/nen113_238.wav|んんんっ!んぁっ、んぁっ、ダメ……手が、震えて、あっ、あっ!ローター……当てていられない……あっ、あっ
|
274 |
+
wav/nen113_249.wav|はっ、はひっ、あっ、あっ、あっ!イ、イく……もう、わらひ、我慢できませんよ……ああっ、あっ、あーーーーッ!
|
275 |
+
wav/nen113_260.wav|え?あ、ちょっと待って下さい……あっ……
|
276 |
+
wav/nen113_272.wav|わかりました。それじゃあ遠慮せず、沢山イきますね……はぁ、はぁ……
|
277 |
+
wav/nen113_284.wav|それに動きたいんですよね?気持ちよくなりたいんですよね?さっきから、わたしの中でおち●ちんがビクビク、してますよ
|
278 |
+
wav/nen113_295.wav|はぁ……はぁ……はぁ……はぁ……あっ……あっ……あっ、ああああぁぁぁぁぁぁぁぁぁああああああーーーーー!!
|
279 |
+
wav/nen113_306.wav|んふぅ……んっんっんんぅぅぅぁああ!はぁ!はぁ!あああっ、んんんーーーーー……んんぁぁああっ!
|
280 |
+
wav/nen113_317.wav|んひっ、あっ!あっ!はぁぁ……まだ、出てる……あっ、あっ、あっ、はぁぁ……ん、んんっ���
|
281 |
+
wav/nen113_328.wav|ちょっと?
|
282 |
+
wav/nen113_006.wav|そんなことありませんよ。さあ、遠慮せずに中に入って下さい
|
283 |
+
wav/nen113_017.wav|あのー……
|
284 |
+
wav/nen113_031.wav|さ、参考……ですか?川上君が考えたデートプランを実際に試してみる、とかじゃなく?
|
285 |
+
wav/nen113_044.wav|私はゲームセンターも好きですよ。普段は全然入ったこともありませんから、むしろ楽しみなぐらいです
|
286 |
+
wav/nen113_056.wav|あっ、しゅうじ君。あっちにもほら、クマのぬいぐるみがありますよ
|
287 |
+
wav/nen113_067.wav|私のことを考えてくれたからこそ、思い出の方を優先してくれたんですよね?
|
288 |
+
wav/nen113_081.wav|そうですね。特別やレアって言われてしまうと、試しに頼んでみたくなりますね
|
289 |
+
wav/nen113_096.wav|なにか違うこと考えてます
|
290 |
+
wav/nen113_107.wav|いえ、もうジュースが無くなっちゃいましたから……
|
291 |
+
wav/nen113_118.wav|でも……いつもよりは、疲れましたよね?
|
292 |
+
wav/nen113_132.wav|た、確かに……そうですね
|
293 |
+
wav/nen113_145.wav|それならいいんですが……
|
294 |
+
wav/nen113_162.wav|どっ、どうやってって
|
295 |
+
wav/nen114_017.wav|いえ。むしろ、こちらこそすみません。不透明な活動ばかりで……もっと結果が残るような物があればご迷惑もおかけしなかったんですが……
|
296 |
+
wav/nen114_028.wav|それに、パーティーで演奏しないとかりやさんはギターを披露できず、モヤモヤしたままになりませんか?
|
297 |
+
wav/nen114_042.wav|そっ、その言い方は……卑怯ですよぅ
|
298 |
+
wav/nen114_056.wav|そこも気になる部分ではあるんですが……
|
299 |
+
wav/nen114_072.wav|しゅうじ君を待っていたんです。最近、一緒にいられる時間が少ない気がして……なんとかしたいなと思って、終わるのを待ってたんです
|
300 |
+
wav/nen114_083.wav|女の子同士でもですか?
|
301 |
+
wav/nen114_097.wav|はい、できました
|
302 |
+
wav/nen114_111.wav|物じゃなくてですね、あの……ですから……しゅうじ君の願い事を、なんでも叶えます、私が
|
303 |
+
wav/nen114_124.wav|ダメです
|
304 |
+
wav/nen114_135.wav|んっ、んんーーーッ……んふぅ、ふぅー……ふぅー……んっ、んんっ、んむぅ……んぅ……も、もっと……しゅうじ君、もっと……
|
305 |
+
wav/nen114_146.wav|んぷぁぁっ、はっ、はぁ……はぁ……はひっ、んぁぁあ……はぁぁぁ……
|
306 |
+
wav/nen115_007.wav|でもその前に、私たちの演奏を聞いて下さい。一生懸命練習してきましたから
|
307 |
+
wav/nen115_021.wav|しゅうじ君は……誰に投票したんですか?
|
308 |
+
wav/nen115_037.wav|はぁ、ぁぁむ……ん、んんっ、ちゅちゅ……じゅる、ちゅぱちゅる、んっ、んんんんんーーーーーー
|
309 |
+
wav/nen115_049.wav|ひっ!?あっ、あっ、あああぁぁーーっ!
|
310 |
+
wav/nen115_061.wav|ずっとオナニー我慢してて……ぁぁぁぁあっ!しゅうじくん、しゅうじくん……っ、はぁ、はぁ、はぁぁあぁっ
|
311 |
+
wav/nen115_072.wav|ふぇぇ……?はっ、はぁ、はぁ、はぁ……ど、どうかしたんですか……?
|
312 |
+
wav/nen115_083.wav|好き、あっ、あっ、あっ、ひゅきでひゅ……おち●ちんにグリグリされるの……あっ、あっ、ああぁぁああっ!
|
313 |
+
wav/nen115_094.wav|あーー……あはーーー……はひ、はひっ……んへぁぁ……私、こんな下品な声を出してイっちゃった……はぁーっ……はぁーっ……
|
314 |
+
wav/nen115_106.wav|んっ、んっ、んんーーーーっ!はひっ、はひっ、んっ、んんんーーーーッ!
|
315 |
+
wav/nen115_118.wav|んひっ、あっ、あっ、んんっ、んんぁぁあっ、はぁーっ……はぁーっ……あっ、あっ、はぁぁぁぁぁ……
|
316 |
+
wav/nen115_129.wav|それに……こんなの、まるでおち●ちんが、私から生えたみたいです。しかも硬いままで……
|
317 |
+
wav/nen115_144.wav|んー……こんなものでしょうか
|
318 |
+
wav/nen115_156.wav|お願い?
|
319 |
+
wav/nen115_168.wav|んっ、ぅうぅ……はぁ、はぁ……んんっ、んんん……
|
320 |
+
wav/nen115_180.wav|はっ、んっ、んんぁっ、んぁっ……ぁぁあぁああぁ……引っかかるの、気持ちいい、です……んんー……ッッ
|
321 |
+
wav/nen115_191.wav|はぁー……はぁー……はぁー……ぁぁぁ、んんんっ……
|
322 |
+
wav/nen115_202.wav|あっ、あっ、あーーーっ……中、中が切なくて……はぁ、はぁ、はぁ、あの、もうオナニーじゃなくなってもいいですか?
|
323 |
+
wav/nen115_214.wav|だって……んぁぁ、ずっと待ってたんです。欲しくて、我慢してたんです……だから、仕方ないんですよ、ぁぁぁ……
|
324 |
+
wav/nen115_225.wav|違う、違うのぉ……身体が勝手に……ん、ん、ん、んぁぁあーーーぁぁぁぁ……こひゅれてる、気持ちいいの、こひゅれてるぅ
|
325 |
+
wav/nen115_236.wav|おま●こですっ、おま●こに欲しい……んっ、んぁ……精液、こっちで飲みたいんです、んぁ、んぁ、んぁーーっ!
|
326 |
+
wav/nen115_249.wav|え、えぇぇ……ま、まだ足りないんですか?こんなにドロドロにしたのに……
|
327 |
+
wav/nen115_262.wav|そう言ってもらえると……ありがとうございます
|
328 |
+
wav/nen116_001.wav|はい
|
329 |
+
wav/nen116_012.wav|私は……別に流されてもいいのに……
|
330 |
+
wav/nen116_026.wav|私にできることがあるなら、何でもします。だから、1人で苦しまないで下さい
|
331 |
+
wav/nen116_039.wav|はい、大丈夫です
|
332 |
+
wav/nen117_007.wav|どうしてそういうことを言うんですか!私の好きな人なのに!
|
333 |
+
wav/nen117_020.wav|あとですね、せっかくですからお泊まり用具の他にも色々用意してきたんです
|
334 |
+
wav/nen117_034.wav|あの、お風呂頂きました。お……お待たせ……しました、しゅうじ君
|
335 |
+
wav/nen117_047.wav|せっかく気合いを入れて身体も綺麗にしたのに……先に寝ちゃうなんてひどいです
|
336 |
+
wav/nen117_058.wav|よかった、安心しました
|
337 |
+
wav/nen117_069.wav|んぅぅ……ちゅ、ちゅ、んんんんーーッ……嫌じゃないですよ?むしろ……私は濃い方が好きかもしれません……ん、じゅる、じゅるりっ
|
338 |
+
wav/nen117_080.wav|じゃあ、続けますね。ん、ちゅ、ちゅぶぶ……んっ、じゅるっ、じゅぽじゅぽ、ちゅ、ちゅるるっ
|
339 |
+
wav/nen117_091.wav|んふぅ……ほら、こうして正直に教えてくれます、気持ちいいって
|
340 |
+
wav/nen117_103.wav|はぁ、はぁ、はぁ……すごい、トロトロと匂いが、さっきから止まりません……ああ、全然綺麗にできない
|
341 |
+
wav/nen117_114.wav|んぐっ……んぶ、んぶ……ッ……んんんんーーーーーッ!ん、んんーーーー……コク……コク……ん、んんんむぅ
|
342 |
+
wav/nen117_125.wav|ひゃっ、たくさん……あつい精液、びゅーって飛んで……あ、きゃっ、ひゃっ
|
343 |
+
wav/nen117_136.wav|ん、ちゅば、ちゅば……んんっ、れろれろ……ンンッ……はぁ、はぁ……れりょれりょ
|
344 |
+
wav/nen117_147.wav|んっ、んんっ、あむあむ……ぢゅぷ、ぢゅるるる……んぽくぽ、じゅるるるっ
|
345 |
+
wav/nen117_158.wav|あっ!ダメですよ、これは罰なのに、あ、きゃぁぁぁッ
|
346 |
+
wav/nen117_171.wav|んっ、んんんぁぁぁぁーーーーーーーー……ッッ!
|
347 |
+
wav/nen117_183.wav|あ、あ、ああーーーーっ……はぁ、はぁ……あ、あ、あ、それ、すごい……すごいぃぃ……んんぁあッッ
|
348 |
+
wav/nen117_194.wav|あっ、ひっ、んひぃぃッ……あーっ、あーっ……もうらめぇ…あ、あ、あ、イく、いっっ……くぅぅぅぅーーーーーぅぅぅぅううううッッ!!
|
349 |
+
wav/nen117_206.wav|んっ、あっ、あっ、あっ、あっ……そうなんですか?わたし、もうちゃんと、覚えてるんですか?
|
350 |
+
wav/nen117_217.wav|イっちゃうっ、わたひまたイっちゃうぅぅ……ッ
|
351 |
+
wav/nen117_228.wav|かひっ、かっ、はぁ、はぁ……んんんっ……はぁ、はぁ、んんっ、んぁ……ぁぁぁぁ……
|
352 |
+
wav/nen117_239.wav|それは、ちがっ、えっと、あががががががががががが――
|
353 |
+
wav/nen203_010.wav|はい、それは残念ながら
|
354 |
+
wav/nen203_025.wav|ご協力ありがとうございます。それは思い至ってませんでした、助かりました
|
355 |
+
wav/nen203_040.wav|心を許しあえるような相手が出来れば、おそらくは
|
356 |
+
wav/nen203_053.wav|すみません……明日もこうでないといいんですが……
|
357 |
+
wav/nen203_065.wav|あの、どうかしたんですか?いなばさん
|
358 |
+
wav/nen203_080.wav|それはたぶん、昨日話をした、胸の痛みに関わることなんですよね
|
359 |
+
wav/nen203_095.wav|占いなんて、あくまでも切っ掛けみたいなものですから
|
360 |
+
wav/nen203_111.wav|あ、あの、優しくしてください……それと、電気を消して……お願いです……
|
361 |
+
wav/nen203_127.wav|せっかくですし、一緒に入りませんか?
|
362 |
+
wav/nen204_006.wav|では、今日はこの辺りで解散にしましょうか
|
363 |
+
wav/nen205_018.wav|それでですね、ほしな君
|
364 |
+
wav/nen206_007.wav|ええ、ちょっと
|
365 |
+
wav/nen206_022.wav|そうですね。少なくとも、自分のせいっていうのはいなばさんの誤解かも知れませんし
|
366 |
+
wav/nen206_033.wav|やりとりをオープンにした方が、互いに痛くもない腹を探り合わないで済むと思います
|
367 |
+
wav/nen206_048.wav|もし、木月さんの行方が知れなくなったのが、魔法や契約と絡むことなら――
|
368 |
+
wav/nen206_063.wav|だから学院にも、なにも……
|
369 |
+
wav/nen207_016.wav|座りましたっ
|
370 |
+
wav/nen207_031.wav|え?そ、それはもちろんですけど
|
371 |
+
wav/nen209_001.wav|こんにちは
|
372 |
+
wav/nen210_009.wav|とりあえず……ほしな君にそ��、想定外に下着まで見せてしまったんですよね?
|
373 |
+
wav/nen210_023.wav|放課後、ななおのところまで付き合ってもらえませんか?
|
374 |
+
wav/nen210_039.wav|お待たせしました
|
375 |
+
wav/nen211_004.wav|はい。ですからほしな君の中には今、魔女2人のものである欠片がそれぞれにあります
|
376 |
+
wav/nen211_015.wav|そして、こうなってしまったものは仕方がありませんし、回収不可能なわけでもないんですから
|
377 |
+
wav/nen212_001.wav|う、上手くいったんですか?
|
378 |
+
wav/nen212_015.wav|はい、おかげさまで
|
379 |
+
wav/nen213_011.wav|生まれつき備えてしまっていた、あの能力のせいで
|
380 |
+
wav/nen213_025.wav|はあ……せ、交尾ですか
|
381 |
+
wav/nen214_010.wav|い、いえっなんでもっ
|
382 |
+
wav/nen215_012.wav|それもわかりますけど
|
383 |
+
wav/nen217_006.wav|とがくし先輩、その――
|
384 |
+
wav/nen218_009.wav|そこはまた、ご協力いただければ助かります
|
385 |
+
wav/nen219_005.wav|ハッピーハロウィンですね、いなばさん
|
386 |
+
wav/nen301_006.wav|ええ。私の方は、あともう少しで溜まりますから
|
387 |
+
wav/nen301_017.wav|はい、頑張ります
|
388 |
+
wav/nen302_010.wav|知っている方なんですか、2人とも?
|
389 |
+
wav/nen303_003.wav|なるほど。だったら、しいばさんはあまり近づき過ぎない方がいいかもしれません
|
390 |
+
wav/nen303_014.wav|はい、どうやらほしな君の心の穴が広がってしまった可能性がありそうです
|
391 |
+
wav/nen303_030.wav|いいんです、ほしな君が吸収してしまった分なら、ほとんど回収した後ですし
|
392 |
+
wav/nen303_045.wav|ほしな君の心の穴を埋めるのも、しいばさんにお任せした方が効率的かもしれません
|
393 |
+
wav/nen305_004.wav|こ、交尾をされたわけではないですよね?
|
394 |
+
wav/nen307_005.wav|もっとも、ほしな君が誘ったのはしいばさんです。しいばさん次第だと思いますが
|
395 |
+
wav/nen308_007.wav|ですがしばらくの間、話し相手になることにしました
|
396 |
+
wav/nen310_006.wav|いいんじゃないでしょうか?
|
397 |
+
wav/nen312_003.wav|どうかしましたか?ほしな君もまだ来てないようですし、気になっていたんですが
|
398 |
+
wav/nen312_014.wav|いえ、私も何も聞いていませんが
|
399 |
+
wav/nen314_002.wav|ありがとうございます
|
400 |
+
wav/nen314_016.wav|魔女を常に見張る者が多いそうです、心当たりはありませんか?
|
401 |
+
wav/nen314_027.wav|すると心を強引に削り取った痕がみつかったんです!
|
402 |
+
wav/nen315_002.wav|え、ええ
|
403 |
+
wav/nen315_013.wav|はい、ですがこの場合、欠片は犯人から奪い返せばいいんです
|
404 |
+
wav/nen315_024.wav|見つけ出すだけでも、なかなか骨が折れそうですが
|
405 |
+
wav/nen316_003.wav|別のアルプがいるなら、匂いでわかるというのですが
|
406 |
+
wav/nen317_008.wav|いえ、厚真さんが預かっていた子犬も、行方がわからなくなっているのを思い出したんですが
|
407 |
+
wav/nen319_005.wav|人間に見えても、ぼんやりしないでしっかり警戒を
|
408 |
+
wav/nen401_006.wav|ふー……ふー………………はぁ、美味しい
|
409 |
+
wav/nen402_007.wav|はい
|
410 |
+
wav/nen402_020.wav|ちょっと思いつきませんね
|
411 |
+
wav/nen404_003.wav|もし何かあるなら休んでくれてもいいんですよ?
|
412 |
+
wav/nen404_014.wav|私に対する罪悪感といいますか、義務感と言いますか……それはきっと同情に近い感情ですから……
|
413 |
+
wav/nen405_002.wav|ほしな君。ああいうのは、どうかと思います
|
414 |
+
wav/nen405_013.wav|はい、何ですか?
|
415 |
+
wav/nen405_024.wav|いえ、今日は仕方ありませんよ。相談だけじゃなく、占いを希望する人も来ませんでしたからね
|
416 |
+
wav/nen406_010.wav|欠片が戻ってきたのは、ほしな君がとがくし先輩とお付き合いをするようになったからだと思うんです
|
417 |
+
wav/nen406_021.wav|それに……これはあくまで、責めるつもりではなく、色んな人の相談を受けて思った個人的な意見なんですが
|
418 |
+
wav/nen409_003.wav|あ、ほしな君
|
419 |
+
wav/nen409_014.wav|魔力の塊をぶつけることで、多少のショックを与えるかもしれないそうですが、先輩の心にひどい影響を与えるものじゃないそうです
|
420 |
+
wav/nen409_025.wav|私は、この弾丸を撃てばいいわけですね
|
421 |
+
wav/nen409_038.wav|それでは
|
422 |
+
wav/nen410_010.wav|それは、ほしな君がオカ研で頑張ってくれた分で相殺です。実際、今のこの欠片の量は、私がほしな君と出会う前より、ほんの少し少ないだけですから
|
423 |
+
wav/nen410_022.wav|学院内ではあれほどダメだって言ってるじゃないですか
|
424 |
+
wav/nen504_001.wav|ほしな君、調子はどうですか?
|
425 |
+
wav/nen505_008.wav|えっと……こ、ここは、励まし会とか開いた方がいいんでしょうか?
|
426 |
+
wav/nen507_009.wav|なのに、部活を続けたりしたら、擦れ違いですとか、そういうこと���心配になって
|
hifiutils.py
ADDED
@@ -0,0 +1,58 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import glob
|
2 |
+
import os
|
3 |
+
import matplotlib
|
4 |
+
import torch
|
5 |
+
from torch.nn.utils import weight_norm
|
6 |
+
matplotlib.use("Agg")
|
7 |
+
import matplotlib.pylab as plt
|
8 |
+
|
9 |
+
|
10 |
+
def plot_spectrogram(spectrogram):
|
11 |
+
fig, ax = plt.subplots(figsize=(10, 2))
|
12 |
+
im = ax.imshow(spectrogram, aspect="auto", origin="lower",
|
13 |
+
interpolation='none')
|
14 |
+
plt.colorbar(im, ax=ax)
|
15 |
+
|
16 |
+
fig.canvas.draw()
|
17 |
+
plt.close()
|
18 |
+
|
19 |
+
return fig
|
20 |
+
|
21 |
+
|
22 |
+
def init_weights(m, mean=0.0, std=0.01):
|
23 |
+
classname = m.__class__.__name__
|
24 |
+
if classname.find("Conv") != -1:
|
25 |
+
m.weight.data.normal_(mean, std)
|
26 |
+
|
27 |
+
|
28 |
+
def apply_weight_norm(m):
|
29 |
+
classname = m.__class__.__name__
|
30 |
+
if classname.find("Conv") != -1:
|
31 |
+
weight_norm(m)
|
32 |
+
|
33 |
+
|
34 |
+
def get_padding(kernel_size, dilation=1):
|
35 |
+
return int((kernel_size*dilation - dilation)/2)
|
36 |
+
|
37 |
+
|
38 |
+
def load_checkpoint(filepath, device):
|
39 |
+
assert os.path.isfile(filepath)
|
40 |
+
print("Loading '{}'".format(filepath))
|
41 |
+
checkpoint_dict = torch.load(filepath, map_location=device)
|
42 |
+
print("Complete.")
|
43 |
+
return checkpoint_dict
|
44 |
+
|
45 |
+
|
46 |
+
def save_checkpoint(filepath, obj):
|
47 |
+
print("Saving checkpoint to {}".format(filepath))
|
48 |
+
torch.save(obj, filepath)
|
49 |
+
print("Complete.")
|
50 |
+
|
51 |
+
|
52 |
+
def scan_checkpoint(cp_dir, prefix):
|
53 |
+
pattern = os.path.join(cp_dir, prefix + '????????')
|
54 |
+
cp_list = glob.glob(pattern)
|
55 |
+
if len(cp_list) == 0:
|
56 |
+
return None
|
57 |
+
return sorted(cp_list)[-1]
|
58 |
+
|
hparams (1).py
ADDED
@@ -0,0 +1,94 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import torch
|
2 |
+
from text import symbols
|
3 |
+
|
4 |
+
|
5 |
+
class create_hparams():
|
6 |
+
"""Create model hyperparameters. Parse nondefault from given string."""
|
7 |
+
################################
|
8 |
+
# CUDA Enable #
|
9 |
+
################################
|
10 |
+
if torch.cuda.is_available() :
|
11 |
+
cuda_enabled = True
|
12 |
+
else :
|
13 |
+
cuda_enabled = False
|
14 |
+
|
15 |
+
################################
|
16 |
+
# Experiment Parameters #
|
17 |
+
################################
|
18 |
+
epochs = 100
|
19 |
+
iters_per_checkpoint = 500
|
20 |
+
seed= 1234
|
21 |
+
dynamic_loss_scaling = True
|
22 |
+
fp16_run = False
|
23 |
+
distributed_run = False
|
24 |
+
dist_backend = "nccl"
|
25 |
+
dist_url = "tcp://localhost:54321"
|
26 |
+
cudnn_enabled = True
|
27 |
+
cudnn_benchmark = False
|
28 |
+
ignore_layers = ['embedding.weight']
|
29 |
+
|
30 |
+
################################
|
31 |
+
# Data Parameters #
|
32 |
+
################################
|
33 |
+
load_mel_from_disk = False
|
34 |
+
training_files = 'filelists/transcript_train.txt'
|
35 |
+
validation_files = 'filelists/transcript_val.txt'
|
36 |
+
text_cleaners = ['japanese_cleaners']
|
37 |
+
|
38 |
+
################################
|
39 |
+
# Audio Parameters #
|
40 |
+
################################
|
41 |
+
max_wav_value = 32768.0
|
42 |
+
sampling_rate = 22050
|
43 |
+
filter_length = 1024
|
44 |
+
hop_length = 256
|
45 |
+
win_length = 1024
|
46 |
+
n_mel_channels = 80
|
47 |
+
mel_fmin = 0.0
|
48 |
+
mel_fmax = 8000.0
|
49 |
+
|
50 |
+
################################
|
51 |
+
# Model Parameters #
|
52 |
+
################################
|
53 |
+
n_symbols = len(symbols)
|
54 |
+
symbols_embedding_dim = 512
|
55 |
+
|
56 |
+
# Encoder parameters
|
57 |
+
encoder_kernel_size = 5
|
58 |
+
encoder_n_convolutions = 3
|
59 |
+
encoder_embedding_dim = 512
|
60 |
+
|
61 |
+
# Decoder parameters
|
62 |
+
n_frames_per_step = 1 # currently only 1 is supported
|
63 |
+
decoder_rnn_dim = 1024
|
64 |
+
prenet_dim = 256
|
65 |
+
max_decoder_steps = 1000
|
66 |
+
gate_threshold = 0.5
|
67 |
+
p_attention_dropout = 0.1
|
68 |
+
p_decoder_dropout = 0.1
|
69 |
+
|
70 |
+
# Attention parameters
|
71 |
+
attention_rnn_dim = 1024
|
72 |
+
attention_dim = 128
|
73 |
+
# Location Layer parameters
|
74 |
+
attention_location_n_filters = 32
|
75 |
+
attention_location_kernel_size = 31
|
76 |
+
|
77 |
+
# Mel-post processing network parameters
|
78 |
+
postnet_embedding_dim = 512
|
79 |
+
postnet_kernel_size = 5
|
80 |
+
postnet_n_convolutions = 5
|
81 |
+
|
82 |
+
################################
|
83 |
+
# Optimization Hyperparameters #
|
84 |
+
################################
|
85 |
+
use_saved_learning_rate = False
|
86 |
+
learning_rate = 1e-3
|
87 |
+
weight_decay = 1e-6
|
88 |
+
grad_clip_thresh = 1.0
|
89 |
+
batch_size = 64
|
90 |
+
mask_padding = True # set model's padded outputs to padded values
|
91 |
+
|
92 |
+
|
93 |
+
|
94 |
+
|
hparams.py
ADDED
@@ -0,0 +1,94 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import torch
|
2 |
+
from text import symbols
|
3 |
+
|
4 |
+
|
5 |
+
class create_hparams():
|
6 |
+
"""Create model hyperparameters. Parse nondefault from given string."""
|
7 |
+
################################
|
8 |
+
# CUDA Enable #
|
9 |
+
################################
|
10 |
+
if torch.cuda.is_available() :
|
11 |
+
cuda_enabled = True
|
12 |
+
else :
|
13 |
+
cuda_enabled = False
|
14 |
+
|
15 |
+
################################
|
16 |
+
# Experiment Parameters #
|
17 |
+
################################
|
18 |
+
epochs = 100
|
19 |
+
iters_per_checkpoint = 500
|
20 |
+
seed= 1234
|
21 |
+
dynamic_loss_scaling = True
|
22 |
+
fp16_run = False
|
23 |
+
distributed_run = False
|
24 |
+
dist_backend = "nccl"
|
25 |
+
dist_url = "tcp://localhost:54321"
|
26 |
+
cudnn_enabled = True
|
27 |
+
cudnn_benchmark = False
|
28 |
+
ignore_layers = ['embedding.weight']
|
29 |
+
|
30 |
+
################################
|
31 |
+
# Data Parameters #
|
32 |
+
################################
|
33 |
+
load_mel_from_disk = False
|
34 |
+
training_files = 'filelists/transcript_train.txt'
|
35 |
+
validation_files = 'filelists/transcript_val.txt'
|
36 |
+
text_cleaners = ['japanese_cleaners']
|
37 |
+
|
38 |
+
################################
|
39 |
+
# Audio Parameters #
|
40 |
+
################################
|
41 |
+
max_wav_value = 32768.0
|
42 |
+
sampling_rate = 22050
|
43 |
+
filter_length = 1024
|
44 |
+
hop_length = 256
|
45 |
+
win_length = 1024
|
46 |
+
n_mel_channels = 80
|
47 |
+
mel_fmin = 0.0
|
48 |
+
mel_fmax = 8000.0
|
49 |
+
|
50 |
+
################################
|
51 |
+
# Model Parameters #
|
52 |
+
################################
|
53 |
+
n_symbols = len(symbols)
|
54 |
+
symbols_embedding_dim = 512
|
55 |
+
|
56 |
+
# Encoder parameters
|
57 |
+
encoder_kernel_size = 5
|
58 |
+
encoder_n_convolutions = 3
|
59 |
+
encoder_embedding_dim = 512
|
60 |
+
|
61 |
+
# Decoder parameters
|
62 |
+
n_frames_per_step = 1 # currently only 1 is supported
|
63 |
+
decoder_rnn_dim = 1024
|
64 |
+
prenet_dim = 256
|
65 |
+
max_decoder_steps = 1000
|
66 |
+
gate_threshold = 0.5
|
67 |
+
p_attention_dropout = 0.1
|
68 |
+
p_decoder_dropout = 0.1
|
69 |
+
|
70 |
+
# Attention parameters
|
71 |
+
attention_rnn_dim = 1024
|
72 |
+
attention_dim = 128
|
73 |
+
# Location Layer parameters
|
74 |
+
attention_location_n_filters = 32
|
75 |
+
attention_location_kernel_size = 31
|
76 |
+
|
77 |
+
# Mel-post processing network parameters
|
78 |
+
postnet_embedding_dim = 512
|
79 |
+
postnet_kernel_size = 5
|
80 |
+
postnet_n_convolutions = 5
|
81 |
+
|
82 |
+
################################
|
83 |
+
# Optimization Hyperparameters #
|
84 |
+
################################
|
85 |
+
use_saved_learning_rate = False
|
86 |
+
learning_rate = 1e-3
|
87 |
+
weight_decay = 1e-6
|
88 |
+
grad_clip_thresh = 1.0
|
89 |
+
batch_size = 64
|
90 |
+
mask_padding = True # set model's padded outputs to padded values
|
91 |
+
|
92 |
+
|
93 |
+
|
94 |
+
|
inference.ipynb
ADDED
The diff for this file is too large to render.
See raw diff
|
|
layers.py
ADDED
@@ -0,0 +1,80 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import torch
|
2 |
+
from librosa.filters import mel as librosa_mel_fn
|
3 |
+
from audio_processing import dynamic_range_compression
|
4 |
+
from audio_processing import dynamic_range_decompression
|
5 |
+
from stft import STFT
|
6 |
+
|
7 |
+
|
8 |
+
class LinearNorm(torch.nn.Module):
|
9 |
+
def __init__(self, in_dim, out_dim, bias=True, w_init_gain='linear'):
|
10 |
+
super(LinearNorm, self).__init__()
|
11 |
+
self.linear_layer = torch.nn.Linear(in_dim, out_dim, bias=bias)
|
12 |
+
|
13 |
+
torch.nn.init.xavier_uniform_(
|
14 |
+
self.linear_layer.weight,
|
15 |
+
gain=torch.nn.init.calculate_gain(w_init_gain))
|
16 |
+
|
17 |
+
def forward(self, x):
|
18 |
+
return self.linear_layer(x)
|
19 |
+
|
20 |
+
|
21 |
+
class ConvNorm(torch.nn.Module):
|
22 |
+
def __init__(self, in_channels, out_channels, kernel_size=1, stride=1,
|
23 |
+
padding=None, dilation=1, bias=True, w_init_gain='linear'):
|
24 |
+
super(ConvNorm, self).__init__()
|
25 |
+
if padding is None:
|
26 |
+
assert(kernel_size % 2 == 1)
|
27 |
+
padding = int(dilation * (kernel_size - 1) / 2)
|
28 |
+
|
29 |
+
self.conv = torch.nn.Conv1d(in_channels, out_channels,
|
30 |
+
kernel_size=kernel_size, stride=stride,
|
31 |
+
padding=padding, dilation=dilation,
|
32 |
+
bias=bias)
|
33 |
+
|
34 |
+
torch.nn.init.xavier_uniform_(
|
35 |
+
self.conv.weight, gain=torch.nn.init.calculate_gain(w_init_gain))
|
36 |
+
|
37 |
+
def forward(self, signal):
|
38 |
+
conv_signal = self.conv(signal)
|
39 |
+
return conv_signal
|
40 |
+
|
41 |
+
|
42 |
+
class TacotronSTFT(torch.nn.Module):
|
43 |
+
def __init__(self, filter_length=1024, hop_length=256, win_length=1024,
|
44 |
+
n_mel_channels=80, sampling_rate=22050, mel_fmin=0.0,
|
45 |
+
mel_fmax=8000.0):
|
46 |
+
super(TacotronSTFT, self).__init__()
|
47 |
+
self.n_mel_channels = n_mel_channels
|
48 |
+
self.sampling_rate = sampling_rate
|
49 |
+
self.stft_fn = STFT(filter_length, hop_length, win_length)
|
50 |
+
mel_basis = librosa_mel_fn(
|
51 |
+
sampling_rate, filter_length, n_mel_channels, mel_fmin, mel_fmax)
|
52 |
+
mel_basis = torch.from_numpy(mel_basis).float()
|
53 |
+
self.register_buffer('mel_basis', mel_basis)
|
54 |
+
|
55 |
+
def spectral_normalize(self, magnitudes):
|
56 |
+
output = dynamic_range_compression(magnitudes)
|
57 |
+
return output
|
58 |
+
|
59 |
+
def spectral_de_normalize(self, magnitudes):
|
60 |
+
output = dynamic_range_decompression(magnitudes)
|
61 |
+
return output
|
62 |
+
|
63 |
+
def mel_spectrogram(self, y):
|
64 |
+
"""Computes mel-spectrograms from a batch of waves
|
65 |
+
PARAMS
|
66 |
+
------
|
67 |
+
y: Variable(torch.FloatTensor) with shape (B, T) in range [-1, 1]
|
68 |
+
|
69 |
+
RETURNS
|
70 |
+
-------
|
71 |
+
mel_output: torch.FloatTensor of shape (B, n_mel_channels, T)
|
72 |
+
"""
|
73 |
+
assert(torch.min(y.data) >= -1)
|
74 |
+
assert(torch.max(y.data) <= 1)
|
75 |
+
|
76 |
+
magnitudes, phases = self.stft_fn.transform(y)
|
77 |
+
magnitudes = magnitudes.data
|
78 |
+
mel_output = torch.matmul(self.mel_basis, magnitudes)
|
79 |
+
mel_output = self.spectral_normalize(mel_output)
|
80 |
+
return mel_output
|
logger.py
ADDED
@@ -0,0 +1,48 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import random
|
2 |
+
import torch
|
3 |
+
from torch.utils.tensorboard import SummaryWriter
|
4 |
+
from plotting_utils import plot_alignment_to_numpy, plot_spectrogram_to_numpy
|
5 |
+
from plotting_utils import plot_gate_outputs_to_numpy
|
6 |
+
|
7 |
+
|
8 |
+
class Tacotron2Logger(SummaryWriter):
|
9 |
+
def __init__(self, logdir):
|
10 |
+
super(Tacotron2Logger, self).__init__(logdir)
|
11 |
+
|
12 |
+
def log_training(self, reduced_loss, grad_norm, learning_rate, duration,
|
13 |
+
iteration):
|
14 |
+
self.add_scalar("training.loss", reduced_loss, iteration)
|
15 |
+
self.add_scalar("grad.norm", grad_norm, iteration)
|
16 |
+
self.add_scalar("learning.rate", learning_rate, iteration)
|
17 |
+
self.add_scalar("duration", duration, iteration)
|
18 |
+
|
19 |
+
def log_validation(self, reduced_loss, model, y, y_pred, iteration):
|
20 |
+
self.add_scalar("validation.loss", reduced_loss, iteration)
|
21 |
+
_, mel_outputs, gate_outputs, alignments = y_pred
|
22 |
+
mel_targets, gate_targets = y
|
23 |
+
|
24 |
+
# plot distribution of parameters
|
25 |
+
for tag, value in model.named_parameters():
|
26 |
+
tag = tag.replace('.', '/')
|
27 |
+
self.add_histogram(tag, value.data.cpu().numpy(), iteration)
|
28 |
+
|
29 |
+
# plot alignment, mel target and predicted, gate target and predicted
|
30 |
+
idx = random.randint(0, alignments.size(0) - 1)
|
31 |
+
self.add_image(
|
32 |
+
"alignment",
|
33 |
+
plot_alignment_to_numpy(alignments[idx].data.cpu().numpy().T),
|
34 |
+
iteration, dataformats='HWC')
|
35 |
+
self.add_image(
|
36 |
+
"mel_target",
|
37 |
+
plot_spectrogram_to_numpy(mel_targets[idx].data.cpu().numpy()),
|
38 |
+
iteration, dataformats='HWC')
|
39 |
+
self.add_image(
|
40 |
+
"mel_predicted",
|
41 |
+
plot_spectrogram_to_numpy(mel_outputs[idx].data.cpu().numpy()),
|
42 |
+
iteration, dataformats='HWC')
|
43 |
+
self.add_image(
|
44 |
+
"gate",
|
45 |
+
plot_gate_outputs_to_numpy(
|
46 |
+
gate_targets[idx].data.cpu().numpy(),
|
47 |
+
torch.sigmoid(gate_outputs[idx]).data.cpu().numpy()),
|
48 |
+
iteration, dataformats='HWC')
|
loss_function.py
ADDED
@@ -0,0 +1,19 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
from torch import nn
|
2 |
+
|
3 |
+
|
4 |
+
class Tacotron2Loss(nn.Module):
|
5 |
+
def __init__(self):
|
6 |
+
super(Tacotron2Loss, self).__init__()
|
7 |
+
|
8 |
+
def forward(self, model_output, targets):
|
9 |
+
mel_target, gate_target = targets[0], targets[1]
|
10 |
+
mel_target.requires_grad = False
|
11 |
+
gate_target.requires_grad = False
|
12 |
+
gate_target = gate_target.view(-1, 1)
|
13 |
+
|
14 |
+
mel_out, mel_out_postnet, gate_out, _ = model_output
|
15 |
+
gate_out = gate_out.view(-1, 1)
|
16 |
+
mel_loss = nn.MSELoss()(mel_out, mel_target) + \
|
17 |
+
nn.MSELoss()(mel_out_postnet, mel_target)
|
18 |
+
gate_loss = nn.BCEWithLogitsLoss()(gate_out, gate_target)
|
19 |
+
return mel_loss + gate_loss
|
loss_scaler.py
ADDED
@@ -0,0 +1,131 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import torch
|
2 |
+
|
3 |
+
class LossScaler:
|
4 |
+
|
5 |
+
def __init__(self, scale=1):
|
6 |
+
self.cur_scale = scale
|
7 |
+
|
8 |
+
# `params` is a list / generator of torch.Variable
|
9 |
+
def has_overflow(self, params):
|
10 |
+
return False
|
11 |
+
|
12 |
+
# `x` is a torch.Tensor
|
13 |
+
def _has_inf_or_nan(x):
|
14 |
+
return False
|
15 |
+
|
16 |
+
# `overflow` is boolean indicating whether we overflowed in gradient
|
17 |
+
def update_scale(self, overflow):
|
18 |
+
pass
|
19 |
+
|
20 |
+
@property
|
21 |
+
def loss_scale(self):
|
22 |
+
return self.cur_scale
|
23 |
+
|
24 |
+
def scale_gradient(self, module, grad_in, grad_out):
|
25 |
+
return tuple(self.loss_scale * g for g in grad_in)
|
26 |
+
|
27 |
+
def backward(self, loss):
|
28 |
+
scaled_loss = loss*self.loss_scale
|
29 |
+
scaled_loss.backward()
|
30 |
+
|
31 |
+
class DynamicLossScaler:
|
32 |
+
|
33 |
+
def __init__(self,
|
34 |
+
init_scale=2**32,
|
35 |
+
scale_factor=2.,
|
36 |
+
scale_window=1000):
|
37 |
+
self.cur_scale = init_scale
|
38 |
+
self.cur_iter = 0
|
39 |
+
self.last_overflow_iter = -1
|
40 |
+
self.scale_factor = scale_factor
|
41 |
+
self.scale_window = scale_window
|
42 |
+
|
43 |
+
# `params` is a list / generator of torch.Variable
|
44 |
+
def has_overflow(self, params):
|
45 |
+
# return False
|
46 |
+
for p in params:
|
47 |
+
if p.grad is not None and DynamicLossScaler._has_inf_or_nan(p.grad.data):
|
48 |
+
return True
|
49 |
+
|
50 |
+
return False
|
51 |
+
|
52 |
+
# `x` is a torch.Tensor
|
53 |
+
def _has_inf_or_nan(x):
|
54 |
+
cpu_sum = float(x.float().sum())
|
55 |
+
if cpu_sum == float('inf') or cpu_sum == -float('inf') or cpu_sum != cpu_sum:
|
56 |
+
return True
|
57 |
+
return False
|
58 |
+
|
59 |
+
# `overflow` is boolean indicating whether we overflowed in gradient
|
60 |
+
def update_scale(self, overflow):
|
61 |
+
if overflow:
|
62 |
+
#self.cur_scale /= self.scale_factor
|
63 |
+
self.cur_scale = max(self.cur_scale/self.scale_factor, 1)
|
64 |
+
self.last_overflow_iter = self.cur_iter
|
65 |
+
else:
|
66 |
+
if (self.cur_iter - self.last_overflow_iter) % self.scale_window == 0:
|
67 |
+
self.cur_scale *= self.scale_factor
|
68 |
+
# self.cur_scale = 1
|
69 |
+
self.cur_iter += 1
|
70 |
+
|
71 |
+
@property
|
72 |
+
def loss_scale(self):
|
73 |
+
return self.cur_scale
|
74 |
+
|
75 |
+
def scale_gradient(self, module, grad_in, grad_out):
|
76 |
+
return tuple(self.loss_scale * g for g in grad_in)
|
77 |
+
|
78 |
+
def backward(self, loss):
|
79 |
+
scaled_loss = loss*self.loss_scale
|
80 |
+
scaled_loss.backward()
|
81 |
+
|
82 |
+
##############################################################
|
83 |
+
# Example usage below here -- assuming it's in a separate file
|
84 |
+
##############################################################
|
85 |
+
if __name__ == "__main__":
|
86 |
+
import torch
|
87 |
+
from torch.autograd import Variable
|
88 |
+
from dynamic_loss_scaler import DynamicLossScaler
|
89 |
+
|
90 |
+
# N is batch size; D_in is input dimension;
|
91 |
+
# H is hidden dimension; D_out is output dimension.
|
92 |
+
N, D_in, H, D_out = 64, 1000, 100, 10
|
93 |
+
|
94 |
+
# Create random Tensors to hold inputs and outputs, and wrap them in Variables.
|
95 |
+
x = Variable(torch.randn(N, D_in), requires_grad=False)
|
96 |
+
y = Variable(torch.randn(N, D_out), requires_grad=False)
|
97 |
+
|
98 |
+
w1 = Variable(torch.randn(D_in, H), requires_grad=True)
|
99 |
+
w2 = Variable(torch.randn(H, D_out), requires_grad=True)
|
100 |
+
parameters = [w1, w2]
|
101 |
+
|
102 |
+
learning_rate = 1e-6
|
103 |
+
optimizer = torch.optim.SGD(parameters, lr=learning_rate)
|
104 |
+
loss_scaler = DynamicLossScaler()
|
105 |
+
|
106 |
+
for t in range(500):
|
107 |
+
y_pred = x.mm(w1).clamp(min=0).mm(w2)
|
108 |
+
loss = (y_pred - y).pow(2).sum() * loss_scaler.loss_scale
|
109 |
+
print('Iter {} loss scale: {}'.format(t, loss_scaler.loss_scale))
|
110 |
+
print('Iter {} scaled loss: {}'.format(t, loss.data[0]))
|
111 |
+
print('Iter {} unscaled loss: {}'.format(t, loss.data[0] / loss_scaler.loss_scale))
|
112 |
+
|
113 |
+
# Run backprop
|
114 |
+
optimizer.zero_grad()
|
115 |
+
loss.backward()
|
116 |
+
|
117 |
+
# Check for overflow
|
118 |
+
has_overflow = DynamicLossScaler.has_overflow(parameters)
|
119 |
+
|
120 |
+
# If no overflow, unscale grad and update as usual
|
121 |
+
if not has_overflow:
|
122 |
+
for param in parameters:
|
123 |
+
param.grad.data.mul_(1. / loss_scaler.loss_scale)
|
124 |
+
optimizer.step()
|
125 |
+
# Otherwise, don't do anything -- ie, skip iteration
|
126 |
+
else:
|
127 |
+
print('OVERFLOW!')
|
128 |
+
|
129 |
+
# Update loss scale for next iteration
|
130 |
+
loss_scaler.update_scale(has_overflow)
|
131 |
+
|
meldataset.py
ADDED
@@ -0,0 +1,168 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import math
|
2 |
+
import os
|
3 |
+
import random
|
4 |
+
import torch
|
5 |
+
import torch.utils.data
|
6 |
+
import numpy as np
|
7 |
+
from librosa.util import normalize
|
8 |
+
from scipy.io.wavfile import read
|
9 |
+
from librosa.filters import mel as librosa_mel_fn
|
10 |
+
|
11 |
+
MAX_WAV_VALUE = 32768.0
|
12 |
+
|
13 |
+
|
14 |
+
def load_wav(full_path):
|
15 |
+
sampling_rate, data = read(full_path)
|
16 |
+
return data, sampling_rate
|
17 |
+
|
18 |
+
|
19 |
+
def dynamic_range_compression(x, C=1, clip_val=1e-5):
|
20 |
+
return np.log(np.clip(x, a_min=clip_val, a_max=None) * C)
|
21 |
+
|
22 |
+
|
23 |
+
def dynamic_range_decompression(x, C=1):
|
24 |
+
return np.exp(x) / C
|
25 |
+
|
26 |
+
|
27 |
+
def dynamic_range_compression_torch(x, C=1, clip_val=1e-5):
|
28 |
+
return torch.log(torch.clamp(x, min=clip_val) * C)
|
29 |
+
|
30 |
+
|
31 |
+
def dynamic_range_decompression_torch(x, C=1):
|
32 |
+
return torch.exp(x) / C
|
33 |
+
|
34 |
+
|
35 |
+
def spectral_normalize_torch(magnitudes):
|
36 |
+
output = dynamic_range_compression_torch(magnitudes)
|
37 |
+
return output
|
38 |
+
|
39 |
+
|
40 |
+
def spectral_de_normalize_torch(magnitudes):
|
41 |
+
output = dynamic_range_decompression_torch(magnitudes)
|
42 |
+
return output
|
43 |
+
|
44 |
+
|
45 |
+
mel_basis = {}
|
46 |
+
hann_window = {}
|
47 |
+
|
48 |
+
|
49 |
+
def mel_spectrogram(y, n_fft, num_mels, sampling_rate, hop_size, win_size, fmin, fmax, center=False):
|
50 |
+
if torch.min(y) < -1.:
|
51 |
+
print('min value is ', torch.min(y))
|
52 |
+
if torch.max(y) > 1.:
|
53 |
+
print('max value is ', torch.max(y))
|
54 |
+
|
55 |
+
global mel_basis, hann_window
|
56 |
+
if fmax not in mel_basis:
|
57 |
+
mel = librosa_mel_fn(sampling_rate, n_fft, num_mels, fmin, fmax)
|
58 |
+
mel_basis[str(fmax)+'_'+str(y.device)] = torch.from_numpy(mel).float().to(y.device)
|
59 |
+
hann_window[str(y.device)] = torch.hann_window(win_size).to(y.device)
|
60 |
+
|
61 |
+
y = torch.nn.functional.pad(y.unsqueeze(1), (int((n_fft-hop_size)/2), int((n_fft-hop_size)/2)), mode='reflect')
|
62 |
+
y = y.squeeze(1)
|
63 |
+
|
64 |
+
spec = torch.stft(y, n_fft, hop_length=hop_size, win_length=win_size, window=hann_window[str(y.device)],
|
65 |
+
center=center, pad_mode='reflect', normalized=False, onesided=True)
|
66 |
+
|
67 |
+
spec = torch.sqrt(spec.pow(2).sum(-1)+(1e-9))
|
68 |
+
|
69 |
+
spec = torch.matmul(mel_basis[str(fmax)+'_'+str(y.device)], spec)
|
70 |
+
spec = spectral_normalize_torch(spec)
|
71 |
+
|
72 |
+
return spec
|
73 |
+
|
74 |
+
|
75 |
+
def get_dataset_filelist(a):
|
76 |
+
with open(a.input_training_file, 'r', encoding='utf-8') as fi:
|
77 |
+
training_files = [os.path.join(a.input_wavs_dir, x.split('|')[0])
|
78 |
+
for x in fi.read().split('\n') if len(x) > 0]
|
79 |
+
|
80 |
+
with open(a.input_validation_file, 'r', encoding='utf-8') as fi:
|
81 |
+
validation_files = [os.path.join(a.input_wavs_dir, x.split('|')[0])
|
82 |
+
for x in fi.read().split('\n') if len(x) > 0]
|
83 |
+
return training_files, validation_files
|
84 |
+
|
85 |
+
|
86 |
+
class MelDataset(torch.utils.data.Dataset):
|
87 |
+
def __init__(self, training_files, segment_size, n_fft, num_mels,
|
88 |
+
hop_size, win_size, sampling_rate, fmin, fmax, split=True, shuffle=True, n_cache_reuse=1,
|
89 |
+
device=None, fmax_loss=None, fine_tuning=False, base_mels_path=None):
|
90 |
+
self.audio_files = training_files
|
91 |
+
random.seed(1234)
|
92 |
+
if shuffle:
|
93 |
+
random.shuffle(self.audio_files)
|
94 |
+
self.segment_size = segment_size
|
95 |
+
self.sampling_rate = sampling_rate
|
96 |
+
self.split = split
|
97 |
+
self.n_fft = n_fft
|
98 |
+
self.num_mels = num_mels
|
99 |
+
self.hop_size = hop_size
|
100 |
+
self.win_size = win_size
|
101 |
+
self.fmin = fmin
|
102 |
+
self.fmax = fmax
|
103 |
+
self.fmax_loss = fmax_loss
|
104 |
+
self.cached_wav = None
|
105 |
+
self.n_cache_reuse = n_cache_reuse
|
106 |
+
self._cache_ref_count = 0
|
107 |
+
self.device = device
|
108 |
+
self.fine_tuning = fine_tuning
|
109 |
+
self.base_mels_path = base_mels_path
|
110 |
+
|
111 |
+
def __getitem__(self, index):
|
112 |
+
filename = self.audio_files[index]
|
113 |
+
if self._cache_ref_count == 0:
|
114 |
+
audio, sampling_rate = load_wav(filename)
|
115 |
+
audio = audio / MAX_WAV_VALUE
|
116 |
+
if not self.fine_tuning:
|
117 |
+
audio = normalize(audio) * 0.95
|
118 |
+
self.cached_wav = audio
|
119 |
+
if sampling_rate != self.sampling_rate:
|
120 |
+
raise ValueError("{} SR doesn't match target {} SR".format(
|
121 |
+
sampling_rate, self.sampling_rate))
|
122 |
+
self._cache_ref_count = self.n_cache_reuse
|
123 |
+
else:
|
124 |
+
audio = self.cached_wav
|
125 |
+
self._cache_ref_count -= 1
|
126 |
+
|
127 |
+
audio = torch.FloatTensor(audio)
|
128 |
+
audio = audio.unsqueeze(0)
|
129 |
+
|
130 |
+
if not self.fine_tuning:
|
131 |
+
if self.split:
|
132 |
+
if audio.size(1) >= self.segment_size:
|
133 |
+
max_audio_start = audio.size(1) - self.segment_size
|
134 |
+
audio_start = random.randint(0, max_audio_start)
|
135 |
+
audio = audio[:, audio_start:audio_start+self.segment_size]
|
136 |
+
else:
|
137 |
+
audio = torch.nn.functional.pad(audio, (0, self.segment_size - audio.size(1)), 'constant')
|
138 |
+
|
139 |
+
mel = mel_spectrogram(audio, self.n_fft, self.num_mels,
|
140 |
+
self.sampling_rate, self.hop_size, self.win_size, self.fmin, self.fmax,
|
141 |
+
center=False)
|
142 |
+
else:
|
143 |
+
mel = np.load(
|
144 |
+
os.path.join(self.base_mels_path, os.path.splitext(filename)[0] + '.npy'))
|
145 |
+
mel = torch.from_numpy(mel)
|
146 |
+
|
147 |
+
if len(mel.shape) < 3:
|
148 |
+
mel = mel.unsqueeze(0)
|
149 |
+
|
150 |
+
if self.split:
|
151 |
+
frames_per_seg = math.ceil(self.segment_size / self.hop_size)
|
152 |
+
|
153 |
+
if audio.size(1) >= self.segment_size:
|
154 |
+
mel_start = random.randint(0, mel.size(2) - frames_per_seg - 1)
|
155 |
+
mel = mel[:, :, mel_start:mel_start + frames_per_seg]
|
156 |
+
audio = audio[:, mel_start * self.hop_size:(mel_start + frames_per_seg) * self.hop_size]
|
157 |
+
else:
|
158 |
+
mel = torch.nn.functional.pad(mel, (0, frames_per_seg - mel.size(2)), 'constant')
|
159 |
+
audio = torch.nn.functional.pad(audio, (0, self.segment_size - audio.size(1)), 'constant')
|
160 |
+
|
161 |
+
mel_loss = mel_spectrogram(audio, self.n_fft, self.num_mels,
|
162 |
+
self.sampling_rate, self.hop_size, self.win_size, self.fmin, self.fmax_loss,
|
163 |
+
center=False)
|
164 |
+
|
165 |
+
return (mel.squeeze(), audio.squeeze(0), filename, mel_loss.squeeze())
|
166 |
+
|
167 |
+
def __len__(self):
|
168 |
+
return len(self.audio_files)
|
model.py
ADDED
@@ -0,0 +1,529 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
from math import sqrt
|
2 |
+
import torch
|
3 |
+
from torch.autograd import Variable
|
4 |
+
from torch import nn
|
5 |
+
from torch.nn import functional as F
|
6 |
+
from layers import ConvNorm, LinearNorm
|
7 |
+
from utils import to_gpu, get_mask_from_lengths
|
8 |
+
|
9 |
+
|
10 |
+
class LocationLayer(nn.Module):
|
11 |
+
def __init__(self, attention_n_filters, attention_kernel_size,
|
12 |
+
attention_dim):
|
13 |
+
super(LocationLayer, self).__init__()
|
14 |
+
padding = int((attention_kernel_size - 1) / 2)
|
15 |
+
self.location_conv = ConvNorm(2, attention_n_filters,
|
16 |
+
kernel_size=attention_kernel_size,
|
17 |
+
padding=padding, bias=False, stride=1,
|
18 |
+
dilation=1)
|
19 |
+
self.location_dense = LinearNorm(attention_n_filters, attention_dim,
|
20 |
+
bias=False, w_init_gain='tanh')
|
21 |
+
|
22 |
+
def forward(self, attention_weights_cat):
|
23 |
+
processed_attention = self.location_conv(attention_weights_cat)
|
24 |
+
processed_attention = processed_attention.transpose(1, 2)
|
25 |
+
processed_attention = self.location_dense(processed_attention)
|
26 |
+
return processed_attention
|
27 |
+
|
28 |
+
|
29 |
+
class Attention(nn.Module):
|
30 |
+
def __init__(self, attention_rnn_dim, embedding_dim, attention_dim,
|
31 |
+
attention_location_n_filters, attention_location_kernel_size):
|
32 |
+
super(Attention, self).__init__()
|
33 |
+
self.query_layer = LinearNorm(attention_rnn_dim, attention_dim,
|
34 |
+
bias=False, w_init_gain='tanh')
|
35 |
+
self.memory_layer = LinearNorm(embedding_dim, attention_dim, bias=False,
|
36 |
+
w_init_gain='tanh')
|
37 |
+
self.v = LinearNorm(attention_dim, 1, bias=False)
|
38 |
+
self.location_layer = LocationLayer(attention_location_n_filters,
|
39 |
+
attention_location_kernel_size,
|
40 |
+
attention_dim)
|
41 |
+
self.score_mask_value = -float("inf")
|
42 |
+
|
43 |
+
def get_alignment_energies(self, query, processed_memory,
|
44 |
+
attention_weights_cat):
|
45 |
+
"""
|
46 |
+
PARAMS
|
47 |
+
------
|
48 |
+
query: decoder output (batch, n_mel_channels * n_frames_per_step)
|
49 |
+
processed_memory: processed encoder outputs (B, T_in, attention_dim)
|
50 |
+
attention_weights_cat: cumulative and prev. att weights (B, 2, max_time)
|
51 |
+
|
52 |
+
RETURNS
|
53 |
+
-------
|
54 |
+
alignment (batch, max_time)
|
55 |
+
"""
|
56 |
+
|
57 |
+
processed_query = self.query_layer(query.unsqueeze(1))
|
58 |
+
processed_attention_weights = self.location_layer(attention_weights_cat)
|
59 |
+
energies = self.v(torch.tanh(
|
60 |
+
processed_query + processed_attention_weights + processed_memory))
|
61 |
+
|
62 |
+
energies = energies.squeeze(-1)
|
63 |
+
return energies
|
64 |
+
|
65 |
+
def forward(self, attention_hidden_state, memory, processed_memory,
|
66 |
+
attention_weights_cat, mask):
|
67 |
+
"""
|
68 |
+
PARAMS
|
69 |
+
------
|
70 |
+
attention_hidden_state: attention rnn last output
|
71 |
+
memory: encoder outputs
|
72 |
+
processed_memory: processed encoder outputs
|
73 |
+
attention_weights_cat: previous and cummulative attention weights
|
74 |
+
mask: binary mask for padded data
|
75 |
+
"""
|
76 |
+
alignment = self.get_alignment_energies(
|
77 |
+
attention_hidden_state, processed_memory, attention_weights_cat)
|
78 |
+
|
79 |
+
if mask is not None:
|
80 |
+
alignment.data.masked_fill_(mask, self.score_mask_value)
|
81 |
+
|
82 |
+
attention_weights = F.softmax(alignment, dim=1)
|
83 |
+
attention_context = torch.bmm(attention_weights.unsqueeze(1), memory)
|
84 |
+
attention_context = attention_context.squeeze(1)
|
85 |
+
|
86 |
+
return attention_context, attention_weights
|
87 |
+
|
88 |
+
|
89 |
+
class Prenet(nn.Module):
|
90 |
+
def __init__(self, in_dim, sizes):
|
91 |
+
super(Prenet, self).__init__()
|
92 |
+
in_sizes = [in_dim] + sizes[:-1]
|
93 |
+
self.layers = nn.ModuleList(
|
94 |
+
[LinearNorm(in_size, out_size, bias=False)
|
95 |
+
for (in_size, out_size) in zip(in_sizes, sizes)])
|
96 |
+
|
97 |
+
def forward(self, x):
|
98 |
+
for linear in self.layers:
|
99 |
+
x = F.dropout(F.relu(linear(x)), p=0.5, training=True)
|
100 |
+
return x
|
101 |
+
|
102 |
+
|
103 |
+
class Postnet(nn.Module):
|
104 |
+
"""Postnet
|
105 |
+
- Five 1-d convolution with 512 channels and kernel size 5
|
106 |
+
"""
|
107 |
+
|
108 |
+
def __init__(self, hparams):
|
109 |
+
super(Postnet, self).__init__()
|
110 |
+
self.convolutions = nn.ModuleList()
|
111 |
+
|
112 |
+
self.convolutions.append(
|
113 |
+
nn.Sequential(
|
114 |
+
ConvNorm(hparams.n_mel_channels, hparams.postnet_embedding_dim,
|
115 |
+
kernel_size=hparams.postnet_kernel_size, stride=1,
|
116 |
+
padding=int((hparams.postnet_kernel_size - 1) / 2),
|
117 |
+
dilation=1, w_init_gain='tanh'),
|
118 |
+
nn.BatchNorm1d(hparams.postnet_embedding_dim))
|
119 |
+
)
|
120 |
+
|
121 |
+
for i in range(1, hparams.postnet_n_convolutions - 1):
|
122 |
+
self.convolutions.append(
|
123 |
+
nn.Sequential(
|
124 |
+
ConvNorm(hparams.postnet_embedding_dim,
|
125 |
+
hparams.postnet_embedding_dim,
|
126 |
+
kernel_size=hparams.postnet_kernel_size, stride=1,
|
127 |
+
padding=int((hparams.postnet_kernel_size - 1) / 2),
|
128 |
+
dilation=1, w_init_gain='tanh'),
|
129 |
+
nn.BatchNorm1d(hparams.postnet_embedding_dim))
|
130 |
+
)
|
131 |
+
|
132 |
+
self.convolutions.append(
|
133 |
+
nn.Sequential(
|
134 |
+
ConvNorm(hparams.postnet_embedding_dim, hparams.n_mel_channels,
|
135 |
+
kernel_size=hparams.postnet_kernel_size, stride=1,
|
136 |
+
padding=int((hparams.postnet_kernel_size - 1) / 2),
|
137 |
+
dilation=1, w_init_gain='linear'),
|
138 |
+
nn.BatchNorm1d(hparams.n_mel_channels))
|
139 |
+
)
|
140 |
+
|
141 |
+
def forward(self, x):
|
142 |
+
for i in range(len(self.convolutions) - 1):
|
143 |
+
x = F.dropout(torch.tanh(self.convolutions[i](x)), 0.5, self.training)
|
144 |
+
x = F.dropout(self.convolutions[-1](x), 0.5, self.training)
|
145 |
+
|
146 |
+
return x
|
147 |
+
|
148 |
+
|
149 |
+
class Encoder(nn.Module):
|
150 |
+
"""Encoder module:
|
151 |
+
- Three 1-d convolution banks
|
152 |
+
- Bidirectional LSTM
|
153 |
+
"""
|
154 |
+
def __init__(self, hparams):
|
155 |
+
super(Encoder, self).__init__()
|
156 |
+
|
157 |
+
convolutions = []
|
158 |
+
for _ in range(hparams.encoder_n_convolutions):
|
159 |
+
conv_layer = nn.Sequential(
|
160 |
+
ConvNorm(hparams.encoder_embedding_dim,
|
161 |
+
hparams.encoder_embedding_dim,
|
162 |
+
kernel_size=hparams.encoder_kernel_size, stride=1,
|
163 |
+
padding=int((hparams.encoder_kernel_size - 1) / 2),
|
164 |
+
dilation=1, w_init_gain='relu'),
|
165 |
+
nn.BatchNorm1d(hparams.encoder_embedding_dim))
|
166 |
+
convolutions.append(conv_layer)
|
167 |
+
self.convolutions = nn.ModuleList(convolutions)
|
168 |
+
|
169 |
+
self.lstm = nn.LSTM(hparams.encoder_embedding_dim,
|
170 |
+
int(hparams.encoder_embedding_dim / 2), 1,
|
171 |
+
batch_first=True, bidirectional=True)
|
172 |
+
|
173 |
+
def forward(self, x, input_lengths):
|
174 |
+
for conv in self.convolutions:
|
175 |
+
x = F.dropout(F.relu(conv(x)), 0.5, self.training)
|
176 |
+
|
177 |
+
x = x.transpose(1, 2)
|
178 |
+
|
179 |
+
# pytorch tensor are not reversible, hence the conversion
|
180 |
+
input_lengths = input_lengths.cpu().numpy()
|
181 |
+
x = nn.utils.rnn.pack_padded_sequence(
|
182 |
+
x, input_lengths, batch_first=True)
|
183 |
+
|
184 |
+
self.lstm.flatten_parameters()
|
185 |
+
outputs, _ = self.lstm(x)
|
186 |
+
|
187 |
+
outputs, _ = nn.utils.rnn.pad_packed_sequence(
|
188 |
+
outputs, batch_first=True)
|
189 |
+
|
190 |
+
return outputs
|
191 |
+
|
192 |
+
def inference(self, x):
|
193 |
+
for conv in self.convolutions:
|
194 |
+
x = F.dropout(F.relu(conv(x)), 0.5, self.training)
|
195 |
+
|
196 |
+
x = x.transpose(1, 2)
|
197 |
+
|
198 |
+
self.lstm.flatten_parameters()
|
199 |
+
outputs, _ = self.lstm(x)
|
200 |
+
|
201 |
+
return outputs
|
202 |
+
|
203 |
+
|
204 |
+
class Decoder(nn.Module):
|
205 |
+
def __init__(self, hparams):
|
206 |
+
super(Decoder, self).__init__()
|
207 |
+
self.n_mel_channels = hparams.n_mel_channels
|
208 |
+
self.n_frames_per_step = hparams.n_frames_per_step
|
209 |
+
self.encoder_embedding_dim = hparams.encoder_embedding_dim
|
210 |
+
self.attention_rnn_dim = hparams.attention_rnn_dim
|
211 |
+
self.decoder_rnn_dim = hparams.decoder_rnn_dim
|
212 |
+
self.prenet_dim = hparams.prenet_dim
|
213 |
+
self.max_decoder_steps = hparams.max_decoder_steps
|
214 |
+
self.gate_threshold = hparams.gate_threshold
|
215 |
+
self.p_attention_dropout = hparams.p_attention_dropout
|
216 |
+
self.p_decoder_dropout = hparams.p_decoder_dropout
|
217 |
+
|
218 |
+
self.prenet = Prenet(
|
219 |
+
hparams.n_mel_channels * hparams.n_frames_per_step,
|
220 |
+
[hparams.prenet_dim, hparams.prenet_dim])
|
221 |
+
|
222 |
+
self.attention_rnn = nn.LSTMCell(
|
223 |
+
hparams.prenet_dim + hparams.encoder_embedding_dim,
|
224 |
+
hparams.attention_rnn_dim)
|
225 |
+
|
226 |
+
self.attention_layer = Attention(
|
227 |
+
hparams.attention_rnn_dim, hparams.encoder_embedding_dim,
|
228 |
+
hparams.attention_dim, hparams.attention_location_n_filters,
|
229 |
+
hparams.attention_location_kernel_size)
|
230 |
+
|
231 |
+
self.decoder_rnn = nn.LSTMCell(
|
232 |
+
hparams.attention_rnn_dim + hparams.encoder_embedding_dim,
|
233 |
+
hparams.decoder_rnn_dim, 1)
|
234 |
+
|
235 |
+
self.linear_projection = LinearNorm(
|
236 |
+
hparams.decoder_rnn_dim + hparams.encoder_embedding_dim,
|
237 |
+
hparams.n_mel_channels * hparams.n_frames_per_step)
|
238 |
+
|
239 |
+
self.gate_layer = LinearNorm(
|
240 |
+
hparams.decoder_rnn_dim + hparams.encoder_embedding_dim, 1,
|
241 |
+
bias=True, w_init_gain='sigmoid')
|
242 |
+
|
243 |
+
def get_go_frame(self, memory):
|
244 |
+
""" Gets all zeros frames to use as first decoder input
|
245 |
+
PARAMS
|
246 |
+
------
|
247 |
+
memory: decoder outputs
|
248 |
+
|
249 |
+
RETURNS
|
250 |
+
-------
|
251 |
+
decoder_input: all zeros frames
|
252 |
+
"""
|
253 |
+
B = memory.size(0)
|
254 |
+
decoder_input = Variable(memory.data.new(
|
255 |
+
B, self.n_mel_channels * self.n_frames_per_step).zero_())
|
256 |
+
return decoder_input
|
257 |
+
|
258 |
+
def initialize_decoder_states(self, memory, mask):
|
259 |
+
""" Initializes attention rnn states, decoder rnn states, attention
|
260 |
+
weights, attention cumulative weights, attention context, stores memory
|
261 |
+
and stores processed memory
|
262 |
+
PARAMS
|
263 |
+
------
|
264 |
+
memory: Encoder outputs
|
265 |
+
mask: Mask for padded data if training, expects None for inference
|
266 |
+
"""
|
267 |
+
B = memory.size(0)
|
268 |
+
MAX_TIME = memory.size(1)
|
269 |
+
|
270 |
+
self.attention_hidden = Variable(memory.data.new(
|
271 |
+
B, self.attention_rnn_dim).zero_())
|
272 |
+
self.attention_cell = Variable(memory.data.new(
|
273 |
+
B, self.attention_rnn_dim).zero_())
|
274 |
+
|
275 |
+
self.decoder_hidden = Variable(memory.data.new(
|
276 |
+
B, self.decoder_rnn_dim).zero_())
|
277 |
+
self.decoder_cell = Variable(memory.data.new(
|
278 |
+
B, self.decoder_rnn_dim).zero_())
|
279 |
+
|
280 |
+
self.attention_weights = Variable(memory.data.new(
|
281 |
+
B, MAX_TIME).zero_())
|
282 |
+
self.attention_weights_cum = Variable(memory.data.new(
|
283 |
+
B, MAX_TIME).zero_())
|
284 |
+
self.attention_context = Variable(memory.data.new(
|
285 |
+
B, self.encoder_embedding_dim).zero_())
|
286 |
+
|
287 |
+
self.memory = memory
|
288 |
+
self.processed_memory = self.attention_layer.memory_layer(memory)
|
289 |
+
self.mask = mask
|
290 |
+
|
291 |
+
def parse_decoder_inputs(self, decoder_inputs):
|
292 |
+
""" Prepares decoder inputs, i.e. mel outputs
|
293 |
+
PARAMS
|
294 |
+
------
|
295 |
+
decoder_inputs: inputs used for teacher-forced training, i.e. mel-specs
|
296 |
+
|
297 |
+
RETURNS
|
298 |
+
-------
|
299 |
+
inputs: processed decoder inputs
|
300 |
+
|
301 |
+
"""
|
302 |
+
# (B, n_mel_channels, T_out) -> (B, T_out, n_mel_channels)
|
303 |
+
decoder_inputs = decoder_inputs.transpose(1, 2)
|
304 |
+
decoder_inputs = decoder_inputs.view(
|
305 |
+
decoder_inputs.size(0),
|
306 |
+
int(decoder_inputs.size(1)/self.n_frames_per_step), -1)
|
307 |
+
# (B, T_out, n_mel_channels) -> (T_out, B, n_mel_channels)
|
308 |
+
decoder_inputs = decoder_inputs.transpose(0, 1)
|
309 |
+
return decoder_inputs
|
310 |
+
|
311 |
+
def parse_decoder_outputs(self, mel_outputs, gate_outputs, alignments):
|
312 |
+
""" Prepares decoder outputs for output
|
313 |
+
PARAMS
|
314 |
+
------
|
315 |
+
mel_outputs:
|
316 |
+
gate_outputs: gate output energies
|
317 |
+
alignments:
|
318 |
+
|
319 |
+
RETURNS
|
320 |
+
-------
|
321 |
+
mel_outputs:
|
322 |
+
gate_outpust: gate output energies
|
323 |
+
alignments:
|
324 |
+
"""
|
325 |
+
# (T_out, B) -> (B, T_out)
|
326 |
+
alignments = torch.stack(alignments).transpose(0, 1)
|
327 |
+
# (T_out, B) -> (B, T_out)
|
328 |
+
gate_outputs = torch.stack(gate_outputs).transpose(0, 1)
|
329 |
+
gate_outputs = gate_outputs.contiguous()
|
330 |
+
# (T_out, B, n_mel_channels) -> (B, T_out, n_mel_channels)
|
331 |
+
mel_outputs = torch.stack(mel_outputs).transpose(0, 1).contiguous()
|
332 |
+
# decouple frames per step
|
333 |
+
mel_outputs = mel_outputs.view(
|
334 |
+
mel_outputs.size(0), -1, self.n_mel_channels)
|
335 |
+
# (B, T_out, n_mel_channels) -> (B, n_mel_channels, T_out)
|
336 |
+
mel_outputs = mel_outputs.transpose(1, 2)
|
337 |
+
|
338 |
+
return mel_outputs, gate_outputs, alignments
|
339 |
+
|
340 |
+
def decode(self, decoder_input):
|
341 |
+
""" Decoder step using stored states, attention and memory
|
342 |
+
PARAMS
|
343 |
+
------
|
344 |
+
decoder_input: previous mel output
|
345 |
+
|
346 |
+
RETURNS
|
347 |
+
-------
|
348 |
+
mel_output:
|
349 |
+
gate_output: gate output energies
|
350 |
+
attention_weights:
|
351 |
+
"""
|
352 |
+
cell_input = torch.cat((decoder_input, self.attention_context), -1)
|
353 |
+
self.attention_hidden, self.attention_cell = self.attention_rnn(
|
354 |
+
cell_input, (self.attention_hidden, self.attention_cell))
|
355 |
+
self.attention_hidden = F.dropout(
|
356 |
+
self.attention_hidden, self.p_attention_dropout, self.training)
|
357 |
+
|
358 |
+
attention_weights_cat = torch.cat(
|
359 |
+
(self.attention_weights.unsqueeze(1),
|
360 |
+
self.attention_weights_cum.unsqueeze(1)), dim=1)
|
361 |
+
self.attention_context, self.attention_weights = self.attention_layer(
|
362 |
+
self.attention_hidden, self.memory, self.processed_memory,
|
363 |
+
attention_weights_cat, self.mask)
|
364 |
+
|
365 |
+
self.attention_weights_cum += self.attention_weights
|
366 |
+
decoder_input = torch.cat(
|
367 |
+
(self.attention_hidden, self.attention_context), -1)
|
368 |
+
self.decoder_hidden, self.decoder_cell = self.decoder_rnn(
|
369 |
+
decoder_input, (self.decoder_hidden, self.decoder_cell))
|
370 |
+
self.decoder_hidden = F.dropout(
|
371 |
+
self.decoder_hidden, self.p_decoder_dropout, self.training)
|
372 |
+
|
373 |
+
decoder_hidden_attention_context = torch.cat(
|
374 |
+
(self.decoder_hidden, self.attention_context), dim=1)
|
375 |
+
decoder_output = self.linear_projection(
|
376 |
+
decoder_hidden_attention_context)
|
377 |
+
|
378 |
+
gate_prediction = self.gate_layer(decoder_hidden_attention_context)
|
379 |
+
return decoder_output, gate_prediction, self.attention_weights
|
380 |
+
|
381 |
+
def forward(self, memory, decoder_inputs, memory_lengths):
|
382 |
+
""" Decoder forward pass for training
|
383 |
+
PARAMS
|
384 |
+
------
|
385 |
+
memory: Encoder outputs
|
386 |
+
decoder_inputs: Decoder inputs for teacher forcing. i.e. mel-specs
|
387 |
+
memory_lengths: Encoder output lengths for attention masking.
|
388 |
+
|
389 |
+
RETURNS
|
390 |
+
-------
|
391 |
+
mel_outputs: mel outputs from the decoder
|
392 |
+
gate_outputs: gate outputs from the decoder
|
393 |
+
alignments: sequence of attention weights from the decoder
|
394 |
+
"""
|
395 |
+
|
396 |
+
decoder_input = self.get_go_frame(memory).unsqueeze(0)
|
397 |
+
decoder_inputs = self.parse_decoder_inputs(decoder_inputs)
|
398 |
+
decoder_inputs = torch.cat((decoder_input, decoder_inputs), dim=0)
|
399 |
+
decoder_inputs = self.prenet(decoder_inputs)
|
400 |
+
|
401 |
+
self.initialize_decoder_states(
|
402 |
+
memory, mask=~get_mask_from_lengths(memory_lengths))
|
403 |
+
|
404 |
+
mel_outputs, gate_outputs, alignments = [], [], []
|
405 |
+
while len(mel_outputs) < decoder_inputs.size(0) - 1:
|
406 |
+
decoder_input = decoder_inputs[len(mel_outputs)]
|
407 |
+
mel_output, gate_output, attention_weights = self.decode(
|
408 |
+
decoder_input)
|
409 |
+
mel_outputs += [mel_output.squeeze(1)]
|
410 |
+
gate_outputs += [gate_output.squeeze(1)]
|
411 |
+
alignments += [attention_weights]
|
412 |
+
|
413 |
+
mel_outputs, gate_outputs, alignments = self.parse_decoder_outputs(
|
414 |
+
mel_outputs, gate_outputs, alignments)
|
415 |
+
|
416 |
+
return mel_outputs, gate_outputs, alignments
|
417 |
+
|
418 |
+
def inference(self, memory):
|
419 |
+
""" Decoder inference
|
420 |
+
PARAMS
|
421 |
+
------
|
422 |
+
memory: Encoder outputs
|
423 |
+
|
424 |
+
RETURNS
|
425 |
+
-------
|
426 |
+
mel_outputs: mel outputs from the decoder
|
427 |
+
gate_outputs: gate outputs from the decoder
|
428 |
+
alignments: sequence of attention weights from the decoder
|
429 |
+
"""
|
430 |
+
decoder_input = self.get_go_frame(memory)
|
431 |
+
|
432 |
+
self.initialize_decoder_states(memory, mask=None)
|
433 |
+
|
434 |
+
mel_outputs, gate_outputs, alignments = [], [], []
|
435 |
+
while True:
|
436 |
+
decoder_input = self.prenet(decoder_input)
|
437 |
+
mel_output, gate_output, alignment = self.decode(decoder_input)
|
438 |
+
|
439 |
+
mel_outputs += [mel_output.squeeze(1)]
|
440 |
+
gate_outputs += [gate_output]
|
441 |
+
alignments += [alignment]
|
442 |
+
|
443 |
+
if torch.sigmoid(gate_output.data) > self.gate_threshold:
|
444 |
+
break
|
445 |
+
elif len(mel_outputs) == self.max_decoder_steps:
|
446 |
+
print("Warning! Reached max decoder steps")
|
447 |
+
break
|
448 |
+
|
449 |
+
decoder_input = mel_output
|
450 |
+
|
451 |
+
mel_outputs, gate_outputs, alignments = self.parse_decoder_outputs(
|
452 |
+
mel_outputs, gate_outputs, alignments)
|
453 |
+
|
454 |
+
return mel_outputs, gate_outputs, alignments
|
455 |
+
|
456 |
+
|
457 |
+
class Tacotron2(nn.Module):
|
458 |
+
def __init__(self, hparams):
|
459 |
+
super(Tacotron2, self).__init__()
|
460 |
+
self.mask_padding = hparams.mask_padding
|
461 |
+
self.fp16_run = hparams.fp16_run
|
462 |
+
self.n_mel_channels = hparams.n_mel_channels
|
463 |
+
self.n_frames_per_step = hparams.n_frames_per_step
|
464 |
+
self.embedding = nn.Embedding(
|
465 |
+
hparams.n_symbols, hparams.symbols_embedding_dim)
|
466 |
+
std = sqrt(2.0 / (hparams.n_symbols + hparams.symbols_embedding_dim))
|
467 |
+
val = sqrt(3.0) * std # uniform bounds for std
|
468 |
+
self.embedding.weight.data.uniform_(-val, val)
|
469 |
+
self.encoder = Encoder(hparams)
|
470 |
+
self.decoder = Decoder(hparams)
|
471 |
+
self.postnet = Postnet(hparams)
|
472 |
+
|
473 |
+
def parse_batch(self, batch):
|
474 |
+
text_padded, input_lengths, mel_padded, gate_padded, \
|
475 |
+
output_lengths = batch
|
476 |
+
text_padded = to_gpu(text_padded).long()
|
477 |
+
input_lengths = to_gpu(input_lengths).long()
|
478 |
+
max_len = torch.max(input_lengths.data).item()
|
479 |
+
mel_padded = to_gpu(mel_padded).float()
|
480 |
+
gate_padded = to_gpu(gate_padded).float()
|
481 |
+
output_lengths = to_gpu(output_lengths).long()
|
482 |
+
|
483 |
+
return (
|
484 |
+
(text_padded, input_lengths, mel_padded, max_len, output_lengths),
|
485 |
+
(mel_padded, gate_padded))
|
486 |
+
|
487 |
+
def parse_output(self, outputs, output_lengths=None):
|
488 |
+
if self.mask_padding and output_lengths is not None:
|
489 |
+
mask = ~get_mask_from_lengths(output_lengths)
|
490 |
+
mask = mask.expand(self.n_mel_channels, mask.size(0), mask.size(1))
|
491 |
+
mask = mask.permute(1, 0, 2)
|
492 |
+
|
493 |
+
outputs[0].data.masked_fill_(mask, 0.0)
|
494 |
+
outputs[1].data.masked_fill_(mask, 0.0)
|
495 |
+
outputs[2].data.masked_fill_(mask[:, 0, :], 1e3) # gate energies
|
496 |
+
|
497 |
+
return outputs
|
498 |
+
|
499 |
+
def forward(self, inputs):
|
500 |
+
text_inputs, text_lengths, mels, max_len, output_lengths = inputs
|
501 |
+
text_lengths, output_lengths = text_lengths.data, output_lengths.data
|
502 |
+
|
503 |
+
embedded_inputs = self.embedding(text_inputs).transpose(1, 2)
|
504 |
+
|
505 |
+
encoder_outputs = self.encoder(embedded_inputs, text_lengths)
|
506 |
+
|
507 |
+
mel_outputs, gate_outputs, alignments = self.decoder(
|
508 |
+
encoder_outputs, mels, memory_lengths=text_lengths)
|
509 |
+
|
510 |
+
mel_outputs_postnet = self.postnet(mel_outputs)
|
511 |
+
mel_outputs_postnet = mel_outputs + mel_outputs_postnet
|
512 |
+
|
513 |
+
return self.parse_output(
|
514 |
+
[mel_outputs, mel_outputs_postnet, gate_outputs, alignments],
|
515 |
+
output_lengths)
|
516 |
+
|
517 |
+
def inference(self, inputs):
|
518 |
+
embedded_inputs = self.embedding(inputs).transpose(1, 2)
|
519 |
+
encoder_outputs = self.encoder.inference(embedded_inputs)
|
520 |
+
mel_outputs, gate_outputs, alignments = self.decoder.inference(
|
521 |
+
encoder_outputs)
|
522 |
+
|
523 |
+
mel_outputs_postnet = self.postnet(mel_outputs)
|
524 |
+
mel_outputs_postnet = mel_outputs + mel_outputs_postnet
|
525 |
+
|
526 |
+
outputs = self.parse_output(
|
527 |
+
[mel_outputs, mel_outputs_postnet, gate_outputs, alignments])
|
528 |
+
|
529 |
+
return outputs
|
models.py
ADDED
@@ -0,0 +1,283 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import torch
|
2 |
+
import torch.nn.functional as F
|
3 |
+
import torch.nn as nn
|
4 |
+
from torch.nn import Conv1d, ConvTranspose1d, AvgPool1d, Conv2d
|
5 |
+
from torch.nn.utils import weight_norm, remove_weight_norm, spectral_norm
|
6 |
+
from hifiutils import init_weights, get_padding
|
7 |
+
|
8 |
+
LRELU_SLOPE = 0.1
|
9 |
+
|
10 |
+
|
11 |
+
class ResBlock1(torch.nn.Module):
|
12 |
+
def __init__(self, h, channels, kernel_size=3, dilation=(1, 3, 5)):
|
13 |
+
super(ResBlock1, self).__init__()
|
14 |
+
self.h = h
|
15 |
+
self.convs1 = nn.ModuleList([
|
16 |
+
weight_norm(Conv1d(channels, channels, kernel_size, 1, dilation=dilation[0],
|
17 |
+
padding=get_padding(kernel_size, dilation[0]))),
|
18 |
+
weight_norm(Conv1d(channels, channels, kernel_size, 1, dilation=dilation[1],
|
19 |
+
padding=get_padding(kernel_size, dilation[1]))),
|
20 |
+
weight_norm(Conv1d(channels, channels, kernel_size, 1, dilation=dilation[2],
|
21 |
+
padding=get_padding(kernel_size, dilation[2])))
|
22 |
+
])
|
23 |
+
self.convs1.apply(init_weights)
|
24 |
+
|
25 |
+
self.convs2 = nn.ModuleList([
|
26 |
+
weight_norm(Conv1d(channels, channels, kernel_size, 1, dilation=1,
|
27 |
+
padding=get_padding(kernel_size, 1))),
|
28 |
+
weight_norm(Conv1d(channels, channels, kernel_size, 1, dilation=1,
|
29 |
+
padding=get_padding(kernel_size, 1))),
|
30 |
+
weight_norm(Conv1d(channels, channels, kernel_size, 1, dilation=1,
|
31 |
+
padding=get_padding(kernel_size, 1)))
|
32 |
+
])
|
33 |
+
self.convs2.apply(init_weights)
|
34 |
+
|
35 |
+
def forward(self, x):
|
36 |
+
for c1, c2 in zip(self.convs1, self.convs2):
|
37 |
+
xt = F.leaky_relu(x, LRELU_SLOPE)
|
38 |
+
xt = c1(xt)
|
39 |
+
xt = F.leaky_relu(xt, LRELU_SLOPE)
|
40 |
+
xt = c2(xt)
|
41 |
+
x = xt + x
|
42 |
+
return x
|
43 |
+
|
44 |
+
def remove_weight_norm(self):
|
45 |
+
for l in self.convs1:
|
46 |
+
remove_weight_norm(l)
|
47 |
+
for l in self.convs2:
|
48 |
+
remove_weight_norm(l)
|
49 |
+
|
50 |
+
|
51 |
+
class ResBlock2(torch.nn.Module):
|
52 |
+
def __init__(self, h, channels, kernel_size=3, dilation=(1, 3)):
|
53 |
+
super(ResBlock2, self).__init__()
|
54 |
+
self.h = h
|
55 |
+
self.convs = nn.ModuleList([
|
56 |
+
weight_norm(Conv1d(channels, channels, kernel_size, 1, dilation=dilation[0],
|
57 |
+
padding=get_padding(kernel_size, dilation[0]))),
|
58 |
+
weight_norm(Conv1d(channels, channels, kernel_size, 1, dilation=dilation[1],
|
59 |
+
padding=get_padding(kernel_size, dilation[1])))
|
60 |
+
])
|
61 |
+
self.convs.apply(init_weights)
|
62 |
+
|
63 |
+
def forward(self, x):
|
64 |
+
for c in self.convs:
|
65 |
+
xt = F.leaky_relu(x, LRELU_SLOPE)
|
66 |
+
xt = c(xt)
|
67 |
+
x = xt + x
|
68 |
+
return x
|
69 |
+
|
70 |
+
def remove_weight_norm(self):
|
71 |
+
for l in self.convs:
|
72 |
+
remove_weight_norm(l)
|
73 |
+
|
74 |
+
|
75 |
+
class Generator(torch.nn.Module):
|
76 |
+
def __init__(self, h):
|
77 |
+
super(Generator, self).__init__()
|
78 |
+
self.h = h
|
79 |
+
self.num_kernels = len(h.resblock_kernel_sizes)
|
80 |
+
self.num_upsamples = len(h.upsample_rates)
|
81 |
+
self.conv_pre = weight_norm(Conv1d(80, h.upsample_initial_channel, 7, 1, padding=3))
|
82 |
+
resblock = ResBlock1 if h.resblock == '1' else ResBlock2
|
83 |
+
|
84 |
+
self.ups = nn.ModuleList()
|
85 |
+
for i, (u, k) in enumerate(zip(h.upsample_rates, h.upsample_kernel_sizes)):
|
86 |
+
self.ups.append(weight_norm(
|
87 |
+
ConvTranspose1d(h.upsample_initial_channel//(2**i), h.upsample_initial_channel//(2**(i+1)),
|
88 |
+
k, u, padding=(k-u)//2)))
|
89 |
+
|
90 |
+
self.resblocks = nn.ModuleList()
|
91 |
+
for i in range(len(self.ups)):
|
92 |
+
ch = h.upsample_initial_channel//(2**(i+1))
|
93 |
+
for j, (k, d) in enumerate(zip(h.resblock_kernel_sizes, h.resblock_dilation_sizes)):
|
94 |
+
self.resblocks.append(resblock(h, ch, k, d))
|
95 |
+
|
96 |
+
self.conv_post = weight_norm(Conv1d(ch, 1, 7, 1, padding=3))
|
97 |
+
self.ups.apply(init_weights)
|
98 |
+
self.conv_post.apply(init_weights)
|
99 |
+
|
100 |
+
def forward(self, x):
|
101 |
+
x = self.conv_pre(x)
|
102 |
+
for i in range(self.num_upsamples):
|
103 |
+
x = F.leaky_relu(x, LRELU_SLOPE)
|
104 |
+
x = self.ups[i](x)
|
105 |
+
xs = None
|
106 |
+
for j in range(self.num_kernels):
|
107 |
+
if xs is None:
|
108 |
+
xs = self.resblocks[i*self.num_kernels+j](x)
|
109 |
+
else:
|
110 |
+
xs += self.resblocks[i*self.num_kernels+j](x)
|
111 |
+
x = xs / self.num_kernels
|
112 |
+
x = F.leaky_relu(x)
|
113 |
+
x = self.conv_post(x)
|
114 |
+
x = torch.tanh(x)
|
115 |
+
|
116 |
+
return x
|
117 |
+
|
118 |
+
def remove_weight_norm(self):
|
119 |
+
print('Removing weight norm...')
|
120 |
+
for l in self.ups:
|
121 |
+
remove_weight_norm(l)
|
122 |
+
for l in self.resblocks:
|
123 |
+
l.remove_weight_norm()
|
124 |
+
remove_weight_norm(self.conv_pre)
|
125 |
+
remove_weight_norm(self.conv_post)
|
126 |
+
|
127 |
+
|
128 |
+
class DiscriminatorP(torch.nn.Module):
|
129 |
+
def __init__(self, period, kernel_size=5, stride=3, use_spectral_norm=False):
|
130 |
+
super(DiscriminatorP, self).__init__()
|
131 |
+
self.period = period
|
132 |
+
norm_f = weight_norm if use_spectral_norm == False else spectral_norm
|
133 |
+
self.convs = nn.ModuleList([
|
134 |
+
norm_f(Conv2d(1, 32, (kernel_size, 1), (stride, 1), padding=(get_padding(5, 1), 0))),
|
135 |
+
norm_f(Conv2d(32, 128, (kernel_size, 1), (stride, 1), padding=(get_padding(5, 1), 0))),
|
136 |
+
norm_f(Conv2d(128, 512, (kernel_size, 1), (stride, 1), padding=(get_padding(5, 1), 0))),
|
137 |
+
norm_f(Conv2d(512, 1024, (kernel_size, 1), (stride, 1), padding=(get_padding(5, 1), 0))),
|
138 |
+
norm_f(Conv2d(1024, 1024, (kernel_size, 1), 1, padding=(2, 0))),
|
139 |
+
])
|
140 |
+
self.conv_post = norm_f(Conv2d(1024, 1, (3, 1), 1, padding=(1, 0)))
|
141 |
+
|
142 |
+
def forward(self, x):
|
143 |
+
fmap = []
|
144 |
+
|
145 |
+
# 1d to 2d
|
146 |
+
b, c, t = x.shape
|
147 |
+
if t % self.period != 0: # pad first
|
148 |
+
n_pad = self.period - (t % self.period)
|
149 |
+
x = F.pad(x, (0, n_pad), "reflect")
|
150 |
+
t = t + n_pad
|
151 |
+
x = x.view(b, c, t // self.period, self.period)
|
152 |
+
|
153 |
+
for l in self.convs:
|
154 |
+
x = l(x)
|
155 |
+
x = F.leaky_relu(x, LRELU_SLOPE)
|
156 |
+
fmap.append(x)
|
157 |
+
x = self.conv_post(x)
|
158 |
+
fmap.append(x)
|
159 |
+
x = torch.flatten(x, 1, -1)
|
160 |
+
|
161 |
+
return x, fmap
|
162 |
+
|
163 |
+
|
164 |
+
class MultiPeriodDiscriminator(torch.nn.Module):
|
165 |
+
def __init__(self):
|
166 |
+
super(MultiPeriodDiscriminator, self).__init__()
|
167 |
+
self.discriminators = nn.ModuleList([
|
168 |
+
DiscriminatorP(2),
|
169 |
+
DiscriminatorP(3),
|
170 |
+
DiscriminatorP(5),
|
171 |
+
DiscriminatorP(7),
|
172 |
+
DiscriminatorP(11),
|
173 |
+
])
|
174 |
+
|
175 |
+
def forward(self, y, y_hat):
|
176 |
+
y_d_rs = []
|
177 |
+
y_d_gs = []
|
178 |
+
fmap_rs = []
|
179 |
+
fmap_gs = []
|
180 |
+
for i, d in enumerate(self.discriminators):
|
181 |
+
y_d_r, fmap_r = d(y)
|
182 |
+
y_d_g, fmap_g = d(y_hat)
|
183 |
+
y_d_rs.append(y_d_r)
|
184 |
+
fmap_rs.append(fmap_r)
|
185 |
+
y_d_gs.append(y_d_g)
|
186 |
+
fmap_gs.append(fmap_g)
|
187 |
+
|
188 |
+
return y_d_rs, y_d_gs, fmap_rs, fmap_gs
|
189 |
+
|
190 |
+
|
191 |
+
class DiscriminatorS(torch.nn.Module):
|
192 |
+
def __init__(self, use_spectral_norm=False):
|
193 |
+
super(DiscriminatorS, self).__init__()
|
194 |
+
norm_f = weight_norm if use_spectral_norm == False else spectral_norm
|
195 |
+
self.convs = nn.ModuleList([
|
196 |
+
norm_f(Conv1d(1, 128, 15, 1, padding=7)),
|
197 |
+
norm_f(Conv1d(128, 128, 41, 2, groups=4, padding=20)),
|
198 |
+
norm_f(Conv1d(128, 256, 41, 2, groups=16, padding=20)),
|
199 |
+
norm_f(Conv1d(256, 512, 41, 4, groups=16, padding=20)),
|
200 |
+
norm_f(Conv1d(512, 1024, 41, 4, groups=16, padding=20)),
|
201 |
+
norm_f(Conv1d(1024, 1024, 41, 1, groups=16, padding=20)),
|
202 |
+
norm_f(Conv1d(1024, 1024, 5, 1, padding=2)),
|
203 |
+
])
|
204 |
+
self.conv_post = norm_f(Conv1d(1024, 1, 3, 1, padding=1))
|
205 |
+
|
206 |
+
def forward(self, x):
|
207 |
+
fmap = []
|
208 |
+
for l in self.convs:
|
209 |
+
x = l(x)
|
210 |
+
x = F.leaky_relu(x, LRELU_SLOPE)
|
211 |
+
fmap.append(x)
|
212 |
+
x = self.conv_post(x)
|
213 |
+
fmap.append(x)
|
214 |
+
x = torch.flatten(x, 1, -1)
|
215 |
+
|
216 |
+
return x, fmap
|
217 |
+
|
218 |
+
|
219 |
+
class MultiScaleDiscriminator(torch.nn.Module):
|
220 |
+
def __init__(self):
|
221 |
+
super(MultiScaleDiscriminator, self).__init__()
|
222 |
+
self.discriminators = nn.ModuleList([
|
223 |
+
DiscriminatorS(use_spectral_norm=True),
|
224 |
+
DiscriminatorS(),
|
225 |
+
DiscriminatorS(),
|
226 |
+
])
|
227 |
+
self.meanpools = nn.ModuleList([
|
228 |
+
AvgPool1d(4, 2, padding=2),
|
229 |
+
AvgPool1d(4, 2, padding=2)
|
230 |
+
])
|
231 |
+
|
232 |
+
def forward(self, y, y_hat):
|
233 |
+
y_d_rs = []
|
234 |
+
y_d_gs = []
|
235 |
+
fmap_rs = []
|
236 |
+
fmap_gs = []
|
237 |
+
for i, d in enumerate(self.discriminators):
|
238 |
+
if i != 0:
|
239 |
+
y = self.meanpools[i-1](y)
|
240 |
+
y_hat = self.meanpools[i-1](y_hat)
|
241 |
+
y_d_r, fmap_r = d(y)
|
242 |
+
y_d_g, fmap_g = d(y_hat)
|
243 |
+
y_d_rs.append(y_d_r)
|
244 |
+
fmap_rs.append(fmap_r)
|
245 |
+
y_d_gs.append(y_d_g)
|
246 |
+
fmap_gs.append(fmap_g)
|
247 |
+
|
248 |
+
return y_d_rs, y_d_gs, fmap_rs, fmap_gs
|
249 |
+
|
250 |
+
|
251 |
+
def feature_loss(fmap_r, fmap_g):
|
252 |
+
loss = 0
|
253 |
+
for dr, dg in zip(fmap_r, fmap_g):
|
254 |
+
for rl, gl in zip(dr, dg):
|
255 |
+
loss += torch.mean(torch.abs(rl - gl))
|
256 |
+
|
257 |
+
return loss*2
|
258 |
+
|
259 |
+
|
260 |
+
def discriminator_loss(disc_real_outputs, disc_generated_outputs):
|
261 |
+
loss = 0
|
262 |
+
r_losses = []
|
263 |
+
g_losses = []
|
264 |
+
for dr, dg in zip(disc_real_outputs, disc_generated_outputs):
|
265 |
+
r_loss = torch.mean((1-dr)**2)
|
266 |
+
g_loss = torch.mean(dg**2)
|
267 |
+
loss += (r_loss + g_loss)
|
268 |
+
r_losses.append(r_loss.item())
|
269 |
+
g_losses.append(g_loss.item())
|
270 |
+
|
271 |
+
return loss, r_losses, g_losses
|
272 |
+
|
273 |
+
|
274 |
+
def generator_loss(disc_outputs):
|
275 |
+
loss = 0
|
276 |
+
gen_losses = []
|
277 |
+
for dg in disc_outputs:
|
278 |
+
l = torch.mean((1-dg)**2)
|
279 |
+
gen_losses.append(l)
|
280 |
+
loss += l
|
281 |
+
|
282 |
+
return loss, gen_losses
|
283 |
+
|
multiproc.py
ADDED
@@ -0,0 +1,23 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import time
|
2 |
+
import torch
|
3 |
+
import sys
|
4 |
+
import subprocess
|
5 |
+
|
6 |
+
argslist = list(sys.argv)[1:]
|
7 |
+
num_gpus = torch.cuda.device_count()
|
8 |
+
argslist.append('--n_gpus={}'.format(num_gpus))
|
9 |
+
workers = []
|
10 |
+
job_id = time.strftime("%Y_%m_%d-%H%M%S")
|
11 |
+
argslist.append("--group_name=group_{}".format(job_id))
|
12 |
+
|
13 |
+
for i in range(num_gpus):
|
14 |
+
argslist.append('--rank={}'.format(i))
|
15 |
+
stdout = None if i == 0 else open("logs/{}_GPU_{}.log".format(job_id, i),
|
16 |
+
"w")
|
17 |
+
print(argslist)
|
18 |
+
p = subprocess.Popen([str(sys.executable)]+argslist, stdout=stdout)
|
19 |
+
workers.append(p)
|
20 |
+
argslist = argslist[:-1]
|
21 |
+
|
22 |
+
for p in workers:
|
23 |
+
p.wait()
|
plotting_utils.py
ADDED
@@ -0,0 +1,61 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import matplotlib
|
2 |
+
matplotlib.use("Agg")
|
3 |
+
import matplotlib.pylab as plt
|
4 |
+
import numpy as np
|
5 |
+
|
6 |
+
|
7 |
+
def save_figure_to_numpy(fig):
|
8 |
+
# save it to a numpy array.
|
9 |
+
data = np.fromstring(fig.canvas.tostring_rgb(), dtype=np.uint8, sep='')
|
10 |
+
data = data.reshape(fig.canvas.get_width_height()[::-1] + (3,))
|
11 |
+
return data
|
12 |
+
|
13 |
+
|
14 |
+
def plot_alignment_to_numpy(alignment, info=None):
|
15 |
+
fig, ax = plt.subplots(figsize=(6, 4))
|
16 |
+
im = ax.imshow(alignment, aspect='auto', origin='lower',
|
17 |
+
interpolation='none')
|
18 |
+
fig.colorbar(im, ax=ax)
|
19 |
+
xlabel = 'Decoder timestep'
|
20 |
+
if info is not None:
|
21 |
+
xlabel += '\n\n' + info
|
22 |
+
plt.xlabel(xlabel)
|
23 |
+
plt.ylabel('Encoder timestep')
|
24 |
+
plt.tight_layout()
|
25 |
+
|
26 |
+
fig.canvas.draw()
|
27 |
+
data = save_figure_to_numpy(fig)
|
28 |
+
plt.close()
|
29 |
+
return data
|
30 |
+
|
31 |
+
|
32 |
+
def plot_spectrogram_to_numpy(spectrogram):
|
33 |
+
fig, ax = plt.subplots(figsize=(12, 3))
|
34 |
+
im = ax.imshow(spectrogram, aspect="auto", origin="lower",
|
35 |
+
interpolation='none')
|
36 |
+
plt.colorbar(im, ax=ax)
|
37 |
+
plt.xlabel("Frames")
|
38 |
+
plt.ylabel("Channels")
|
39 |
+
plt.tight_layout()
|
40 |
+
|
41 |
+
fig.canvas.draw()
|
42 |
+
data = save_figure_to_numpy(fig)
|
43 |
+
plt.close()
|
44 |
+
return data
|
45 |
+
|
46 |
+
|
47 |
+
def plot_gate_outputs_to_numpy(gate_targets, gate_outputs):
|
48 |
+
fig, ax = plt.subplots(figsize=(12, 3))
|
49 |
+
ax.scatter(range(len(gate_targets)), gate_targets, alpha=0.5,
|
50 |
+
color='green', marker='+', s=1, label='target')
|
51 |
+
ax.scatter(range(len(gate_outputs)), gate_outputs, alpha=0.5,
|
52 |
+
color='red', marker='.', s=1, label='predicted')
|
53 |
+
|
54 |
+
plt.xlabel("Frames (Green target, Red predicted)")
|
55 |
+
plt.ylabel("Gate State")
|
56 |
+
plt.tight_layout()
|
57 |
+
|
58 |
+
fig.canvas.draw()
|
59 |
+
data = save_figure_to_numpy(fig)
|
60 |
+
plt.close()
|
61 |
+
return data
|
requirements.txt
ADDED
@@ -0,0 +1,14 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
pillow
|
2 |
+
matplotlib
|
3 |
+
numpy==1.22.4
|
4 |
+
inflect
|
5 |
+
librosa
|
6 |
+
denoiser
|
7 |
+
pysoundfile
|
8 |
+
scipy
|
9 |
+
Unidecode
|
10 |
+
pillow
|
11 |
+
openjtalk>=0.3.0.dev2
|
12 |
+
janome
|
13 |
+
torch
|
14 |
+
tensorboardX
|
stft.py
ADDED
@@ -0,0 +1,141 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
"""
|
2 |
+
BSD 3-Clause License
|
3 |
+
|
4 |
+
Copyright (c) 2017, Prem Seetharaman
|
5 |
+
All rights reserved.
|
6 |
+
|
7 |
+
* Redistribution and use in source and binary forms, with or without
|
8 |
+
modification, are permitted provided that the following conditions are met:
|
9 |
+
|
10 |
+
* Redistributions of source code must retain the above copyright notice,
|
11 |
+
this list of conditions and the following disclaimer.
|
12 |
+
|
13 |
+
* Redistributions in binary form must reproduce the above copyright notice, this
|
14 |
+
list of conditions and the following disclaimer in the
|
15 |
+
documentation and/or other materials provided with the distribution.
|
16 |
+
|
17 |
+
* Neither the name of the copyright holder nor the names of its
|
18 |
+
contributors may be used to endorse or promote products derived from this
|
19 |
+
software without specific prior written permission.
|
20 |
+
|
21 |
+
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
|
22 |
+
ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
|
23 |
+
WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
|
24 |
+
DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR
|
25 |
+
ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
|
26 |
+
(INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
|
27 |
+
LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON
|
28 |
+
ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
|
29 |
+
(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
|
30 |
+
SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
31 |
+
"""
|
32 |
+
|
33 |
+
import torch
|
34 |
+
import numpy as np
|
35 |
+
import torch.nn.functional as F
|
36 |
+
from torch.autograd import Variable
|
37 |
+
from scipy.signal import get_window
|
38 |
+
from librosa.util import pad_center, tiny
|
39 |
+
from audio_processing import window_sumsquare
|
40 |
+
|
41 |
+
|
42 |
+
class STFT(torch.nn.Module):
|
43 |
+
"""adapted from Prem Seetharaman's https://github.com/pseeth/pytorch-stft"""
|
44 |
+
def __init__(self, filter_length=800, hop_length=200, win_length=800,
|
45 |
+
window='hann'):
|
46 |
+
super(STFT, self).__init__()
|
47 |
+
self.filter_length = filter_length
|
48 |
+
self.hop_length = hop_length
|
49 |
+
self.win_length = win_length
|
50 |
+
self.window = window
|
51 |
+
self.forward_transform = None
|
52 |
+
scale = self.filter_length / self.hop_length
|
53 |
+
fourier_basis = np.fft.fft(np.eye(self.filter_length))
|
54 |
+
|
55 |
+
cutoff = int((self.filter_length / 2 + 1))
|
56 |
+
fourier_basis = np.vstack([np.real(fourier_basis[:cutoff, :]),
|
57 |
+
np.imag(fourier_basis[:cutoff, :])])
|
58 |
+
|
59 |
+
forward_basis = torch.FloatTensor(fourier_basis[:, None, :])
|
60 |
+
inverse_basis = torch.FloatTensor(
|
61 |
+
np.linalg.pinv(scale * fourier_basis).T[:, None, :])
|
62 |
+
|
63 |
+
if window is not None:
|
64 |
+
assert(filter_length >= win_length)
|
65 |
+
# get window and zero center pad it to filter_length
|
66 |
+
fft_window = get_window(window, win_length, fftbins=True)
|
67 |
+
fft_window = pad_center(fft_window, filter_length)
|
68 |
+
fft_window = torch.from_numpy(fft_window).float()
|
69 |
+
|
70 |
+
# window the bases
|
71 |
+
forward_basis *= fft_window
|
72 |
+
inverse_basis *= fft_window
|
73 |
+
|
74 |
+
self.register_buffer('forward_basis', forward_basis.float())
|
75 |
+
self.register_buffer('inverse_basis', inverse_basis.float())
|
76 |
+
|
77 |
+
def transform(self, input_data):
|
78 |
+
num_batches = input_data.size(0)
|
79 |
+
num_samples = input_data.size(1)
|
80 |
+
|
81 |
+
self.num_samples = num_samples
|
82 |
+
|
83 |
+
# similar to librosa, reflect-pad the input
|
84 |
+
input_data = input_data.view(num_batches, 1, num_samples)
|
85 |
+
input_data = F.pad(
|
86 |
+
input_data.unsqueeze(1),
|
87 |
+
(int(self.filter_length / 2), int(self.filter_length / 2), 0, 0),
|
88 |
+
mode='reflect')
|
89 |
+
input_data = input_data.squeeze(1)
|
90 |
+
|
91 |
+
forward_transform = F.conv1d(
|
92 |
+
input_data,
|
93 |
+
Variable(self.forward_basis, requires_grad=False),
|
94 |
+
stride=self.hop_length,
|
95 |
+
padding=0)
|
96 |
+
|
97 |
+
cutoff = int((self.filter_length / 2) + 1)
|
98 |
+
real_part = forward_transform[:, :cutoff, :]
|
99 |
+
imag_part = forward_transform[:, cutoff:, :]
|
100 |
+
|
101 |
+
magnitude = torch.sqrt(real_part**2 + imag_part**2)
|
102 |
+
phase = torch.autograd.Variable(
|
103 |
+
torch.atan2(imag_part.data, real_part.data))
|
104 |
+
|
105 |
+
return magnitude, phase
|
106 |
+
|
107 |
+
def inverse(self, magnitude, phase):
|
108 |
+
recombine_magnitude_phase = torch.cat(
|
109 |
+
[magnitude*torch.cos(phase), magnitude*torch.sin(phase)], dim=1)
|
110 |
+
|
111 |
+
inverse_transform = F.conv_transpose1d(
|
112 |
+
recombine_magnitude_phase,
|
113 |
+
Variable(self.inverse_basis, requires_grad=False),
|
114 |
+
stride=self.hop_length,
|
115 |
+
padding=0)
|
116 |
+
|
117 |
+
if self.window is not None:
|
118 |
+
window_sum = window_sumsquare(
|
119 |
+
self.window, magnitude.size(-1), hop_length=self.hop_length,
|
120 |
+
win_length=self.win_length, n_fft=self.filter_length,
|
121 |
+
dtype=np.float32)
|
122 |
+
# remove modulation effects
|
123 |
+
approx_nonzero_indices = torch.from_numpy(
|
124 |
+
np.where(window_sum > tiny(window_sum))[0])
|
125 |
+
window_sum = torch.autograd.Variable(
|
126 |
+
torch.from_numpy(window_sum), requires_grad=False)
|
127 |
+
window_sum = window_sum.cuda() if magnitude.is_cuda else window_sum
|
128 |
+
inverse_transform[:, :, approx_nonzero_indices] /= window_sum[approx_nonzero_indices]
|
129 |
+
|
130 |
+
# scale by hop ratio
|
131 |
+
inverse_transform *= float(self.filter_length) / self.hop_length
|
132 |
+
|
133 |
+
inverse_transform = inverse_transform[:, :, int(self.filter_length/2):]
|
134 |
+
inverse_transform = inverse_transform[:, :, :-int(self.filter_length/2):]
|
135 |
+
|
136 |
+
return inverse_transform
|
137 |
+
|
138 |
+
def forward(self, input_data):
|
139 |
+
self.magnitude, self.phase = self.transform(input_data)
|
140 |
+
reconstruction = self.inverse(self.magnitude, self.phase)
|
141 |
+
return reconstruction
|
tensorboard.png
ADDED
![]() |
text/LICENSE
ADDED
@@ -0,0 +1,19 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
Copyright (c) 2017 Keith Ito
|
2 |
+
|
3 |
+
Permission is hereby granted, free of charge, to any person obtaining a copy
|
4 |
+
of this software and associated documentation files (the "Software"), to deal
|
5 |
+
in the Software without restriction, including without limitation the rights
|
6 |
+
to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
|
7 |
+
copies of the Software, and to permit persons to whom the Software is
|
8 |
+
furnished to do so, subject to the following conditions:
|
9 |
+
|
10 |
+
The above copyright notice and this permission notice shall be included in
|
11 |
+
all copies or substantial portions of the Software.
|
12 |
+
|
13 |
+
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
|
14 |
+
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
|
15 |
+
FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
|
16 |
+
AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
|
17 |
+
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
|
18 |
+
OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
|
19 |
+
THE SOFTWARE.
|
text/__init__.py
ADDED
@@ -0,0 +1,74 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
""" from https://github.com/keithito/tacotron """
|
2 |
+
import re
|
3 |
+
from text import cleaners
|
4 |
+
from text.symbols import symbols
|
5 |
+
|
6 |
+
|
7 |
+
# Mappings from symbol to numeric ID and vice versa:
|
8 |
+
_symbol_to_id = {s: i for i, s in enumerate(symbols)}
|
9 |
+
_id_to_symbol = {i: s for i, s in enumerate(symbols)}
|
10 |
+
|
11 |
+
# Regular expression matching text enclosed in curly braces:
|
12 |
+
_curly_re = re.compile(r'(.*?)\{(.+?)\}(.*)')
|
13 |
+
|
14 |
+
|
15 |
+
def text_to_sequence(text, cleaner_names):
|
16 |
+
'''Converts a string of text to a sequence of IDs corresponding to the symbols in the text.
|
17 |
+
|
18 |
+
The text can optionally have ARPAbet sequences enclosed in curly braces embedded
|
19 |
+
in it. For example, "Turn left on {HH AW1 S S T AH0 N} Street."
|
20 |
+
|
21 |
+
Args:
|
22 |
+
text: string to convert to a sequence
|
23 |
+
cleaner_names: names of the cleaner functions to run the text through
|
24 |
+
|
25 |
+
Returns:
|
26 |
+
List of integers corresponding to the symbols in the text
|
27 |
+
'''
|
28 |
+
sequence = []
|
29 |
+
|
30 |
+
# Check for curly braces and treat their contents as ARPAbet:
|
31 |
+
while len(text):
|
32 |
+
m = _curly_re.match(text)
|
33 |
+
if not m:
|
34 |
+
sequence += _symbols_to_sequence(_clean_text(text, cleaner_names))
|
35 |
+
break
|
36 |
+
sequence += _symbols_to_sequence(_clean_text(m.group(1), cleaner_names))
|
37 |
+
sequence += _arpabet_to_sequence(m.group(2))
|
38 |
+
text = m.group(3)
|
39 |
+
|
40 |
+
return sequence
|
41 |
+
|
42 |
+
|
43 |
+
def sequence_to_text(sequence):
|
44 |
+
'''Converts a sequence of IDs back to a string'''
|
45 |
+
result = ''
|
46 |
+
for symbol_id in sequence:
|
47 |
+
if symbol_id in _id_to_symbol:
|
48 |
+
s = _id_to_symbol[symbol_id]
|
49 |
+
# Enclose ARPAbet back in curly braces:
|
50 |
+
if len(s) > 1 and s[0] == '@':
|
51 |
+
s = '{%s}' % s[1:]
|
52 |
+
result += s
|
53 |
+
return result.replace('}{', ' ')
|
54 |
+
|
55 |
+
|
56 |
+
def _clean_text(text, cleaner_names):
|
57 |
+
for name in cleaner_names:
|
58 |
+
cleaner = getattr(cleaners, name)
|
59 |
+
if not cleaner:
|
60 |
+
raise Exception('Unknown cleaner: %s' % name)
|
61 |
+
text = cleaner(text)
|
62 |
+
return text
|
63 |
+
|
64 |
+
|
65 |
+
def _symbols_to_sequence(symbols):
|
66 |
+
return [_symbol_to_id[s] for s in symbols if _should_keep_symbol(s)]
|
67 |
+
|
68 |
+
|
69 |
+
def _arpabet_to_sequence(text):
|
70 |
+
return _symbols_to_sequence(['@' + s for s in text.split()])
|
71 |
+
|
72 |
+
|
73 |
+
def _should_keep_symbol(s):
|
74 |
+
return s in _symbol_to_id and s is not '_' and s is not '~'
|
text/__pycache__/__init__.cpython-310.pyc
ADDED
Binary file (2.69 kB). View file
|
|
text/__pycache__/cleaners.cpython-310.pyc
ADDED
Binary file (5.15 kB). View file
|
|
text/__pycache__/cmudict.cpython-310.pyc
ADDED
Binary file (2.35 kB). View file
|
|
text/__pycache__/numbers.cpython-310.pyc
ADDED
Binary file (2.19 kB). View file
|
|
text/__pycache__/symbols.cpython-310.pyc
ADDED
Binary file (578 Bytes). View file
|
|