anthonyrusso commited on
Commit
b57c851
1 Parent(s): f1e9197

upload dependencies

Browse files
This view is limited to 50 files because it contains too many changes.   See raw diff
Files changed (50) hide show
  1. 2.0 +0 -0
  2. 2.0' +0 -0
  3. CHANGELOG.md +28 -0
  4. CODE_OF_CONDUCT.md +80 -0
  5. CONTRIBUTING.md +35 -0
  6. LICENSE +21 -0
  7. LICENSE_weights +399 -0
  8. MANIFEST.in +9 -0
  9. Makefile +40 -0
  10. README.md +86 -13
  11. config/conditioner/chroma2music.yaml +46 -0
  12. config/conditioner/clapemb2music.yaml +44 -0
  13. config/conditioner/none.yaml +19 -0
  14. config/conditioner/text2music.yaml +30 -0
  15. config/conditioner/text2sound.yaml +24 -0
  16. config/config.yaml +75 -0
  17. config/dset/audio/audiocaps_16khz.yaml +11 -0
  18. config/dset/audio/default.yaml +10 -0
  19. config/dset/audio/example.yaml +10 -0
  20. config/dset/audio/musiccaps_32khz.yaml +12 -0
  21. config/dset/default.yaml +10 -0
  22. config/dset/internal/music_10k_32khz.yaml +11 -0
  23. config/dset/internal/music_400k_32khz.yaml +10 -0
  24. config/dset/internal/sounds_16khz.yaml +12 -0
  25. config/model/encodec/default.yaml +54 -0
  26. config/model/encodec/encodec_base_causal.yaml +11 -0
  27. config/model/encodec/encodec_large_nq4_s320.yaml +13 -0
  28. config/model/encodec/encodec_large_nq4_s640.yaml +13 -0
  29. config/model/lm/audiogen_lm.yaml +36 -0
  30. config/model/lm/default.yaml +47 -0
  31. config/model/lm/model_scale/base.yaml +3 -0
  32. config/model/lm/model_scale/large.yaml +7 -0
  33. config/model/lm/model_scale/medium.yaml +7 -0
  34. config/model/lm/model_scale/small.yaml +8 -0
  35. config/model/lm/model_scale/xsmall.yaml +8 -0
  36. config/model/lm/musicgen_lm.yaml +36 -0
  37. config/model/none.yaml +4 -0
  38. config/model/score/basic.yaml +17 -0
  39. config/solver/audiogen/audiogen_base_16khz.yaml +70 -0
  40. config/solver/audiogen/debug.yaml +52 -0
  41. config/solver/audiogen/default.yaml +40 -0
  42. config/solver/audiogen/evaluation/none.yaml +5 -0
  43. config/solver/audiogen/evaluation/objective_eval.yaml +29 -0
  44. config/solver/compression/debug.yaml +55 -0
  45. config/solver/compression/default.yaml +160 -0
  46. config/solver/compression/encodec_audiogen_16khz.yaml +10 -0
  47. config/solver/compression/encodec_base_24khz.yaml +10 -0
  48. config/solver/compression/encodec_musicgen_32khz.yaml +10 -0
  49. config/solver/default.yaml +108 -0
  50. config/solver/diffusion/debug.yaml +106 -0
2.0 ADDED
File without changes
2.0' ADDED
File without changes
CHANGELOG.md ADDED
@@ -0,0 +1,28 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Changelog
2
+
3
+ All notable changes to this project will be documented in this file.
4
+
5
+ The format is based on [Keep a Changelog](https://keepachangelog.com/en/1.0.0/).
6
+
7
+ ## [1.0.0] - 2023-08-02
8
+
9
+ Major revision, added training code for EnCodec, AudioGen, MusicGen, and MultiBandDiffusion.
10
+ Added pretrained model for AudioGen and MultiBandDiffusion.
11
+
12
+ ## [0.0.2] - 2023-08-01
13
+
14
+ Improved demo, fixed top p (thanks @jnordberg).
15
+
16
+ Compressor tanh on output to avoid clipping with some style (especially piano).
17
+ Now repeating the conditioning periodically if it is too short.
18
+
19
+ More options when launching Gradio app locally (thanks @ashleykleynhans).
20
+
21
+ Testing out PyTorch 2.0 memory efficient attention.
22
+
23
+ Added extended generation (infinite length) by slowly moving the windows.
24
+ Note that other implementations exist: https://github.com/camenduru/MusicGen-colab.
25
+
26
+ ## [0.0.1] - 2023-06-09
27
+
28
+ Initial release, with model evaluation only.
CODE_OF_CONDUCT.md ADDED
@@ -0,0 +1,80 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Code of Conduct
2
+
3
+ ## Our Pledge
4
+
5
+ In the interest of fostering an open and welcoming environment, we as
6
+ contributors and maintainers pledge to make participation in our project and
7
+ our community a harassment-free experience for everyone, regardless of age, body
8
+ size, disability, ethnicity, sex characteristics, gender identity and expression,
9
+ level of experience, education, socio-economic status, nationality, personal
10
+ appearance, race, religion, or sexual identity and orientation.
11
+
12
+ ## Our Standards
13
+
14
+ Examples of behavior that contributes to creating a positive environment
15
+ include:
16
+
17
+ * Using welcoming and inclusive language
18
+ * Being respectful of differing viewpoints and experiences
19
+ * Gracefully accepting constructive criticism
20
+ * Focusing on what is best for the community
21
+ * Showing empathy towards other community members
22
+
23
+ Examples of unacceptable behavior by participants include:
24
+
25
+ * The use of sexualized language or imagery and unwelcome sexual attention or
26
+ advances
27
+ * Trolling, insulting/derogatory comments, and personal or political attacks
28
+ * Public or private harassment
29
+ * Publishing others' private information, such as a physical or electronic
30
+ address, without explicit permission
31
+ * Other conduct which could reasonably be considered inappropriate in a
32
+ professional setting
33
+
34
+ ## Our Responsibilities
35
+
36
+ Project maintainers are responsible for clarifying the standards of acceptable
37
+ behavior and are expected to take appropriate and fair corrective action in
38
+ response to any instances of unacceptable behavior.
39
+
40
+ Project maintainers have the right and responsibility to remove, edit, or
41
+ reject comments, commits, code, wiki edits, issues, and other contributions
42
+ that are not aligned to this Code of Conduct, or to ban temporarily or
43
+ permanently any contributor for other behaviors that they deem inappropriate,
44
+ threatening, offensive, or harmful.
45
+
46
+ ## Scope
47
+
48
+ This Code of Conduct applies within all project spaces, and it also applies when
49
+ an individual is representing the project or its community in public spaces.
50
+ Examples of representing a project or community include using an official
51
+ project e-mail address, posting via an official social media account, or acting
52
+ as an appointed representative at an online or offline event. Representation of
53
+ a project may be further defined and clarified by project maintainers.
54
+
55
+ This Code of Conduct also applies outside the project spaces when there is a
56
+ reasonable belief that an individual's behavior may have a negative impact on
57
+ the project or its community.
58
+
59
+ ## Enforcement
60
+
61
+ Instances of abusive, harassing, or otherwise unacceptable behavior may be
62
+ reported by contacting the project team at <opensource-conduct@fb.com>. All
63
+ complaints will be reviewed and investigated and will result in a response that
64
+ is deemed necessary and appropriate to the circumstances. The project team is
65
+ obligated to maintain confidentiality with regard to the reporter of an incident.
66
+ Further details of specific enforcement policies may be posted separately.
67
+
68
+ Project maintainers who do not follow or enforce the Code of Conduct in good
69
+ faith may face temporary or permanent repercussions as determined by other
70
+ members of the project's leadership.
71
+
72
+ ## Attribution
73
+
74
+ This Code of Conduct is adapted from the [Contributor Covenant][homepage], version 1.4,
75
+ available at https://www.contributor-covenant.org/version/1/4/code-of-conduct.html
76
+
77
+ [homepage]: https://www.contributor-covenant.org
78
+
79
+ For answers to common questions about this code of conduct, see
80
+ https://www.contributor-covenant.org/faq
CONTRIBUTING.md ADDED
@@ -0,0 +1,35 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Contributing to AudioCraft
2
+
3
+ We want to make contributing to this project as easy and transparent as
4
+ possible.
5
+
6
+ ## Pull Requests
7
+
8
+ AudioCraft is the implementation of a research paper.
9
+ Therefore, we do not plan on accepting many pull requests for new features.
10
+ We certainly welcome them for bug fixes.
11
+
12
+ 1. Fork the repo and create your branch from `main`.
13
+ 2. If you've added code that should be tested, add tests.
14
+ 3. If you've changed APIs, update the documentation.
15
+ 4. Ensure the test suite passes.
16
+ 5. Make sure your code lints.
17
+ 6. If you haven't already, complete the Contributor License Agreement ("CLA").
18
+
19
+ ## Contributor License Agreement ("CLA")
20
+ In order to accept your pull request, we need you to submit a CLA. You only need
21
+ to do this once to work on any of Meta's open source projects.
22
+
23
+ Complete your CLA here: <https://code.facebook.com/cla>
24
+
25
+ ## Issues
26
+ We use GitHub issues to track public bugs. Please ensure your description is
27
+ clear and has sufficient instructions to be able to reproduce the issue.
28
+
29
+ Meta has a [bounty program](https://www.facebook.com/whitehat/) for the safe
30
+ disclosure of security bugs. In those cases, please go through the process
31
+ outlined on that page and do not file a public issue.
32
+
33
+ ## License
34
+ By contributing to encodec, you agree that your contributions will be licensed
35
+ under the LICENSE file in the root directory of this source tree.
LICENSE ADDED
@@ -0,0 +1,21 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ MIT License
2
+
3
+ Copyright (c) Meta Platforms, Inc. and affiliates.
4
+
5
+ Permission is hereby granted, free of charge, to any person obtaining a copy
6
+ of this software and associated documentation files (the "Software"), to deal
7
+ in the Software without restriction, including without limitation the rights
8
+ to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
9
+ copies of the Software, and to permit persons to whom the Software is
10
+ furnished to do so, subject to the following conditions:
11
+
12
+ The above copyright notice and this permission notice shall be included in all
13
+ copies or substantial portions of the Software.
14
+
15
+ THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16
+ IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17
+ FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18
+ AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19
+ LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20
+ OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21
+ SOFTWARE.
LICENSE_weights ADDED
@@ -0,0 +1,399 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ Attribution-NonCommercial 4.0 International
2
+
3
+ =======================================================================
4
+
5
+ Creative Commons Corporation ("Creative Commons") is not a law firm and
6
+ does not provide legal services or legal advice. Distribution of
7
+ Creative Commons public licenses does not create a lawyer-client or
8
+ other relationship. Creative Commons makes its licenses and related
9
+ information available on an "as-is" basis. Creative Commons gives no
10
+ warranties regarding its licenses, any material licensed under their
11
+ terms and conditions, or any related information. Creative Commons
12
+ disclaims all liability for damages resulting from their use to the
13
+ fullest extent possible.
14
+
15
+ Using Creative Commons Public Licenses
16
+
17
+ Creative Commons public licenses provide a standard set of terms and
18
+ conditions that creators and other rights holders may use to share
19
+ original works of authorship and other material subject to copyright
20
+ and certain other rights specified in the public license below. The
21
+ following considerations are for informational purposes only, are not
22
+ exhaustive, and do not form part of our licenses.
23
+
24
+ Considerations for licensors: Our public licenses are
25
+ intended for use by those authorized to give the public
26
+ permission to use material in ways otherwise restricted by
27
+ copyright and certain other rights. Our licenses are
28
+ irrevocable. Licensors should read and understand the terms
29
+ and conditions of the license they choose before applying it.
30
+ Licensors should also secure all rights necessary before
31
+ applying our licenses so that the public can reuse the
32
+ material as expected. Licensors should clearly mark any
33
+ material not subject to the license. This includes other CC-
34
+ licensed material, or material used under an exception or
35
+ limitation to copyright. More considerations for licensors:
36
+ wiki.creativecommons.org/Considerations_for_licensors
37
+
38
+ Considerations for the public: By using one of our public
39
+ licenses, a licensor grants the public permission to use the
40
+ licensed material under specified terms and conditions. If
41
+ the licensor's permission is not necessary for any reason--for
42
+ example, because of any applicable exception or limitation to
43
+ copyright--then that use is not regulated by the license. Our
44
+ licenses grant only permissions under copyright and certain
45
+ other rights that a licensor has authority to grant. Use of
46
+ the licensed material may still be restricted for other
47
+ reasons, including because others have copyright or other
48
+ rights in the material. A licensor may make special requests,
49
+ such as asking that all changes be marked or described.
50
+ Although not required by our licenses, you are encouraged to
51
+ respect those requests where reasonable. More_considerations
52
+ for the public:
53
+ wiki.creativecommons.org/Considerations_for_licensees
54
+
55
+ =======================================================================
56
+
57
+ Creative Commons Attribution-NonCommercial 4.0 International Public
58
+ License
59
+
60
+ By exercising the Licensed Rights (defined below), You accept and agree
61
+ to be bound by the terms and conditions of this Creative Commons
62
+ Attribution-NonCommercial 4.0 International Public License ("Public
63
+ License"). To the extent this Public License may be interpreted as a
64
+ contract, You are granted the Licensed Rights in consideration of Your
65
+ acceptance of these terms and conditions, and the Licensor grants You
66
+ such rights in consideration of benefits the Licensor receives from
67
+ making the Licensed Material available under these terms and
68
+ conditions.
69
+
70
+ Section 1 -- Definitions.
71
+
72
+ a. Adapted Material means material subject to Copyright and Similar
73
+ Rights that is derived from or based upon the Licensed Material
74
+ and in which the Licensed Material is translated, altered,
75
+ arranged, transformed, or otherwise modified in a manner requiring
76
+ permission under the Copyright and Similar Rights held by the
77
+ Licensor. For purposes of this Public License, where the Licensed
78
+ Material is a musical work, performance, or sound recording,
79
+ Adapted Material is always produced where the Licensed Material is
80
+ synched in timed relation with a moving image.
81
+
82
+ b. Adapter's License means the license You apply to Your Copyright
83
+ and Similar Rights in Your contributions to Adapted Material in
84
+ accordance with the terms and conditions of this Public License.
85
+
86
+ c. Copyright and Similar Rights means copyright and/or similar rights
87
+ closely related to copyright including, without limitation,
88
+ performance, broadcast, sound recording, and Sui Generis Database
89
+ Rights, without regard to how the rights are labeled or
90
+ categorized. For purposes of this Public License, the rights
91
+ specified in Section 2(b)(1)-(2) are not Copyright and Similar
92
+ Rights.
93
+ d. Effective Technological Measures means those measures that, in the
94
+ absence of proper authority, may not be circumvented under laws
95
+ fulfilling obligations under Article 11 of the WIPO Copyright
96
+ Treaty adopted on December 20, 1996, and/or similar international
97
+ agreements.
98
+
99
+ e. Exceptions and Limitations means fair use, fair dealing, and/or
100
+ any other exception or limitation to Copyright and Similar Rights
101
+ that applies to Your use of the Licensed Material.
102
+
103
+ f. Licensed Material means the artistic or literary work, database,
104
+ or other material to which the Licensor applied this Public
105
+ License.
106
+
107
+ g. Licensed Rights means the rights granted to You subject to the
108
+ terms and conditions of this Public License, which are limited to
109
+ all Copyright and Similar Rights that apply to Your use of the
110
+ Licensed Material and that the Licensor has authority to license.
111
+
112
+ h. Licensor means the individual(s) or entity(ies) granting rights
113
+ under this Public License.
114
+
115
+ i. NonCommercial means not primarily intended for or directed towards
116
+ commercial advantage or monetary compensation. For purposes of
117
+ this Public License, the exchange of the Licensed Material for
118
+ other material subject to Copyright and Similar Rights by digital
119
+ file-sharing or similar means is NonCommercial provided there is
120
+ no payment of monetary compensation in connection with the
121
+ exchange.
122
+
123
+ j. Share means to provide material to the public by any means or
124
+ process that requires permission under the Licensed Rights, such
125
+ as reproduction, public display, public performance, distribution,
126
+ dissemination, communication, or importation, and to make material
127
+ available to the public including in ways that members of the
128
+ public may access the material from a place and at a time
129
+ individually chosen by them.
130
+
131
+ k. Sui Generis Database Rights means rights other than copyright
132
+ resulting from Directive 96/9/EC of the European Parliament and of
133
+ the Council of 11 March 1996 on the legal protection of databases,
134
+ as amended and/or succeeded, as well as other essentially
135
+ equivalent rights anywhere in the world.
136
+
137
+ l. You means the individual or entity exercising the Licensed Rights
138
+ under this Public License. Your has a corresponding meaning.
139
+
140
+ Section 2 -- Scope.
141
+
142
+ a. License grant.
143
+
144
+ 1. Subject to the terms and conditions of this Public License,
145
+ the Licensor hereby grants You a worldwide, royalty-free,
146
+ non-sublicensable, non-exclusive, irrevocable license to
147
+ exercise the Licensed Rights in the Licensed Material to:
148
+
149
+ a. reproduce and Share the Licensed Material, in whole or
150
+ in part, for NonCommercial purposes only; and
151
+
152
+ b. produce, reproduce, and Share Adapted Material for
153
+ NonCommercial purposes only.
154
+
155
+ 2. Exceptions and Limitations. For the avoidance of doubt, where
156
+ Exceptions and Limitations apply to Your use, this Public
157
+ License does not apply, and You do not need to comply with
158
+ its terms and conditions.
159
+
160
+ 3. Term. The term of this Public License is specified in Section
161
+ 6(a).
162
+
163
+ 4. Media and formats; technical modifications allowed. The
164
+ Licensor authorizes You to exercise the Licensed Rights in
165
+ all media and formats whether now known or hereafter created,
166
+ and to make technical modifications necessary to do so. The
167
+ Licensor waives and/or agrees not to assert any right or
168
+ authority to forbid You from making technical modifications
169
+ necessary to exercise the Licensed Rights, including
170
+ technical modifications necessary to circumvent Effective
171
+ Technological Measures. For purposes of this Public License,
172
+ simply making modifications authorized by this Section 2(a)
173
+ (4) never produces Adapted Material.
174
+
175
+ 5. Downstream recipients.
176
+
177
+ a. Offer from the Licensor -- Licensed Material. Every
178
+ recipient of the Licensed Material automatically
179
+ receives an offer from the Licensor to exercise the
180
+ Licensed Rights under the terms and conditions of this
181
+ Public License.
182
+
183
+ b. No downstream restrictions. You may not offer or impose
184
+ any additional or different terms or conditions on, or
185
+ apply any Effective Technological Measures to, the
186
+ Licensed Material if doing so restricts exercise of the
187
+ Licensed Rights by any recipient of the Licensed
188
+ Material.
189
+
190
+ 6. No endorsement. Nothing in this Public License constitutes or
191
+ may be construed as permission to assert or imply that You
192
+ are, or that Your use of the Licensed Material is, connected
193
+ with, or sponsored, endorsed, or granted official status by,
194
+ the Licensor or others designated to receive attribution as
195
+ provided in Section 3(a)(1)(A)(i).
196
+
197
+ b. Other rights.
198
+
199
+ 1. Moral rights, such as the right of integrity, are not
200
+ licensed under this Public License, nor are publicity,
201
+ privacy, and/or other similar personality rights; however, to
202
+ the extent possible, the Licensor waives and/or agrees not to
203
+ assert any such rights held by the Licensor to the limited
204
+ extent necessary to allow You to exercise the Licensed
205
+ Rights, but not otherwise.
206
+
207
+ 2. Patent and trademark rights are not licensed under this
208
+ Public License.
209
+
210
+ 3. To the extent possible, the Licensor waives any right to
211
+ collect royalties from You for the exercise of the Licensed
212
+ Rights, whether directly or through a collecting society
213
+ under any voluntary or waivable statutory or compulsory
214
+ licensing scheme. In all other cases the Licensor expressly
215
+ reserves any right to collect such royalties, including when
216
+ the Licensed Material is used other than for NonCommercial
217
+ purposes.
218
+
219
+ Section 3 -- License Conditions.
220
+
221
+ Your exercise of the Licensed Rights is expressly made subject to the
222
+ following conditions.
223
+
224
+ a. Attribution.
225
+
226
+ 1. If You Share the Licensed Material (including in modified
227
+ form), You must:
228
+
229
+ a. retain the following if it is supplied by the Licensor
230
+ with the Licensed Material:
231
+
232
+ i. identification of the creator(s) of the Licensed
233
+ Material and any others designated to receive
234
+ attribution, in any reasonable manner requested by
235
+ the Licensor (including by pseudonym if
236
+ designated);
237
+
238
+ ii. a copyright notice;
239
+
240
+ iii. a notice that refers to this Public License;
241
+
242
+ iv. a notice that refers to the disclaimer of
243
+ warranties;
244
+
245
+ v. a URI or hyperlink to the Licensed Material to the
246
+ extent reasonably practicable;
247
+
248
+ b. indicate if You modified the Licensed Material and
249
+ retain an indication of any previous modifications; and
250
+
251
+ c. indicate the Licensed Material is licensed under this
252
+ Public License, and include the text of, or the URI or
253
+ hyperlink to, this Public License.
254
+
255
+ 2. You may satisfy the conditions in Section 3(a)(1) in any
256
+ reasonable manner based on the medium, means, and context in
257
+ which You Share the Licensed Material. For example, it may be
258
+ reasonable to satisfy the conditions by providing a URI or
259
+ hyperlink to a resource that includes the required
260
+ information.
261
+
262
+ 3. If requested by the Licensor, You must remove any of the
263
+ information required by Section 3(a)(1)(A) to the extent
264
+ reasonably practicable.
265
+
266
+ 4. If You Share Adapted Material You produce, the Adapter's
267
+ License You apply must not prevent recipients of the Adapted
268
+ Material from complying with this Public License.
269
+
270
+ Section 4 -- Sui Generis Database Rights.
271
+
272
+ Where the Licensed Rights include Sui Generis Database Rights that
273
+ apply to Your use of the Licensed Material:
274
+
275
+ a. for the avoidance of doubt, Section 2(a)(1) grants You the right
276
+ to extract, reuse, reproduce, and Share all or a substantial
277
+ portion of the contents of the database for NonCommercial purposes
278
+ only;
279
+
280
+ b. if You include all or a substantial portion of the database
281
+ contents in a database in which You have Sui Generis Database
282
+ Rights, then the database in which You have Sui Generis Database
283
+ Rights (but not its individual contents) is Adapted Material; and
284
+
285
+ c. You must comply with the conditions in Section 3(a) if You Share
286
+ all or a substantial portion of the contents of the database.
287
+
288
+ For the avoidance of doubt, this Section 4 supplements and does not
289
+ replace Your obligations under this Public License where the Licensed
290
+ Rights include other Copyright and Similar Rights.
291
+
292
+ Section 5 -- Disclaimer of Warranties and Limitation of Liability.
293
+
294
+ a. UNLESS OTHERWISE SEPARATELY UNDERTAKEN BY THE LICENSOR, TO THE
295
+ EXTENT POSSIBLE, THE LICENSOR OFFERS THE LICENSED MATERIAL AS-IS
296
+ AND AS-AVAILABLE, AND MAKES NO REPRESENTATIONS OR WARRANTIES OF
297
+ ANY KIND CONCERNING THE LICENSED MATERIAL, WHETHER EXPRESS,
298
+ IMPLIED, STATUTORY, OR OTHER. THIS INCLUDES, WITHOUT LIMITATION,
299
+ WARRANTIES OF TITLE, MERCHANTABILITY, FITNESS FOR A PARTICULAR
300
+ PURPOSE, NON-INFRINGEMENT, ABSENCE OF LATENT OR OTHER DEFECTS,
301
+ ACCURACY, OR THE PRESENCE OR ABSENCE OF ERRORS, WHETHER OR NOT
302
+ KNOWN OR DISCOVERABLE. WHERE DISCLAIMERS OF WARRANTIES ARE NOT
303
+ ALLOWED IN FULL OR IN PART, THIS DISCLAIMER MAY NOT APPLY TO YOU.
304
+
305
+ b. TO THE EXTENT POSSIBLE, IN NO EVENT WILL THE LICENSOR BE LIABLE
306
+ TO YOU ON ANY LEGAL THEORY (INCLUDING, WITHOUT LIMITATION,
307
+ NEGLIGENCE) OR OTHERWISE FOR ANY DIRECT, SPECIAL, INDIRECT,
308
+ INCIDENTAL, CONSEQUENTIAL, PUNITIVE, EXEMPLARY, OR OTHER LOSSES,
309
+ COSTS, EXPENSES, OR DAMAGES ARISING OUT OF THIS PUBLIC LICENSE OR
310
+ USE OF THE LICENSED MATERIAL, EVEN IF THE LICENSOR HAS BEEN
311
+ ADVISED OF THE POSSIBILITY OF SUCH LOSSES, COSTS, EXPENSES, OR
312
+ DAMAGES. WHERE A LIMITATION OF LIABILITY IS NOT ALLOWED IN FULL OR
313
+ IN PART, THIS LIMITATION MAY NOT APPLY TO YOU.
314
+
315
+ c. The disclaimer of warranties and limitation of liability provided
316
+ above shall be interpreted in a manner that, to the extent
317
+ possible, most closely approximates an absolute disclaimer and
318
+ waiver of all liability.
319
+
320
+ Section 6 -- Term and Termination.
321
+
322
+ a. This Public License applies for the term of the Copyright and
323
+ Similar Rights licensed here. However, if You fail to comply with
324
+ this Public License, then Your rights under this Public License
325
+ terminate automatically.
326
+
327
+ b. Where Your right to use the Licensed Material has terminated under
328
+ Section 6(a), it reinstates:
329
+
330
+ 1. automatically as of the date the violation is cured, provided
331
+ it is cured within 30 days of Your discovery of the
332
+ violation; or
333
+
334
+ 2. upon express reinstatement by the Licensor.
335
+
336
+ For the avoidance of doubt, this Section 6(b) does not affect any
337
+ right the Licensor may have to seek remedies for Your violations
338
+ of this Public License.
339
+
340
+ c. For the avoidance of doubt, the Licensor may also offer the
341
+ Licensed Material under separate terms or conditions or stop
342
+ distributing the Licensed Material at any time; however, doing so
343
+ will not terminate this Public License.
344
+
345
+ d. Sections 1, 5, 6, 7, and 8 survive termination of this Public
346
+ License.
347
+
348
+ Section 7 -- Other Terms and Conditions.
349
+
350
+ a. The Licensor shall not be bound by any additional or different
351
+ terms or conditions communicated by You unless expressly agreed.
352
+
353
+ b. Any arrangements, understandings, or agreements regarding the
354
+ Licensed Material not stated herein are separate from and
355
+ independent of the terms and conditions of this Public License.
356
+
357
+ Section 8 -- Interpretation.
358
+
359
+ a. For the avoidance of doubt, this Public License does not, and
360
+ shall not be interpreted to, reduce, limit, restrict, or impose
361
+ conditions on any use of the Licensed Material that could lawfully
362
+ be made without permission under this Public License.
363
+
364
+ b. To the extent possible, if any provision of this Public License is
365
+ deemed unenforceable, it shall be automatically reformed to the
366
+ minimum extent necessary to make it enforceable. If the provision
367
+ cannot be reformed, it shall be severed from this Public License
368
+ without affecting the enforceability of the remaining terms and
369
+ conditions.
370
+
371
+ c. No term or condition of this Public License will be waived and no
372
+ failure to comply consented to unless expressly agreed to by the
373
+ Licensor.
374
+
375
+ d. Nothing in this Public License constitutes or may be interpreted
376
+ as a limitation upon, or waiver of, any privileges and immunities
377
+ that apply to the Licensor or You, including from the legal
378
+ processes of any jurisdiction or authority.
379
+
380
+ =======================================================================
381
+
382
+ Creative Commons is not a party to its public
383
+ licenses. Notwithstanding, Creative Commons may elect to apply one of
384
+ its public licenses to material it publishes and in those instances
385
+ will be considered the “Licensor.” The text of the Creative Commons
386
+ public licenses is dedicated to the public domain under the CC0 Public
387
+ Domain Dedication. Except for the limited purpose of indicating that
388
+ material is shared under a Creative Commons public license or as
389
+ otherwise permitted by the Creative Commons policies published at
390
+ creativecommons.org/policies, Creative Commons does not authorize the
391
+ use of the trademark "Creative Commons" or any other trademark or logo
392
+ of Creative Commons without its prior written consent including,
393
+ without limitation, in connection with any unauthorized modifications
394
+ to any of its public licenses or any other arrangements,
395
+ understandings, or agreements concerning use of licensed material. For
396
+ the avoidance of doubt, this paragraph does not form part of the
397
+ public licenses.
398
+
399
+ Creative Commons may be contacted at creativecommons.org.
MANIFEST.in ADDED
@@ -0,0 +1,9 @@
 
 
 
 
 
 
 
 
 
 
1
+ include Makefile
2
+ include LICENSE
3
+ include LICENSE_weights
4
+ include *.md
5
+ include *.ini
6
+ include requirements.txt
7
+ include audiocraft/py.typed
8
+ include assets/*.mp3
9
+ recursive-include conf *.yaml
Makefile ADDED
@@ -0,0 +1,40 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ INTEG=AUDIOCRAFT_DORA_DIR="/tmp/magma_$(USER)" python3 -m dora -v run --clear device=cpu dataset.num_workers=0 optim.epochs=1 \
2
+ dataset.train.num_samples=10 dataset.valid.num_samples=10 \
3
+ dataset.evaluate.num_samples=10 dataset.generate.num_samples=2 sample_rate=16000 \
4
+ logging.level=DEBUG
5
+ INTEG_COMPRESSION = $(INTEG) solver=compression/debug rvq.n_q=2 rvq.bins=48 checkpoint.save_last=true # SIG is 5091833e
6
+ INTEG_MUSICGEN = $(INTEG) solver=musicgen/debug dset=audio/example compression_model_checkpoint=//sig/5091833e \
7
+ transformer_lm.n_q=2 transformer_lm.card=48 transformer_lm.dim=16 checkpoint.save_last=false # Using compression model from 5091833e
8
+ INTEG_AUDIOGEN = $(INTEG) solver=audiogen/debug dset=audio/example compression_model_checkpoint=//sig/5091833e \
9
+ transformer_lm.n_q=2 transformer_lm.card=48 transformer_lm.dim=16 checkpoint.save_last=false # Using compression model from 5091833e
10
+ INTEG_MBD = $(INTEG) solver=diffusion/debug dset=audio/example \
11
+ checkpoint.save_last=false # Using compression model from 616d7b3c
12
+
13
+ default: linter tests
14
+
15
+ install:
16
+ pip install -U pip
17
+ pip install -U -e '.[dev]'
18
+
19
+ linter:
20
+ flake8 audiocraft && mypy audiocraft
21
+ flake8 tests && mypy tests
22
+
23
+ tests:
24
+ coverage run -m pytest tests
25
+ coverage report
26
+
27
+ tests_integ:
28
+ $(INTEG_COMPRESSION)
29
+ $(INTEG_MBD)
30
+ $(INTEG_MUSICGEN)
31
+ $(INTEG_AUDIOGEN)
32
+
33
+
34
+ api_docs:
35
+ pdoc3 --html -o api_docs -f audiocraft
36
+
37
+ dist:
38
+ python setup.py sdist
39
+
40
+ .PHONY: linter tests api_docs dist
README.md CHANGED
@@ -1,13 +1,86 @@
1
- ---
2
- title: Videoshop Backend
3
- emoji: 📚
4
- colorFrom: green
5
- colorTo: red
6
- sdk: gradio
7
- sdk_version: 3.41.2
8
- app_file: app.py
9
- pinned: false
10
- license: openrail
11
- ---
12
-
13
- Check out the configuration reference at https://huggingface.co/docs/hub/spaces-config-reference
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # AudioCraft
2
+
3
+ ![docs badge](https://github.com/facebookresearch/audiocraft/workflows/audiocraft_docs/badge.svg)
4
+ ![linter badge](https://github.com/facebookresearch/audiocraft/workflows/audiocraft_linter/badge.svg)
5
+ ![tests badge](https://github.com/facebookresearch/audiocraft/workflows/audiocraft_tests/badge.svg)
6
+
7
+ AudioCraft is a PyTorch library for deep learning research on audio generation. AudioCraft contains inference and trainingcon code
8
+ for two state-of-the-art AI generative models producing high-quality audio: AudioGen and MusicGen.
9
+
10
+ ## Installation
11
+
12
+ AudioCraft requires Python 3.9, PyTorch 2.0.0. To install AudioCraft, you can run the following:
13
+
14
+ ```shell
15
+ # Best to make sure you have torch installed first, in particular before installing xformers.
16
+ # Don't run this if you already have PyTorch installed.
17
+ pip install 'torch>=2.0'
18
+ # Then proceed to one of the following
19
+ pip install -U audiocraft # stable release
20
+ pip install -U git+https://git@github.com/facebookresearch/audiocraft#egg=audiocraft # bleeding edge
21
+ pip install -e . # or if you cloned the repo locally (mandatory if you want to train).
22
+
23
+ ```
24
+
25
+ We also recommend having `ffmpeg` installed, either through your system or Anaconda:
26
+
27
+ ```bash
28
+ sudo apt-get install ffmpeg
29
+ # Or if you are using Anaconda or Miniconda
30
+ conda install 'ffmpeg<5' -c conda-forge
31
+
32
+ ```
33
+
34
+ ## Models
35
+
36
+ At the moment, AudioCraft contains the training code and inference code for:
37
+
38
+ * [MusicGen](./docs/MUSICGEN.md): A state-of-the-art controllable text-to-music model.
39
+ * [AudioGen](./docs/AUDIOGEN.md): A state-of-the-art text-to-sound model.
40
+ * [EnCodec](./docs/ENCODEC.md): A state-of-the-art high fidelity neural audio codec.
41
+ * [Multi Band Diffusion](./docs/MBD.md): An EnCodec compatible decoder using diffusion.
42
+
43
+ ## Training code
44
+
45
+ AudioCraft contains PyTorch components for deep learning research in audio and training pipelines for the developed models.
46
+ For a general introduction of AudioCraft design principles and instructions to develop your own training pipeline, refer to
47
+ the [AudioCraft training documentation](./docs/TRAINING.md).
48
+
49
+ For reproducing existing work and using the developed training pipelines, refer to the instructions for each specific model
50
+ that provides pointers to configuration, example grids and model/task-specific information and FAQ.
51
+
52
+ ## API documentation
53
+
54
+ We provide some [API documentation](https://facebookresearch.github.io/audiocraft/api_docs/audiocraft/index.html) for AudioCraft.
55
+
56
+ ## FAQ
57
+
58
+ #### Is the training code available?
59
+
60
+ Yes! We provide the training code for [EnCodec](./docs/ENCODEC.md), [MusicGen](./docs/MUSICGEN.md) and [Multi Band Diffusion](./docs/MBD.md).
61
+
62
+ #### Where are the models stored?
63
+
64
+ Hugging Face stored the model in a specific location, which can be overriden by setting the `AUDIOCRAFT_CACHE_DIR` environment variable.
65
+
66
+ ## License
67
+
68
+ * The code in this repository is released under the MIT license as found in the [LICENSE file](LICENSE).
69
+ * The models weights in this repository are released under the CC-BY-NC 4.0 license as found in the [LICENSE_weights file](LICENSE_weights).
70
+
71
+ ## Citation
72
+
73
+ For the general framework of AudioCraft, please cite the following.
74
+
75
+ ```json
76
+ @article{copet2023simple,
77
+ title={Simple and Controllable Music Generation},
78
+ author={Jade Copet and Felix Kreuk and Itai Gat and Tal Remez and David Kant and Gabriel Synnaeve and Yossi Adi and Alexandre Défossez},
79
+ year={2023},
80
+ journal={arXiv preprint arXiv:2306.05284},
81
+ }
82
+
83
+ ```
84
+
85
+ When referring to a specific model, please cite as mentioned in the model specific README, e.g
86
+ [./docs/MUSICGEN.md](./docs/MUSICGEN.md), [./docs/AUDIOGEN.md](./docs/AUDIOGEN.md), etc.
config/conditioner/chroma2music.yaml ADDED
@@ -0,0 +1,46 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # @package __global__
2
+
3
+ classifier_free_guidance:
4
+ training_dropout: 0.2
5
+ inference_coef: 3.0
6
+
7
+ attribute_dropout:
8
+ args:
9
+ active_on_eval: false
10
+ text: {}
11
+ wav:
12
+ self_wav: 0.5
13
+
14
+ fuser:
15
+ cross_attention_pos_emb: false
16
+ cross_attention_pos_emb_scale: 1
17
+ sum: []
18
+ prepend: [self_wav, description]
19
+ cross: []
20
+ input_interpolate: []
21
+
22
+ conditioners:
23
+ self_wav:
24
+ model: chroma_stem
25
+ chroma_stem:
26
+ sample_rate: ${sample_rate}
27
+ n_chroma: 12
28
+ radix2_exp: 14
29
+ argmax: true
30
+ match_len_on_eval: false
31
+ eval_wavs: null
32
+ n_eval_wavs: 100
33
+ cache_path: null
34
+ description:
35
+ model: t5
36
+ t5:
37
+ name: t5-base
38
+ finetune: false
39
+ word_dropout: 0.2
40
+ normalize_text: false
41
+
42
+ dataset:
43
+ train:
44
+ merge_text_p: 0.25
45
+ drop_desc_p: 0.5
46
+ drop_other_p: 0.5
config/conditioner/clapemb2music.yaml ADDED
@@ -0,0 +1,44 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # @package __global__
2
+
3
+ classifier_free_guidance:
4
+ training_dropout: 0.3
5
+ inference_coef: 3.0
6
+
7
+ attribute_dropout:
8
+ text: {}
9
+ wav: {}
10
+
11
+ fuser:
12
+ cross_attention_pos_emb: false
13
+ cross_attention_pos_emb_scale: 1
14
+ sum: []
15
+ prepend: []
16
+ cross: [description]
17
+ input_interpolate: []
18
+
19
+ conditioners:
20
+ description:
21
+ model: clap
22
+ clap:
23
+ checkpoint: //reference/clap/music_audioset_epoch_15_esc_90.14.pt
24
+ model_arch: 'HTSAT-base'
25
+ enable_fusion: false
26
+ sample_rate: 44100
27
+ max_audio_length: 10
28
+ audio_stride: 1
29
+ dim: 512
30
+ attribute: description
31
+ normalize: true
32
+ quantize: true # use RVQ quantization
33
+ n_q: 12
34
+ bins: 1024
35
+ kmeans_iters: 50
36
+ text_p: 0. # probability of using text embed at train time
37
+ cache_path: null
38
+
39
+ dataset:
40
+ joint_embed_attributes: [description]
41
+ train:
42
+ merge_text_p: 0.25
43
+ drop_desc_p: 0.5
44
+ drop_other_p: 0.5
config/conditioner/none.yaml ADDED
@@ -0,0 +1,19 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # @package __global__
2
+
3
+ # No conditioning
4
+
5
+ classifier_free_guidance:
6
+ training_dropout: 0
7
+ inference_coef: 1
8
+
9
+ attribute_dropout:
10
+ text: {}
11
+ wav: {}
12
+
13
+ fuser:
14
+ sum: []
15
+ prepend: []
16
+ cross: []
17
+ input_interpolate: []
18
+
19
+ conditioners: null
config/conditioner/text2music.yaml ADDED
@@ -0,0 +1,30 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # @package __global__
2
+
3
+ classifier_free_guidance:
4
+ training_dropout: 0.3
5
+ inference_coef: 3.0
6
+
7
+ attribute_dropout: {}
8
+
9
+ fuser:
10
+ cross_attention_pos_emb: false
11
+ cross_attention_pos_emb_scale: 1
12
+ sum: []
13
+ prepend: []
14
+ cross: [description]
15
+ input_interpolate: []
16
+
17
+ conditioners:
18
+ description:
19
+ model: t5
20
+ t5:
21
+ name: t5-base
22
+ finetune: false
23
+ word_dropout: 0.3
24
+ normalize_text: false
25
+
26
+ dataset:
27
+ train:
28
+ merge_text_p: 0.25
29
+ drop_desc_p: 0.5
30
+ drop_other_p: 0.5
config/conditioner/text2sound.yaml ADDED
@@ -0,0 +1,24 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # @package __global__
2
+
3
+ classifier_free_guidance:
4
+ training_dropout: 0.1
5
+ inference_coef: 3.0
6
+
7
+ attribute_dropout: {}
8
+
9
+ fuser:
10
+ cross_attention_pos_emb: false
11
+ cross_attention_pos_emb_scale: 1
12
+ sum: []
13
+ prepend: []
14
+ cross: [description]
15
+ input_interpolate: []
16
+
17
+ conditioners:
18
+ description:
19
+ model: t5
20
+ t5:
21
+ name: t5-large
22
+ finetune: false
23
+ word_dropout: 0.
24
+ normalize_text: false
config/config.yaml ADDED
@@ -0,0 +1,75 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # WARNING: This is the base configuration file shared across ALL solvers in AudioCraft
2
+ # Please don't update this file directly. Instead use distinct configuration files
3
+ # to override the below configuration.
4
+ defaults:
5
+ - _self_
6
+ - dset: default
7
+ - solver: default
8
+
9
+ device: cuda
10
+ dtype: float32
11
+ autocast: false
12
+ autocast_dtype: bfloat16
13
+ seed: 2036
14
+ show: false # just show the model and its size and exit
15
+ continue_from: # continue from a given sig or path
16
+ execute_only: # can be set to generate/evaluate/valid to run that stage
17
+ execute_inplace: false # don't enforce continue_from to be set
18
+ # to enable inplace execution of the stage. This assume
19
+ # that you know what you are doing and execute stage
20
+ # preserving the original xp sig.
21
+ benchmark_no_load: false # if set to true, will repeat the same batch instead of loading them
22
+
23
+ efficient_attention_backend: torch # can be torch or xformers.
24
+ num_threads: 1 # called with torch.set_num_thread.
25
+ mp_start_method: forkserver # multiprocessing method (spawn, fork or fork_server).
26
+
27
+
28
+ label: # use this if you want twice the same exp, with a name.
29
+
30
+ # logging parameters
31
+ logging:
32
+ level: INFO
33
+ log_updates: 10
34
+ log_tensorboard: false
35
+ log_wandb: false
36
+ tensorboard:
37
+ with_media_logging: false
38
+ name: # optional name for the experiment
39
+ sub_dir: # optional sub directory to store tensorboard data
40
+ wandb:
41
+ with_media_logging: true
42
+ project: # project name
43
+ name: # optional name for the experiment
44
+ group: # optional group
45
+
46
+ # SLURM launcher configuration.
47
+ slurm:
48
+ gpus: 4 # convenience parameter, number of GPUs to use.
49
+ mem_per_gpu: 40 # in GB, total mem is automatically scaled with `gpus`.
50
+ time: 3600
51
+ constraint:
52
+ partition:
53
+ comment:
54
+ setup: []
55
+ exclude: ''
56
+
57
+ # dora parameters
58
+ dora:
59
+ # Output folder for all artifacts of an experiment.
60
+ dir: /checkpoint/${oc.env:USER}/experiments/audiocraft/outputs
61
+ # The following entries will be ignored by dora when computing the unique XP signature.
62
+ # Note that slurm.* and dora.* are automatically ignored.
63
+ exclude: [
64
+ 'device', 'wandb.*', 'tensorboard.*', 'logging.*',
65
+ 'dataset.num_workers', 'eval.num_workers', 'special.*',
66
+ 'metrics.visqol.bin', 'metrics.fad.bin',
67
+ 'execute_only', 'execute_best', 'generate.every',
68
+ 'optim.eager_sync', 'profiler.*', 'deadlock.*',
69
+ 'efficient_attention_backend', 'num_threads', 'mp_start_method',
70
+ ]
71
+ use_rendezvous: false
72
+ # for grids, always run from a clean repo, allowing reliable runs and storing
73
+ # the exact commit. Your repo must be absolutely pristine clean.
74
+ # Local `dora run` are not impacted for easier debugging.
75
+ git_save: true
config/dset/audio/audiocaps_16khz.yaml ADDED
@@ -0,0 +1,11 @@
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # @package __global__
2
+
3
+ # AudioCaps dataset
4
+ datasource:
5
+ max_sample_rate: 16000
6
+ max_channels: 1
7
+
8
+ train: null # only evaluation set
9
+ valid: null # only evaluation set
10
+ evaluate: egs/audiocaps/audiocaps_16khz
11
+ generate: egs/audiocaps/audiocaps_16khz # identical to evaluate
config/dset/audio/default.yaml ADDED
@@ -0,0 +1,10 @@
 
 
 
 
 
 
 
 
 
 
 
1
+ # @package __global__
2
+
3
+ datasource:
4
+ max_sample_rate: ???
5
+ max_channels: ???
6
+
7
+ train: ???
8
+ valid: ???
9
+ evaluate: ???
10
+ generate: null
config/dset/audio/example.yaml ADDED
@@ -0,0 +1,10 @@
 
 
 
 
 
 
 
 
 
 
 
1
+ # @package __global__
2
+
3
+ datasource:
4
+ max_sample_rate: 44100
5
+ max_channels: 2
6
+
7
+ train: egs/example
8
+ valid: egs/example
9
+ evaluate: egs/example
10
+ generate: egs/example
config/dset/audio/musiccaps_32khz.yaml ADDED
@@ -0,0 +1,12 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # @package __global__
2
+
3
+ # total samples obtained from MusicCaps = 5469
4
+ # (out of 5521 due to AudioSet corrupted samples)
5
+ datasource:
6
+ max_sample_rate: 32000
7
+ max_channels: 2
8
+
9
+ train: null # only evaluation set
10
+ valid: null # only evaluation set
11
+ evaluate: egs/musiccaps/musiccaps_32khz
12
+ generate: egs/musiccaps/musiccaps_32khz # identical to evaluate
config/dset/default.yaml ADDED
@@ -0,0 +1,10 @@
 
 
 
 
 
 
 
 
 
 
 
1
+ # @package __global__
2
+
3
+ # WARNING: This is a base configuration file shared across ALL solvers in AudioCraft
4
+ # Please don't update this file directly. Instead use distinct configuration files
5
+ # to override the below configuration.
6
+ datasource:
7
+ train: ???
8
+ valid: ???
9
+ evaluate: ???
10
+ generate: ???
config/dset/internal/music_10k_32khz.yaml ADDED
@@ -0,0 +1,11 @@
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # @package __global__
2
+
3
+ # high quality music dataset with no artist overlap between splits
4
+ datasource:
5
+ max_sample_rate: 32000
6
+ max_channels: 1
7
+
8
+ train: egs/music/music_10k_32khz/train
9
+ valid: egs/music/music_10k_32khz/valid
10
+ evaluate: egs/music/music_10k_32khz/test
11
+ generate: egs/music/music_10k_32khz/test # identical to evaluate
config/dset/internal/music_400k_32khz.yaml ADDED
@@ -0,0 +1,10 @@
 
 
 
 
 
 
 
 
 
 
 
1
+ # @package __global__
2
+
3
+ datasource:
4
+ max_sample_rate: 32000
5
+ max_channels: 1
6
+
7
+ train: egs/music/music_400k_32khz/train
8
+ valid: egs/music/music_400k_32khz/valid
9
+ evaluate: egs/music/music_400k_32khz/test
10
+ generate: egs/music/music_400k_32khz/test # identical to evaluate
config/dset/internal/sounds_16khz.yaml ADDED
@@ -0,0 +1,12 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # @package __global__
2
+
3
+ # environmental sounds dataset compiling all datasets
4
+ # with applied filters on tags
5
+ datasource:
6
+ max_sample_rate: 16000
7
+ max_channels: 1
8
+
9
+ train: egs/sound/sounds_16khz/train
10
+ valid: egs/sound/sounds_16khz/valid
11
+ evaluate: egs/sound/sounds_16khz/test
12
+ generate: egs/sound/sounds_16khz/test # identical to evaluate
config/model/encodec/default.yaml ADDED
@@ -0,0 +1,54 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # @package __global__
2
+
3
+ compression_model: encodec
4
+
5
+ encodec:
6
+ autoencoder: seanet
7
+ quantizer: rvq
8
+ sample_rate: ${sample_rate}
9
+ channels: ${channels}
10
+ causal: false
11
+ renormalize: false
12
+
13
+ seanet:
14
+ dimension: 128
15
+ channels: ${channels}
16
+ causal: ${encodec.causal}
17
+ n_filters: 32
18
+ n_residual_layers: 1
19
+ ratios: [8, 5, 4, 2]
20
+ activation: ELU
21
+ activation_params: {"alpha": 1.}
22
+ norm: weight_norm
23
+ norm_params: {}
24
+ kernel_size: 7
25
+ residual_kernel_size: 3
26
+ last_kernel_size: 7
27
+ dilation_base: 2
28
+ pad_mode: constant
29
+ true_skip: true
30
+ compress: 2
31
+ lstm: 2
32
+ disable_norm_outer_blocks: 0
33
+ # Specific encoder or decoder params.
34
+ # You can also override any param for the encoder or decoder only
35
+ # by using Hydra `+param=` syntax, i.e.`
36
+ # `+seanet.decoder.n_filters=64`.
37
+ decoder:
38
+ trim_right_ratio: 1.0
39
+ final_activation: null
40
+ final_activation_params: null
41
+ encoder: {}
42
+
43
+ rvq:
44
+ n_q: 8
45
+ q_dropout: false
46
+ bins: 1024
47
+ decay: 0.99
48
+ kmeans_init: true
49
+ kmeans_iters: 50
50
+ threshold_ema_dead_code: 2
51
+ orthogonal_reg_weight: 0.0
52
+ orthogonal_reg_active_codes_only: false
53
+
54
+ no_quant: {}
config/model/encodec/encodec_base_causal.yaml ADDED
@@ -0,0 +1,11 @@
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # @package __global__
2
+
3
+ defaults:
4
+ - encodec/default
5
+
6
+ encodec:
7
+ causal: true
8
+
9
+ rvq:
10
+ n_q: 32
11
+ q_dropout: true
config/model/encodec/encodec_large_nq4_s320.yaml ADDED
@@ -0,0 +1,13 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # @package __global__
2
+
3
+ defaults:
4
+ - encodec/default
5
+
6
+ seanet:
7
+ # default ratios are [8, 5, 4, 2]
8
+ n_filters: 64
9
+
10
+ rvq:
11
+ bins: 2048
12
+ n_q: 4
13
+ q_dropout: false
config/model/encodec/encodec_large_nq4_s640.yaml ADDED
@@ -0,0 +1,13 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # @package __global__
2
+
3
+ defaults:
4
+ - encodec/default
5
+
6
+ seanet:
7
+ ratios: [8, 5, 4, 4]
8
+ n_filters: 64
9
+
10
+ rvq:
11
+ bins: 2048
12
+ n_q: 4
13
+ q_dropout: false
config/model/lm/audiogen_lm.yaml ADDED
@@ -0,0 +1,36 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # @package __global__
2
+
3
+ defaults:
4
+ - lm/default
5
+ - override /conditioner: text2sound
6
+ - override /model/lm/model_scale: small # prefer this group to set model scale instead of transformer_lm keys directly
7
+
8
+ lm_model: transformer_lm
9
+
10
+ codebooks_pattern:
11
+ modeling: delay
12
+ delay:
13
+ delays: [0, 1, 2, 3]
14
+ flatten_first: 0
15
+ empty_initial: 0
16
+ unroll:
17
+ flattening: [0, 1, 2, 3]
18
+ delays: [0, 0, 0, 0]
19
+ music_lm:
20
+ group_by: 2
21
+ valle:
22
+ delays: [0, 0, 0]
23
+
24
+ transformer_lm:
25
+ n_q: 4
26
+ card: 2048
27
+ memory_efficient: true
28
+ bias_proj: false
29
+ bias_ff: false
30
+ bias_attn: false
31
+ norm_first: true
32
+ layer_scale: null
33
+ weight_init: gaussian
34
+ depthwise_init: current
35
+ zero_bias_init: true
36
+ attention_as_float32: false
config/model/lm/default.yaml ADDED
@@ -0,0 +1,47 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # @package __global__
2
+ defaults:
3
+ - _self_
4
+ - /model/lm/model_scale: base # prefer this group to set model scale instead of transformer_lm keys directly
5
+
6
+ lm_model: transformer_lm
7
+
8
+ codebooks_pattern:
9
+ modeling: parallel
10
+
11
+ transformer_lm:
12
+ dim: 512
13
+ num_heads: 8
14
+ num_layers: 8
15
+ hidden_scale: 4
16
+ n_q: 8 # number of streams to model
17
+ card: 1024
18
+ dropout: 0.
19
+ emb_lr: null
20
+ activation: gelu
21
+ norm_first: false # use pre-norm instead of post-norm
22
+ bias_ff: true # use bias for the feedforward
23
+ bias_attn: true # use bias for the attention
24
+ bias_proj: true # use bias for the output projections
25
+ past_context: null
26
+ causal: true
27
+ custom: false # use custom MHA implementation
28
+ memory_efficient: false # use flash attention
29
+ attention_as_float32: false # use float32 for the attention part,
30
+ # recommended at the moment when memory_efficient is True.
31
+ layer_scale: null
32
+ positional_embedding: sin # positional embedding strategy (sin, rope, or sin_rope).
33
+ xpos: false # apply xpos decay (rope only).
34
+ checkpointing: none # layer checkpointing method, can be none, torch, xformers_default.
35
+ # torch is the slowest but uses the least memory,
36
+ # xformers_default is somewhere in between.
37
+ weight_init: null # weight initialization (null, gaussian or uniform)
38
+ depthwise_init: null # perform depthwise initialization (null, current, global)
39
+ zero_bias_init: false # initialize bias to zero if bias in linears and
40
+ # if a weight_init method is used.
41
+ norm: layer_norm # normalization method to use in transformer.
42
+ cross_attention: false
43
+ qk_layer_norm: false
44
+ qk_layer_norm_cross: false
45
+ attention_dropout: null
46
+ kv_repeat: 1
47
+ two_step_cfg: false # whether to do true 2 steps CFG, potentially resolving some padding issues or not...
config/model/lm/model_scale/base.yaml ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ # @package __global__
2
+
3
+ # overrides nothing because default is already transformer base (~ 60M params)
config/model/lm/model_scale/large.yaml ADDED
@@ -0,0 +1,7 @@
 
 
 
 
 
 
 
 
1
+ # @package _global_
2
+
3
+ # gpt2 inspired, even bigger (~3.3B params)
4
+ transformer_lm:
5
+ dim: 2048
6
+ num_heads: 32
7
+ num_layers: 48
config/model/lm/model_scale/medium.yaml ADDED
@@ -0,0 +1,7 @@
 
 
 
 
 
 
 
 
1
+ # @package _global_
2
+
3
+ # gpt2 like (~1.5B params)
4
+ transformer_lm:
5
+ dim: 1536
6
+ num_heads: 24
7
+ num_layers: 48
config/model/lm/model_scale/small.yaml ADDED
@@ -0,0 +1,8 @@
 
 
 
 
 
 
 
 
 
1
+ # @package _global_
2
+
3
+ # 300M Param.
4
+
5
+ transformer_lm:
6
+ dim: 1024
7
+ num_heads: 16
8
+ num_layers: 24
config/model/lm/model_scale/xsmall.yaml ADDED
@@ -0,0 +1,8 @@
 
 
 
 
 
 
 
 
 
1
+ # @package _global_
2
+ # just used for debugging or when we just want to populate the cache
3
+ # and do not care about training.
4
+
5
+ transformer_lm:
6
+ dim: 64
7
+ num_heads: 2
8
+ num_layers: 2
config/model/lm/musicgen_lm.yaml ADDED
@@ -0,0 +1,36 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # @package __global__
2
+
3
+ defaults:
4
+ - lm/default
5
+ - override /conditioner: text2music
6
+ - override /model/lm/model_scale: small # prefer this group to set model scale instead of transformer_lm keys directly
7
+
8
+ lm_model: transformer_lm
9
+
10
+ codebooks_pattern:
11
+ modeling: delay
12
+ delay:
13
+ delays: [0, 1, 2, 3]
14
+ flatten_first: 0
15
+ empty_initial: 0
16
+ unroll:
17
+ flattening: [0, 1, 2, 3]
18
+ delays: [0, 0, 0, 0]
19
+ music_lm:
20
+ group_by: 2
21
+ valle:
22
+ delays: [0, 0, 0]
23
+
24
+ transformer_lm:
25
+ n_q: 4
26
+ card: 2048
27
+ memory_efficient: true
28
+ bias_proj: false
29
+ bias_ff: false
30
+ bias_attn: false
31
+ norm_first: true
32
+ layer_scale: null
33
+ weight_init: gaussian
34
+ depthwise_init: current
35
+ zero_bias_init: true
36
+ attention_as_float32: false
config/model/none.yaml ADDED
@@ -0,0 +1,4 @@
 
 
 
 
 
1
+ # @package __global__
2
+
3
+ # This file exist so that model is recognized as a config group
4
+ # by Hydra, and Dora. A bit weird we might need a better fix someday.
config/model/score/basic.yaml ADDED
@@ -0,0 +1,17 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # @package _global_
2
+
3
+ diffusion_unet:
4
+ hidden: 48
5
+ depth: 4
6
+ res_blocks: 1
7
+ norm_groups: 4
8
+ kernel: 8
9
+ stride: 4
10
+ growth: 4
11
+ max_channels: 10_000
12
+ dropout: 0.
13
+ emb_all_layers: true
14
+ bilstm: false
15
+ codec_dim: null
16
+ transformer: false
17
+ cross_attention: false
config/solver/audiogen/audiogen_base_16khz.yaml ADDED
@@ -0,0 +1,70 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # @package __global__
2
+
3
+ # This is the training loop solver
4
+ # for the base AudioGen model (text-to-sound)
5
+ # on monophonic audio sampled at 16 kHz
6
+ # using a similar EnCodec+LM setup to MusicGen
7
+ defaults:
8
+ - audiogen/default
9
+ - /model: lm/audiogen_lm
10
+ - override /dset: audio/default
11
+ - _self_
12
+
13
+ autocast: true
14
+ autocast_dtype: float16
15
+
16
+ # EnCodec large trained on mono-channel music audio sampled at 16khz
17
+ # with a total stride of 320 leading to 50 frames/s.
18
+ # rvq.n_q=4, rvq.bins=2048, no quantization dropout
19
+ # (transformer_lm card and n_q must be compatible)
20
+ compression_model_checkpoint: //reference/bd44a852/checkpoint.th
21
+
22
+ channels: 1
23
+ sample_rate: 16000
24
+
25
+ deadlock:
26
+ use: true # deadlock detection
27
+
28
+ dataset:
29
+ batch_size: 128 # matching AudioGen paper setup (256 * mix_p=0.5 = 128)
30
+ num_workers: 10
31
+ segment_duration: 10
32
+ min_segment_ratio: 1.0
33
+ sample_on_weight: false # Uniform sampling all the way
34
+ sample_on_duration: false # Uniform sampling all the way
35
+ external_metadata_source: null
36
+ # sample mixing augmentation at train time
37
+ train:
38
+ batch_size: 256 # matching AudioGen paper setup
39
+ aug_p: 0.5 # perform audio mixing 50% of the time
40
+ mix_p: 0.5 # proportion of batch items mixed together
41
+ # important: note that this will reduce the
42
+ # actual batch size used at train time
43
+ # which will be equal to mix_p * batch_size
44
+ mix_snr_low: -5
45
+ mix_snr_high: 5
46
+ mix_min_overlap: 0.5
47
+
48
+ generate:
49
+ lm:
50
+ use_sampling: true
51
+ top_k: 250
52
+ top_p: 0.0
53
+
54
+ optim:
55
+ epochs: 100
56
+ optimizer: adamw
57
+ lr: 5e-4
58
+ ema:
59
+ use: true
60
+ updates: 10
61
+ device: cuda
62
+
63
+ logging:
64
+ log_tensorboard: true
65
+
66
+ schedule:
67
+ lr_scheduler: inverse_sqrt
68
+ inverse_sqrt:
69
+ warmup: 3000
70
+ warmup_init_lr: 0.0
config/solver/audiogen/debug.yaml ADDED
@@ -0,0 +1,52 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # @package __global__
2
+
3
+ # This is a minimal debugging configuration
4
+ # for MusicGen training solver
5
+ defaults:
6
+ - audiogen/default
7
+ - /model: lm/audiogen_lm
8
+ - override /model/lm/model_scale: xsmall
9
+ - override /dset: audio/example
10
+ - _self_
11
+
12
+ autocast: false
13
+ compression_model_checkpoint: null
14
+
15
+ codebooks_pattern:
16
+ modeling: parallel
17
+
18
+ channels: 1
19
+ sample_rate: 16000
20
+
21
+ deadlock:
22
+ use: false # deadlock detection
23
+
24
+ dataset:
25
+ batch_size: 4
26
+ segment_duration: 5
27
+ sample_on_weight: false # Uniform sampling all the way
28
+ sample_on_duration: false # Uniform sampling all the way
29
+
30
+ generate:
31
+ audio:
32
+ strategy: peak
33
+ lm:
34
+ use_sampling: false
35
+ top_k: 0
36
+ top_p: 0.0
37
+
38
+ checkpoint:
39
+ save_every: 0
40
+ keep_last: 0
41
+
42
+ optim:
43
+ epochs: 2
44
+ updates_per_epoch: 10
45
+ optimizer: adamw
46
+ lr: 1e-4
47
+
48
+ logging:
49
+ log_tensorboard: true
50
+
51
+ schedule:
52
+ lr_scheduler: null
config/solver/audiogen/default.yaml ADDED
@@ -0,0 +1,40 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # @package __global__
2
+
3
+ defaults:
4
+ - /solver/musicgen/default
5
+ - _self_
6
+ - /solver/audiogen/evaluation: none
7
+ - override /dset: audio/default
8
+
9
+ # See config/solver/musicgen/default.yaml for a list of possible values.
10
+ # We only keep the most important here.
11
+
12
+ autocast: true
13
+ autocast_dtype: float16
14
+
15
+ solver: audiogen
16
+ sample_rate: ???
17
+ channels: ???
18
+ compression_model_checkpoint: ???
19
+
20
+ tokens:
21
+ padding_with_special_token: false
22
+
23
+ dataset:
24
+ batch_size: 128
25
+ segment_duration: 10
26
+ min_segment_ratio: 1.0 # lower values such as 0.5 result in generations with a lot of silence.
27
+
28
+ optim:
29
+ epochs: 100
30
+ updates_per_epoch: 2000
31
+ lr: 1e-4
32
+ optimizer: adamw
33
+ max_norm: 1.0
34
+ adam:
35
+ betas: [0.9, 0.95]
36
+ weight_decay: 0.1
37
+ eps: 1e-8
38
+
39
+ schedule:
40
+ lr_scheduler: null
config/solver/audiogen/evaluation/none.yaml ADDED
@@ -0,0 +1,5 @@
 
 
 
 
 
 
1
+ # @package __global__
2
+
3
+ dataset:
4
+ evaluate:
5
+ num_samples: 10000
config/solver/audiogen/evaluation/objective_eval.yaml ADDED
@@ -0,0 +1,29 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # @package __global__
2
+
3
+ # Setup for execute only on audiocaps for audio generation
4
+ # evaluation with objective metrics
5
+ # execute_only=evaluate
6
+
7
+ dataset:
8
+ max_audio_duration: null
9
+ # ensure the proper values are broadcasted here for evaluate
10
+ evaluate:
11
+ min_audio_duration: 1. # some metrics requires a minimum audio length
12
+ max_audio_duration: null # all samples from audiocaps should be ~10s
13
+ num_samples: null
14
+ segment_duration: null
15
+ generate:
16
+ min_audio_duration: 1.
17
+ max_audio_duration: null
18
+ num_samples: 500
19
+
20
+ evaluate:
21
+ metrics:
22
+ fad: true
23
+ kld: true
24
+ text_consistency: true
25
+
26
+ metrics:
27
+ kld:
28
+ passt:
29
+ pretrained_length: 10 # similarly to reported results in AudioGen paper
config/solver/compression/debug.yaml ADDED
@@ -0,0 +1,55 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # @package __global__
2
+
3
+ defaults:
4
+ - compression/default
5
+ - /model: encodec/encodec_base_causal
6
+ - override /dset: audio/example
7
+ - _self_
8
+
9
+ channels: 1
10
+ sample_rate: 16000
11
+
12
+ # debug config uses just L1
13
+ losses:
14
+ adv: 0.
15
+ feat: 0.
16
+ l1: 1.
17
+ mel: 0.
18
+ msspec: 0.
19
+ # no balancer
20
+ balancer:
21
+ balance_grads: false
22
+ ema_decay: 1.
23
+ total_norm: 1.
24
+ per_batch_item: false
25
+ # no adversaries
26
+ adversarial:
27
+ adversaries: []
28
+ adv_loss: hinge
29
+ feat_loss: l1
30
+
31
+ # faster model for local dev
32
+ seanet:
33
+ dimension: 16
34
+ n_filters: 4
35
+
36
+ # very small dataset
37
+ dataset:
38
+ batch_size: 8
39
+ num_workers: 10
40
+ num_samples: 100
41
+ segment_duration: 1
42
+ evaluate:
43
+ batch_size: 32
44
+ generate:
45
+ batch_size: 1
46
+ num_samples: 5
47
+ segment_duration: 10
48
+
49
+ # limited training
50
+ evaluate:
51
+ every: 5
52
+ generate:
53
+ every: 5
54
+ optim:
55
+ epochs: 50
config/solver/compression/default.yaml ADDED
@@ -0,0 +1,160 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # @package __global__
2
+
3
+ defaults:
4
+ - ../default
5
+ - override /dset: audio/default
6
+ - _self_
7
+
8
+ solver: compression
9
+ sample_rate: ???
10
+ channels: ???
11
+
12
+ # loss balancing
13
+ losses:
14
+ adv: 4.
15
+ feat: 4.
16
+ l1: 0.1
17
+ mel: 0.
18
+ msspec: 2.
19
+ sisnr: 0.
20
+ balancer:
21
+ balance_grads: true
22
+ ema_decay: 0.999
23
+ per_batch_item: true
24
+ total_norm: 1.
25
+
26
+ adversarial:
27
+ every: 1
28
+ adversaries: [msstftd]
29
+ adv_loss: hinge
30
+ feat_loss: l1
31
+
32
+ # losses hyperparameters
33
+ l1: {}
34
+ l2: {}
35
+ mrstft:
36
+ factor_sc: .5
37
+ factor_mag: .5
38
+ normalized: false
39
+ mel:
40
+ sample_rate: ${sample_rate}
41
+ n_fft: 1024
42
+ hop_length: 256
43
+ win_length: 1024
44
+ n_mels: 64
45
+ f_min: 64
46
+ f_max: null
47
+ normalized: false
48
+ floor_level: 1e-5
49
+ sisnr:
50
+ sample_rate: ${sample_rate}
51
+ segment: 5.
52
+ msspec:
53
+ sample_rate: ${sample_rate}
54
+ range_start: 6
55
+ range_end: 11
56
+ n_mels: 64
57
+ f_min: 64
58
+ f_max: null
59
+ normalized: true
60
+ alphas: false
61
+ floor_level: 1e-5
62
+
63
+ # metrics
64
+ metrics:
65
+ visqol:
66
+ mode: audio
67
+ bin: null # path to visqol install
68
+ model: tcdaudio14_aacvopus_coresv_svrnsim_n.68_g.01_c1.model # visqol v3
69
+
70
+ # adversaries hyperparameters
71
+ msstftd:
72
+ in_channels: 1
73
+ out_channels: 1
74
+ filters: 32
75
+ norm: weight_norm
76
+ n_ffts: [1024, 2048, 512, 256, 128]
77
+ hop_lengths: [256, 512, 128, 64, 32]
78
+ win_lengths: [1024, 2048, 512, 256, 128]
79
+ activation: LeakyReLU
80
+ activation_params: {negative_slope: 0.3}
81
+ msd:
82
+ in_channels: 1
83
+ out_channels: 1
84
+ scale_norms: [spectral_norm, weight_norm, weight_norm]
85
+ kernel_sizes: [5, 3]
86
+ filters: 16
87
+ max_filters: 1024
88
+ downsample_scales: [4, 4, 4, 4]
89
+ inner_kernel_sizes: null
90
+ groups: [4, 4, 4, 4]
91
+ strides: null
92
+ paddings: null
93
+ activation: LeakyReLU
94
+ activation_params: {negative_slope: 0.3}
95
+ mpd:
96
+ in_channels: 1
97
+ out_channels: 1
98
+ periods: [2, 3, 5, 7, 11]
99
+ n_layers: 5
100
+ kernel_size: 5
101
+ stride: 3
102
+ filters: 8
103
+ filter_scales: 4
104
+ max_filters: 1024
105
+ activation: LeakyReLU
106
+ activation_params: {negative_slope: 0.3}
107
+ norm: weight_norm
108
+
109
+ # data hyperparameters
110
+ dataset:
111
+ batch_size: 64
112
+ num_workers: 10
113
+ segment_duration: 1
114
+ train:
115
+ num_samples: 500000
116
+ valid:
117
+ num_samples: 10000
118
+ evaluate:
119
+ batch_size: 32
120
+ num_samples: 10000
121
+ generate:
122
+ batch_size: 32
123
+ num_samples: 50
124
+ segment_duration: 10
125
+
126
+ # solver hyperparameters
127
+ evaluate:
128
+ every: 25
129
+ num_workers: 5
130
+ metrics:
131
+ visqol: false
132
+ sisnr: true
133
+ generate:
134
+ every: 25
135
+ num_workers: 5
136
+ audio:
137
+ sample_rate: ${sample_rate}
138
+
139
+ # checkpointing schedule
140
+ checkpoint:
141
+ save_last: true
142
+ save_every: 25
143
+ keep_last: 10
144
+ keep_every_states: null
145
+
146
+ # optimization hyperparameters
147
+ optim:
148
+ epochs: 200
149
+ updates_per_epoch: 2000
150
+ lr: 3e-4
151
+ max_norm: 0.
152
+ optimizer: adam
153
+ adam:
154
+ betas: [0.5, 0.9]
155
+ weight_decay: 0.
156
+ ema:
157
+ use: true # whether to use EMA or not
158
+ updates: 1 # update at every step
159
+ device: ${device} # device for EMA, can be put on GPU if more frequent updates
160
+ decay: 0.99 # EMA decay value, if null, no EMA is used
config/solver/compression/encodec_audiogen_16khz.yaml ADDED
@@ -0,0 +1,10 @@
 
 
 
 
 
 
 
 
 
 
 
1
+ # @package __global__
2
+
3
+ defaults:
4
+ - compression/default
5
+ - /model: encodec/encodec_large_nq4_s320
6
+ - override /dset: audio/default
7
+ - _self_
8
+
9
+ channels: 1
10
+ sample_rate: 16000
config/solver/compression/encodec_base_24khz.yaml ADDED
@@ -0,0 +1,10 @@
 
 
 
 
 
 
 
 
 
 
 
1
+ # @package __global__
2
+
3
+ defaults:
4
+ - compression/default
5
+ - /model: encodec/encodec_base_causal
6
+ - override /dset: audio/default
7
+ - _self_
8
+
9
+ channels: 1
10
+ sample_rate: 24000
config/solver/compression/encodec_musicgen_32khz.yaml ADDED
@@ -0,0 +1,10 @@
 
 
 
 
 
 
 
 
 
 
 
1
+ # @package __global__
2
+
3
+ defaults:
4
+ - compression/default
5
+ - /model: encodec/encodec_large_nq4_s640
6
+ - override /dset: audio/default
7
+ - _self_
8
+
9
+ channels: 1
10
+ sample_rate: 32000
config/solver/default.yaml ADDED
@@ -0,0 +1,108 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # @package __global__
2
+
3
+ # WARNING: This is a base configuration file shared across ALL solvers in AudioCraft
4
+ # Please don't update this file directly. Instead use distinct configuration files
5
+ # to override the below configuration.
6
+ solver: ???
7
+
8
+ fsdp:
9
+ use: false # should we use FSDP.
10
+ param_dtype: float16 # equivalent to autocast_dtype for FSDP.
11
+ reduce_dtype: float32 # gradient averaging dtype, float32 will give max stability.
12
+ buffer_dtype: float32 # dtype used for buffers, we don't have much buffers, so let's leave it.
13
+ sharding_strategy: shard_grad_op # can be shard_grad_op or full_shard.
14
+ # full_shard will use less memory but slower ??
15
+ per_block: true # If True, uses nested FSDP.
16
+
17
+ profiler:
18
+ enabled: false
19
+
20
+ deadlock:
21
+ use: false
22
+ timeout: 600
23
+
24
+ dataset:
25
+ batch_size: ???
26
+ num_workers: 10
27
+ segment_duration: null
28
+ num_samples: null
29
+ return_info: false
30
+ shuffle: false
31
+ sample_on_duration: true
32
+ sample_on_weight: true
33
+ min_segment_ratio: 0.5
34
+ train:
35
+ num_samples: null
36
+ shuffle: true
37
+ shuffle_seed: 0 # if you want to sample the data differently.
38
+ permutation_on_files: false
39
+ valid:
40
+ num_samples: null
41
+ evaluate:
42
+ num_samples: null
43
+ generate:
44
+ num_samples: null
45
+ return_info: true
46
+
47
+ checkpoint:
48
+ save_last: true
49
+ save_every: null
50
+ keep_last: null
51
+ keep_every_states: null
52
+
53
+ generate:
54
+ every: null
55
+ path: 'samples'
56
+ audio:
57
+ format: 'mp3'
58
+ strategy: 'clip'
59
+ sample_rate: null
60
+ lm:
61
+ use_sampling: false
62
+ temp: 1.0
63
+ top_k: 0
64
+ top_p: 0.0
65
+ evaluate:
66
+ every: null
67
+ num_workers: 5
68
+ truncate_audio: null
69
+ fixed_generation_duration: null # in secs
70
+ metrics:
71
+ base: true # run default evaluation (e.g. like train/valid stage)
72
+
73
+ optim:
74
+ epochs: ???
75
+ updates_per_epoch: null
76
+ lr: ???
77
+ optimizer: ???
78
+ adam:
79
+ betas: [0.9, 0.999]
80
+ weight_decay: 0.
81
+ ema:
82
+ use: false # whether to use EMA or not
83
+ updates: ${optim.updates_per_epoch} # frequency of updates of the EMA
84
+ device: cpu # device for EMA, can be put on GPU if more frequent updates
85
+ decay: 0.99 # EMA decay value, if null, no EMA is used
86
+
87
+ schedule:
88
+ lr_scheduler: null
89
+ step:
90
+ step_size: null
91
+ gamma: null
92
+ exponential:
93
+ lr_decay: null
94
+ cosine:
95
+ warmup: null
96
+ lr_min_ratio: 0.0
97
+ cycle_length: 1.0
98
+ polynomial_decay:
99
+ warmup: null
100
+ zero_lr_warmup_steps: 0
101
+ end_lr: 0.0
102
+ power: 1
103
+ inverse_sqrt:
104
+ warmup: null
105
+ warmup_init_lr: 0.0
106
+ linear_warmup:
107
+ warmup: null
108
+ warmup_init_lr: 0.0
config/solver/diffusion/debug.yaml ADDED
@@ -0,0 +1,106 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # @package __global__
2
+
3
+ defaults:
4
+ - /solver/default
5
+ - /model: score/basic
6
+ - override /dset: audio/default
7
+ - _self_
8
+
9
+ solver: diffusion
10
+
11
+ sample_rate: 16000
12
+ channels: 1
13
+ compression_model_checkpoint: //sig/5091833e
14
+ n_q: 2 # number of codebooks to keep
15
+
16
+ dataset:
17
+ batch_size: 8
18
+ num_workers: 10
19
+ segment_duration: 1
20
+ train:
21
+ num_samples: 100
22
+ valid:
23
+ num_samples: 100
24
+ evaluate:
25
+ batch_size: 8
26
+ num_samples: 10
27
+ generate:
28
+ batch_size: 8
29
+ num_samples: 10
30
+ segment_duration: 10
31
+
32
+ loss:
33
+ kind: mse
34
+ norm_power: 0.
35
+
36
+ valid:
37
+ every: 1
38
+
39
+ evaluate:
40
+ every: 5
41
+ num_workers: 5
42
+ metrics:
43
+ visqol: false
44
+ sisnr: false
45
+ rvm: true
46
+
47
+ generate:
48
+ every: 5
49
+ num_workers: 5
50
+ audio:
51
+ sample_rate: ${sample_rate}
52
+
53
+ checkpoint:
54
+ save_last: true
55
+ save_every: 25
56
+ keep_last: 10
57
+ keep_every_states: null
58
+
59
+
60
+ optim:
61
+ epochs: 50
62
+ updates_per_epoch: 2000
63
+ lr: 2e-4
64
+ max_norm: 0
65
+ optimizer: adam
66
+ adam:
67
+ betas: [0.9, 0.999]
68
+ weight_decay: 0.
69
+ ema:
70
+ use: true # whether to use EMA or not
71
+ updates: 1 # update at every step
72
+ device: ${device} # device for EMA, can be put on GPU if more frequent updates
73
+ decay: 0.99 # EMA decay value, if null, no EMA is used
74
+
75
+ processor:
76
+ name: multi_band_processor
77
+ use: false
78
+ n_bands: 8
79
+ num_samples: 10_000
80
+ power_std: 1.
81
+
82
+ resampling:
83
+ use: false
84
+ target_sr: 16000
85
+
86
+ filter:
87
+ use: false
88
+ n_bands: 4
89
+ idx_band: 0
90
+ cutoffs: null
91
+
92
+ schedule:
93
+ repartition: "power"
94
+ variable_step_batch: true
95
+ beta_t0: 1.0e-5
96
+ beta_t1: 2.9e-2
97
+ beta_exp: 7.5
98
+ num_steps: 1000
99
+ variance: 'beta'
100
+ clip: 5.
101
+ rescale: 1.
102
+ n_bands: null
103
+ noise_scale: 1.0
104
+
105
+ metrics:
106
+ num_stage: 4