adefossez commited on
Commit
5238467
·
0 Parent(s):

Initial commit

Browse files
This view is limited to 50 files because it contains too many changes.   See raw diff
Files changed (50) hide show
  1. .github/actions/audiocraft_build/action.yml +29 -0
  2. .github/workflows/audiocraft_docs.yml +32 -0
  3. .github/workflows/audiocraft_linter.yml +17 -0
  4. .github/workflows/audiocraft_tests.yml +17 -0
  5. .gitignore +55 -0
  6. CHANGELOG.md +9 -0
  7. CODE_OF_CONDUCT.md +80 -0
  8. CONTRIBUTING.md +35 -0
  9. LICENSE +21 -0
  10. LICENSE_weights +157 -0
  11. MANIFEST.in +8 -0
  12. MODEL_CARD.md +81 -0
  13. Makefile +21 -0
  14. README.md +96 -0
  15. app.py +116 -0
  16. app_batched.py +111 -0
  17. assets/bach.mp3 +0 -0
  18. audiocraft/__init__.py +10 -0
  19. audiocraft/data/__init__.py +8 -0
  20. audiocraft/data/audio.py +213 -0
  21. audiocraft/data/audio_dataset.py +525 -0
  22. audiocraft/data/audio_utils.py +169 -0
  23. audiocraft/data/zip.py +74 -0
  24. audiocraft/models/__init__.py +10 -0
  25. audiocraft/models/builders.py +218 -0
  26. audiocraft/models/encodec.py +302 -0
  27. audiocraft/models/lm.py +526 -0
  28. audiocraft/models/loaders.py +65 -0
  29. audiocraft/models/musicgen.py +288 -0
  30. audiocraft/modules/__init__.py +20 -0
  31. audiocraft/modules/activations.py +96 -0
  32. audiocraft/modules/codebooks_patterns.py +539 -0
  33. audiocraft/modules/conditioners.py +986 -0
  34. audiocraft/modules/conv.py +245 -0
  35. audiocraft/modules/lstm.py +25 -0
  36. audiocraft/modules/rope.py +124 -0
  37. audiocraft/modules/seanet.py +258 -0
  38. audiocraft/modules/streaming.py +135 -0
  39. audiocraft/modules/transformer.py +704 -0
  40. audiocraft/py.typed +0 -0
  41. audiocraft/quantization/__init__.py +9 -0
  42. audiocraft/quantization/base.py +107 -0
  43. audiocraft/quantization/core_vq.py +400 -0
  44. audiocraft/quantization/vq.py +116 -0
  45. audiocraft/utils/__init__.py +5 -0
  46. audiocraft/utils/autocast.py +40 -0
  47. audiocraft/utils/export.py +56 -0
  48. audiocraft/utils/notebook.py +32 -0
  49. audiocraft/utils/utils.py +234 -0
  50. demo.ipynb +235 -0
.github/actions/audiocraft_build/action.yml ADDED
@@ -0,0 +1,29 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ name: audiocraft_build
2
+ description: 'Build audiocraft env.'
3
+ runs:
4
+ using: "composite"
5
+ steps:
6
+ - uses: actions/setup-python@v2
7
+ with:
8
+ python-version: 3.8
9
+ - uses: actions/cache@v2
10
+ id: cache
11
+ with:
12
+ path: env
13
+ key: audiocraft_env-${{ hashFiles('**/requirements.txt') }}
14
+
15
+ - if: ${{ steps.cache.outputs.cache-hit != 'true' }}
16
+ name: Install dependencies
17
+ shell: bash
18
+ run: |
19
+ sudo apt-get update
20
+ sudo apt-get install libsndfile1-dev ffmpeg
21
+ python3 -m venv env
22
+ . env/bin/activate
23
+ python -m pip install --upgrade pip
24
+ pip install -e '.[dev]'
25
+ - name: System Dependencies
26
+ shell: bash
27
+ run: |
28
+ sudo apt-get update
29
+ sudo apt-get install libsndfile1-dev ffmpeg
.github/workflows/audiocraft_docs.yml ADDED
@@ -0,0 +1,32 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ name: audiocraft_docs
2
+ on:
3
+ push:
4
+ branches: [ main ]
5
+
6
+ jobs:
7
+ run_docs:
8
+ name: Run docs
9
+ runs-on: ubuntu-latest
10
+ steps:
11
+ - uses: actions/checkout@v2
12
+ - uses: ./.github/actions/audiocraft_build
13
+ - name: Config git
14
+ run: |
15
+ git config --global user.email "defossez@fb.com"
16
+ git config --global user.name "Alexandre Défossez (autodoc)"
17
+
18
+ - name: Reset branch
19
+ run: |
20
+ git branch -f gh-docs main
21
+ git checkout gh-docs
22
+
23
+ - name: Make docs
24
+ run: |
25
+ . env/bin/activate
26
+ make docs
27
+ git add -f docs
28
+ git commit -m docs
29
+
30
+ - name: Push branch
31
+ run: |
32
+ git push -f -u origin gh-docs
.github/workflows/audiocraft_linter.yml ADDED
@@ -0,0 +1,17 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ name: audiocraft_linter
2
+ on:
3
+ push:
4
+ branches: [ main ]
5
+ pull_request:
6
+ branches: [ main ]
7
+
8
+ jobs:
9
+ run_linter:
10
+ name: Run linter
11
+ runs-on: ubuntu-latest
12
+ steps:
13
+ - uses: actions/checkout@v2
14
+ - uses: ./.github/actions/audiocraft_build
15
+ - run: |
16
+ . env/bin/activate
17
+ make linter
.github/workflows/audiocraft_tests.yml ADDED
@@ -0,0 +1,17 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ name: audiocraft_tests
2
+ on:
3
+ push:
4
+ branches: [ main ]
5
+ pull_request:
6
+ branches: [ main ]
7
+
8
+ jobs:
9
+ run_tests:
10
+ name: Run tests
11
+ runs-on: ubuntu-latest
12
+ steps:
13
+ - uses: actions/checkout@v2
14
+ - uses: ./.github/actions/audiocraft_build
15
+ - run: |
16
+ . env/bin/activate
17
+ make tests
.gitignore ADDED
@@ -0,0 +1,55 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Byte-compiled / optimized / DLL files
2
+ __pycache__
3
+ *.py[cod]
4
+ *$py.class
5
+
6
+ # C extensions
7
+ *.so
8
+
9
+ # macOS dir files
10
+ .DS_Store
11
+
12
+ # Distribution / packaging
13
+ .Python
14
+ env/
15
+ build/
16
+ develop-eggs/
17
+ dist/
18
+ downloads/
19
+ eggs/
20
+ .eggs/
21
+ lib/
22
+ lib64/
23
+ parts/
24
+ sdist/
25
+ var/
26
+ wheels/
27
+ *.egg-info/
28
+ .installed.cfg
29
+ *.egg
30
+ .ipynb_checkpoints
31
+
32
+ # Tests and linter
33
+ .pytest_cache/
34
+ .mypy_cache/
35
+ .coverage
36
+
37
+ # docs
38
+ /docs
39
+
40
+ # dotenv
41
+ .env
42
+ .envrc
43
+
44
+ # virtualenv
45
+ .venv
46
+ venv/
47
+ ENV/
48
+
49
+ # personal notebooks & scripts
50
+ */local_scripts
51
+ */notes
52
+ .vscode/
53
+ /notebooks
54
+ /local_scripts
55
+ /notes
CHANGELOG.md ADDED
@@ -0,0 +1,9 @@
 
 
 
 
 
 
 
 
 
 
1
+ # Changelog
2
+
3
+ All notable changes to this project will be documented in this file.
4
+
5
+ The format is based on [Keep a Changelog](https://keepachangelog.com/en/1.0.0/).
6
+
7
+ ## [0.0.1a] - TBD
8
+
9
+ Initial release, with model evaluation only.
CODE_OF_CONDUCT.md ADDED
@@ -0,0 +1,80 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Code of Conduct
2
+
3
+ ## Our Pledge
4
+
5
+ In the interest of fostering an open and welcoming environment, we as
6
+ contributors and maintainers pledge to make participation in our project and
7
+ our community a harassment-free experience for everyone, regardless of age, body
8
+ size, disability, ethnicity, sex characteristics, gender identity and expression,
9
+ level of experience, education, socio-economic status, nationality, personal
10
+ appearance, race, religion, or sexual identity and orientation.
11
+
12
+ ## Our Standards
13
+
14
+ Examples of behavior that contributes to creating a positive environment
15
+ include:
16
+
17
+ * Using welcoming and inclusive language
18
+ * Being respectful of differing viewpoints and experiences
19
+ * Gracefully accepting constructive criticism
20
+ * Focusing on what is best for the community
21
+ * Showing empathy towards other community members
22
+
23
+ Examples of unacceptable behavior by participants include:
24
+
25
+ * The use of sexualized language or imagery and unwelcome sexual attention or
26
+ advances
27
+ * Trolling, insulting/derogatory comments, and personal or political attacks
28
+ * Public or private harassment
29
+ * Publishing others' private information, such as a physical or electronic
30
+ address, without explicit permission
31
+ * Other conduct which could reasonably be considered inappropriate in a
32
+ professional setting
33
+
34
+ ## Our Responsibilities
35
+
36
+ Project maintainers are responsible for clarifying the standards of acceptable
37
+ behavior and are expected to take appropriate and fair corrective action in
38
+ response to any instances of unacceptable behavior.
39
+
40
+ Project maintainers have the right and responsibility to remove, edit, or
41
+ reject comments, commits, code, wiki edits, issues, and other contributions
42
+ that are not aligned to this Code of Conduct, or to ban temporarily or
43
+ permanently any contributor for other behaviors that they deem inappropriate,
44
+ threatening, offensive, or harmful.
45
+
46
+ ## Scope
47
+
48
+ This Code of Conduct applies within all project spaces, and it also applies when
49
+ an individual is representing the project or its community in public spaces.
50
+ Examples of representing a project or community include using an official
51
+ project e-mail address, posting via an official social media account, or acting
52
+ as an appointed representative at an online or offline event. Representation of
53
+ a project may be further defined and clarified by project maintainers.
54
+
55
+ This Code of Conduct also applies outside the project spaces when there is a
56
+ reasonable belief that an individual's behavior may have a negative impact on
57
+ the project or its community.
58
+
59
+ ## Enforcement
60
+
61
+ Instances of abusive, harassing, or otherwise unacceptable behavior may be
62
+ reported by contacting the project team at <opensource-conduct@fb.com>. All
63
+ complaints will be reviewed and investigated and will result in a response that
64
+ is deemed necessary and appropriate to the circumstances. The project team is
65
+ obligated to maintain confidentiality with regard to the reporter of an incident.
66
+ Further details of specific enforcement policies may be posted separately.
67
+
68
+ Project maintainers who do not follow or enforce the Code of Conduct in good
69
+ faith may face temporary or permanent repercussions as determined by other
70
+ members of the project's leadership.
71
+
72
+ ## Attribution
73
+
74
+ This Code of Conduct is adapted from the [Contributor Covenant][homepage], version 1.4,
75
+ available at https://www.contributor-covenant.org/version/1/4/code-of-conduct.html
76
+
77
+ [homepage]: https://www.contributor-covenant.org
78
+
79
+ For answers to common questions about this code of conduct, see
80
+ https://www.contributor-covenant.org/faq
CONTRIBUTING.md ADDED
@@ -0,0 +1,35 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Contributing to Audiocraft
2
+
3
+ We want to make contributing to this project as easy and transparent as
4
+ possible.
5
+
6
+ ## Pull Requests
7
+
8
+ Audiocraft is the implementation of a research paper.
9
+ Therefore, we do not plan on accepting many pull requests for new features.
10
+ We certainly welcome them for bug fixes.
11
+
12
+ 1. Fork the repo and create your branch from `main`.
13
+ 2. If you've added code that should be tested, add tests.
14
+ 3. If you've changed APIs, update the documentation.
15
+ 4. Ensure the test suite passes.
16
+ 5. Make sure your code lints.
17
+ 6. If you haven't already, complete the Contributor License Agreement ("CLA").
18
+
19
+ ## Contributor License Agreement ("CLA")
20
+ In order to accept your pull request, we need you to submit a CLA. You only need
21
+ to do this once to work on any of Meta's open source projects.
22
+
23
+ Complete your CLA here: <https://code.facebook.com/cla>
24
+
25
+ ## Issues
26
+ We use GitHub issues to track public bugs. Please ensure your description is
27
+ clear and has sufficient instructions to be able to reproduce the issue.
28
+
29
+ Meta has a [bounty program](https://www.facebook.com/whitehat/) for the safe
30
+ disclosure of security bugs. In those cases, please go through the process
31
+ outlined on that page and do not file a public issue.
32
+
33
+ ## License
34
+ By contributing to encodec, you agree that your contributions will be licensed
35
+ under the LICENSE file in the root directory of this source tree.
LICENSE ADDED
@@ -0,0 +1,21 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ MIT License
2
+
3
+ Copyright (c) Meta Platforms, Inc. and affiliates.
4
+
5
+ Permission is hereby granted, free of charge, to any person obtaining a copy
6
+ of this software and associated documentation files (the "Software"), to deal
7
+ in the Software without restriction, including without limitation the rights
8
+ to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
9
+ copies of the Software, and to permit persons to whom the Software is
10
+ furnished to do so, subject to the following conditions:
11
+
12
+ The above copyright notice and this permission notice shall be included in all
13
+ copies or substantial portions of the Software.
14
+
15
+ THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16
+ IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17
+ FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18
+ AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19
+ LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20
+ OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21
+ SOFTWARE.
LICENSE_weights ADDED
@@ -0,0 +1,157 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Attribution-NonCommercial-NoDerivatives 4.0 International
2
+
3
+ > *Creative Commons Corporation (“Creative Commons”) is not a law firm and does not provide legal services or legal advice. Distribution of Creative Commons public licenses does not create a lawyer-client or other relationship. Creative Commons makes its licenses and related information available on an “as-is” basis. Creative Commons gives no warranties regarding its licenses, any material licensed under their terms and conditions, or any related information. Creative Commons disclaims all liability for damages resulting from their use to the fullest extent possible.*
4
+ >
5
+ > ### Using Creative Commons Public Licenses
6
+ >
7
+ > Creative Commons public licenses provide a standard set of terms and conditions that creators and other rights holders may use to share original works of authorship and other material subject to copyright and certain other rights specified in the public license below. The following considerations are for informational purposes only, are not exhaustive, and do not form part of our licenses.
8
+ >
9
+ > * __Considerations for licensors:__ Our public licenses are intended for use by those authorized to give the public permission to use material in ways otherwise restricted by copyright and certain other rights. Our licenses are irrevocable. Licensors should read and understand the terms and conditions of the license they choose before applying it. Licensors should also secure all rights necessary before applying our licenses so that the public can reuse the material as expected. Licensors should clearly mark any material not subject to the license. This includes other CC-licensed material, or material used under an exception or limitation to copyright. [More considerations for licensors](http://wiki.creativecommons.org/Considerations_for_licensors_and_licensees#Considerations_for_licensors).
10
+ >
11
+ > * __Considerations for the public:__ By using one of our public licenses, a licensor grants the public permission to use the licensed material under specified terms and conditions. If the licensor’s permission is not necessary for any reason–for example, because of any applicable exception or limitation to copyright–then that use is not regulated by the license. Our licenses grant only permissions under copyright and certain other rights that a licensor has authority to grant. Use of the licensed material may still be restricted for other reasons, including because others have copyright or other rights in the material. A licensor may make special requests, such as asking that all changes be marked or described. Although not required by our licenses, you are encouraged to respect those requests where reasonable. [More considerations for the public](http://wiki.creativecommons.org/Considerations_for_licensors_and_licensees#Considerations_for_licensees).
12
+
13
+ ## Creative Commons Attribution-NonCommercial-NoDerivatives 4.0 International Public License
14
+
15
+ By exercising the Licensed Rights (defined below), You accept and agree to be bound by the terms and conditions of this Creative Commons Attribution-NonCommercial-NoDerivatives 4.0 International Public License ("Public License"). To the extent this Public License may be interpreted as a contract, You are granted the Licensed Rights in consideration of Your acceptance of these terms and conditions, and the Licensor grants You such rights in consideration of benefits the Licensor receives from making the Licensed Material available under these terms and conditions.
16
+
17
+ ### Section 1 – Definitions.
18
+
19
+ a. __Adapted Material__ means material subject to Copyright and Similar Rights that is derived from or based upon the Licensed Material and in which the Licensed Material is translated, altered, arranged, transformed, or otherwise modified in a manner requiring permission under the Copyright and Similar Rights held by the Licensor. For purposes of this Public License, where the Licensed Material is a musical work, performance, or sound recording, Adapted Material is always produced where the Licensed Material is synched in timed relation with a moving image.
20
+
21
+ b. __Copyright and Similar Rights__ means copyright and/or similar rights closely related to copyright including, without limitation, performance, broadcast, sound recording, and Sui Generis Database Rights, without regard to how the rights are labeled or categorized. For purposes of this Public License, the rights specified in Section 2(b)(1)-(2) are not Copyright and Similar Rights.
22
+
23
+ e. __Effective Technological Measures__ means those measures that, in the absence of proper authority, may not be circumvented under laws fulfilling obligations under Article 11 of the WIPO Copyright Treaty adopted on December 20, 1996, and/or similar international agreements.
24
+
25
+ f. __Exceptions and Limitations__ means fair use, fair dealing, and/or any other exception or limitation to Copyright and Similar Rights that applies to Your use of the Licensed Material.
26
+
27
+ h. __Licensed Material__ means the artistic or literary work, database, or other material to which the Licensor applied this Public License.
28
+
29
+ i. __Licensed Rights__ means the rights granted to You subject to the terms and conditions of this Public License, which are limited to all Copyright and Similar Rights that apply to Your use of the Licensed Material and that the Licensor has authority to license.
30
+
31
+ h. __Licensor__ means the individual(s) or entity(ies) granting rights under this Public License.
32
+
33
+ i. __NonCommercial__ means not primarily intended for or directed towards commercial advantage or monetary compensation. For purposes of this Public License, the exchange of the Licensed Material for other material subject to Copyright and Similar Rights by digital file-sharing or similar means is NonCommercial provided there is no payment of monetary compensation in connection with the exchange.
34
+
35
+ j. __Share__ means to provide material to the public by any means or process that requires permission under the Licensed Rights, such as reproduction, public display, public performance, distribution, dissemination, communication, or importation, and to make material available to the public including in ways that members of the public may access the material from a place and at a time individually chosen by them.
36
+
37
+ k. __Sui Generis Database Rights__ means rights other than copyright resulting from Directive 96/9/EC of the European Parliament and of the Council of 11 March 1996 on the legal protection of databases, as amended and/or succeeded, as well as other essentially equivalent rights anywhere in the world.
38
+
39
+ l. __You__ means the individual or entity exercising the Licensed Rights under this Public License. Your has a corresponding meaning.
40
+
41
+ ### Section 2 – Scope.
42
+
43
+ a. ___License grant.___
44
+
45
+ 1. Subject to the terms and conditions of this Public License, the Licensor hereby grants You a worldwide, royalty-free, non-sublicensable, non-exclusive, irrevocable license to exercise the Licensed Rights in the Licensed Material to:
46
+
47
+ A. reproduce and Share the Licensed Material, in whole or in part, for NonCommercial purposes only; and
48
+
49
+ B. produce and reproduce, but not Share, Adapted Material for NonCommercial purposes only.
50
+
51
+ 2. __Exceptions and Limitations.__ For the avoidance of doubt, where Exceptions and Limitations apply to Your use, this Public License does not apply, and You do not need to comply with its terms and conditions.
52
+
53
+ 3. __Term.__ The term of this Public License is specified in Section 6(a).
54
+
55
+ 4. __Media and formats; technical modifications allowed.__ The Licensor authorizes You to exercise the Licensed Rights in all media and formats whether now known or hereafter created, and to make technical modifications necessary to do so. The Licensor waives and/or agrees not to assert any right or authority to forbid You from making technical modifications necessary to exercise the Licensed Rights, including technical modifications necessary to circumvent Effective Technological Measures. For purposes of this Public License, simply making modifications authorized by this Section 2(a)(4) never produces Adapted Material.
56
+
57
+ 5. __Downstream recipients.__
58
+
59
+ A. __Offer from the Licensor – Licensed Material.__ Every recipient of the Licensed Material automatically receives an offer from the Licensor to exercise the Licensed Rights under the terms and conditions of this Public License.
60
+
61
+ B. __No downstream restrictions.__ You may not offer or impose any additional or different terms or conditions on, or apply any Effective Technological Measures to, the Licensed Material if doing so restricts exercise of the Licensed Rights by any recipient of the Licensed Material.
62
+
63
+ 6. __No endorsement.__ Nothing in this Public License constitutes or may be construed as permission to assert or imply that You are, or that Your use of the Licensed Material is, connected with, or sponsored, endorsed, or granted official status by, the Licensor or others designated to receive attribution as provided in Section 3(a)(1)(A)(i).
64
+
65
+ b. ___Other rights.___
66
+
67
+ 1. Moral rights, such as the right of integrity, are not licensed under this Public License, nor are publicity, privacy, and/or other similar personality rights; however, to the extent possible, the Licensor waives and/or agrees not to assert any such rights held by the Licensor to the limited extent necessary to allow You to exercise the Licensed Rights, but not otherwise.
68
+
69
+ 2. Patent and trademark rights are not licensed under this Public License.
70
+
71
+ 3. To the extent possible, the Licensor waives any right to collect royalties from You for the exercise of the Licensed Rights, whether directly or through a collecting society under any voluntary or waivable statutory or compulsory licensing scheme. In all other cases the Licensor expressly reserves any right to collect such royalties, including when the Licensed Material is used other than for NonCommercial purposes.
72
+
73
+ ### Section 3 – License Conditions.
74
+
75
+ Your exercise of the Licensed Rights is expressly made subject to the following conditions.
76
+
77
+ a. ___Attribution.___
78
+
79
+ 1. If You Share the Licensed Material, You must:
80
+
81
+ A. retain the following if it is supplied by the Licensor with the Licensed Material:
82
+
83
+ i. identification of the creator(s) of the Licensed Material and any others designated to receive attribution, in any reasonable manner requested by the Licensor (including by pseudonym if designated);
84
+
85
+ ii. a copyright notice;
86
+
87
+ iii. a notice that refers to this Public License;
88
+
89
+ iv. a notice that refers to the disclaimer of warranties;
90
+
91
+ v. a URI or hyperlink to the Licensed Material to the extent reasonably practicable;
92
+
93
+ B. indicate if You modified the Licensed Material and retain an indication of any previous modifications; and
94
+
95
+ C. indicate the Licensed Material is licensed under this Public License, and include the text of, or the URI or hyperlink to, this Public License.
96
+
97
+ For the avoidance of doubt, You do not have permission under this Public License to Share Adapted Material.
98
+
99
+ 2. You may satisfy the conditions in Section 3(a)(1) in any reasonable manner based on the medium, means, and context in which You Share the Licensed Material. For example, it may be reasonable to satisfy the conditions by providing a URI or hyperlink to a resource that includes the required information.
100
+
101
+ 3. If requested by the Licensor, You must remove any of the information required by Section 3(a)(1)(A) to the extent reasonably practicable.
102
+
103
+ ### Section 4 – Sui Generis Database Rights.
104
+
105
+ Where the Licensed Rights include Sui Generis Database Rights that apply to Your use of the Licensed Material:
106
+
107
+ a. for the avoidance of doubt, Section 2(a)(1) grants You the right to extract, reuse, reproduce, and Share all or a substantial portion of the contents of the database for NonCommercial purposes only and provided You do not Share Adapted Material;
108
+
109
+ b. if You include all or a substantial portion of the database contents in a database in which You have Sui Generis Database Rights, then the database in which You have Sui Generis Database Rights (but not its individual contents) is Adapted Material; and
110
+
111
+ c. You must comply with the conditions in Section 3(a) if You Share all or a substantial portion of the contents of the database.
112
+
113
+ For the avoidance of doubt, this Section 4 supplements and does not replace Your obligations under this Public License where the Licensed Rights include other Copyright and Similar Rights.
114
+
115
+ ### Section 5 – Disclaimer of Warranties and Limitation of Liability.
116
+
117
+ a. __Unless otherwise separately undertaken by the Licensor, to the extent possible, the Licensor offers the Licensed Material as-is and as-available, and makes no representations or warranties of any kind concerning the Licensed Material, whether express, implied, statutory, or other. This includes, without limitation, warranties of title, merchantability, fitness for a particular purpose, non-infringement, absence of latent or other defects, accuracy, or the presence or absence of errors, whether or not known or discoverable. Where disclaimers of warranties are not allowed in full or in part, this disclaimer may not apply to You.__
118
+
119
+ b. __To the extent possible, in no event will the Licensor be liable to You on any legal theory (including, without limitation, negligence) or otherwise for any direct, special, indirect, incidental, consequential, punitive, exemplary, or other losses, costs, expenses, or damages arising out of this Public License or use of the Licensed Material, even if the Licensor has been advised of the possibility of such losses, costs, expenses, or damages. Where a limitation of liability is not allowed in full or in part, this limitation may not apply to You.__
120
+
121
+ c. The disclaimer of warranties and limitation of liability provided above shall be interpreted in a manner that, to the extent possible, most closely approximates an absolute disclaimer and waiver of all liability.
122
+
123
+ ### Section 6 – Term and Termination.
124
+
125
+ a. This Public License applies for the term of the Copyright and Similar Rights licensed here. However, if You fail to comply with this Public License, then Your rights under this Public License terminate automatically.
126
+
127
+ b. Where Your right to use the Licensed Material has terminated under Section 6(a), it reinstates:
128
+
129
+ 1. automatically as of the date the violation is cured, provided it is cured within 30 days of Your discovery of the violation; or
130
+
131
+ 2. upon express reinstatement by the Licensor.
132
+
133
+ For the avoidance of doubt, this Section 6(b) does not affect any right the Licensor may have to seek remedies for Your violations of this Public License.
134
+
135
+ c. For the avoidance of doubt, the Licensor may also offer the Licensed Material under separate terms or conditions or stop distributing the Licensed Material at any time; however, doing so will not terminate this Public License.
136
+
137
+ d. Sections 1, 5, 6, 7, and 8 survive termination of this Public License.
138
+
139
+ ### Section 7 – Other Terms and Conditions.
140
+
141
+ a. The Licensor shall not be bound by any additional or different terms or conditions communicated by You unless expressly agreed.
142
+
143
+ b. Any arrangements, understandings, or agreements regarding the Licensed Material not stated herein are separate from and independent of the terms and conditions of this Public License.
144
+
145
+ ### Section 8 – Interpretation.
146
+
147
+ a. For the avoidance of doubt, this Public License does not, and shall not be interpreted to, reduce, limit, restrict, or impose conditions on any use of the Licensed Material that could lawfully be made without permission under this Public License.
148
+
149
+ b. To the extent possible, if any provision of this Public License is deemed unenforceable, it shall be automatically reformed to the minimum extent necessary to make it enforceable. If the provision cannot be reformed, it shall be severed from this Public License without affecting the enforceability of the remaining terms and conditions.
150
+
151
+ c. No term or condition of this Public License will be waived and no failure to comply consented to unless expressly agreed to by the Licensor.
152
+
153
+ d. Nothing in this Public License constitutes or may be interpreted as a limitation upon, or waiver of, any privileges and immunities that apply to the Licensor or You, including from the legal processes of any jurisdiction or authority.
154
+
155
+ > Creative Commons is not a party to its public licenses. Notwithstanding, Creative Commons may elect to apply one of its public licenses to material it publishes and in those instances will be considered the “Licensor.” Except for the limited purpose of indicating that material is shared under a Creative Commons public license or as otherwise permitted by the Creative Commons policies published at [creativecommons.org/policies](http://creativecommons.org/policies), Creative Commons does not authorize the use of the trademark “Creative Commons” or any other trademark or logo of Creative Commons without its prior written consent including, without limitation, in connection with any unauthorized modifications to any of its public licenses or any other arrangements, understandings, or agreements concerning use of licensed material. For the avoidance of doubt, this paragraph does not form part of the public licenses.
156
+ >
157
+ > Creative Commons may be contacted at [creativecommons.org](http://creativecommons.org).
MANIFEST.in ADDED
@@ -0,0 +1,8 @@
 
 
 
 
 
 
 
 
 
1
+ include Makefile
2
+ include LICENSE
3
+ include LICENSE_weights
4
+ include *.md
5
+ include *.ini
6
+ include requirements.txt
7
+ include audiocraft/py.typed
8
+ include assets/*.mp3
MODEL_CARD.md ADDED
@@ -0,0 +1,81 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # MusicGen Model Card
2
+
3
+ ## Model details
4
+
5
+ **Organization developing the model:** The FAIR team of Meta AI.
6
+
7
+ **Model date:** MusicGen was trained between April 2023 and May 2023.
8
+
9
+ **Model version:** This is the version 1 of the model.
10
+
11
+ **Model type:** MusicGen consists of an EnCodec model for audio tokenization, an auto-regressive language model based on the transformer architecture for music modeling. The model comes in different sizes: 300M, 1.5B and 3.3B parameters ; and two variants: a model trained for text-to-music generation task and a model trained for melody-guided music generation.
12
+
13
+ **Paper or resources for more information:** More information can be found in the paper [Simple and Controllable Music Generation][arxiv].
14
+
15
+ **Citation details** See [our paper][arxiv]
16
+
17
+ **License** Code is released under MIT, model weights are released under CC-BY-NC 4.0.
18
+
19
+ **Where to send questions or comments about the model:** Questions and comments about MusicGen can be sent via the [Github repository](https://github.com/facebookresearch/audiocraft) of the project, or by opening an issue.
20
+
21
+ ## Intended use
22
+ **Primary intended use:** The primary use of MusicGen is research on AI-based music generation, including:
23
+
24
+ - Research efforts, such as probing and better understanding the limitations of generative models to further improve the state of science
25
+ - Generation of music guided by text or melody to understand current abilities of generative AI models by machine learning amateurs
26
+
27
+ **Primary intended users:** The primary intended users of the model are researchers in audio, machine learning and artificial intelligence, as well as amateur seeking to better understand those models.
28
+
29
+ **Out-of-scope use cases** The model should not be used on downstream applications without further risk evaluation and mitigation. The model should not be used to intentionally create or disseminate music pieces that create hostile or alienating environments for people. This includes generating music that people would foreseeably find disturbing, distressing, or offensive; or content that propagates historical or current stereotypes.
30
+
31
+ ## Metrics
32
+
33
+ **Models performance measures:** We used the following objective measure to evaluate the model on a standard music benchmark:
34
+
35
+ - Frechet Audio Distance computed on features extracted from a pre-trained audio classifier (VGGish)
36
+ - Kullback-Leibler Divergence on label distributions extracted from a pre-trained audio classifier (PaSST)
37
+ - CLAP Score between audio embedding and text embedding extracted from a pre-trained CLAP model
38
+
39
+ Additionally, we run qualitative studies with human participants, evaluating the performance of the model with the following axes:
40
+
41
+ - Overall quality of the music samples;
42
+ - Text relevance to the provided text input;
43
+ - Adherence to the melody for melody-guided music generation.
44
+
45
+ More details on performance measures and human studies can be found in the paper.
46
+
47
+ **Decision thresholds:** Not applicable.
48
+
49
+ ## Evaluation datasets
50
+
51
+ The model was evaluated on the [MusicCaps benchmark](https://www.kaggle.com/datasets/googleai/musiccaps) and on an in-domain held-out evaluation set, with no artist overlap with the training set.
52
+
53
+ ## Training datasets
54
+
55
+ The model was trained using the following sources: the [Meta Music Initiative Sound Collection](https://www.fb.com/sound), [Shutterstock music collection](https://www.shutterstock.com/music) and the [Pond5 music collection](https://www.pond5.com/). See the paper for more details about the training set and corresponding preprocessing.
56
+
57
+ ## Quantitative analysis
58
+
59
+ More information can be found in the paper [Simple and Controllable Music Generation][arxiv], in the Experimental Setup section.
60
+
61
+ ## Limitations and biases
62
+
63
+ **Data:** The data sources used to train the model are created by music professionals and covered by legal agreements with the right holders. The model is trained on 20K hours of data, we believe that scaling the model on larger datasets can further improve the performance of the model.
64
+
65
+ **Mitigations:** All vocals have been removed from the data source using a state-of-the-art music source separation method, namely using the open source [Hybrid Transformer for Music Source Separation](https://github.com/facebookresearch/demucs) (HT-Demucs). The model is therefore not able to produce vocals.
66
+
67
+ **Limitations:**
68
+
69
+ - The model is not able to generate realistic vocals.
70
+ - The model has been trained with English descriptions and will not perform as well in other languages.
71
+ - The model does not perform equally well for all music styles and cultures.
72
+ - The model sometimes generates end of songs, collapsing to silence.
73
+ - It is sometimes difficult to assess what types of text descriptions provide the best generations. Prompt engineering may be required to obtain satisfying results.
74
+
75
+ **Biases:** The source of data is potentially lacking diversity and all music cultures are not equally represented in the dataset. The model may not perform equally well on the wide variety of music genres that exists. The generated samples from the model will reflect the biases from the training data. Further work on this model should include methods for balanced and just representations of cultures, for example, by scaling the training data to be both diverse and inclusive.
76
+
77
+ **Risks and harms:** Biases and limitations of the model may lead to generation of samples that may be considered as biased, inappropriate or offensive. We believe that providing the code to reproduce the research and train new models will allow to broaden the application to new and more representative data.
78
+
79
+ **Use cases:** Users must be aware of the biases, limitations and risks of the model. MusicGen is a model developed for artificial intelligence research on controllable music generation. As such, it should not be used for downstream applications without further investigation and mitigation of risks.
80
+
81
+ [arxiv]: https://arxiv.org/abs/2306.05284
Makefile ADDED
@@ -0,0 +1,21 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ default: linter tests
2
+
3
+ install:
4
+ pip install -U pip
5
+ pip install -U -e '.[dev]'
6
+
7
+ linter:
8
+ flake8 audiocraft && mypy audiocraft
9
+ flake8 tests && mypy tests
10
+
11
+ tests:
12
+ coverage run -m pytest tests
13
+ coverage report --include 'audiocraft/*'
14
+
15
+ docs:
16
+ pdoc3 --html -o docs -f audiocraft
17
+
18
+ dist:
19
+ python setup.py sdist
20
+
21
+ .PHONY: linter tests docs dist
README.md ADDED
@@ -0,0 +1,96 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Audiocraft
2
+ ![docs badge](https://github.com/facebookresearch/audiocraft/workflows/audiocraft_docs/badge.svg)
3
+ ![linter badge](https://github.com/facebookresearch/audiocraft/workflows/audiocraft_linter/badge.svg)
4
+ ![tests badge](https://github.com/facebookresearch/audiocraft/workflows/audiocraft_tests/badge.svg)
5
+
6
+ Audiocraft is a PyTorch library for deep learning research on audio generation. At the moment, it contains the code for MusicGen, a state-of-the-art controllable text-to-music model.
7
+
8
+ ## MusicGen
9
+
10
+ Audiocraft provides the code and models for MusicGen, [a simple and controllable model for music generation][arxiv]. MusicGen is a single stage auto-regressive
11
+ Transformer model trained over a 32kHz <a href="https://github.com/facebookresearch/encodec">EnCodec tokenizer</a> with 4 codebooks sampled at 50 Hz. Unlike existing methods like [MusicLM](https://arxiv.org/abs/2301.11325), MusicGen doesn't not require a self-supervised semantic representation, and it generates
12
+ all 4 codebooks in one pass. By introducing a small delay between the codebooks, we show we can predict
13
+ them in parallel, thus having only 50 auto-regressive steps per second of audio.
14
+ Check out our [sample page][musicgen_samples] or test the available demo!
15
+
16
+ <a target="_blank" href="https://colab.research.google.com/drive/1fxGqfg96RBUvGxZ1XXN07s3DthrKUl4-?usp=sharing">
17
+ <img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/>
18
+ </a>
19
+ <a target="_blank" href="https://huggingface.co/spaces/facebook/MusicGen">
20
+ <img src="https://huggingface.co/datasets/huggingface/badges/raw/main/open-in-hf-spaces-sm.svg" alt="Open in HugginFace"/>
21
+ </a>
22
+ <br>
23
+
24
+ ## Installation
25
+ Audiocraft requires Python 3.9, PyTorch 2.0.0, and a GPU with at least 16 GB of memory (for the medium-sized model). To install Audiocraft, you can run the following:
26
+
27
+ ```shell
28
+ # Best to make sure you have torch installed first, in particular before installing xformers.
29
+ # Don't run this if you already have PyTorch installed.
30
+ pip install 'torch>=2.0'
31
+ # Then proceed to one of the following
32
+ pip install -U audiocraft # stable release
33
+ pip install -U git+https://git@github.com/facebookresearch/audiocraft#egg=audiocraft # bleeding edge
34
+ pip install -e . # or if you cloned the repo locally
35
+ ```
36
+
37
+ ## Usage
38
+ You can play with MusicGen by running the jupyter notebook at [`demo.ipynb`](./demo.ipynb) locally, or use the provided [colab notebook](https://colab.research.google.com/drive/1fxGqfg96RBUvGxZ1XXN07s3DthrKUl4-?usp=sharing). Finally, a demo is also available on the [`facebook/MusiGen` HugginFace Space](https://huggingface.co/spaces/facebook/MusicGen) (huge thanks to all the HF team for their support).
39
+
40
+ ## API
41
+
42
+ We provide a simple API and 4 pre-trained models. The pre trained models are:
43
+ - `small`: 300M model, text to music only,
44
+ - `medium`: 1.5B model, text to music only,
45
+ - `melody`: 1.5B model, text to music and text+melody to music,
46
+ - `large`: 3.3B model, text to music only.
47
+
48
+ We observe the best trade-off between quality and compute with the `medium` or `melody` model.
49
+ In order to use MusicGen locally **you must have a GPU**. We recommend 16GB of memory, but smaller
50
+ GPUs will be able to generate short sequences, or longer sequences with the `small` model.
51
+
52
+ See after a quick example for using the API.
53
+
54
+ ```python
55
+ import torchaudio
56
+ from audiocraft.models import MusicGen
57
+ from audiocraft.data.audio import audio_write
58
+
59
+ model = MusicGen.get_pretrained('melody')
60
+ model.set_generation_params(duration=8) # generate 8 seconds.
61
+ wav = model.generate_unconditional(4) # generates 4 unconditional audio samples
62
+ descriptions = ['happy rock', 'energetic EDM', 'sad jazz']
63
+ wav = model.generate(descriptions) # generates 3 samples.
64
+
65
+ melody, sr = torchaudio.load('./assets/bach.mp3')
66
+ # generates using the melody from the given audio and the provided descriptions.
67
+ wav = model.generate_with_chroma(descriptions, melody[None].expand(3, -1, -1), sr)
68
+
69
+ for idx, one_wav in enumerate(wav):
70
+ # Will save under {idx}.wav, with loudness normalization at -14 db LUFS.
71
+ audio_write(f'{idx}', one_wav, model.sample_rate, strategy="loudness")
72
+ ```
73
+
74
+
75
+ ## Model Card
76
+
77
+ See [the model card page](./MODEL_CARD.md).
78
+
79
+ ## FAQ
80
+
81
+ #### Will the training code be released?
82
+
83
+ Yes. We will soon release the training code for MusicLM and EnCodec.
84
+
85
+
86
+ ## Citation
87
+ ```
88
+ bib here
89
+ ```
90
+
91
+ ## License
92
+ * The code in this repository is released under the MIT license as found in the [LICENSE file](LICENSE).
93
+ * The weights in this repository are released under the CC-BY-NC 4.0 license as found in the [LICENSE_weights file](LICENSE_weights).
94
+
95
+ [arxiv]: https://arxiv.org/abs/2306.05284
96
+ [musicgen_samples]: https://ai.honu.io/papers/musicgen/
app.py ADDED
@@ -0,0 +1,116 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """
2
+ Copyright (c) Meta Platforms, Inc. and affiliates.
3
+ All rights reserved.
4
+
5
+ This source code is licensed under the license found in the
6
+ LICENSE file in the root directory of this source tree.
7
+ """
8
+
9
+ import torch
10
+ import gradio as gr
11
+ from hf_loading import get_pretrained
12
+
13
+
14
+ MODEL = None
15
+
16
+
17
+ def load_model(version):
18
+ print("Loading model", version)
19
+ return get_pretrained(version)
20
+
21
+
22
+ def predict(model, text, melody, duration, topk, topp, temperature, cfg_coef):
23
+ global MODEL
24
+ topk = int(topk)
25
+ if MODEL is None or MODEL.name != model:
26
+ MODEL = load_model(model)
27
+
28
+ if duration > MODEL.lm.cfg.dataset.segment_duration:
29
+ raise gr.Error("MusicGen currently supports durations of up to 30 seconds!")
30
+ MODEL.set_generation_params(
31
+ use_sampling=True,
32
+ top_k=topk,
33
+ top_p=topp,
34
+ temperature=temperature,
35
+ cfg_coef=cfg_coef,
36
+ duration=duration,
37
+ )
38
+
39
+ if melody:
40
+ sr, melody = melody[0], torch.from_numpy(melody[1]).to(MODEL.device).float().t().unsqueeze(0)
41
+ print(melody.shape)
42
+ if melody.dim() == 2:
43
+ melody = melody[None]
44
+ melody = melody[..., :int(sr * MODEL.lm.cfg.dataset.segment_duration)]
45
+ output = MODEL.generate_with_chroma(
46
+ descriptions=[text],
47
+ melody_wavs=melody,
48
+ melody_sample_rate=sr,
49
+ progress=False
50
+ )
51
+ else:
52
+ output = MODEL.generate(descriptions=[text], progress=False)
53
+
54
+ output = output.detach().cpu().numpy()
55
+ return MODEL.sample_rate, output
56
+
57
+
58
+ with gr.Blocks() as demo:
59
+ gr.Markdown(
60
+ """
61
+ # MusicGen
62
+
63
+ This is the demo for MusicGen, a simple and controllable model for music generation presented at: "Simple and Controllable Music Generation".
64
+
65
+ Below we present 3 model variations:
66
+ 1. Melody -- a music generation model capable of generating music condition on text and melody inputs. **Note**, you can also use text only.
67
+ 2. Small -- a 300M transformer decoder conditioned on text only.
68
+ 3. Medium -- a 1.5B transformer decoder conditioned on text only.
69
+
70
+ See [github.com/facebookresearch/audiocraft](https://github.com/facebookresearch/audiocraft)
71
+ for more details.
72
+ """
73
+ )
74
+ with gr.Row():
75
+ with gr.Column():
76
+ with gr.Row():
77
+ text = gr.Text(label="Input Text", interactive=True)
78
+ melody = gr.Audio(source="upload", type="numpy", label="Melody Condition (optional)", interactive=True)
79
+ with gr.Row():
80
+ submit = gr.Button("Submit")
81
+ with gr.Row():
82
+ model = gr.Radio(["melody", "medium", "small"], label="Model", value="melody", interactive=True)
83
+ with gr.Row():
84
+ duration = gr.Slider(minimum=1, maximum=30, value=10, label="Duration", interactive=True)
85
+ with gr.Row():
86
+ topk = gr.Number(label="Top-k", value=250, interactive=True)
87
+ topp = gr.Number(label="Top-p", value=0, interactive=True)
88
+ temperature = gr.Number(label="Temperature", value=1.0, interactive=True)
89
+ cfg_coef = gr.Number(label="Classifier Free Guidance", value=3.0, interactive=True)
90
+ with gr.Column():
91
+ output = gr.Audio(label="Generated Music", type="numpy")
92
+ submit.click(predict, inputs=[model, text, melody, duration, topk, topp, temperature, cfg_coef], outputs=[output])
93
+ gr.Examples(
94
+ fn=predict,
95
+ examples=[
96
+ [
97
+ "An 80s driving pop song with heavy drums and synth pads in the background",
98
+ "./assets/bach.mp3",
99
+ "melody"
100
+ ],
101
+ [
102
+ "90s rock song with electric guitar and heavy drums",
103
+ None,
104
+ "medium"
105
+ ],
106
+ [
107
+ "a light and cheerly EDM track, with syncopated drums, aery pads, and strong emotions",
108
+ "./assets/bach.mp3",
109
+ "melody"
110
+ ]
111
+ ],
112
+ inputs=[text, melody, model],
113
+ outputs=[output]
114
+ )
115
+
116
+ demo.launch()
app_batched.py ADDED
@@ -0,0 +1,111 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """
2
+ Copyright (c) Meta Platforms, Inc. and affiliates.
3
+ All rights reserved.
4
+
5
+ This source code is licensed under the license found in the
6
+ LICENSE file in the root directory of this source tree.
7
+ """
8
+
9
+ from tempfile import NamedTemporaryFile
10
+ import torch
11
+ import gradio as gr
12
+ from audiocraft.data.audio_utils import convert_audio
13
+ from audiocraft.data.audio import audio_write
14
+ from hf_loading import get_pretrained
15
+
16
+
17
+ MODEL = None
18
+
19
+
20
+ def load_model():
21
+ print("Loading model")
22
+ return get_pretrained("melody")
23
+
24
+
25
+ def predict(texts, melodies):
26
+ global MODEL
27
+ if MODEL is None:
28
+ MODEL = load_model()
29
+
30
+ duration = 12
31
+ MODEL.set_generation_params(duration=duration)
32
+
33
+ print(texts, melodies)
34
+ processed_melodies = []
35
+
36
+ target_sr = 32000
37
+ target_ac = 1
38
+ for melody in melodies:
39
+ if melody is None:
40
+ processed_melodies.append(None)
41
+ else:
42
+ sr, melody = melody[0], torch.from_numpy(melody[1]).to(MODEL.device).float().t()
43
+ if melody.dim() == 1:
44
+ melody = melody[None]
45
+ melody = melody[..., :int(sr * duration)]
46
+ melody = convert_audio(melody, sr, target_sr, target_ac)
47
+ processed_melodies.append(melody)
48
+
49
+ outputs = MODEL.generate_with_chroma(
50
+ descriptions=texts,
51
+ melody_wavs=processed_melodies,
52
+ melody_sample_rate=target_sr,
53
+ progress=False
54
+ )
55
+
56
+ outputs = outputs.detach().cpu().float()
57
+ out_files = []
58
+ for output in outputs:
59
+ with NamedTemporaryFile("wb", suffix=".wav", delete=False) as file:
60
+ audio_write(file.name, output, MODEL.sample_rate, strategy="loudness", add_suffix=False)
61
+ out_files.append([file.name])
62
+ return out_files
63
+
64
+
65
+ with gr.Blocks() as demo:
66
+ gr.Markdown(
67
+ """
68
+ # MusicGen
69
+
70
+ This is the demo for MusicGen, a simple and controllable model for music generation
71
+ presented at: "Simple and Controllable Music Generation".
72
+
73
+ Enter the description of the music you want and an optional audio used for melody conditioning.
74
+ This will generate a 12s extract with the `melody` model. For generating longer sequences
75
+ (up to 30 seconds), use the Colab demo or your own GPU.
76
+
77
+ See [github.com/facebookresearch/audiocraft](https://github.com/facebookresearch/audiocraft)
78
+ for more details.
79
+ """
80
+ )
81
+ with gr.Row():
82
+ with gr.Column():
83
+ with gr.Row():
84
+ text = gr.Text(label="Input Text", interactive=True)
85
+ melody = gr.Audio(source="upload", type="numpy", label="Melody Condition (optional)", interactive=True)
86
+ with gr.Row():
87
+ submit = gr.Button("Submit")
88
+ with gr.Column():
89
+ output = gr.Audio(label="Generated Music", type="filepath", format="wav")
90
+ submit.click(predict, inputs=[text, melody], outputs=[output], batch=True, max_batch_size=12)
91
+ gr.Examples(
92
+ fn=predict,
93
+ examples=[
94
+ [
95
+ "An 80s driving pop song with heavy drums and synth pads in the background",
96
+ "./assets/bach.mp3",
97
+ ],
98
+ [
99
+ "90s rock song with electric guitar and heavy drums",
100
+ None,
101
+ ],
102
+ [
103
+ "a light and cheerly EDM track, with syncopated drums, aery pads, and strong emotions",
104
+ "./assets/bach.mp3",
105
+ ]
106
+ ],
107
+ inputs=[text, melody],
108
+ outputs=[output]
109
+ )
110
+
111
+ demo.launch()
assets/bach.mp3 ADDED
Binary file (160 kB). View file
 
audiocraft/__init__.py ADDED
@@ -0,0 +1,10 @@
 
 
 
 
 
 
 
 
 
 
 
1
+ # Copyright (c) Meta Platforms, Inc. and affiliates.
2
+ # All rights reserved.
3
+ #
4
+ # This source code is licensed under the license found in the
5
+ # LICENSE file in the root directory of this source tree.
6
+
7
+ # flake8: noqa
8
+ from . import data, modules, models
9
+
10
+ __version__ = '0.0.1'
audiocraft/data/__init__.py ADDED
@@ -0,0 +1,8 @@
 
 
 
 
 
 
 
 
 
1
+ # Copyright (c) Meta Platforms, Inc. and affiliates.
2
+ # All rights reserved.
3
+ #
4
+ # This source code is licensed under the license found in the
5
+ # LICENSE file in the root directory of this source tree.
6
+
7
+ # flake8: noqa
8
+ from . import audio, audio_dataset
audiocraft/data/audio.py ADDED
@@ -0,0 +1,213 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Copyright (c) Meta Platforms, Inc. and affiliates.
2
+ # All rights reserved.
3
+ #
4
+ # This source code is licensed under the license found in the
5
+ # LICENSE file in the root directory of this source tree.
6
+
7
+ """
8
+ Audio IO methods are defined in this module (info, read, write),
9
+ We rely on av library for faster read when possible, otherwise on torchaudio.
10
+ """
11
+
12
+ from dataclasses import dataclass
13
+ from pathlib import Path
14
+ import logging
15
+ import typing as tp
16
+
17
+ import numpy as np
18
+ import soundfile
19
+ import torch
20
+ from torch.nn import functional as F
21
+ import torchaudio as ta
22
+
23
+ import av
24
+
25
+ from .audio_utils import f32_pcm, i16_pcm, normalize_audio
26
+
27
+
28
+ _av_initialized = False
29
+
30
+
31
+ def _init_av():
32
+ global _av_initialized
33
+ if _av_initialized:
34
+ return
35
+ logger = logging.getLogger('libav.mp3')
36
+ logger.setLevel(logging.ERROR)
37
+ _av_initialized = True
38
+
39
+
40
+ @dataclass(frozen=True)
41
+ class AudioFileInfo:
42
+ sample_rate: int
43
+ duration: float
44
+ channels: int
45
+
46
+
47
+ def _av_info(filepath: tp.Union[str, Path]) -> AudioFileInfo:
48
+ _init_av()
49
+ with av.open(str(filepath)) as af:
50
+ stream = af.streams.audio[0]
51
+ sample_rate = stream.codec_context.sample_rate
52
+ duration = float(stream.duration * stream.time_base)
53
+ channels = stream.channels
54
+ return AudioFileInfo(sample_rate, duration, channels)
55
+
56
+
57
+ def _soundfile_info(filepath: tp.Union[str, Path]) -> AudioFileInfo:
58
+ info = soundfile.info(filepath)
59
+ return AudioFileInfo(info.samplerate, info.duration, info.channels)
60
+
61
+
62
+ def audio_info(filepath: tp.Union[str, Path]) -> AudioFileInfo:
63
+ # torchaudio no longer returns useful duration informations for some formats like mp3s.
64
+ filepath = Path(filepath)
65
+ if filepath.suffix in ['.flac', '.ogg']: # TODO: Validate .ogg can be safely read with av_info
66
+ # ffmpeg has some weird issue with flac.
67
+ return _soundfile_info(filepath)
68
+ else:
69
+ return _av_info(filepath)
70
+
71
+
72
+ def _av_read(filepath: tp.Union[str, Path], seek_time: float = 0, duration: float = -1.) -> tp.Tuple[torch.Tensor, int]:
73
+ """FFMPEG-based audio file reading using PyAV bindings.
74
+ Soundfile cannot read mp3 and av_read is more efficient than torchaudio.
75
+
76
+ Args:
77
+ filepath (str or Path): Path to audio file to read.
78
+ seek_time (float): Time at which to start reading in the file.
79
+ duration (float): Duration to read from the file. If set to -1, the whole file is read.
80
+ Returns:
81
+ Tuple[torch.Tensor, int]: Tuple containing audio data and sample rate
82
+ """
83
+ _init_av()
84
+ with av.open(str(filepath)) as af:
85
+ stream = af.streams.audio[0]
86
+ sr = stream.codec_context.sample_rate
87
+ num_frames = int(sr * duration) if duration >= 0 else -1
88
+ frame_offset = int(sr * seek_time)
89
+ # we need a small negative offset otherwise we get some edge artifact
90
+ # from the mp3 decoder.
91
+ af.seek(int(max(0, (seek_time - 0.1)) / stream.time_base), stream=stream)
92
+ frames = []
93
+ length = 0
94
+ for frame in af.decode(streams=stream.index):
95
+ current_offset = int(frame.rate * frame.pts * frame.time_base)
96
+ strip = max(0, frame_offset - current_offset)
97
+ buf = torch.from_numpy(frame.to_ndarray())
98
+ if buf.shape[0] != stream.channels:
99
+ buf = buf.view(-1, stream.channels).t()
100
+ buf = buf[:, strip:]
101
+ frames.append(buf)
102
+ length += buf.shape[1]
103
+ if num_frames > 0 and length >= num_frames:
104
+ break
105
+ assert frames
106
+ # If the above assert fails, it is likely because we seeked past the end of file point,
107
+ # in which case ffmpeg returns a single frame with only zeros, and a weird timestamp.
108
+ # This will need proper debugging, in due time.
109
+ wav = torch.cat(frames, dim=1)
110
+ assert wav.shape[0] == stream.channels
111
+ if num_frames > 0:
112
+ wav = wav[:, :num_frames]
113
+ return f32_pcm(wav), sr
114
+
115
+
116
+ def audio_read(filepath: tp.Union[str, Path], seek_time: float = 0.,
117
+ duration: float = -1., pad: bool = False) -> tp.Tuple[torch.Tensor, int]:
118
+ """Read audio by picking the most appropriate backend tool based on the audio format.
119
+
120
+ Args:
121
+ filepath (str or Path): Path to audio file to read.
122
+ seek_time (float): Time at which to start reading in the file.
123
+ duration (float): Duration to read from the file. If set to -1, the whole file is read.
124
+ pad (bool): Pad output audio if not reaching expected duration.
125
+ Returns:
126
+ Tuple[torch.Tensor, int]: Tuple containing audio data and sample rate.
127
+ """
128
+ fp = Path(filepath)
129
+ if fp.suffix in ['.flac', '.ogg']: # TODO: check if we can safely use av_read for .ogg
130
+ # There is some bug with ffmpeg and reading flac
131
+ info = _soundfile_info(filepath)
132
+ frames = -1 if duration <= 0 else int(duration * info.sample_rate)
133
+ frame_offset = int(seek_time * info.sample_rate)
134
+ wav, sr = soundfile.read(filepath, start=frame_offset, frames=frames, dtype=np.float32)
135
+ assert info.sample_rate == sr, f"Mismatch of sample rates {info.sample_rate} {sr}"
136
+ wav = torch.from_numpy(wav).t().contiguous()
137
+ if len(wav.shape) == 1:
138
+ wav = torch.unsqueeze(wav, 0)
139
+ elif (
140
+ fp.suffix in ['.wav', '.mp3'] and fp.suffix[1:] in ta.utils.sox_utils.list_read_formats()
141
+ and duration <= 0 and seek_time == 0
142
+ ):
143
+ # Torchaudio is faster if we load an entire file at once.
144
+ wav, sr = ta.load(fp)
145
+ else:
146
+ wav, sr = _av_read(filepath, seek_time, duration)
147
+ if pad and duration > 0:
148
+ expected_frames = int(duration * sr)
149
+ wav = F.pad(wav, (0, expected_frames - wav.shape[-1]))
150
+ return wav, sr
151
+
152
+
153
+ def audio_write(stem_name: tp.Union[str, Path],
154
+ wav: torch.Tensor, sample_rate: int,
155
+ format: str = 'wav', mp3_rate: int = 320, normalize: bool = True,
156
+ strategy: str = 'peak', peak_clip_headroom_db: float = 1,
157
+ rms_headroom_db: float = 18, loudness_headroom_db: float = 14,
158
+ log_clipping: bool = True, make_parent_dir: bool = True,
159
+ add_suffix: bool = True) -> Path:
160
+ """Convenience function for saving audio to disk. Returns the filename the audio was written to.
161
+
162
+ Args:
163
+ stem_name (str or Path): Filename without extension which will be added automatically.
164
+ format (str): Either "wav" or "mp3".
165
+ mp3_rate (int): kbps when using mp3s.
166
+ normalize (bool): if `True` (default), normalizes according to the prescribed
167
+ strategy (see after). If `False`, the strategy is only used in case clipping
168
+ would happen.
169
+ strategy (str): Can be either 'clip', 'peak', or 'rms'. Default is 'peak',
170
+ i.e. audio is normalized by its largest value. RMS normalizes by root-mean-square
171
+ with extra headroom to avoid clipping. 'clip' just clips.
172
+ peak_clip_headroom_db (float): Headroom in dB when doing 'peak' or 'clip' strategy.
173
+ rms_headroom_db (float): Headroom in dB when doing 'rms' strategy. This must be much larger
174
+ than the `peak_clip` one to avoid further clipping.
175
+ loudness_headroom_db (float): Target loudness for loudness normalization.
176
+ log_clipping (bool): If True, basic logging on stderr when clipping still
177
+ occurs despite strategy (only for 'rms').
178
+ make_parent_dir (bool): Make parent directory if it doesn't exist.
179
+ Returns:
180
+ Path: Path of the saved audio.
181
+ """
182
+ assert wav.dtype.is_floating_point, "wav is not floating point"
183
+ if wav.dim() == 1:
184
+ wav = wav[None]
185
+ elif wav.dim() > 2:
186
+ raise ValueError("Input wav should be at most 2 dimension.")
187
+ assert wav.isfinite().all()
188
+ wav = normalize_audio(wav, normalize, strategy, peak_clip_headroom_db,
189
+ rms_headroom_db, loudness_headroom_db, log_clipping=log_clipping,
190
+ sample_rate=sample_rate, stem_name=str(stem_name))
191
+ kwargs: dict = {}
192
+ if format == 'mp3':
193
+ suffix = '.mp3'
194
+ kwargs.update({"compression": mp3_rate})
195
+ elif format == 'wav':
196
+ wav = i16_pcm(wav)
197
+ suffix = '.wav'
198
+ kwargs.update({"encoding": "PCM_S", "bits_per_sample": 16})
199
+ else:
200
+ raise RuntimeError(f"Invalid format {format}. Only wav or mp3 are supported.")
201
+ if not add_suffix:
202
+ suffix = ''
203
+ path = Path(str(stem_name) + suffix)
204
+ if make_parent_dir:
205
+ path.parent.mkdir(exist_ok=True, parents=True)
206
+ try:
207
+ ta.save(path, wav, sample_rate, **kwargs)
208
+ except Exception:
209
+ if path.exists():
210
+ # we do not want to leave half written files around.
211
+ path.unlink()
212
+ raise
213
+ return path
audiocraft/data/audio_dataset.py ADDED
@@ -0,0 +1,525 @@