arnavmehta7 commited on Jun 8

Commit

8520a55

•

1 Parent(s): 673f68d

Add files (#1)

Browse files

- Add files (4c64221df87e5c2599c8d3a616943e5d23ed29de)

Files changed (26) hide show

.gitattributes +1 -0
LICENSE +664 -0
README.md +157 -0
assets/demo-preview.png +0 -0
assets/github-banner.png +0 -0
docs/architecture.md +103 -0
docs/assets/NAR_inpainting_diagram.png +0 -0
docs/assets/example_ref.wav +0 -0
docs/assets/intro_vid.mp4 +3 -0
docs/assets/mars5_AR_arch.png +0 -0
docs/assets/mars5_NAR_arch.png +0 -0
docs/assets/simplified_diagram.png +0 -0
hubconf.py +33 -0
inference.py +236 -0
mars5/ar_generate.py +165 -0
mars5/diffuser.py +472 -0
mars5/minbpe/base.py +166 -0
mars5/minbpe/codebook.py +210 -0
mars5/minbpe/regex.py +164 -0
mars5/model.py +344 -0
mars5/nn_future.py +400 -0
mars5/samplers.py +122 -0
mars5/trim.py +741 -0
mars5/utils.py +62 -0
mars5_demo.ipynb +140 -0
requirements.txt +8 -0

.gitattributes CHANGED Viewed

@@ -33,3 +33,4 @@ saved_model/**/* filter=lfs diff=lfs merge=lfs -text
 *.zip filter=lfs diff=lfs merge=lfs -text
 *.zst filter=lfs diff=lfs merge=lfs -text
 *tfevents* filter=lfs diff=lfs merge=lfs -text

 *.zip filter=lfs diff=lfs merge=lfs -text
 *.zst filter=lfs diff=lfs merge=lfs -text
 *tfevents* filter=lfs diff=lfs merge=lfs -text
+docs/assets/intro_vid.mp4 filter=lfs diff=lfs merge=lfs -text

LICENSE ADDED Viewed

	@@ -0,0 +1,664 @@

+                    GNU AFFERO GENERAL PUBLIC LICENSE
+                       Version 3, 19 November 2007
+ Copyright (C) 2007 Free Software Foundation, Inc. <https://fsf.org/>
+ Everyone is permitted to copy and distribute verbatim copies
+ of this license document, but changing it is not allowed.
+                            Preamble
+  The GNU Affero General Public License is a free, copyleft license for
+software and other kinds of works, specifically designed to ensure
+cooperation with the community in the case of network server software.
+  The licenses for most software and other practical works are designed
+to take away your freedom to share and change the works.  By contrast,
+our General Public Licenses are intended to guarantee your freedom to
+share and change all versions of a program--to make sure it remains free
+software for all its users.
+  When we speak of free software, we are referring to freedom, not
+price.  Our General Public Licenses are designed to make sure that you
+have the freedom to distribute copies of free software (and charge for
+them if you wish), that you receive source code or can get it if you
+want it, that you can change the software or use pieces of it in new
+free programs, and that you know you can do these things.
+  Developers that use our General Public Licenses protect your rights
+with two steps: (1) assert copyright on the software, and (2) offer
+you this License which gives you legal permission to copy, distribute
+and/or modify the software.
+  A secondary benefit of defending all users' freedom is that
+improvements made in alternate versions of the program, if they
+receive widespread use, become available for other developers to
+incorporate.  Many developers of free software are heartened and
+encouraged by the resulting cooperation.  However, in the case of
+software used on network servers, this result may fail to come about.
+The GNU General Public License permits making a modified version and
+letting the public access it on a server without ever releasing its
+source code to the public.
+  The GNU Affero General Public License is designed specifically to
+ensure that, in such cases, the modified source code becomes available
+to the community.  It requires the operator of a network server to
+provide the source code of the modified version running there to the
+users of that server.  Therefore, public use of a modified version, on
+a publicly accessible server, gives the public access to the source
+code of the modified version.
+  An older license, called the Affero General Public License and
+published by Affero, was designed to accomplish similar goals.  This is
+a different license, not a version of the Affero GPL, but Affero has
+released a new version of the Affero GPL which permits relicensing under
+this license.
+  The precise terms and conditions for copying, distribution and
+modification follow.
+                       TERMS AND CONDITIONS
+  0. Definitions.
+  "This License" refers to version 3 of the GNU Affero General Public License.
+  "Copyright" also means copyright-like laws that apply to other kinds of
+works, such as semiconductor masks.
+  "The Program" refers to any copyrightable work licensed under this
+License.  Each licensee is addressed as "you".  "Licensees" and
+"recipients" may be individuals or organizations.
+  To "modify" a work means to copy from or adapt all or part of the work
+in a fashion requiring copyright permission, other than the making of an
+exact copy.  The resulting work is called a "modified version" of the
+earlier work or a work "based on" the earlier work.
+  A "covered work" means either the unmodified Program or a work based
+on the Program.
+  To "propagate" a work means to do anything with it that, without
+permission, would make you directly or secondarily liable for
+infringement under applicable copyright law, except executing it on a
+computer or modifying a private copy.  Propagation includes copying,
+distribution (with or without modification), making available to the
+public, and in some countries other activities as well.
+  To "convey" a work means any kind of propagation that enables other
+parties to make or receive copies.  Mere interaction with a user through
+a computer network, with no transfer of a copy, is not conveying.
+  An interactive user interface displays "Appropriate Legal Notices"
+to the extent that it includes a convenient and prominently visible
+feature that (1) displays an appropriate copyright notice, and (2)
+tells the user that there is no warranty for the work (except to the
+extent that warranties are provided), that licensees may convey the
+work under this License, and how to view a copy of this License.  If
+the interface presents a list of user commands or options, such as a
+menu, a prominent item in the list meets this criterion.
+  1. Source Code.
+  The "source code" for a work means the preferred form of the work
+for making modifications to it.  "Object code" means any non-source
+form of a work.
+  A "Standard Interface" means an interface that either is an official
+standard defined by a recognized standards body, or, in the case of
+interfaces specified for a particular programming language, one that
+is widely used among developers working in that language.
+  The "System Libraries" of an executable work include anything, other
+than the work as a whole, that (a) is included in the normal form of
+packaging a Major Component, but which is not part of that Major
+Component, and (b) serves only to enable use of the work with that
+Major Component, or to implement a Standard Interface for which an
+implementation is available to the public in source code form.  A
+"Major Component", in this context, means a major essential component
+(kernel, window system, and so on) of the specific operating system
+(if any) on which the executable work runs, or a compiler used to
+produce the work, or an object code interpreter used to run it.
+  The "Corresponding Source" for a work in object code form means all
+the source code needed to generate, install, and (for an executable
+work) run the object code and to modify the work, including scripts to
+control those activities.  However, it does not include the work's
+System Libraries, or general-purpose tools or generally available free
+programs which are used unmodified in performing those activities but
+which are not part of the work.  For example, Corresponding Source
+includes interface definition files associated with source files for
+the work, and the source code for shared libraries and dynamically
+linked subprograms that the work is specifically designed to require,
+such as by intimate data communication or control flow between those
+subprograms and other parts of the work.
+  The Corresponding Source need not include anything that users
+can regenerate automatically from other parts of the Corresponding
+Source.
+  The Corresponding Source for a work in source code form is that
+same work.
+  2. Basic Permissions.
+  All rights granted under this License are granted for the term of
+copyright on the Program, and are irrevocable provided the stated
+conditions are met.  This License explicitly affirms your unlimited
+permission to run the unmodified Program.  The output from running a
+covered work is covered by this License only if the output, given its
+content, constitutes a covered work.  This License acknowledges your
+rights of fair use or other equivalent, as provided by copyright law.
+  You may make, run and propagate covered works that you do not
+convey, without conditions so long as your license otherwise remains
+in force.  You may convey covered works to others for the sole purpose
+of having them make modifications exclusively for you, or provide you
+with facilities for running those works, provided that you comply with
+the terms of this License in conveying all material for which you do
+not control copyright.  Those thus making or running the covered works
+for you must do so exclusively on your behalf, under your direction
+and control, on terms that prohibit them from making any copies of
+your copyrighted material outside their relationship with you.
+  Conveying under any other circumstances is permitted solely under
+the conditions stated below.  Sublicensing is not allowed; section 10
+makes it unnecessary.
+  3. Protecting Users' Legal Rights From Anti-Circumvention Law.
+  No covered work shall be deemed part of an effective technological
+measure under any applicable law fulfilling obligations under article
+11 of the WIPO copyright treaty adopted on 20 December 1996, or
+similar laws prohibiting or restricting circumvention of such
+measures.
+  When you convey a covered work, you waive any legal power to forbid
+circumvention of technological measures to the extent such circumvention
+is effected by exercising rights under this License with respect to
+the covered work, and you disclaim any intention to limit operation or
+modification of the work as a means of enforcing, against the work's
+users, your or third parties' legal rights to forbid circumvention of
+technological measures.
+  4. Conveying Verbatim Copies.
+  You may convey verbatim copies of the Program's source code as you
+receive it, in any medium, provided that you conspicuously and
+appropriately publish on each copy an appropriate copyright notice;
+keep intact all notices stating that this License and any
+non-permissive terms added in accord with section 7 apply to the code;
+keep intact all notices of the absence of any warranty; and give all
+recipients a copy of this License along with the Program.
+  You may charge any price or no price for each copy that you convey,
+and you may offer support or warranty protection for a fee.
+  5. Conveying Modified Source Versions.
+  You may convey a work based on the Program, or the modifications to
+produce it from the Program, in the form of source code under the
+terms of section 4, provided that you also meet all of these conditions:
+    a) The work must carry prominent notices stating that you modified
+    it, and giving a relevant date.
+    b) The work must carry prominent notices stating that it is
+    released under this License and any conditions added under section
+    7.  This requirement modifies the requirement in section 4 to
+    "keep intact all notices".
+    c) You must license the entire work, as a whole, under this
+    License to anyone who comes into possession of a copy.  This
+    License will therefore apply, along with any applicable section 7
+    additional terms, to the whole of the work, and all its parts,
+    regardless of how they are packaged.  This License gives no
+    permission to license the work in any other way, but it does not
+    invalidate such permission if you have separately received it.
+    d) If the work has interactive user interfaces, each must display
+    Appropriate Legal Notices; however, if the Program has interactive
+    interfaces that do not display Appropriate Legal Notices, your
+    work need not make them do so.
+  A compilation of a covered work with other separate and independent
+works, which are not by their nature extensions of the covered work,
+and which are not combined with it such as to form a larger program,
+in or on a volume of a storage or distribution medium, is called an
+"aggregate" if the compilation and its resulting copyright are not
+used to limit the access or legal rights of the compilation's users
+beyond what the individual works permit.  Inclusion of a covered work
+in an aggregate does not cause this License to apply to the other
+parts of the aggregate.
+  6. Conveying Non-Source Forms.
+  You may convey a covered work in object code form under the terms
+of sections 4 and 5, provided that you also convey the
+machine-readable Corresponding Source under the terms of this License,
+in one of these ways:
+    a) Convey the object code in, or embodied in, a physical product
+    (including a physical distribution medium), accompanied by the
+    Corresponding Source fixed on a durable physical medium
+    customarily used for software interchange.
+    b) Convey the object code in, or embodied in, a physical product
+    (including a physical distribution medium), accompanied by a
+    written offer, valid for at least three years and valid for as
+    long as you offer spare parts or customer support for that product
+    model, to give anyone who possesses the object code either (1) a
+    copy of the Corresponding Source for all the software in the
+    product that is covered by this License, on a durable physical
+    medium customarily used for software interchange, for a price no
+    more than your reasonable cost of physically performing this
+    conveying of source, or (2) access to copy the
+    Corresponding Source from a network server at no charge.
+    c) Convey individual copies of the object code with a copy of the
+    written offer to provide the Corresponding Source.  This
+    alternative is allowed only occasionally and noncommercially, and
+    only if you received the object code with such an offer, in accord
+    with subsection 6b.
+    d) Convey the object code by offering access from a designated
+    place (gratis or for a charge), and offer equivalent access to the
+    Corresponding Source in the same way through the same place at no
+    further charge.  You need not require recipients to copy the
+    Corresponding Source along with the object code.  If the place to
+    copy the object code is a network server, the Corresponding Source
+    may be on a different server (operated by you or a third party)
+    that supports equivalent copying facilities, provided you maintain
+    clear directions next to the object code saying where to find the
+    Corresponding Source.  Regardless of what server hosts the
+    Corresponding Source, you remain obligated to ensure that it is
+    available for as long as needed to satisfy these requirements.
+    e) Convey the object code using peer-to-peer transmission, provided
+    you inform other peers where the object code and Corresponding
+    Source of the work are being offered to the general public at no
+    charge under subsection 6d.
+  A separable portion of the object code, whose source code is excluded
+from the Corresponding Source as a System Library, need not be
+included in conveying the object code work.
+  A "User Product" is either (1) a "consumer product", which means any
+tangible personal property which is normally used for personal, family,
+or household purposes, or (2) anything designed or sold for incorporation
+into a dwelling.  In determining whether a product is a consumer product,
+doubtful cases shall be resolved in favor of coverage.  For a particular
+product received by a particular user, "normally used" refers to a
+typical or common use of that class of product, regardless of the status
+of the particular user or of the way in which the particular user
+actually uses, or expects or is expected to use, the product.  A product
+is a consumer product regardless of whether the product has substantial
+commercial, industrial or non-consumer uses, unless such uses represent
+the only significant mode of use of the product.
+  "Installation Information" for a User Product means any methods,
+procedures, authorization keys, or other information required to install
+and execute modified versions of a covered work in that User Product from
+a modified version of its Corresponding Source.  The information must
+suffice to ensure that the continued functioning of the modified object
+code is in no case prevented or interfered with solely because
+modification has been made.
+  If you convey an object code work under this section in, or with, or
+specifically for use in, a User Product, and the conveying occurs as
+part of a transaction in which the right of possession and use of the
+User Product is transferred to the recipient in perpetuity or for a
+fixed term (regardless of how the transaction is characterized), the
+Corresponding Source conveyed under this section must be accompanied
+by the Installation Information.  But this requirement does not apply
+if neither you nor any third party retains the ability to install
+modified object code on the User Product (for example, the work has
+been installed in ROM).
+  The requirement to provide Installation Information does not include a
+requirement to continue to provide support service, warranty, or updates
+for a work that has been modified or installed by the recipient, or for
+the User Product in which it has been modified or installed.  Access to a
+network may be denied when the modification itself materially and
+adversely affects the operation of the network or violates the rules and
+protocols for communication across the network.
+  Corresponding Source conveyed, and Installation Information provided,
+in accord with this section must be in a format that is publicly
+documented (and with an implementation available to the public in
+source code form), and must require no special password or key for
+unpacking, reading or copying.
+  7. Additional Terms.
+  "Additional permissions" are terms that supplement the terms of this
+License by making exceptions from one or more of its conditions.
+Additional permissions that are applicable to the entire Program shall
+be treated as though they were included in this License, to the extent
+that they are valid under applicable law.  If additional permissions
+apply only to part of the Program, that part may be used separately
+under those permissions, but the entire Program remains governed by
+this License without regard to the additional permissions.
+  When you convey a copy of a covered work, you may at your option
+remove any additional permissions from that copy, or from any part of
+it.  (Additional permissions may be written to require their own
+removal in certain cases when you modify the work.)  You may place
+additional permissions on material, added by you to a covered work,
+for which you have or can give appropriate copyright permission.
+  Notwithstanding any other provision of this License, for material you
+add to a covered work, you may (if authorized by the copyright holders of
+that material) supplement the terms of this License with terms:
+    a) Disclaiming warranty or limiting liability differently from the
+    terms of sections 15 and 16 of this License; or
+    b) Requiring preservation of specified reasonable legal notices or
+    author attributions in that material or in the Appropriate Legal
+    Notices displayed by works containing it; or
+    c) Prohibiting misrepresentation of the origin of that material, or
+    requiring that modified versions of such material be marked in
+    reasonable ways as different from the original version; or
+    d) Limiting the use for publicity purposes of names of licensors or
+    authors of the material; or
+    e) Declining to grant rights under trademark law for use of some
+    trade names, trademarks, or service marks; or
+    f) Requiring indemnification of licensors and authors of that
+    material by anyone who conveys the material (or modified versions of
+    it) with contractual assumptions of liability to the recipient, for
+    any liability that these contractual assumptions directly impose on
+    those licensors and authors.
+  All other non-permissive additional terms are considered "further
+restrictions" within the meaning of section 10.  If the Program as you
+received it, or any part of it, contains a notice stating that it is
+governed by this License along with a term that is a further
+restriction, you may remove that term.  If a license document contains
+a further restriction but permits relicensing or conveying under this
+License, you may add to a covered work material governed by the terms
+of that license document, provided that the further restriction does
+not survive such relicensing or conveying.
+  If you add terms to a covered work in accord with this section, you
+must place, in the relevant source files, a statement of the
+additional terms that apply to those files, or a notice indicating
+where to find the applicable terms.
+  Additional terms, permissive or non-permissive, may be stated in the
+form of a separately written license, or stated as exceptions;
+the above requirements apply either way.
+  8. Termination.
+  You may not propagate or modify a covered work except as expressly
+provided under this License.  Any attempt otherwise to propagate or
+modify it is void, and will automatically terminate your rights under
+this License (including any patent licenses granted under the third
+paragraph of section 11).
+  However, if you cease all violation of this License, then your
+license from a particular copyright holder is reinstated (a)
+provisionally, unless and until the copyright holder explicitly and
+finally terminates your license, and (b) permanently, if the copyright
+holder fails to notify you of the violation by some reasonable means
+prior to 60 days after the cessation.
+  Moreover, your license from a particular copyright holder is
+reinstated permanently if the copyright holder notifies you of the
+violation by some reasonable means, this is the first time you have
+received notice of violation of this License (for any work) from that
+copyright holder, and you cure the violation prior to 30 days after
+your receipt of the notice.
+  Termination of your rights under this section does not terminate the
+licenses of parties who have received copies or rights from you under
+this License.  If your rights have been terminated and not permanently
+reinstated, you do not qualify to receive new licenses for the same
+material under section 10.
+  9. Acceptance Not Required for Having Copies.
+  You are not required to accept this License in order to receive or
+run a copy of the Program.  Ancillary propagation of a covered work
+occurring solely as a consequence of using peer-to-peer transmission
+to receive a copy likewise does not require acceptance.  However,
+nothing other than this License grants you permission to propagate or
+modify any covered work.  These actions infringe copyright if you do
+not accept this License.  Therefore, by modifying or propagating a
+covered work, you indicate your acceptance of this License to do so.
+  10. Automatic Licensing of Downstream Recipients.
+  Each time you convey a covered work, the recipient automatically
+receives a license from the original licensors, to run, modify and
+propagate that work, subject to this License.  You are not responsible
+for enforcing compliance by third parties with this License.
+  An "entity transaction" is a transaction transferring control of an
+organization, or substantially all assets of one, or subdividing an
+organization, or merging organizations.  If propagation of a covered
+work results from an entity transaction, each party to that
+transaction who receives a copy of the work also receives whatever
+licenses to the work the party's predecessor in interest had or could
+give under the previous paragraph, plus a right to possession of the
+Corresponding Source of the work from the predecessor in interest, if
+the predecessor has it or can get it with reasonable efforts.
+  You may not impose any further restrictions on the exercise of the
+rights granted or affirmed under this License.  For example, you may
+not impose a license fee, royalty, or other charge for exercise of
+rights granted under this License, and you may not initiate litigation
+(including a cross-claim or counterclaim in a lawsuit) alleging that
+any patent claim is infringed by making, using, selling, offering for
+sale, or importing the Program or any portion of it.
+  11. Patents.
+  A "contributor" is a copyright holder who authorizes use under this
+License of the Program or a work on which the Program is based.  The
+work thus licensed is called the contributor's "contributor version".
+  A contributor's "essential patent claims" are all patent claims
+owned or controlled by the contributor, whether already acquired or
+hereafter acquired, that would be infringed by some manner, permitted
+by this License, of making, using, or selling its contributor version,
+but do not include claims that would be infringed only as a
+consequence of further modification of the contributor version.  For
+purposes of this definition, "control" includes the right to grant
+patent sublicenses in a manner consistent with the requirements of
+this License.
+  Each contributor grants you a non-exclusive, worldwide, royalty-free
+patent license under the contributor's essential patent claims, to
+make, use, sell, offer for sale, import and otherwise run, modify and
+propagate the contents of its contributor version.
+  In the following three paragraphs, a "patent license" is any express
+agreement or commitment, however denominated, not to enforce a patent
+(such as an express permission to practice a patent or covenant not to
+sue for patent infringement).  To "grant" such a patent license to a
+party means to make such an agreement or commitment not to enforce a
+patent against the party.
+  If you convey a covered work, knowingly relying on a patent license,
+and the Corresponding Source of the work is not available for anyone
+to copy, free of charge and under the terms of this License, through a
+publicly available network server or other readily accessible means,
+then you must either (1) cause the Corresponding Source to be so
+available, or (2) arrange to deprive yourself of the benefit of the
+patent license for this particular work, or (3) arrange, in a manner
+consistent with the requirements of this License, to extend the patent
+license to downstream recipients.  "Knowingly relying" means you have
+actual knowledge that, but for the patent license, your conveying the
+covered work in a country, or your recipient's use of the covered work
+in a country, would infringe one or more identifiable patents in that
+country that you have reason to believe are valid.
+  If, pursuant to or in connection with a single transaction or
+arrangement, you convey, or propagate by procuring conveyance of, a
+covered work, and grant a patent license to some of the parties
+receiving the covered work authorizing them to use, propagate, modify
+or convey a specific copy of the covered work, then the patent license
+you grant is automatically extended to all recipients of the covered
+work and works based on it.
+  A patent license is "discriminatory" if it does not include within
+the scope of its coverage, prohibits the exercise of, or is
+conditioned on the non-exercise of one or more of the rights that are
+specifically granted under this License.  You may not convey a covered
+work if you are a party to an arrangement with a third party that is
+in the business of distributing software, under which you make payment
+to the third party based on the extent of your activity of conveying
+the work, and under which the third party grants, to any of the
+parties who would receive the covered work from you, a discriminatory
+patent license (a) in connection with copies of the covered work
+conveyed by you (or copies made from those copies), or (b) primarily
+for and in connection with specific products or compilations that
+contain the covered work, unless you entered into that arrangement,
+or that patent license was granted, prior to 28 March 2007.
+  Nothing in this License shall be construed as excluding or limiting
+any implied license or other defenses to infringement that may
+otherwise be available to you under applicable patent law.
+  12. No Surrender of Others' Freedom.
+  If conditions are imposed on you (whether by court order, agreement or
+otherwise) that contradict the conditions of this License, they do not
+excuse you from the conditions of this License.  If you cannot convey a
+covered work so as to satisfy simultaneously your obligations under this
+License and any other pertinent obligations, then as a consequence you may
+not convey it at all.  For example, if you agree to terms that obligate you
+to collect a royalty for further conveying from those to whom you convey
+the Program, the only way you could satisfy both those terms and this
+License would be to refrain entirely from conveying the Program.
+  13. Remote Network Interaction; Use with the GNU General Public License.
+  Notwithstanding any other provision of this License, if you modify the
+Program, your modified version must prominently offer all users
+interacting with it remotely through a computer network (if your version
+supports such interaction) an opportunity to receive the Corresponding
+Source of your version by providing access to the Corresponding Source
+from a network server at no charge, through some standard or customary
+means of facilitating copying of software.  This Corresponding Source
+shall include the Corresponding Source for any work covered by version 3
+of the GNU General Public License that is incorporated pursuant to the
+following paragraph.
+  Notwithstanding any other provision of this License, you have
+permission to link or combine any covered work with a work licensed
+under version 3 of the GNU General Public License into a single
+combined work, and to convey the resulting work.  The terms of this
+License will continue to apply to the part which is the covered work,
+but the work with which it is combined will remain governed by version
+3 of the GNU General Public License.
+  14. Revised Versions of this License.
+  The Free Software Foundation may publish revised and/or new versions of
+the GNU Affero General Public License from time to time.  Such new versions
+will be similar in spirit to the present version, but may differ in detail to
+address new problems or concerns.
+  Each version is given a distinguishing version number.  If the
+Program specifies that a certain numbered version of the GNU Affero General
+Public License "or any later version" applies to it, you have the
+option of following the terms and conditions either of that numbered
+version or of any later version published by the Free Software
+Foundation.  If the Program does not specify a version number of the
+GNU Affero General Public License, you may choose any version ever published
+by the Free Software Foundation.
+  If the Program specifies that a proxy can decide which future
+versions of the GNU Affero General Public License can be used, that proxy's
+public statement of acceptance of a version permanently authorizes you
+to choose that version for the Program.
+  Later license versions may give you additional or different
+permissions.  However, no additional obligations are imposed on any
+author or copyright holder as a result of your choosing to follow a
+later version.
+  15. Disclaimer of Warranty.
+  THERE IS NO WARRANTY FOR THE PROGRAM, TO THE EXTENT PERMITTED BY
+APPLICABLE LAW.  EXCEPT WHEN OTHERWISE STATED IN WRITING THE COPYRIGHT
+HOLDERS AND/OR OTHER PARTIES PROVIDE THE PROGRAM "AS IS" WITHOUT WARRANTY
+OF ANY KIND, EITHER EXPRESSED OR IMPLIED, INCLUDING, BUT NOT LIMITED TO,
+THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+PURPOSE.  THE ENTIRE RISK AS TO THE QUALITY AND PERFORMANCE OF THE PROGRAM
+IS WITH YOU.  SHOULD THE PROGRAM PROVE DEFECTIVE, YOU ASSUME THE COST OF
+ALL NECESSARY SERVICING, REPAIR OR CORRECTION.
+  16. Limitation of Liability.
+  IN NO EVENT UNLESS REQUIRED BY APPLICABLE LAW OR AGREED TO IN WRITING
+WILL ANY COPYRIGHT HOLDER, OR ANY OTHER PARTY WHO MODIFIES AND/OR CONVEYS
+THE PROGRAM AS PERMITTED ABOVE, BE LIABLE TO YOU FOR DAMAGES, INCLUDING ANY
+GENERAL, SPECIAL, INCIDENTAL OR CONSEQUENTIAL DAMAGES ARISING OUT OF THE
+USE OR INABILITY TO USE THE PROGRAM (INCLUDING BUT NOT LIMITED TO LOSS OF
+DATA OR DATA BEING RENDERED INACCURATE OR LOSSES SUSTAINED BY YOU OR THIRD
+PARTIES OR A FAILURE OF THE PROGRAM TO OPERATE WITH ANY OTHER PROGRAMS),
+EVEN IF SUCH HOLDER OR OTHER PARTY HAS BEEN ADVISED OF THE POSSIBILITY OF
+SUCH DAMAGES.
+  17. Interpretation of Sections 15 and 16.
+  If the disclaimer of warranty and limitation of liability provided
+above cannot be given local legal effect according to their terms,
+reviewing courts shall apply local law that most closely approximates
+an absolute waiver of all civil liability in connection with the
+Program, unless a warranty or assumption of liability accompanies a
+copy of the Program in return for a fee.
+                     END OF TERMS AND CONDITIONS
+            How to Apply These Terms to Your New Programs
+  If you develop a new program, and you want it to be of the greatest
+possible use to the public, the best way to achieve this is to make it
+free software which everyone can redistribute and change under these terms.
+  To do so, attach the following notices to the program.  It is safest
+to attach them to the start of each source file to most effectively
+state the exclusion of warranty; and each file should have at least
+the "copyright" line and a pointer to where the full notice is found.
+    <one line to give the program's name and a brief idea of what it does.>
+    Copyright (C) <year>  <name of author>
+    This program is free software: you can redistribute it and/or modify
+    it under the terms of the GNU Affero General Public License as published
+    by the Free Software Foundation, either version 3 of the License, or
+    (at your option) any later version.
+    This program is distributed in the hope that it will be useful,
+    but WITHOUT ANY WARRANTY; without even the implied warranty of
+    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+    GNU Affero General Public License for more details.
+    You should have received a copy of the GNU Affero General Public License
+    along with this program.  If not, see <https://www.gnu.org/licenses/>.
+Also add information on how to contact you by electronic and paper mail.
+  If your software can interact with users remotely through a computer
+network, you should also make sure that it provides a way for users to
+get its source.  For example, if your program is a web application, its
+interface could display a "Source" link that leads users to an archive
+of the code.  There are many ways you could offer source, and different
+solutions will be better for different programs; see section 13 for the
+specific requirements.
+  You should also get your employer (if you work as a programmer) or school,
+if any, to sign a "copyright disclaimer" for the program,
+if necessary. For more information on this, and how to apply and follow the GNU AGPL, see <https://www.gnu.org/licenses/>.
+If you would like to obtain a copy of the software under a different license (e.g. Apache),
+please send an email to Camb.AI at help@camb.ai indicating that
+you would like a copy of the software under a different license.

README.md ADDED Viewed

	@@ -0,0 +1,157 @@

+![MARS5 Banner](assets/github-banner.png)
+# MARS5: A novel speech model for insane prosody.
+This is the repo for the MARS5 English speech model (TTS) from CAMB.AI.
+The model follows a two-stage AR-NAR pipeline with a distinctively novel NAR component (see more info in the [docs](docs/architecture.md)).
+With just 5 seconds of audio and a snippet of text, MARS5 can generate speech even for prosodically hard and diverse scenarios like sports commentary, anime and more. Check out our demo:
+https://github.com/Camb-ai/MARS5-TTS/assets/23717819/3e191508-e03c-4ff9-9b02-d73ae0ebefdd
+**Quick links**:
+- [CAMB.AI website](https://camb.ai/) (access MARS5 in 140+ languages for TTS and dubbing)
+- Technical docs: [in the docs folder](docs/architecture.md)
+- Colab quickstart: <a target="_blank" href="https://colab.research.google.com/github/Camb-ai/mars5-tts/blob/master/mars5_demo.ipynb"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>
+- Demo page with samples: [here](https://179c54d254f7.ngrok.app/)
+![Mars 5 simplified diagram](docs/assets/simplified_diagram.png)
+**Figure**: the high-level architecture flow of Mars 5. Given text and a reference audio, coarse (L0) encodec speech features are obtained through an autoregressive transformer model. Then, the text, reference, and coarse features are refined in a multinomial DDPM model to produce the remaining encodec codebook values. The output of the DDPM is then vocoded to produce the final audio.
+Because the model is trained on raw audio together with byte-pair-encoded text, it can be steered with things like punctuation and capitalization.
+E.g. to add a pause, add a comma to that part in the transcript. Or, to emphasize a word, put it in capital letters in the transcript.
+This enables a fairly natural way for guiding the prosody of the generated output.
+Speaker identity is specified using an audio reference file between 2-12 seconds, with lengths around 6s giving optimal results.
+Further, by providing the transcript of the reference, MARS5 enables one to do a '_deep clone_' which improves the quality of the cloning and output, at the cost of taking a bit longer to produce the audio.
+For more details on this and other performance and model details, please see inside the [docs folder](docs/architecture.md).
+## Quickstart
+We use `torch.hub` to make loading the model easy -- no cloning of the repo needed. The steps to perform inference are simple:
+1. **Install pip dependencies**: we have 3 inference dependencies only `torch`, `torchaudio`, `librosa`, `vocos`, and `encodec`. Python must be at version 3.10 or greater, and torch must be v2.0 or greater.
+```bash
+pip install --upgrade torch torchaudio librosa vocos encodec
+```
+2. **Load models**: load the Mars 5 AR and NAR model from torch hub:
+```python
+import torch, librosa
+mars5, config_class = torch.hub.load('Camb-ai/mars5-tts', 'mars5_english', trust_repo=True)
+# The `mars5` contains the AR and NAR model, as well as inference code.
+# The `config_class` contains tunable inference config settings like temperature.
+```
+3. **Pick a reference** and optionally its transcript:
+```python
+# load reference audio between 1-12 seconds.
+wav, sr = librosa.load('<path to arbitrary 24kHz waveform>.wav',
+                       sr=mars5.sr, mono=True)
+wav = torch.from_numpy(wav)
+ref_transcript = "<transcript of the reference audio>"
+```
+The reference transcript is an optional piece of info you need if you wish to do a deep clone.
+Mars5 supports 2 kinds of inference: a shallow, fast inference whereby you do not need the transcript of the reference (we call this a _shallow clone_), and a second slower, but typically higher quality way, which we call a _deep clone_.
+To use the deep clone, you need the prompt transcript. See the [model docs](docs/architecture.md) for more info on this.
+4. **Perform the synthesis**:
+```python
+# Pick whether you want a deep or shallow clone. Set to False if you don't know prompt transcript or want fast inference. Set to True if you know transcript and want highest quality.
+deep_clone = True
+# Below you can tune other inference settings, like top_k, temperature, top_p, etc...
+cfg = config_class(deep_clone=deep_clone, rep_penalty_window=100,
+                      top_k=100, temperature=0.7, freq_penalty=3)
+ar_codes, output_audio = mars5.tts("The quick brown rat.", wav,
+          ref_transcript,
+          cfg=cfg)
+# output_audio is (T,) shape float tensor corresponding to the 24kHz output audio.
+```
+That's it! These default settings provide pretty good results, but feel free to tune the inference settings to optimize the output for your particular example. See the [`InferenceConfig`](inference.py) code or the demo notebook for info and docs on all the different inference settings.
+_Some tips for best quality:_
+- Make sure reference audio is clean and between 1 second and 12 seconds.
+- Use deep clone and provide an accurate transcript for the reference.
+- Use proper punctuation -- the model can be guided and made better or worse with proper use of punctuation and capitalization.
+## Model details
+**Checkpoints**
+The checkpoints for MARS5 are provided under the releases tab of this github repo. We provide two checkpoints:
+- AR fp16 checkpoint [~750M parameters], along with config embedded in the checkpoint.
+- NAR fp16 checkpoint [~450M parameters], along with config embedded in the checkpoint.
+- The byte-pair encoding tokenizer used for the L0 encodec codes and the English text is embedded in each checkpoint under the `'vocab'` key, and follows roughly the same format of a saved minbpe tokenizer.
+**Hardware requirements**:
+You must be able to store at least 750M+450M params on GPU, and do inference with 750M of active parameters. In general, at least **20GB of GPU VRAM** is needed to run the model on GPU (we plan to further optimize this in the future).
+If you do not have the necessary hardware requirements and just want to use MARS5 in your applications, you can use it via our API: see [docs.camb.ai](https://docs.camb.ai/). If you need some more credits to test it for your use case, feel free to reach out to `help@camb.ai` for help.
+## Roadmap
+Mars 5 is not perfect at the moment, and we are working on a few efforts to improve its quality, stability, and performance.
+Rough areas we are looking to improve, and welcome any contributions:
+- Improving inference stability and consistency
+- Speed/performance optimizations
+- Improving reference audio selection when given long references.
+- Benchmark performance numbers for Mars 5 on standard speech datasets.
+If you would like to contribute any improvement to MARS, please feel free to contribute (guidelines below).
+## Contributions
+We welcome any contributions to improving the model. As you may find when experimenting, it can produce really great results, it can still be further improved to create excellent outputs _consistently_.
+**Contribution format**:
+The preferred way to contribute to our repo is to fork the [master repository](https://github.com/Camb-ai/mars5-tts) on GitHub:
+1. Fork the repo on github
+2. Clone the repo, set upstream as this repo: `git remote add upstream git@github.com:Camb-ai/mars5-tts.git`
+3. Make to a new local branch and make your changes, commit changes.
+4. Push changes to new upstream branch: `git push --set-upstream origin <NAME-NEW-BRANCH>`
+5. On github, go to your fork and click 'Pull request' to begin the PR process. Please make sure to include a description of what you did/fixed.
+## License
+We are open-sourcing MARS in English under GNU AGPL 3.0, but you can request to use it under a different license by emailing help@camb.ai
+## Join our team
+We're an ambitious team, globally distributed, with a singular aim of making everyone's voice count. At CAMB.AI, we're a research team of Interspeech-published, Carnegie Mellon, ex-Siri engineers and we're looking for you to join our team.
+We're actively hiring; please drop us an email at ack@camb.ai if you're interested. Visit our [careers page](https://www.camb.ai/careers) for more info.
+## Acknowledgements
+Parts of code for this project are adapted from the following repositories -- please make sure to check them out! Thank you to the authors of:
+- AWS: For providing much needed compute resources (NVIDIA H100s) to enable training of the model.
+- TransFusion: [https://github.com/RF5/transfusion-asr](https://github.com/RF5/transfusion-asr)
+- Multinomial diffusion: [https://github.com/ehoogeboom/multinomial_diffusion](https://github.com/ehoogeboom/multinomial_diffusion)
+- Mistral-src: [https://github.com/mistralai/mistral-src](https://github.com/mistralai/mistral-src)
+- minbpe: [https://github.com/karpathy/minbpe](https://github.com/karpathy/minbpe)
+- gemelo-ai's encodec Vocos: [https://github.com/gemelo-ai/vocos](https://github.com/gemelo-ai/vocos)
+- librosa for their `.trim()` code: [https://librosa.org/doc/main/generated/librosa.effects.trim.html](https://librosa.org/doc/main/generated/librosa.effects.trim.html)

assets/demo-preview.png ADDED Viewed

assets/github-banner.png ADDED Viewed

docs/architecture.md ADDED Viewed

	@@ -0,0 +1,103 @@

+# Mars 5 technical details
+While we do not have the time for a proper full writeup of the details of Mars5, its design, training, and implementation, we at least try give a more detailed overview here of how Mars5 works.
+## hubconf object/api
+After loading the model with `torch.hub.load`, two objects are returned, a Mars5TTS, and the dataclass of the inference config to use when calling the `mars5.tts()` method.
+Concretely, the main methods of the mars5 object are:
+```python
+# The init function, called automatically when you initialize the
+# model from torch.hub.load(). If you want, you can pass in your
+# own custom checkpoints here to initalize the model with your
+# own model, tokenizer, etc...
+def __init__(self, ar_ckpt, nar_ckpt, device: str = None) -> None:
+    # ... initialization code ...
+# Main text-to-speech function, converting text and a reference
+# audio to speech.
+def tts(self, text: str, ref_audio: Tensor, ref_transcript: str | None,
+            cfg: InferenceConfig) -> Tensor:
+        """ Perform TTS for `text`, given a reference audio `ref_audio` (of shape [sequence_length,], sampled at 24kHz)
+        which has an associated `ref_transcript`. Perform inference using the inference
+        config given by `cfg`, which controls the temperature, top_p, etc...
+        Returns:
+        - `ar_codes`: (seq_len,) long tensor of discrete coarse code outputs from the AR model.
+        - `out_wav`: (T,) float output audio tensor sampled at 24kHz.
+        """
+# Utility function to vocode encodec tokens, if one wishes
+# to hear the raw AR model ouput by vocoding the `ar_codes`
+# returned above.
+def vocode(self, tokens: Tensor) -> Tensor:
+    """ Vocodes tokens of shape (seq_len, n_q) """
+```
+## Model design
+Mars 5 follows a two-stage AR-NAR design according to the diagram on the main page.
+#### AR component
+The AR model follows a Mistral-style encoder-only transformer model to predict Encodec L0 codes (the lowest/most coarse level quantization codes).
+Overall, the AR and NAR model is going to predict all 8 codebook entries of the Encodec 6kbps codec.
+The AR model design is given below:
+![Mars 5 AR architecture](/docs/assets/mars5_AR_arch.png)
+**Figure**: autoregressive component of Mars 5. During training, the initial 6kbps encodec tokens of the speech are fed through a small encoder-only transformer, producing a single output vector corresponding to an implicit speaker embedding.
+This vector is concatenated with learnt embeddings corresponding to the text tokens, and L0 speech tokens, after byte-pair encoding tokenization.
+The AR model is trained using the standard next-token prediction task of language models with a cross-entropy loss with the next token, given a smaller weight to text tokens.
+During inference, we iteratively sample from the transformer to produce the desiged L0 codes.
+When we use a _shallow clone_, then the reference audio is fed into the transcript to make the implicit speaker embedding used in the input sequence.
+When we use a _deep clone_, the above is done, but we also concatenate the reference transcript with the desired text, and the reference audio tokens with the input sequence before we start sampling the output.
+In pseudocode:
+```
+speaker_embedding <- speaker_conditioning_transformer(ref audio)
+if deep_clone:
+    prompt = concatenate( speaker embedding, reference text, target text, reference L0 speech codes )
+else:
+    prompt = concatenate( speaker embedding, target text )
+ar output <- autoregressively sample from prompt
+```
+While a deep clone provides a more accurate cloning of the reference speaker identity and prosody, it requires knowledge of the reference transcript and takes longer to do inference.
+#### NAR component
+After the AR model has predicted the L0 encodec codes, we need a way to predict the remaining 7 codebooks of the 6kbps Encodec codec.
+This is what the NAR model is trained to do, using a multinomial diffusion framework.
+Concretely, the diffusion process is a discrete DDPM, whereby at each timestep in the diffusion process, it takes in a sequence of `(batch size, sequence length, n_codebooks)` and produces an output categorical distribution over each codebook, i.e. an output of shape `(batch size, sequence length, n_codebooks, 1024)`, since each encodec codebook has 1024 possible values.
+The architecture of the model looks as follows:
+![Mars 5 NAR architecture](/docs/assets/mars5_NAR_arch.png)
+**Figure**: Mars 5 non-autoregressive component. It follows an encoder-decoder transformer architecture, whereby the encoder computes an implicit speaker embedding like the AR model, and concatenates that along with the target to form an input sequence to a transformer encoder. The transformer decoder predicts the distribution of all 8 encodec codebook tokens given a partly noised input at some diffusion timestep `t`.
+The encoder and decoder transformers are simple `nn.Transformer` variants with sinusoidal positional embeddings and SwiGLU activations.
+A multinomial diffusion manager controls the forward and reference diffusion processes during inference and training according to a cosine diffusion schedule.
+Diffusion is performed independently of the sequence length or codebook index.
+During training and inference, the L0 codebooks of the input at timestep $t$ are overridden (i.e. not noised in the forward diffusion process) with either the ground truth L0 codes (during training) or the AR model's predictions (during inference).
+Like the AR model, the NAR model can perform inference in either a _shallow clone_ way or a _deep clone_ way.
+And, like the AR model, the difference between the two is, with a _deep clone_, we concatenate the reference text to the input text sequence, and the reference speech codes (the full values for all 8 codebooks) to the decoder input sequence $x$.
+During inference, we then treat the portion of $x$ corresponding to the reference codec codes, and all the AR L0 codes, as 'fixed' and effectively perform diffusion inpainting for the remaining missing codec codes.
+The figure below explains what the input to the decoder looks like for a deep clone:
+![NAR decoder input for deep clone](/docs/assets/NAR_inpainting_diagram.png)
+This allows us to use diffusion inpainting techniques like [RePaint](https://arxiv.org/abs/2201.09865) to improve the quality of the output at the cost of more inference time.
+We've implemented this in the the diffusion config used in the NAR inference code (see it [here](/mars5/diffuser.py)), and you can simply increase the `jump_len` and `jump_n_sample` to greater than 1 to use RePaint inpainting to improve NAR performance.

docs/assets/NAR_inpainting_diagram.png ADDED Viewed

docs/assets/example_ref.wav ADDED Viewed

Binary file (137 kB). View file

docs/assets/intro_vid.mp4 ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:cabbb40186fd5599282b4ada76643b1d1b34c513af1977861513f9d2f1220ad6
+size 2105962

docs/assets/mars5_AR_arch.png ADDED Viewed

docs/assets/mars5_NAR_arch.png ADDED Viewed

docs/assets/simplified_diagram.png ADDED Viewed

hubconf.py ADDED Viewed

	@@ -0,0 +1,33 @@

+dependencies = ['torch', 'torchaudio', 'numpy', 'vocos']
+import logging
+from pathlib import Path
+import torch
+from inference import Mars5TTS, InferenceConfig
+ar_url = "https://github.com/Camb-ai/mars5-tts/releases/download/v0.1-checkpoints/mars5_en_checkpoints_ar-1680000.pt"
+nar_url = "https://github.com/Camb-ai/mars5-tts/releases/download/v0.1-checkpoints/mars5_en_checkpoints_nar-1260000.pt"
+def mars5_english(pretrained=True, progress=True, device=None, ar_path=None, nar_path=None) -> Mars5TTS:
+    """ Load mars5 english model on `device`, optionally show `progress`. """
+    if device is None: device = 'cuda' if torch.cuda.is_available() else 'cpu'
+    logging.info(f"Using device: {device}")
+    if pretrained == False: raise AssertionError('Only pretrained model currently supported.')
+    logging.info("Loading AR checkpoint...")
+    if ar_path is None:
+        ar_ckpt = torch.hub.load_state_dict_from_url(
+            ar_url, progress=progress, check_hash=False, map_location='cpu'
+        )
+    else: ar_ckpt = torch.load(str(ar_path), map_location='cpu')
+    logging.info("Loading NAR checkpoint...")
+    if nar_path is None:
+        nar_ckpt = torch.hub.load_state_dict_from_url(
+            nar_url, progress=progress, check_hash=False, map_location='cpu'
+        )
+    else: nar_ckpt = torch.load(str(nar_path), map_location='cpu')
+    logging.info("Initializing modules...")
+    mars5 = Mars5TTS(ar_ckpt, nar_ckpt, device=device)
+    return mars5, InferenceConfig

inference.py ADDED Viewed

	@@ -0,0 +1,236 @@

+import torch
+from torch import Tensor
+import torch.nn as nn
+import torch.nn.functional as F
+import logging
+import json
+from typing import Optional
+from pathlib import Path
+from dataclasses import dataclass
+import os
+from mars5.model import CodecLM, ResidualTransformer
+from vocos import Vocos
+from encodec import EncodecModel
+from mars5.diffuser import MultinomialDiffusion, DSH, perform_simple_inference
+from mars5.minbpe.regex import RegexTokenizer, GPT4_SPLIT_PATTERN
+from mars5.minbpe.codebook import CodebookTokenizer
+from mars5.ar_generate import ar_generate
+from mars5.utils import nuke_weight_norm
+from mars5.trim import trim
+import tempfile
+import logging
+@dataclass
+class InferenceConfig():
+    """ The defaults configuration variables for TTS inference. """
+    ## >>>> AR CONFIG
+    temperature: float = 0.7
+    top_k: int = 200 # 0 disables it
+    top_p: float = 0.2
+    typical_p: float = 1.0
+    freq_penalty: float = 3
+    presence_penalty: float = 0.4
+    rep_penalty_window: int = 80 # how far in the past to consider when penalizing repetitions. Equates to 5s
+    eos_penalty_decay: float = 0.5 # how much to penalize <eos>
+    eos_penalty_factor: float = 1 # overal penalty weighting
+    eos_estimated_gen_length_factor: float = 1.0 # multiple of len(text_phones) to assume an approximate output length is
+    ## >>>> NAR CONFIG
+    # defaults, that can be overridden with user specified inputs
+    timesteps: int = 200
+    x_0_temp: float = 0.7
+    q0_override_steps: int = 20 # number of diffusion steps where NAR L0 predictions overrides AR L0 predictions.
+    nar_guidance_w: float = 3
+    max_prompt_dur: float = 12 # maximum length prompt is allowed, in seconds.
+    # Maximum AR codes to generate in 1 inference.
+    # Default of -1 leaves it same as training time max AR tokens.
+    # Typical values up to ~2x training time can be tolerated,
+    # with ~1.5x trianing time tokens having still mostly ok performance.
+    generate_max_len_override: int = -1
+    # Whether to deep clone from the reference.
+    # Pros: improves intelligibility and speaker cloning performance.
+    # Cons: requires reference transcript, and inference takes a bit longer.
+    deep_clone: bool = True
+    use_kv_cache: bool = True
+    trim_db: float = 27
+    beam_width: int = 1 # only beam width of 1 is currently supported
+    ref_audio_pad: float = 0
+class Mars5TTS(nn.Module):
+    def __init__(self, ar_ckpt, nar_ckpt, device: str = None) -> None:
+        super().__init__()
+        if device is None:
+            device = 'cuda' if torch.cuda.is_available() else 'cpu'
+        self.device = torch.device(device)
+        self.codec = EncodecModel.encodec_model_24khz().to(device).eval()
+        self.codec.set_target_bandwidth(6.0)
+        # save and load text tokenize
+        self.texttok = RegexTokenizer(GPT4_SPLIT_PATTERN)
+        tfn = tempfile.mkstemp(suffix='texttok.model')[1]
+        Path(tfn).write_text(ar_ckpt['vocab']['texttok.model'])
+        self.texttok.load(tfn)
+        os.remove(tfn)
+        # save and load speech tokenizer
+        sfn = tempfile.mkstemp(suffix='speechtok.model')[1]
+        self.speechtok = CodebookTokenizer(GPT4_SPLIT_PATTERN)
+        Path(sfn).write_text(ar_ckpt['vocab']['speechtok.model'])
+        self.speechtok.load(sfn)
+        os.remove(sfn)
+        # keep track of tokenization things.
+        self.n_vocab = len(self.texttok.vocab) + len(self.speechtok.vocab)
+        self.n_text_vocab = len(self.texttok.vocab) + 1
+        self.diffusion_n_classes: int = 1025 # 1 for padding idx
+        # load AR model
+        self.codeclm = CodecLM(n_vocab=self.n_vocab, dim=1536, dim_ff_scale=7/3)
+        self.codeclm.load_state_dict(ar_ckpt['model'])
+        self.codeclm = self.codeclm.to(self.device).eval()
+        # load NAR model
+        self.codecnar = ResidualTransformer(n_text_vocab=self.n_text_vocab, n_quant=self.diffusion_n_classes,
+                                            p_cond_drop=0, dropout=0)
+        self.codecnar.load_state_dict(nar_ckpt['model'])
+        self.codecnar = self.codecnar.to(self.device).eval()
+        self.default_T = 200
+        self.sr = 24000
+        self.latent_sr = 75
+        # load vocoder
+        self.vocos = Vocos.from_pretrained("charactr/vocos-encodec-24khz").to(self.device).eval()
+        nuke_weight_norm(self.codec)
+        nuke_weight_norm(self.vocos)
+    @torch.inference_mode
+    def vocode(self, tokens: Tensor) -> Tensor:
+        """ Vocodes tokens of shape (seq_len, n_q) """
+        tokens = tokens.T.to(self.device)
+        features = self.vocos.codes_to_features(tokens)
+        # A cool hidden feature of vocos vocoding:
+        # setting the bandwidth below to 1 (corresponding to 3 kbps)
+        # actually still works on 6kbps input tokens, but *smooths* the output
+        # audio a bit, which can help improve quality if its a bit noisy.
+        # Hence we use [1] and not [2] below.
+        bandwidth_id = torch.tensor([1], device=self.device)  # 6 kbps
+        wav_diffusion = self.vocos.decode(features, bandwidth_id=bandwidth_id)
+        return wav_diffusion.cpu().squeeze()[None]
+    @torch.inference_mode
+    def tts(self, text: str, ref_audio: Tensor, ref_transcript: Optional[str] = None,
+            cfg: Optional[InferenceConfig] = InferenceConfig()) -> Tensor:
+        """ Perform TTS for `text`, given a reference audio `ref_audio` (of shape [sequence_length,], sampled at 24kHz)
+        which has an associated `ref_transcript`. Perform inference using the inference
+        config given by `cfg`, which controls the temperature, top_p, etc...
+        Returns:
+        - `ar_codes`: (seq_len,) long tensor of discrete coarse code outputs from the AR model.
+        - `out_wav`: (T,) float output audio tensor sampled at 24kHz.
+        """
+        if cfg.deep_clone and ref_transcript is None:
+            raise AssertionError(
+                ("Inference config deep clone is set to true, but reference transcript not specified! "
+                "Please specify the transcript of the prompt, or set deep_clone=False in the inference `cfg` argument."
+            ))
+        ref_dur = ref_audio.shape[-1]/self.sr
+        if ref_dur > cfg.max_prompt_dur:
+            logging.warning((f"Reference audio duration is {ref_dur:.2f} > max suggested ref audio. "
+                            f"Expect quality degradations. We recommend you trim prompt to be shorter than max prompt length."))
+        # get text codes.
+        text_tokens =  self.texttok.encode("<|startoftext|>"+text.strip()+"<|endoftext|>",
+                                           allowed_special='all')
+        text_tokens_full = self.texttok.encode("<|startoftext|>"+ ref_transcript + ' ' + str(text).strip()+"<|endoftext|>",
+                                            allowed_special='all')
+        if ref_audio.dim() == 1: ref_audio = ref_audio[None]
+        if ref_audio.shape[0] != 1: ref_audio = ref_audio.mean(dim=0, keepdim=True)
+        ref_audio = F.pad(ref_audio, (int(self.sr*cfg.ref_audio_pad), 0))
+        # get reference audio codec tokens
+        prompt_codec = self.codec.encode(ref_audio[None].to(self.device))[0][0] # (bs, n_q, seq_len)
+        n_speech_inp = 0
+        n_start_skip = 0
+        q0_str = ' '.join([str(t) for t in prompt_codec[0, 0].tolist()])
+        # Note, in the below, we do NOT want to encode the <eos> token as a part of it, since we will be continuing it!!!
+        speech_tokens = self.speechtok.encode(q0_str.strip()) # + "<|endofspeech|>", allowed_special='all')
+        spk_ref_codec = prompt_codec[0, :, :].T # (seq_len, n_q)
+        raw_prompt_acoustic_len = len(prompt_codec[0,0].squeeze())
+        offset_speech_codes = [p+len(self.texttok.vocab) for p in speech_tokens]
+        if not cfg.deep_clone:
+            # shallow clone, so
+            # 1. clip existing speech codes to be empty (n_speech_inp = 0)
+            offset_speech_codes = offset_speech_codes[:n_speech_inp]
+        else:
+            # Deep clone, so
+            # 1. set text to be text of prompt + target text
+            text_tokens = text_tokens_full
+            # 2. update n_speech_inp to be length of prompt, so we only display from ths `n_speech_inp` onwards in the final output.
+            n_speech_inp = len(offset_speech_codes)
+        prompt = torch.tensor(text_tokens + offset_speech_codes, dtype=torch.long, device=self.device)
+        first_codec_idx = prompt.shape[-1] - n_speech_inp + 1
+        # ---> perform AR code generation
+        logging.debug(f"Raw acoustic prompt length: {raw_prompt_acoustic_len}")
+        ar_codes = ar_generate(self.texttok, self.speechtok, self.codeclm,
+                               prompt, spk_ref_codec, first_codec_idx,
+                               max_len=cfg.generate_max_len_override if cfg.generate_max_len_override > 1 else 2000,
+                               temperature=cfg.temperature, topk=cfg.top_k, top_p=cfg.top_p, typical_p=cfg.typical_p,
+                               alpha_frequency=cfg.freq_penalty, alpha_presence=cfg.presence_penalty, penalty_window=cfg.rep_penalty_window,
+                               eos_penalty_decay=cfg.eos_penalty_decay, eos_penalty_factor=cfg.eos_penalty_factor,
+                               beam_width=cfg.beam_width, beam_length_penalty=1,
+                               n_phones_gen=round(cfg.eos_estimated_gen_length_factor*len(text)),
+                               vocode=False, use_kv_cache=cfg.use_kv_cache)
+        # Parse AR output
+        output_tokens = ar_codes - len(self.texttok.vocab)
+        output_tokens = output_tokens.clamp(min=0).squeeze()[first_codec_idx:].cpu().tolist()
+        gen_codes_decoded = self.speechtok.decode_int(output_tokens)
+        gen_codes_decoded = torch.tensor([s for s in gen_codes_decoded if type(s) == int], dtype=torch.long, device=self.device)
+        c_text = torch.tensor(text_tokens, dtype=torch.long, device=self.device)[None]
+        c_codes = prompt_codec.permute(0, 2, 1)
+        c_texts_lengths = torch.tensor([len(text_tokens)], dtype=torch.long, device=self.device)
+        c_codes_lengths = torch.tensor([c_codes.shape[1],], dtype=torch.long, device=self.device)
+        _x = gen_codes_decoded[None, n_start_skip:, None].repeat(1, 1, 8) # (seq_len) -> (1, seq_len, 8)
+        x_padding_mask = torch.zeros((1, _x.shape[1]), dtype=torch.bool, device=_x.device)
+        # ---> perform DDPM NAR inference
+        T = self.default_T
+        diff = MultinomialDiffusion(self.diffusion_n_classes, timesteps=T, device=self.device)
+        dsh_cfg = DSH(last_greedy=True, x_0_temp=cfg.x_0_temp,
+                        guidance_w=cfg.nar_guidance_w,
+                        deep_clone=cfg.deep_clone, jump_len=1, jump_n_sample=1,
+                        q0_override_steps=cfg.q0_override_steps,
+                        enable_kevin_scaled_inference=True, # see TransFusion ASR for explanation of this
+                        progress=False)
+        final_output = perform_simple_inference(self.codecnar,(
+            c_text, c_codes, c_texts_lengths, c_codes_lengths, _x, x_padding_mask
+        ), diff, diff.num_timesteps, torch.float16, dsh=dsh_cfg, retain_quant0=True) # (bs, seq_len, n_quant)
+        skip_front = raw_prompt_acoustic_len if cfg.deep_clone else 0
+        final_output = final_output[0, skip_front:].to(self.device)  # (seq_len, n_quant)
+        # vocode final output and trim silences
+        final_audio = self.vocode(final_output).squeeze()
+        final_audio, _ = trim(final_audio.cpu(), top_db=cfg.trim_db)
+        return gen_codes_decoded, final_audio

mars5/ar_generate.py ADDED Viewed

	@@ -0,0 +1,165 @@

+import torch
+import torch.nn.functional as F
+import torchaudio
+import copy
+from torch import Tensor, nn
+import logging
+from .model import length_to_mask
+from .samplers import (apply_typical_p, early_eos_penalty,
+                      top_k_top_p_filtering, freq_rep_penalty)
+from .nn_future import RotatingBufferCache
+from .minbpe.codebook import CodebookTokenizer
+from .minbpe.regex import RegexTokenizer
+@torch.inference_mode()
+def ar_generate(texttok: RegexTokenizer, speechtok: CodebookTokenizer,
+                codeclm: nn.Module, xx: Tensor, ss_gen: Tensor, first_codex_idx: int,
+                max_len: int = 1500, fp16: bool = True, temperature: float = 1.0, topk: int = None,
+                top_p=1.0, alpha_frequency=0, alpha_presence=0, penalty_window=100,
+                typical_p=1.0, eos_penalty_factor=1.0, eos_penalty_decay=0, n_phones_gen=None, vocode=True,
+                beam_width: int = 1, beam_length_penalty=2, use_kv_cache: bool = True) -> tuple[Tensor, Tensor]:
+    """ Use the `codeclm` language model to autoregressively generate a completion of `xx` (seq_len), where the first `first_codex_idx`-1
+    indices correspond to the input phones. The output generation is limited to at most `max_len` (measured as num latent codes).
+    Returns both output first quantizer codes and synthesized audio using `codec`. Use decoding with `beam_width` to keep
+    track of top `beam_width` outcomes, selecting the top one among them.
+    - Optionally vocode if `vocode` (default True).
+    - See `InferenceConfig` for other inference docs.
+    """
+    assert xx.dim() == 1, "Only batch size of 1 is currently supported."
+    assert beam_width == 1, "Only beam size of 1 is currently supported."
+    # internally our batch size will be the beam width
+    bs = beam_width
+    x_inp = xx[None].repeat(bs, 1) # (bs, seq_len)
+    ss_gen = ss_gen[None].repeat(bs, 1, 1)
+    # We must subtract 1 in the line below so that we match the train-time conditions of having a
+    # False padding value for the <bos> token position. This is needed so that we correctly use the
+    # _acoustic_ and not the linguistic language embedding for the <bos> token.
+    offsets = torch.tensor([first_codex_idx - 1 for _ in range(bs)], dtype=torch.long, device=xx.device)
+    valid_logit_idx_start = len(texttok.vocab) # vocab['s2i']['quant0-0000']
+    valid_logit_idx_end = len(texttok.vocab) + len(speechtok.vocab) + 1 # vocab['s2i']['quant1-0000']
+    # Make mask that is True where we have valid outputs, False otherwise (where we have text outputs).
+    # logit_mask = torch.zeros(n_vocab, dtype=bool, device=x_inp.device)
+    # logit_mask[valid_logit_idx_start:valid_logit_idx_end] = True
+    # logit_mask[vocab['s2i']['<eos>']] = True
+    cum_logprobs = torch.zeros(bs, dtype=torch.float, device=x_inp.device)
+    eos_idx = len(texttok.vocab) + speechtok.special_tokens['<|endofspeech|>']
+    n_vocab = len(texttok.vocab) + len(speechtok.vocab)
+    logging.info(f"Starting beam decoding with beam_width={beam_width}")
+    prev_ids = [[] for _ in range(bs)]
+    cache = None
+    if use_kv_cache:
+        # Initialise kv cache
+        cache_window = min(codeclm.ar.args.sliding_window, x_inp.shape[-1] + max_len)
+        cache = RotatingBufferCache(codeclm.ar.args.n_layers, bs, cache_window, codeclm.ar.args.n_kv_heads, codeclm.ar.args.head_dim)
+        cache.to(device=x_inp.device, dtype=torch.float16)
+    counter = 0
+    while x_inp.shape[-1] < max_len:
+        counter += 1
+        gen_length = torch.tensor([x_inp.shape[-1] for _ in range(bs)], dtype=torch.long, device=xx.device)
+        padding_mask = length_to_mask(gen_length, offsets)
+        with torch.autocast('cuda', enabled=fp16):
+            logits: Tensor = codeclm(x_inp, padding_mask, spk_reference=ss_gen, cache=cache, counter=counter)
+        logits = logits.float()
+        logits = logits[:, -1] # select last index, now (bs, logit_dim)
+        # <---------------------- logit filtering ---------------------->
+        filtered_logits = logits.clone()
+        # apply repetition penalty before logit mask if any item in the beam has more than 1 prior token.
+        if len(prev_ids[0]) > 1:
+            filtered_logits = freq_rep_penalty(filtered_logits, previous=torch.tensor(prev_ids, dtype=torch.long),
+                                             alpha_frequency=alpha_frequency, alpha_presence=alpha_presence,
+                                             penalty_window=penalty_window)
+        filtered_logits[..., :valid_logit_idx_start-1] = float('-inf')
+        filtered_logits[..., valid_logit_idx_end:] = float('-inf')
+        if n_phones_gen is not None:
+            # apply eos penalty
+            filtered_logits = early_eos_penalty(filtered_logits, len(prev_ids[0]), n_phones_gen,
+                                                eos_penalty_decay, eos_penalty_factor,
+                                                eos_index=eos_idx)
+        filtered_logits = filtered_logits / temperature
+        filtered_logits = top_k_top_p_filtering(filtered_logits, top_k=topk, top_p=top_p)
+        filtered_logits = apply_typical_p(filtered_logits, mass=typical_p)
+        # mask out anything that isn't first quantizer output codes
+        filtered_logits[..., :valid_logit_idx_start-1] = float('-inf')
+        filtered_logits[..., valid_logit_idx_end:] = float('-inf')
+        logits = filtered_logits
+        # <---------------------- next frame prediction --------------------->
+        logprobs = logits.log_softmax(dim=-1)
+        # update assignments: if any beam ended in <eos> last step, it MUST also end in <eos> this step.
+        # so, below we multiply the logits with a True/False mask, setting to
+        for j in range(bs):
+            if x_inp[j, -1] == eos_idx:
+                # do not add any additional probability to it, keeping it the same for all vocab idxs
+                logprobs[j] = float('-inf') # zero probability of anything non-eos after 1 eos
+                logprobs[j, eos_idx] = 0 # probability=1 of <eos> after <eos>
+        candidate_cum_logprobs = cum_logprobs[:, None] + logprobs # (bs, 1) + (bs, vocab) -> (bs, vocab)
+        logp_flat = logprobs.flatten()
+        candidates = torch.multinomial(logp_flat.exp(), num_samples=beam_width, replacement=False) # (bs,)
+        # Ravel it up:
+        beam_idxs = candidates // n_vocab # (bs,)
+        tok_inds_in_each_beam = candidates % n_vocab # (bs,)
+        # check for breaks
+        if torch.all(tok_inds_in_each_beam == eos_idx):
+            # apply length penalty:
+            non_eos_toks = (x_inp != eos_idx).sum(dim=-1) # (bs,) number of non eos toks
+            gen_length = non_eos_toks - first_codex_idx
+            penalties = (gen_length**beam_length_penalty)
+            penalized_cum_tok_logp = candidate_cum_logprobs / penalties[:, None]
+            eos_avg_logps = penalized_cum_tok_logp[:, eos_idx]
+            best_beam_idx = eos_avg_logps.argmax()
+            best_avg_logp = eos_avg_logps[best_beam_idx]
+            best_beam = x_inp[best_beam_idx]
+            logging.info((f"best beam = {best_beam_idx} @ penalized_cum_tok_logp = {best_avg_logp.item():.3f} |\n num toks: {non_eos_toks.cpu().tolist()}. "
+                         f"Candidates: {eos_avg_logps.cpu()} |\n non-eos toks: {non_eos_toks.cpu().tolist()} |\n penalties: {penalties.cpu().tolist()} | "
+                         f"raw cumulative probs: {candidate_cum_logprobs[:, eos_idx].cpu().tolist()}"))
+            break
+        # update beam histories:
+        x_inp = x_inp[beam_idxs]
+        # update next token
+        next_sample = tok_inds_in_each_beam
+        # update cum logprob
+        cum_logprobs = cum_logprobs[beam_idxs] + logprobs[beam_idxs, tok_inds_in_each_beam]
+        # update prior inds to point to correct beam
+        prev_ids = [copy.deepcopy(prev_ids[beam_idx.item()]) for beam_idx in beam_idxs]
+        # add new tokens to previous ids
+        for j in range(bs):
+            prev_ids[j].append(tok_inds_in_each_beam[j].item())
+        logging.debug("L%d | next sample: %s | beam: %s | cum_logp: %s", len(x_inp[0]), next_sample.cpu().tolist(), beam_idxs.cpu().tolist(), cum_logprobs.cpu())
+        # update cache with beam indexes
+        if cache is not None:
+            cache.cache_k = cache.cache_k[:, beam_idxs]
+            cache.cache_v = cache.cache_v[:, beam_idxs]
+        # add 1 None below to make (bs,) -> (bs, 1) so we can concat along seq len dim.
+        x_inp = torch.cat([x_inp, next_sample[:, None]], dim=-1)
+    if x_inp.shape[-1] >= max_len - 1:
+        logging.warning(f"[autoregressive generation] output length = {x_inp.shape[-1]} -- inference likely failed or input too long!")
+        best_beam = x_inp[0]
+    if not vocode: return best_beam # (seq_len,)
+    else: raise AssertionError()

mars5/diffuser.py ADDED Viewed

	@@ -0,0 +1,472 @@

+"""
+Discrete multinomial diffusion code adapted from https://github.com/RF5/transfusion-asr,
+which in turn is adapted from https://github.com/ehoogeboom/multinomial_diffusion.
+Please see the original repo (https://github.com/ehoogeboom/multinomial_diffusion) and paper for full
+details on how multinomial diffusion works -- thanks to the original authors!
+"""
+import torch
+from torch import Tensor
+from torch.functional import F
+import numpy as np
+from dataclasses import dataclass
+from typing import Union
+# -------------- Multinomial utility functions -----------
+MIN_LOG_ARG = 1e-7 # originally was 1e-40
+def log_1_min_a(a): return torch.log((1 - a.exp()).clamp_(min=1e-30))
+def log_add_exp(a, b):
+    maximum = torch.max(a, b)
+    return maximum + torch.log(torch.exp(a - maximum) + torch.exp(b - maximum))
+def extract(a: Tensor, t, x_shape):
+    """ Given 1D vector of alpha/alpha_cum/betas, get index at `t` of shape (bs,), and then
+    broadcast it to number of dims in `x_shape`.
+    """
+    b, *_ = t.shape
+    out = a.gather(-1, t)
+    return out.reshape(b, *((1,) * (len(x_shape) - 1)))
+def index_to_log_onehot(x, num_classes, dim=-1, dtype=torch.float32):
+    """ Convert indices `x` (bs, ...) to approx one-hot log-probs of shape (bs, ..., num_classes) """
+    assert x.max().item() < num_classes, \
+        f'Error: {x.max().item()} >= {num_classes}'
+    x_onehot = F.one_hot(x, num_classes)
+    if dim == 1:
+        permute_order = (0, -1) + tuple(range(1, len(x.size())))
+        x_onehot = x_onehot.permute(permute_order)
+    else:
+        pass
+    log_x = torch.log(x_onehot.to(dtype).clamp(min=MIN_LOG_ARG)) # so min(log_x) will be -30
+    return log_x
+def sum_except_batch(x: Tensor, num_dims=1) -> Tensor:
+    '''
+    Sums all dimensions except the first.
+    Args:
+        x: Tensor, shape (batch_size, ...)
+        num_dims: int, number of batch dims (default=1)
+    Returns:
+        x_sum: Tensor, shape (batch_size,)
+    '''
+    return x.reshape(*x.shape[:num_dims], -1).sum(-1)
+# -------------- Multinomial diffusion class -------------
+class MultinomialDiffusion():
+    def __init__(self, num_classes, timesteps=100, diffusion_s=0.008,
+                 loss_type='vb_stochastic', parametrization='x0',
+                 dtype=torch.float32,
+                 device='cpu'):
+        super(MultinomialDiffusion, self).__init__()
+        assert loss_type in ('vb_stochastic',)
+        assert parametrization in ('x0', 'direct')
+        self.num_classes = num_classes
+        self.loss_type = loss_type
+        self.num_timesteps = timesteps
+        self.parametrization = parametrization
+        alphas = self.cosine_beta_schedule(timesteps, diffusion_s)
+        alphas = alphas.to(torch.float64)
+        log_alpha = alphas.log()
+        log_cumprod_alpha = torch.cumsum(log_alpha, dim=-1)
+        log_1_min_alpha = log_1_min_a(log_alpha) # = log(betas)
+        log_1_min_cumprod_alpha = log_1_min_a(log_cumprod_alpha) # = log(1- \bar{a})
+        a = log_add_exp(log_alpha, log_1_min_alpha) # log(1-beta + beta) = log(1) = 0
+        assert log_add_exp(log_alpha, log_1_min_alpha).abs().sum().item() < 1.e-5
+        assert log_add_exp(log_cumprod_alpha, log_1_min_cumprod_alpha).abs().sum().item() < 1e-5
+        assert (torch.cumsum(log_alpha, dim=-1) - log_cumprod_alpha).abs().sum().item() < 1.e-5
+        # Convert to float32 and register buffers.
+        self.log_alpha = log_alpha.to(dtype).to(device)
+        self.log_1_min_alpha = log_1_min_alpha.to(dtype).to(device)
+        self.log_cumprod_alpha = log_cumprod_alpha.to(dtype).to(device)
+        self.log_1_min_cumprod_alpha = log_1_min_cumprod_alpha.to(dtype).to(device)
+    @staticmethod
+    def cosine_beta_schedule(timesteps, s=0.008) -> Tensor:
+        """
+        cosine schedule as proposed in https://arxiv.org/abs/2102.09672 .
+        Returns alpha parameters, NOT Beta
+        """
+        steps = timesteps + 1
+        x = torch.linspace(0, timesteps, steps)
+        alphas_cumprod = torch.cos(((x / timesteps) + s) / (1 + s) * torch.pi * 0.5) ** 2
+        alphas_cumprod = alphas_cumprod / alphas_cumprod[0]
+        alphas = (alphas_cumprod[1:] / alphas_cumprod[:-1])
+        alphas = torch.clamp(alphas, 0.001, 1.0)
+        return torch.sqrt(alphas)
+    def multinomial_kl(self, log_prob1: Tensor, log_prob2: Tensor, dim=-1) -> Tensor:
+        """ Get KL divergence between two categorical distributions specified with `log_prob1` and `log_prob2`.
+        Assumed probability dim is `dim` (i.e. log_prob1.exp().sum(dim=`dim`) should be tensor of ones)
+        """
+        kl = (log_prob1.exp() * (log_prob1 - log_prob2)).sum(dim=dim)
+        return kl
+    def q_pred_one_timestep(self, log_x_t: Tensor, t: Tensor) -> Tensor:
+        """ Compute q(x_t | x_{t-1}) = C(x_t | alpha_t * x_{t-1} + (1-alpha_t)/K in the log-domain
+        given `log_x_t` as log one-hot encoding of x_t.
+        Recall due to symmetry property we can compute
+        this value using x_t instead of x_{t-1} (se appendix A of https://arxiv.org/pdf/2102.05379.pdf)
+        """
+        dt = log_x_t.dtype
+        log_alpha_t = extract(self.log_alpha, t, log_x_t.shape).to(dt)
+        log_1_min_alpha_t = extract(self.log_1_min_alpha, t, log_x_t.shape).to(dt)
+        # alpha_t * E[xt] + (1 - alpha_t) 1 / K
+        log_probs = log_add_exp(
+            log_x_t + log_alpha_t,
+            log_1_min_alpha_t - np.log(self.num_classes)
+        )
+        return log_probs
+    def q_pred_one_timestep_scaled(self, log_x_t: Tensor, t: Tensor, c: int, jump_len: int) -> Tensor:
+        """ Compute q(x_t | x_{t-1}) = C(x_t | alpha_t * x_{t-1} + (1-alpha_t)/K in the log-domain
+        given `log_x_t` as log one-hot encoding of x_t.
+        Recall due to symmetry property we can compute
+        this value using x_t instead of x_{t-1} (se appendix A of https://arxiv.org/pdf/2102.05379.pdf)
+        """
+        dt = log_x_t.dtype
+        log_alpha_t = extract(self.log_alpha, t, log_x_t.shape).to(dt)
+        log_1_min_alpha_t = extract(self.log_1_min_alpha, t, log_x_t.shape).to(dt)
+        # Magic
+        xax = torch.arange(0,log_x_t.shape[1],1).to(log_x_t.device)
+        aa=log_x_t.shape[1]*(c/jump_len)
+        sig = 1/(1+torch.exp(-(xax-aa+20)/8))
+        log_alpha_t = (torch.log(1/sig)[None,:,None] + log_alpha_t).clamp(-torch.inf, 0)
+        log_1_min_alpha_t = torch.log(sig)[None,:,None] + log_1_min_alpha_t
+        # alpha_t * E[xt] + (1 - alpha_t) 1 / K
+        log_probs = log_add_exp(
+            log_x_t + log_alpha_t,
+            log_1_min_alpha_t - np.log(self.num_classes)
+        )
+        return log_probs
+    def q_pred(self, log_x_start: Tensor, t) -> Tensor:
+        """ Compute q(x_t | x_0) = C(x_t | bar{alpha}_t * x_0 + (1 - bar{alpha}_t)/K ) in log domain,
+        given `log_x_start` of log probs of x_0.
+        """
+        dt = log_x_start.dtype
+        log_cumprod_alpha_t = extract(self.log_cumprod_alpha, t, log_x_start.shape).to(dt)
+        log_1_min_cumprod_alpha = extract(self.log_1_min_cumprod_alpha, t, log_x_start.shape).to(dt)
+        log_probs = log_add_exp(
+            log_x_start + log_cumprod_alpha_t,
+            log_1_min_cumprod_alpha - np.log(self.num_classes)
+        )
+        return log_probs
+    def q_posterior(self, log_x_start, log_x_t, t):
+        """ Compute  `q(xt-1 | xt, x0) = q(xt | xt-1, x0) * q(xt-1 | x0) / q(xt | x0)`
+        where q(xt | xt-1, x0) = q(xt | xt-1).
+        """
+        # q(xt-1 | xt, x0) = q(xt | xt-1, x0) * q(xt-1 | x0) / q(xt | x0)
+        # where q(xt | xt-1, x0) = q(xt | xt-1).
+        t_minus_1 = t - 1
+        # Remove negative values, will not be used anyway for final decoder
+        t_minus_1 = torch.where(t_minus_1 < 0, torch.zeros_like(t_minus_1), t_minus_1)
+        log_EV_qxtmin_x0 = self.q_pred(log_x_start, t_minus_1) # log( q(x_{t-1} | x_0) )
+        # if t == 0, then log( q(x_0 | x_0) ) = log( one_hot(x_0) ), not even random at that point.
+        # so, where t == 0
+        num_axes = (1,) * (len(log_x_start.size()) - 1)
+        t_broadcast = t.view(-1, *num_axes) * torch.ones_like(log_x_start) # broadcast to non-batch axes
+        log_EV_qxtmin_x0 = torch.where(t_broadcast == 0, log_x_start, log_EV_qxtmin_x0)
+        # where it is zero, replace
+        # with log one-hot encoding of x0.
+        # Note: _NOT_ x_tmin1, which is how the formula is typically used!!!
+        # Not very easy to see why this is true. But it is :)
+        # log_EV_qxtmin_x0 ~ q(x_{t-1} | x_0)
+        # q_pred_one_timestep(log_x_t, t) ~ q(x_t | x_{t-1}) (which due to symmetry can be computed using x_t)
+        unnormed_logprobs = log_EV_qxtmin_x0 + self.q_pred_one_timestep(log_x_t, t) # numerator of bayes
+        # approximate denominator with just a normalizing sum.
+        log_EV_xtmin_given_xt_given_xstart = \
+            unnormed_logprobs \
+            - torch.logsumexp(unnormed_logprobs, dim=-1, keepdim=True)
+        return log_EV_xtmin_given_xt_given_xstart
+    def p_pred(self, log_x_t, t, log_x0_pred):
+        """ Predict `p(x_{t-1} | x_t)` using `q(xt-1 | xt, hat{x0})`, where `hat{x0}` is given by
+        log probabilities from model as `log_x0_pred` (bs, ...., K) and x_t is given by
+        `log_x_t` of shape `(bs, ..., K)`
+        """
+        # log_x_recon = self.predict_start(log_x, t=t) # model itself predicts x_0
+        # log_x0_pred
+        log_model_pred = self.q_posterior(
+            log_x_start=log_x0_pred, log_x_t=log_x_t, t=t)
+        return log_model_pred
+    def log_sample_categorical(self, logprobs: Tensor, dim=-1) -> Tensor:
+        """ Sample from categorical `logprobs` (bs, ..., probs), where position of probs is specified
+        by `dim`.
+        Returns sampled long indices of shape `(bs, ...)`
+        """
+        uniform = torch.rand_like(logprobs)
+        gumbel_noise = -torch.log( (-torch.log(uniform.clamp_(min=MIN_LOG_ARG)) ).clamp_(min=MIN_LOG_ARG))
+        sample = (gumbel_noise + logprobs).argmax(dim=dim)
+        return sample
+    def q_sample(self, log_x_start, t):
+        """ Draw `x_t` ~ q(x_t | x_0) . `log_x_start` is of shape `(bs, ..., K)`, returns result of same shape """
+        log_EV_qxt_x0 = self.q_pred(log_x_start, t)
+        sample = self.log_sample_categorical(log_EV_qxt_x0)
+        # log_sample = index_to_log_onehot(sample, self.num_classes)
+        return sample #log_sample
+    def compute_Lt(self, log_x_start: Tensor, log_x_t: Tensor, log_x0_pred: Tensor, t,
+                   detach_mean=False, include_kl_prior=True):
+        """ Get loss given one-hot log x_0, one-hot log x_t, t, and model prediction `log_x0_pred`.
+        Parameters:
+            - `log_x_start`: ground-truth input x0, converted to log one-hot (bs, ..., K)
+            - `log_x_t`: sampled noisy input at `x_t`, converted to log one-hot (bs, ..., K)
+            - `t`: diffusion timestep (bs,)
+            - `log_x0_pred`: model prediction of log probabilities of x0, i.e. hat{x0}.
+            - `include_kl_prior`: add last two terms to model loss (does not change optimization problem).
+        """
+        dtype = log_x_start.dtype
+        log_true_prob = self.q_posterior(
+            log_x_start=log_x_start, log_x_t=log_x_t, t=t)
+        log_model_prob = self.p_pred(log_x_t=log_x_t, t=t, log_x0_pred=log_x0_pred)
+        if detach_mean:
+            log_model_prob = log_model_prob.detach()
+        kl = self.multinomial_kl(log_true_prob, log_model_prob)
+        kl = sum_except_batch(kl)
+        # Add L_0, -log(p(x_0 | x_1))
+        decoder_nll = - (log_x_start.exp() * log_model_prob).sum(dim=-1)
+        decoder_nll = sum_except_batch(decoder_nll)
+        mask = (t == torch.zeros_like(t)).to(dtype)
+        loss = mask * decoder_nll + (1. - mask) * kl # only add L0 if t == 0.
+        if include_kl_prior:
+            pt = torch.ones_like(t, dtype=dtype)
+            kl_prior = self.kl_prior(log_x_start)
+            loss = (kl) + kl_prior
+        return loss
+    def kl_prior(self, log_x_start: Tensor) -> Tensor:
+        """ This function computes -H_{q}(x_T | x_0)+H_{p}(x_T), which
+        by some math (see wiki for KL div relation to conditional entropy).
+        So KL(q(x_T | x_0) || 1/K) = -H_{q}(x_T | x_0)+H_{p}(x_T) for categorical distribution.
+        Given `log_x_start` (bs, ..., probs), return KL prior of shape (bs,)
+        """
+        b = log_x_start.size(0)
+        device = log_x_start.device
+        ones = torch.ones(b, device=device, dtype=torch.long)
+        log_qxT_prob = self.q_pred(log_x_start, t=(self.num_timesteps - 1) * ones) # q(x_T | x_0)
+        log_half_prob = -torch.log(self.num_classes * torch.ones_like(log_qxT_prob)) # log(1/K), broadcast to q(x_T|x_0) shape
+        kl_prior = self.multinomial_kl(log_qxT_prob, log_half_prob)
+        return sum_except_batch(kl_prior)
+def index2logit(x: Tensor, vocab_size: int, dtype=torch.float32):
+    x = F.one_hot(x, num_classes=vocab_size).to(dtype)
+    x = x * (vocab_size/(vocab_size - 1)) - 1/(vocab_size - 1)
+    return x
+# ------------------------------
+# Functions adapted from the full
+@dataclass
+class DSH():
+    # Diffusion Sampling Hyperparameters [DSH] (Section 4)
+    jump_len: int = 1 # j in RePaint paper [default 10] (Section 4.1)
+    jump_n_sample: int = 1 # r in RePaint paper [default 10] (Section 4.1)
+    last_greedy: bool = False # whether to not sample at t=0, but take argmax prediction. [default False]
+    x_0_temp: float = 1.0 # reweight temp for model prediction of x0
+    guidance_w: float = 1.0 # classifier free guidance weight [default 1.5] (Section 4.3)
+    enable_kevin_scaled_inference: bool = True # sequentially progressive diffusion [default True] (Section 4.2)
+    T_override: Union[None, int] = None # allow variable transcription sizes during inference (Section 4.4)
+    deep_clone: bool = False # whether to do deep clone.
+    q0_override_steps: int = 0 # number of steps that we allow overriding the input quant level 0 inputs.
+    progress: bool = False # whether to show progress bar
+def get_schedule(t_T, jump_len=10, jump_n_sample=10):
+    jumps = {}
+    for j in range(0, t_T - jump_len, jump_len):
+        jumps[j] = jump_n_sample - 1
+    t = t_T
+    ts = []
+    while t >= 1:
+        t = t-1
+        ts.append(t)
+        if jumps.get(t, 0) > 0:
+            jumps[t] = jumps[t] - 1
+            for _ in range(jump_len):
+                t = t + 1
+                ts.append(t)
+    ts.append(-1)
+    return ts
+def forward_diffusion(diff: MultinomialDiffusion, dtype, x, t, c=None, dsh=DSH):
+    """Simple forward diffusion process p"""
+    log_x_t = index_to_log_onehot(x, diff.num_classes, dtype=dtype)
+    if c is not None: x = diff.q_pred_one_timestep_scaled(log_x_t, t, c, dsh.jump_len)
+    else: x = diff.q_pred_one_timestep(log_x_t, t)
+    x = diff.log_sample_categorical(x)
+    return x
+def reverse_diffusion(diff: MultinomialDiffusion, model, batch, x_known=None, m=None,
+                      last_greedy=False, temperature=1.0, alphas=None, ensemble_size=1, dsh=DSH):
+    """Reverse diffusion process q: predict x_{t-1} given x, t, x_known, m. Optionally do not sample model output
+    for t=0, but rather use the greedy argmax with `last_greedy`.
+    """
+    x = batch[4]
+    t = batch[-1]
+    if x_known is None: x_known = torch.zeros_like(x)
+    if m is None: m = torch.zeros_like(x)
+    # Equation 8b
+    # for b in batch:
+        # print(f"{b.shape}: {b}")
+    x_0_pred = model(*batch) # (bs, seq_len, logit_dim, n_quant)
+    x_0_pred = x_0_pred.permute(0, 1, 3, 2) # (bs, seq_len, n_quant, dim)
+    if dsh.guidance_w != 1:
+        uncond_x_0_pred = model(*(c.clone() if c is not None else None for c in batch), drop_cond=True)
+        uncond_x_0_pred = uncond_x_0_pred.permute(0, 1, 3, 2)
+        x_0_pred = dsh.guidance_w*x_0_pred + (1-dsh.guidance_w)*uncond_x_0_pred
+    x_0_pred = x_0_pred / temperature
+    log_x_0_pred = F.log_softmax(x_0_pred, dim=-1)
+    log_x_t = index_to_log_onehot(x, diff.num_classes, dtype=x_0_pred.dtype)
+    # print("PRE: ", log_x_t.shape, t.shape, log_x_0_pred.shape)
+    log_model_pred = diff.p_pred(log_x_t, t, log_x_0_pred) # p(x_{t-1} | x_{t})
+    a_t = alphas[t[0]] if alphas is not None else 0
+    mat = torch.eye(ensemble_size, device=x.device)*(1-a_t)
+    mat += 1/ensemble_size * a_t
+    mat = torch.block_diag(*([mat]*(x.shape[0]//ensemble_size)))
+    log_model_pred = ( (mat[..., None, None] ).log().to(x.dtype) + log_model_pred[None])
+    log_model_pred = torch.logsumexp(log_model_pred, dim=1)
+    if (t==0).all() and last_greedy: # Do not sample at t=0
+        x_tm1_unknown = log_model_pred.argmax(dim=-1)
+    else:
+        x_tm1_unknown = diff.log_sample_categorical(log_model_pred)
+    # Equation 8a
+    x_known_log = index_to_log_onehot(x_known, diff.num_classes, dtype=x_0_pred.dtype)
+    if (t==0).all(): # Do not sample at t=0
+        x_tm1_known = x_known
+    else:
+        x_tm1_known = diff.q_sample(x_known_log, t)
+    # Equation 8c
+    x_tm1 = x_tm1_known * m.long() + x_tm1_unknown * (1 - m.long())
+    return x_tm1, x_0_pred
+@torch.inference_mode()
+def perform_simple_inference(model: torch.nn.Module, batch: tuple, diff: MultinomialDiffusion, T, dtype=torch.float16,
+                             retain_quant0: bool = True, dsh=DSH):
+    """ If `retain_quant0`, then do not sample quant0 in each forward or reverse diffusion step. """
+    # (bs=1, N), (bs, seq_len2, 8), (bs,)
+    c_text, c_codes, c_text_lengths, c_codes_lengths, x, x_padding_mask = batch
+    device = c_text.device
+    bs = c_text.shape[0]
+    x_quant0 = x[..., 0].clone() # (bs, seq_len) 0th quant level
+    x = torch.randint(0, diff.num_classes, x.shape, dtype=x.dtype, device=device)
+    # CRITICAL LINE: override quantization level 0 with provided quant0 level.
+    x[..., 0] = x_quant0
+    # RePaint paper resample scheduling
+    times = get_schedule(T, jump_n_sample=dsh.jump_n_sample, jump_len=dsh.jump_len)
+    x_known = torch.zeros_like(x)
+    x_known[..., 0] = x[..., 0] # override L0 codes
+    m = torch.zeros_like(x).bool()
+    # (bs, seq_len, 8)
+    m[..., 0] = True
+    offset = 0
+    if dsh.deep_clone:
+        print(f"Note: using deep clone. Assuming input `c_phones` is concatenated prompt and output phones.",
+              "Also assuming no padded indices in `c_codes`.")
+        prompt = c_codes
+        x = torch.cat((prompt, x), dim=1) # (bs=1, sl1 + sl2, 8)
+        x_known = torch.cat((prompt, x_known), dim=1)
+        x_padding_mask = torch.cat((
+            torch.zeros(x_padding_mask.shape[0], c_codes_lengths[0], dtype=torch.bool, device=x_padding_mask.device),
+            x_padding_mask), dim=-1
+        )
+        # (bs=1, :up to prompt duration, all 8 codebooks) = True/masked.
+        m = torch.cat((torch.ones_like(prompt), m), dim=1)
+        x_quant0 = torch.cat((prompt[..., 0], x_quant0), dim=-1)
+        offset = c_codes_lengths[0]
+        print(f"New x: {x.shape} | new x_known: {x_known.shape} . Base prompt: {prompt.shape}. New padding mask: {x_padding_mask.shape} | m shape: {m.shape}")
+    c = 0 # sequentially progressive diffusion offset (Section 4.2)
+    # ensemble bs (not in paper)
+    alphas = torch.linspace(1, 0, T).to(device)
+    pb = zip(times[:-1], times[1:])
+    if dsh.progress:
+        from fastprogress import progress_bar
+        pb = progress_bar(pb, total=len(times)-1)
+    # See RePaint paper algorithm
+    for t_last, t_cur in pb:
+        t = torch.ones((bs,), dtype=torch.long, device=x.device) * (t_last)
+        if t_cur < t_last:
+            if c > dsh.jump_n_sample:
+                c = 0
+            c += 1/dsh.jump_len
+            # Reverse diffusion: q
+            cbatch = (c_text, c_codes, c_text_lengths, c_codes_lengths, x, x_padding_mask, t)
+            x, x_0_pred = reverse_diffusion(diff, model, cbatch, x_known, m, temperature=dsh.x_0_temp, alphas=alphas, ensemble_size=1, dsh=dsh)
+        else:
+            # Forward diffusion: p
+            if dsh.enable_kevin_scaled_inference: x = forward_diffusion(diff, dtype, x, t, c=c, dsh=dsh)
+            else: x = forward_diffusion(diff, dtype, x, t, c=None, dsh=dsh)
+        if retain_quant0 and dsh.q0_override_steps < t_last:
+            x[..., 0] = x_quant0
+    # crop offset:
+    x = x[:, offset:]
+    return x

mars5/minbpe/base.py ADDED Viewed

	@@ -0,0 +1,166 @@

+"""
+Contains the base Tokenizer class and a few common helper functions.
+The base class also contains the (common) save/load functionality.
+It would be possible to be a lot more strict about the interface and
+e.g. isolating all regex/pattern parts to the RegexTokenizer, but
+some concessions are made for simplicity.
+"""
+import unicodedata
+# -----------------------------------------------------------------------------
+# a few helper functions useful for both BasicTokenizer and RegexTokenizer
+def get_stats(ids, counts=None):
+    """
+    Given a list of integers, return a dictionary of counts of consecutive pairs
+    Example: [1, 2, 3, 1, 2] -> {(1, 2): 2, (2, 3): 1, (3, 1): 1}
+    Optionally allows to update an existing dictionary of counts
+    """
+    counts = {} if counts is None else counts
+    for pair in zip(ids, ids[1:]): # iterate consecutive elements
+        counts[pair] = counts.get(pair, 0) + 1
+    return counts
+def merge(ids, pair, idx):
+    """
+    In the list of integers (ids), replace all consecutive occurrences
+    of pair with the new integer token idx
+    Example: ids=[1, 2, 3, 1, 2], pair=(1, 2), idx=4 -> [4, 3, 4]
+    """
+    newids = []
+    i = 0
+    while i < len(ids):
+        # if not at the very last position AND the pair matches, replace it
+        if ids[i] == pair[0] and i < len(ids) - 1 and ids[i+1] == pair[1]:
+            newids.append(idx)
+            i += 2
+        else:
+            newids.append(ids[i])
+            i += 1
+    return newids
+# first two helper functions...
+def replace_control_characters(s: str) -> str:
+    # we don't want to print control characters
+    # which distort the output (e.g. \n or much worse)
+    # https://stackoverflow.com/questions/4324790/removing-control-characters-from-a-string-in-python/19016117#19016117
+    # http://www.unicode.org/reports/tr44/#GC_Values_Table
+    chars = []
+    for ch in s:
+        if unicodedata.category(ch)[0] != "C":
+            chars.append(ch) # this character is ok
+        else:
+            chars.append(f"\\u{ord(ch):04x}") # escape
+    return "".join(chars)
+def render_token(t: bytes) -> str:
+    # pretty print a token, escaping control characters
+    s = t.decode('utf-8', errors='replace')
+    s = replace_control_characters(s)
+    return s
+# -----------------------------------------------------------------------------
+# the base Tokenizer class
+class Tokenizer:
+    """Base class for Tokenizers"""
+    def __init__(self):
+        # default: vocab size of 256 (all bytes), no merges, no patterns
+        self.merges = {} # (int, int) -> int
+        self.pattern = "" # str
+        self.special_tokens = {} # str -> int, e.g. {'<|endoftext|>': 100257}
+        self.vocab = self._build_vocab() # int -> bytes
+    def train(self, text, vocab_size, verbose=False):
+        # Tokenizer can train a vocabulary of size vocab_size from text
+        raise NotImplementedError
+    def encode(self, text):
+        # Tokenizer can encode a string into a list of integers
+        raise NotImplementedError
+    def decode(self, ids):
+        # Tokenizer can decode a list of integers into a string
+        raise NotImplementedError
+    def _build_vocab(self):
+        # vocab is simply and deterministically derived from merges
+        vocab = {idx: bytes([idx]) for idx in range(256)}
+        for (p0, p1), idx in self.merges.items():
+            vocab[idx] = vocab[p0] + vocab[p1]
+        for special, idx in self.special_tokens.items():
+            vocab[idx] = special.encode("utf-8")
+        return vocab
+    def save(self, file_prefix):
+        """
+        Saves two files: file_prefix.vocab and file_prefix.model
+        This is inspired (but not equivalent to!) sentencepiece's model saving:
+        - model file is the critical one, intended for load()
+        - vocab file is just a pretty printed version for human inspection only
+        """
+        # write the model: to be used in load() later
+        model_file = file_prefix + ".model"
+        with open(model_file, 'w') as f:
+            # write the version, pattern and merges, that's all that's needed
+            f.write("minbpe v1\n")
+            f.write(f"{self.pattern}\n")
+            # write the special tokens, first the number of them, then each one
+            f.write(f"{len(self.special_tokens)}\n")
+            for special, idx in self.special_tokens.items():
+                f.write(f"{special} {idx}\n")
+            # the merges dict
+            for idx1, idx2 in self.merges:
+                f.write(f"{idx1} {idx2}\n")
+        # write the vocab: for the human to look at
+        vocab_file = file_prefix + ".vocab"
+        inverted_merges = {idx: pair for pair, idx in self.merges.items()}
+        with open(vocab_file, "w", encoding="utf-8") as f:
+            for idx, token in self.vocab.items():
+                # note: many tokens may be partial utf-8 sequences
+                # and cannot be decoded into valid strings. Here we're using
+                # errors='replace' to replace them with the replacement char �.
+                # this also means that we couldn't possibly use .vocab in load()
+                # because decoding in this way is a lossy operation!
+                s = render_token(token)
+                # find the children of this token, if any
+                if idx in inverted_merges:
+                    # if this token has children, render it nicely as a merge
+                    idx0, idx1 = inverted_merges[idx]
+                    s0 = render_token(self.vocab[idx0])
+                    s1 = render_token(self.vocab[idx1])
+                    f.write(f"[{s0}][{s1}] -> [{s}] {idx}\n")
+                else:
+                    # otherwise this is leaf token, just print it
+                    # (this should just be the first 256 tokens, the bytes)
+                    f.write(f"[{s}] {idx}\n")
+    def load(self, model_file):
+        """Inverse of save() but only for the model file"""
+        model_file = str(model_file)
+        assert model_file.endswith(".model")
+        # read the model file
+        merges = {}
+        special_tokens = {}
+        idx = 256
+        with open(model_file, 'r', encoding="utf-8") as f:
+            # read the version
+            version = f.readline().strip()
+            assert version == "minbpe v1"
+            # read the pattern
+            self.pattern = f.readline().strip()
+            # read the special tokens
+            num_special = int(f.readline().strip())
+            for _ in range(num_special):
+                special, special_idx = f.readline().strip().split()
+                special_tokens[special] = int(special_idx)
+            # read the merges
+            for line in f:
+                idx1, idx2 = map(int, line.split())
+                merges[(idx1, idx2)] = idx
+                idx += 1
+        self.merges = merges
+        self.special_tokens = special_tokens
+        self.vocab = self._build_vocab()

mars5/minbpe/codebook.py ADDED Viewed

	@@ -0,0 +1,210 @@

+"""
+Minimal (byte-level) Byte Pair Encoding tokenizer.
+Unlike RegexTokenizer:
+- Operates on integer codes from an encodec codebook.
+"""
+import regex as re
+from .base import Tokenizer, get_stats, merge
+class CodebookTokenizer(Tokenizer):
+    def __init__(self, pattern=None, codebook_size=1024):
+        """
+        - pattern: optional string to override the default (GPT-4 split pattern)
+        - special_tokens: str -> int dictionary of special tokens
+          example: {'<|endoftext|>': 100257}
+        """
+        self.merges = {} # (int, int) -> int
+        self.pattern = pattern
+        self.compiled_pattern = re.compile(self.pattern)
+        self.special_tokens = {}  # str -> int, e.g. {'<|endoftext|>': 100257}
+        self.inverse_special_tokens = {}
+        self.codebook_size = codebook_size
+        self.vocab = self._build_vocab() # int -> bytes
+    def train(self, text, vocab_size, verbose=False):
+        assert vocab_size >= self.codebook_size
+        num_merges = vocab_size - self.codebook_size
+        # split the text up into text chunks
+        # text is a continuous signal, there is no splitting it up.
+        text_chunks = [text,] # re.findall(self.compiled_pattern, text)
+        # input text preprocessing
+        ids = [[int(idx) for idx in ch.split(' ')] for ch in text_chunks]
+        # iteratively merge the most common pairs to create new tokens
+        merges = {} # (int, int) -> int
+        # vocab = {idx: bytes([idx]) for idx in range(self.codebook_size)} # idx -> bytes
+        vocab = {idx: f" {idx:04d}".encode('utf-8') for idx in range(self.codebook_size)} # idx -> bytes
+        for i in range(num_merges):
+            # count the number of times every consecutive pair appears
+            stats = {}
+            for chunk_ids in ids:
+                # passing in stats will update it in place, adding up counts
+                get_stats(chunk_ids, stats)
+            # find the pair with the highest count
+            pair = max(stats, key=stats.get)
+            # mint a new token: assign it the next available id
+            idx = self.codebook_size + i
+            # replace all occurrences of pair in ids with idx
+            ids = [merge(chunk_ids, pair, idx) for chunk_ids in ids]
+            # save the merge
+            merges[pair] = idx
+            vocab[idx] = vocab[pair[0]] + vocab[pair[1]]
+            # prints
+            if verbose:
+                print(f"merge {i+1}/{num_merges}: {pair} -> {idx} ({vocab[idx]}) had {stats[pair]} occurrences")
+        # save class variables
+        self.merges = merges # used in encode()
+        self.vocab = vocab   # used in decode()
+    def register_special_tokens(self, special_tokens):
+        # special_tokens is a dictionary of str -> int
+        # example: {"<|endoftext|>": 100257}
+        self.special_tokens = special_tokens
+        self.inverse_special_tokens = {v: k for k, v in special_tokens.items()}
+    def decode(self, ids):
+        # given ids (list of integers), return Python string
+        part_bytes = []
+        for idx in ids:
+            if idx in self.vocab:
+                part_bytes.append(self.vocab[idx])
+            elif idx in self.inverse_special_tokens:
+                part_bytes.append(self.inverse_special_tokens[idx].encode("utf-8"))
+            else:
+                raise ValueError(f"invalid token id: {idx}")
+        text_bytes = b"".join(part_bytes)
+        text = text_bytes.decode("utf-8", errors="replace")
+        return text
+    def decode_int(self, ids) -> list[int]:
+        ret: str = self.decode(ids)
+        for s in self.special_tokens:
+            ret = ret.replace(s, ' ' + s + ' ')
+        ret = ret.strip()
+        ret = [int(t) if t[0].isnumeric() else t for t in ret.split(' ') if len(t) > 0]
+        return ret
+    def _encode_chunk(self, text_bytes):
+        # return the token ids
+        # let's begin. first, convert all bytes to integers in range 0..255
+        ids = list(text_bytes)
+        while len(ids) >= 2:
+            # find the pair with the lowest merge index
+            stats = get_stats(ids)
+            pair = min(stats, key=lambda p: self.merges.get(p, float("inf")))
+            # subtle: if there are no more merges available, the key will
+            # result in an inf for every single pair, and the min will be
+            # just the first pair in the list, arbitrarily
+            # we can detect this terminating case by a membership check
+            if pair not in self.merges:
+                break # nothing else can be merged anymore
+            # otherwise let's merge the best pair (lowest merge index)
+            idx = self.merges[pair]
+            ids = merge(ids, pair, idx)
+        return ids
+    def encode_ordinary(self, text):
+        """Encoding that ignores any special tokens."""
+        # split text into chunks of text by categories defined in regex pattern
+        text_chunks = [text,] #re.findall(self.compiled_pattern, text)
+        # all chunks of text are encoded separately, then results are joined
+        ids = []
+        for chunk in text_chunks:
+            # chunk_bytes = chunk.encode("utf-8") # raw bytes
+            chunk_ids = [int(idx) for idx in chunk.split(' ')]
+            chunk_ids = self._encode_chunk(chunk_ids)
+            ids.extend(chunk_ids)
+        return ids
+    def encode(self, text, allowed_special="none_raise"):
+        """
+        Unlike encode_ordinary, this function handles special tokens.
+        allowed_special: can be "all"|"none"|"none_raise" or a custom set of special tokens
+        if none_raise, then an error is raised if any special token is encountered in text
+        this is the default tiktoken behavior right now as well
+        any other behavior is either annoying, or a major footgun
+        """
+        # decode the user desire w.r.t. handling of special tokens
+        special = None
+        if allowed_special == "all":
+            special = self.special_tokens
+        elif allowed_special == "none":
+            special = {}
+        elif allowed_special == "none_raise":
+            special = {}
+            assert all(token not in text for token in self.special_tokens)
+        elif isinstance(allowed_special, set):
+            special = {k: v for k, v in self.special_tokens.items() if k in allowed_special}
+        else:
+            raise ValueError(f"allowed_special={allowed_special} not understood")
+        if not special:
+            # shortcut: if no special tokens, just use the ordinary encoding
+            return self.encode_ordinary(text)
+        # otherwise, we have to be careful with potential special tokens in text
+        # we handle special tokens by splitting the text
+        # based on the occurrence of any exact match with any of the special tokens
+        # we can use re.split for this. note that surrounding the pattern with ()
+        # makes it into a capturing group, so the special tokens will be included
+        special_pattern = "(" + "|".join(re.escape(k) for k in special) + ")"
+        special_chunks = re.split(special_pattern, text)
+        # now all the special characters are separated from the rest of the text
+        # all chunks of text are encoded separately, then results are joined
+        ids = []
+        for part in special_chunks:
+            part = part.strip()
+            if len(part) == 0: continue
+            if part in special:
+                # this is a special token, encode it separately as a special case
+                ids.append(special[part])
+            else:
+                # this is an ordinary sequence, encode it normally
+                ids.extend(self.encode_ordinary(part))
+        return ids
+    def load(self, model_file):
+        """Inverse of save() but only for the model file"""
+        model_file = str(model_file)
+        assert model_file.endswith(".model")
+        # read the model file
+        merges = {}
+        special_tokens = {}
+        idx = self.codebook_size
+        with open(model_file, 'r', encoding="utf-8") as f:
+            # read the version
+            version = f.readline().strip()
+            assert version == "minbpe v1"
+            # read the pattern
+            self.pattern = f.readline().strip()
+            # read the special tokens
+            num_special = int(f.readline().strip())
+            for _ in range(num_special):
+                special, special_idx = f.readline().strip().split()
+                special_tokens[special] = int(special_idx)
+            # read the merges
+            for line in f:
+                # print(line)
+                idx1, idx2 = map(int, line.split())
+                merges[(idx1, idx2)] = idx
+                idx += 1
+        self.merges = merges
+        self.special_tokens = special_tokens
+        self.vocab = self._build_vocab()
+    def _build_vocab(self):
+        # vocab is simply and deterministically derived from merges
+        vocab = {idx: f" {idx:04d}".encode('utf-8') for idx in range(self.codebook_size)}
+        for (p0, p1), idx in self.merges.items():
+            vocab[idx] = vocab[p0] + vocab[p1]
+        for special, idx in self.special_tokens.items():
+            vocab[idx] = special.encode("utf-8")
+        return vocab

mars5/minbpe/regex.py ADDED Viewed

	@@ -0,0 +1,164 @@

+"""
+Minimal (byte-level) Byte Pair Encoding tokenizer.
+Algorithmically follows along the GPT tokenizer:
+https://github.com/openai/gpt-2/blob/master/src/encoder.py
+Unlike BasicTokenizer:
+- RegexTokenizer handles an optional regex splitting pattern.
+- RegexTokenizer handles optional special tokens.
+"""
+import regex as re
+from .base import Tokenizer, get_stats, merge
+# the main GPT text split patterns, see
+# https://github.com/openai/tiktoken/blob/main/tiktoken_ext/openai_public.py
+GPT2_SPLIT_PATTERN = r"""'(?:[sdmt]|ll|ve|re)| ?\p{L}+| ?\p{N}+| ?[^\s\p{L}\p{N}]+|\s+(?!\S)|\s+"""
+GPT4_SPLIT_PATTERN = r"""'(?i:[sdmt]|ll|ve|re)|[^\r\n\p{L}\p{N}]?+\p{L}+|\p{N}{1,3}| ?[^\s\p{L}\p{N}]++[\r\n]*|\s*[\r\n]|\s+(?!\S)|\s+"""
+class RegexTokenizer(Tokenizer):
+    def __init__(self, pattern=None):
+        """
+        - pattern: optional string to override the default (GPT-4 split pattern)
+        - special_tokens: str -> int dictionary of special tokens
+          example: {'<|endoftext|>': 100257}
+        """
+        super().__init__()
+        self.pattern = GPT4_SPLIT_PATTERN if pattern is None else pattern
+        self.compiled_pattern = re.compile(self.pattern)
+        self.special_tokens = {}
+        self.inverse_special_tokens = {}
+    def train(self, text, vocab_size, verbose=False):
+        assert vocab_size >= 256
+        num_merges = vocab_size - 256
+        # split the text up into text chunks
+        text_chunks = re.findall(self.compiled_pattern, text)
+        # input text preprocessing
+        ids = [list(ch.encode("utf-8")) for ch in text_chunks]
+        # iteratively merge the most common pairs to create new tokens
+        merges = {} # (int, int) -> int
+        vocab = {idx: bytes([idx]) for idx in range(256)} # idx -> bytes
+        for i in range(num_merges):
+            # count the number of times every consecutive pair appears
+            stats = {}
+            for chunk_ids in ids:
+                # passing in stats will update it in place, adding up counts
+                get_stats(chunk_ids, stats)
+            # find the pair with the highest count
+            pair = max(stats, key=stats.get)
+            # mint a new token: assign it the next available id
+            idx = 256 + i
+            # replace all occurrences of pair in ids with idx
+            ids = [merge(chunk_ids, pair, idx) for chunk_ids in ids]
+            # save the merge
+            merges[pair] = idx
+            vocab[idx] = vocab[pair[0]] + vocab[pair[1]]
+            # prints
+            if verbose:
+                print(f"merge {i+1}/{num_merges}: {pair} -> {idx} ({vocab[idx]}) had {stats[pair]} occurrences")
+        # save class variables
+        self.merges = merges # used in encode()
+        self.vocab = vocab   # used in decode()
+    def register_special_tokens(self, special_tokens):
+        # special_tokens is a dictionary of str -> int
+        # example: {"<|endoftext|>": 100257}
+        self.special_tokens = special_tokens
+        self.inverse_special_tokens = {v: k for k, v in special_tokens.items()}
+    def decode(self, ids):
+        # given ids (list of integers), return Python string
+        part_bytes = []
+        for idx in ids:
+            if idx in self.vocab:
+                part_bytes.append(self.vocab[idx])
+            elif idx in self.inverse_special_tokens:
+                part_bytes.append(self.inverse_special_tokens[idx].encode("utf-8"))
+            else:
+                raise ValueError(f"invalid token id: {idx}")
+        text_bytes = b"".join(part_bytes)
+        text = text_bytes.decode("utf-8", errors="replace")
+        return text
+    def _encode_chunk(self, text_bytes):
+        # return the token ids
+        # let's begin. first, convert all bytes to integers in range 0..255
+        ids = list(text_bytes)
+        while len(ids) >= 2:
+            # find the pair with the lowest merge index
+            stats = get_stats(ids)
+            pair = min(stats, key=lambda p: self.merges.get(p, float("inf")))
+            # subtle: if there are no more merges available, the key will
+            # result in an inf for every single pair, and the min will be
+            # just the first pair in the list, arbitrarily
+            # we can detect this terminating case by a membership check
+            if pair not in self.merges:
+                break # nothing else can be merged anymore
+            # otherwise let's merge the best pair (lowest merge index)
+            idx = self.merges[pair]
+            ids = merge(ids, pair, idx)
+        return ids
+    def encode_ordinary(self, text):
+        """Encoding that ignores any special tokens."""
+        # split text into chunks of text by categories defined in regex pattern
+        text_chunks = re.findall(self.compiled_pattern, text)
+        # all chunks of text are encoded separately, then results are joined
+        ids = []
+        for chunk in text_chunks:
+            chunk_bytes = chunk.encode("utf-8") # raw bytes
+            chunk_ids = self._encode_chunk(chunk_bytes)
+            ids.extend(chunk_ids)
+        return ids
+    def encode(self, text, allowed_special="none_raise"):
+        """
+        Unlike encode_ordinary, this function handles special tokens.
+        allowed_special: can be "all"|"none"|"none_raise" or a custom set of special tokens
+        if none_raise, then an error is raised if any special token is encountered in text
+        this is the default tiktoken behavior right now as well
+        any other behavior is either annoying, or a major footgun
+        """
+        # decode the user desire w.r.t. handling of special tokens
+        special = None
+        if allowed_special == "all":
+            special = self.special_tokens
+        elif allowed_special == "none":
+            special = {}
+        elif allowed_special == "none_raise":
+            special = {}
+            assert all(token not in text for token in self.special_tokens)
+        elif isinstance(allowed_special, set):
+            special = {k: v for k, v in self.special_tokens.items() if k in allowed_special}
+        else:
+            raise ValueError(f"allowed_special={allowed_special} not understood")
+        if not special:
+            # shortcut: if no special tokens, just use the ordinary encoding
+            return self.encode_ordinary(text)
+        # otherwise, we have to be careful with potential special tokens in text
+        # we handle special tokens by splitting the text
+        # based on the occurrence of any exact match with any of the special tokens
+        # we can use re.split for this. note that surrounding the pattern with ()
+        # makes it into a capturing group, so the special tokens will be included
+        special_pattern = "(" + "|".join(re.escape(k) for k in special) + ")"
+        special_chunks = re.split(special_pattern, text)
+        # now all the special characters are separated from the rest of the text
+        # all chunks of text are encoded separately, then results are joined
+        ids = []
+        for part in special_chunks:
+            if part in special:
+                # this is a special token, encode it separately as a special case
+                ids.append(special[part])
+            else:
+                # this is an ordinary sequence, encode it normally
+                ids.extend(self.encode_ordinary(part))
+        return ids

mars5/model.py ADDED Viewed

	@@ -0,0 +1,344 @@

+import math
+from typing import Optional
+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+from torch import Tensor
+from .nn_future import (FNNSwiGLU, MistralTransformer, ModelArgs,
+                        RotatingBufferCache, SinePositionalEmbedding)
+from .utils import construct_padding_mask, length_to_mask
+LAYERNORM_EPS = 4e-5
+# ------------------------
+# Code adapted from OpenAI guided diffusion repo
+def timestep_embedding(timesteps, dim, max_period=10000, dtype=torch.float32):
+    """
+    Create sinusoidal timestep embeddings.
+    :param timesteps: a 1-D Tensor of N indices, one per batch element.
+                      These may be fractional.
+    :param dim: the dimension of the output.
+    :param max_period: controls the minimum frequency of the embeddings.
+    :return: an [N x dim] Tensor of positional embeddings.
+    """
+    half = dim // 2
+    freqs = torch.exp(
+        -math.log(max_period) * torch.arange(start=0, end=half) / half
+    ).to(device=timesteps.device)
+    args = timesteps[:, None].float() * freqs[None]
+    embedding = torch.cat([torch.cos(args), torch.sin(args)], dim=-1).to(dtype)
+    if dim % 2:
+        embedding = torch.cat([embedding, torch.zeros_like(embedding[:, :1])], dim=-1)
+    return embedding
+# --------------------------------
+# autoregressive codec language model
+class CodecLM(nn.Module):
+    def __init__(self, n_vocab, dim=1536, nhead=24, n_layers=26, n_spk_layers=2, dim_ff_scale=None, sliding_window=3000) -> None:
+        super().__init__()
+        if dim_ff_scale is None: hidden_dim = int(dim*4*(3/4))
+        else: hidden_dim = int(dim*dim_ff_scale)
+        self.cfg = ModelArgs(n_vocab, dim=dim, n_layers=n_layers, n_heads=nhead, n_kv_heads=nhead, hidden_dim=hidden_dim, sliding_window=sliding_window)
+        self.ar = MistralTransformer(self.cfg)
+        self.embed = nn.Embedding(n_vocab, dim)
+        # --- spk embedding network
+        dim_ff = int(dim*4*(3/4))
+        self.pos_embedding = SinePositionalEmbedding(dim, scale=False, alpha=True)
+        self.ref_chunked_emb = ChunkedEmbedding(1024 + 1, 8, dim) # add 1 for pad idx
+        self.spk_identity_emb = nn.Embedding(1, dim)
+        # define custom decoder
+        encoder_layer = nn.TransformerEncoderLayer(dim, nhead, dim_ff,
+                                                activation=FNNSwiGLU(dim, dim_ff), dropout=0,
+                                                batch_first=True, norm_first=True, layer_norm_eps=LAYERNORM_EPS)
+        encoder_layer.linear1 = nn.Identity()
+        self.spk_encoder = nn.TransformerEncoder(encoder_layer, n_spk_layers, norm=nn.LayerNorm(dim, eps=LAYERNORM_EPS))
+        # monkeypatch for broken copy.deepcopy of nn.Modules in nn.TransformerDecoder
+        for l in self.spk_encoder.layers: l.activation = FNNSwiGLU(dim, dim_ff)
+    @torch.inference_mode
+    def get_spk_embedding(self, spk_reference, c_codes_lengths=None) -> Tensor:
+        """ Gets speaker reference embeddings using `spk_reference` codes of shape (bs, seq_len, n_codebooks). """
+        bs = spk_reference.shape[0]
+        if bs != 1:
+            raise AssertionError(f"Speaker embedding extraction only implemented using for bs=1 currently.")
+        spk_seq = self.ref_chunked_emb(spk_reference) # (bs, sl, dim)
+        spk_ref_emb = self.spk_identity_emb.weight[None].expand(bs, -1, -1) # (bs, 1, dim)
+        spk_seq = torch.cat([spk_ref_emb, spk_seq], dim=1) # (bs, 1+sl, dim)
+        # add pos encoding
+        spk_seq = self.pos_embedding(spk_seq)
+        # codebook goes from indices 0->1023, padding is idx 1024 (the 1025th entry)
+        src_key_padding_mask = construct_padding_mask(spk_reference[:, :, 0], 1024)
+        src_key_padding_mask = torch.cat((
+                                            # append a zero here since we DO want to attend to initial position.
+                                            torch.zeros(src_key_padding_mask.shape[0], 1, dtype=bool, device=src_key_padding_mask.device),
+                                            src_key_padding_mask
+                                            ),
+                                            dim=1)
+        # pass through transformer
+        res = self.spk_encoder(spk_seq, is_causal=False, src_key_padding_mask=src_key_padding_mask)[:, :1] # select first element -> now (bs, 1, dim).
+        return res.squeeze(1)
+    def forward(self, x: Tensor, x_padding_mask: Optional[Tensor] = None, spk_reference: Optional[Tensor] = None,
+                cache: Optional[RotatingBufferCache] = None, counter: int = 0) -> Tensor:
+        """ Inputs:
+            - `x`: (bs, seq_len, vocab_size)
+            - `x_padding_mask`: (bs, seq_len) mask for each input, True for positions to *ignore*, False otherwise.
+                Note that since this is an autoregressive model, this doesn't actually matter for infernece, so it is ignored at inference.
+            - `spk_reference`: (bs, seq_len, n_codebooks) corresponding to the speaker reference to clone from.
+            - `cache` and `counter`: used for kv caching, optional.
+            Returns `x` of same shape (bs, seq_len, dim)
+        """
+        x = self.embed(x)
+        # --- speaker reference/embedding
+        if spk_reference is not None:
+            # compute ref
+            bs = spk_reference.shape[0]
+            spk_seq = self.ref_chunked_emb(spk_reference) # (bs, sl, dim)
+            spk_ref_emb = self.spk_identity_emb.weight[None].expand(bs, -1, -1) # (bs, 1, dim)
+            spk_seq = torch.cat([spk_ref_emb, spk_seq], dim=1) # (bs, 1+sl, dim)
+            # add pos encoding
+            spk_seq = self.pos_embedding(spk_seq)
+            # codebook goes from indices 0->1023, padding is idx 1024 (the 1025th entry)
+            src_key_padding_mask = construct_padding_mask(spk_reference[:, :, 0], 1024)
+            src_key_padding_mask = torch.cat((
+                                                # append a zero here since we DO want to attend to initial position.
+                                                torch.zeros(src_key_padding_mask.shape[0], 1, dtype=bool, device=src_key_padding_mask.device),
+                                                src_key_padding_mask
+                                             ),
+                                             dim=1)
+            # pass through transformer
+            res = self.spk_encoder(spk_seq, is_causal=False, src_key_padding_mask=src_key_padding_mask)[:, :1] # select first element -> now (bs, 1, dim).
+            x = torch.cat([res, x], dim=1)
+        positions = torch.arange(0, x.shape[1], device=x.device, dtype=torch.long)
+        if cache is not None and counter != 1:
+            # using only the last token to predict the next one
+            x = x[:,-1,:].unsqueeze(1)
+            positions = positions[-1:]
+        x = self.ar(x, positions, cache) # (bs, seq_len, vocab)
+        if spk_reference is not None and (cache is None or counter == 1):
+            x = x[:, 1:] # strip out the first output token corresponding to the speaker embedding token.
+        return x
+# -------------------------
+# residual discrete diffusion model
+class ChunkedEmbedding(nn.Module):
+    def __init__(self, codebook_size: int, n_quantizer: int, dim: int) -> None:
+        super().__init__()
+        assert dim % n_quantizer == 0, f"ChunkedEmbedding output dim ({dim}) must be divisible by n_quant {n_quantizer}"
+        self.embs = nn.ModuleList([nn.Embedding(codebook_size, dim//n_quantizer) for _ in range(n_quantizer)])
+    def forward(self, x: Tensor) -> Tensor:
+        """ Embeds each codebook index in `x` (bs, seq_len, n_quantizer) to an embedding vector, concatenating results.
+        Returns output of shape (bs, seq_len, dim)
+        """
+        y = torch.cat([self.embs[i](x[..., i]) for i in range(x.shape[-1])], dim=-1)
+        return y
+class ResidualTransformer(nn.Module):
+    def __init__(self, n_text_vocab, n_quant=1024, dim=1024, nhead=16,
+                 enc_layers=8, dec_layers=16, n_spk_layers=3,
+                 c_quant_levels=8, pred_quant_levels=8,
+                 t_emb_dim=1024, norm_first=True, p_cond_drop=0.1, dropout=0) -> None:
+        super().__init__()
+        self.cond_pos_embedding = SinePositionalEmbedding(dim, scale=False, alpha=True)
+        self.pos_embedding = SinePositionalEmbedding(dim, scale=False, alpha=True)
+        # *4 from heuristic, *2/3 from swiglu, since there are 3 linear matrices not 2.
+        # so we must keep # params the same.
+        dim_ff = int(dim*4*(3/4))
+        # define custom encoder
+        encoder_layer = nn.TransformerEncoderLayer(dim, nhead, dim_ff,
+                            activation=FNNSwiGLU(dim, dim_ff), dropout=dropout,
+                            batch_first=True, norm_first=norm_first, layer_norm_eps=LAYERNORM_EPS)
+        encoder_layer.linear1 = nn.Identity()
+        encoder = nn.TransformerEncoder(encoder_layer, enc_layers, norm=nn.LayerNorm(dim, eps=LAYERNORM_EPS) if norm_first else None)
+        # define custom decoder
+        decoder_layer = nn.TransformerDecoderLayer(dim, nhead, dim_ff,
+                                                activation=FNNSwiGLU(dim, dim_ff), dropout=dropout,
+                                                batch_first=True, norm_first=norm_first, layer_norm_eps=LAYERNORM_EPS)
+        decoder_layer.linear1 = nn.Identity()
+        decoder = nn.TransformerDecoder(decoder_layer, dec_layers, norm=nn.LayerNorm(dim, eps=LAYERNORM_EPS) if norm_first else None)
+        # monkeypatch for broken copy.deepcopy of nn.Modules in nn.TransformerDecoder
+        for l in decoder.layers: l.activation = FNNSwiGLU(dim, dim_ff)
+        self.tfm = nn.Transformer(dim, nhead, dim_feedforward=dim_ff, batch_first=True,
+            norm_first=norm_first,
+            num_encoder_layers=enc_layers,
+            num_decoder_layers=dec_layers,
+            custom_encoder=encoder,
+            custom_decoder=decoder,
+            layer_norm_eps=LAYERNORM_EPS,
+            dropout=dropout
+        )
+        # Timestep embedding network
+        self.t_emb_dim = t_emb_dim
+        self.timestep_encoder_emb = nn.Sequential(
+            nn.Linear(t_emb_dim, dim),
+            nn.SiLU(),
+            nn.Linear(dim, dim)
+        )
+        self.timestep_decoder_emb = nn.Sequential(
+            nn.Linear(t_emb_dim, dim),
+            nn.SiLU(),
+            nn.Linear(dim, dim)
+        )
+        self.text_embed = nn.Embedding(n_text_vocab, dim)
+        ## ----> reference / conditioning encoder:
+        self.ref_embedder = ChunkedEmbedding(n_quant, c_quant_levels, dim)
+        self.ref_pos_embedding = SinePositionalEmbedding(dim, scale=False, alpha=True)
+        self.spk_identity_emb = nn.Embedding(1, dim)
+        spk_encoder_layer = nn.TransformerEncoderLayer(dim, nhead, dim_ff,
+                                                activation=FNNSwiGLU(dim, dim_ff), dropout=dropout,
+                                                batch_first=True, norm_first=True, layer_norm_eps=LAYERNORM_EPS)
+        spk_encoder_layer.linear1 = nn.Identity()
+        self.spk_encoder = nn.TransformerEncoder(spk_encoder_layer, n_spk_layers, norm=nn.LayerNorm(dim, eps=LAYERNORM_EPS))
+        # monkeypatch for broken copy.deepcopy of nn.Modules in nn.TransformerDecoder
+        for l in self.spk_encoder.layers: l.activation = FNNSwiGLU(dim, dim_ff)
+        # ----> end speaker encoder network
+        # self.residual_encoder = nn.Embedding(n_quant, dim) # only encode first quantization level of decoder input.
+        self.residual_encoder = ChunkedEmbedding(n_quant, c_quant_levels, dim)
+        self.residual_decoder = nn.ModuleList([
+            nn.Sequential(
+                nn.LayerNorm(dim),
+                nn.Linear(dim, n_quant)
+            ) for i in range(pred_quant_levels)
+        ])
+        self.n_quantizer = pred_quant_levels
+        self.p_cond_drop = p_cond_drop
+    @torch.inference_mode
+    def get_spk_embedding(self, c_codes, c_codes_length) -> Tensor:
+        """ Obtain speaker embedding vectors using `c_codes` from reference encodec sequences, and `c_codes_length` of lengths for each sequence """
+        bs = c_codes.shape[0]
+        spk_seq = self.ref_embedder(c_codes) # (bs, sl, dim)
+        spk_ref_emb = self.spk_identity_emb.weight[None].expand(bs, -1, -1) # (bs, 1, dim)
+        spk_seq = torch.cat([spk_ref_emb, spk_seq], dim=1) # (bs, 1+sl, dim)
+        # add pos encoding
+        spk_seq = self.ref_pos_embedding(spk_seq)
+        # add 1 to c_codes_length to account for the fact that we concatenate the spk_ref_emb to it.
+        src_key_padding_mask = length_to_mask(c_codes_length+1, torch.zeros_like(c_codes_length), max_len=spk_seq.shape[1])
+        src_key_padding_mask = src_key_padding_mask.to(dtype=torch.bool, device=spk_seq.device)
+        # pass through transformer
+        res = self.spk_encoder(spk_seq, is_causal=False, src_key_padding_mask=src_key_padding_mask)[:, :1] # select first element -> now (bs, 1, dim).
+        return res.squeeze(1)
+    def forward(self, c_text: Tensor, c_codes: Tensor, c_texts_length: Tensor, c_codes_length: Tensor,
+                x: Tensor, x_padding_mask: Tensor, t: Tensor, drop_cond=False):
+        """ Input:
+            - `c_text`: (bs, seq_len1) the prompt text (BPE encoded)
+            - `c_codes`: (bs, seq_len2, n_quant) the full tokenized codes of the reference speech
+            - `c_texts_length`: (bs, ) the length of the codes in the text prompt
+            - `c_codes_length`: (bs, ) the length of the prompt acoustic token codes in `c_codes`.
+            - `x`: (bs, seq_len3) L0 residual codes
+            - `x`: (bs, seq_len3, n_quant) L0 residual codes
+            - `x_padding_mask`: (bs, seq_len3) masking for residual codes
+            - `t`: (bs) timestep
+            - `drop_cond`: bool, whether or not to forcibly drop the conditioning information.
+        Returns:
+            - outs: (bs, seq_len, n_quantizer, codebook_size)
+        """
+        c_text = self.text_embed(c_text) # (bs, seq_len1, dim)
+        ## ----> reference / conditioning encoder:
+        bs = c_codes.shape[0]
+        if self.training:
+            zero_cond_inds = torch.rand_like(t, dtype=c_text.dtype) < self.p_cond_drop
+        else:
+            # never randomly zero when in eval mode
+            zero_cond_inds = torch.zeros_like(t, dtype=torch.bool)
+            if drop_cond:
+                # force drop conditioning
+                zero_cond_inds = torch.ones_like(t, dtype=torch.bool)
+        c_codes_length[zero_cond_inds] = 0
+        c_codes[zero_cond_inds] = 1024
+        spk_seq = self.ref_embedder(c_codes) # (bs, sl, dim)
+        spk_ref_emb = self.spk_identity_emb.weight[None].expand(bs, -1, -1) # (bs, 1, dim)
+        spk_seq = torch.cat([spk_ref_emb, spk_seq], dim=1) # (bs, 1+sl, dim)
+        # add pos encoding
+        spk_seq = self.ref_pos_embedding(spk_seq)
+        # add 1 to c_codes_length to account for the fact that we concatenate the spk_ref_emb to it.
+        src_key_padding_mask = length_to_mask(c_codes_length+1, torch.zeros_like(c_codes_length), max_len=spk_seq.shape[1])
+        src_key_padding_mask = src_key_padding_mask.to(dtype=torch.bool, device=spk_seq.device)
+        # pass through transformer
+        res = self.spk_encoder(spk_seq, is_causal=False, src_key_padding_mask=src_key_padding_mask)[:, :1] # select first element -> now (bs, 1, dim).
+        c_codes = res # (bs, 1, dim)
+        c_codes_lengths_extract = torch.ones_like(c_codes_length) # manually override all the code lengths to equal 1, since we only have 1 spk embedding.
+        ## ----> end reference / conditioning encoder:
+        ## ----> timestep embeddings and parsing
+        t_emb = timestep_embedding(t, self.t_emb_dim, dtype=c_text.dtype)
+        t_emb_encoder = self.timestep_encoder_emb(t_emb) # (bs, t_dim)
+        t_emb_decoder = self.timestep_decoder_emb(t_emb)
+        ## ----> concatenating text/phone inputs and implicit speaker embedding.
+        c_phones_unpacked = nn.utils.rnn.unpad_sequence(c_text, c_texts_length.cpu(), batch_first=True)
+        c_codes_unpacked = nn.utils.rnn.unpad_sequence(c_codes, c_codes_lengths_extract.cpu(), batch_first=True)
+        # >>> Concat [speaker codes, text codes]
+        assert all(b.shape[0] == 1 for b in c_codes_unpacked)
+        c_joined = [torch.cat((b, a), dim=0) for a, b in zip(c_phones_unpacked, c_codes_unpacked)]
+        c = nn.utils.rnn.pad_sequence(c_joined, batch_first=True)
+        c_joined_lengths = torch.tensor([p.shape[0] for p in c_joined], device=c.device, dtype=torch.long)
+        c_padding_mask = length_to_mask(c_joined_lengths, torch.zeros_like(c_joined_lengths))
+        c = self.cond_pos_embedding(c)
+        ## Format input:
+        x = self.residual_encoder(x) # (bs, seq_len3, dim)
+        x = self.pos_embedding(x)
+        x = x + t_emb_decoder[:, None]
+        c = c + t_emb_encoder[:, None]
+        ## Perform prediction:
+        output = self.tfm(c, x, src_key_padding_mask=c_padding_mask,
+                          tgt_key_padding_mask=x_padding_mask,
+                          memory_key_padding_mask=c_padding_mask) # (bs, seq_len, dim)
+        outs = torch.stack([self.residual_decoder[i](output) for i in range(self.n_quantizer)], dim=-1) # (bs, seq_len, logit_dim, n_quant)
+        return outs

mars5/nn_future.py ADDED Viewed

	@@ -0,0 +1,400 @@

+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+from torch import Tensor
+import math
+from dataclasses import dataclass
+from typing import Optional
+# --------------------------
+# activation functions
+class FNNSwiGLU(nn.Module):
+    def __init__(self, dim, dim_ff) -> None:
+        super().__init__()
+        # we will receive in xW
+        self.V = nn.Linear(dim, dim_ff, bias=False)
+        self.W = nn.Linear(dim, dim_ff, bias=False)
+    def forward(self, x: Tensor) -> Tensor:
+        """ Compute SwiGLU output of x, the output of the first linear layer. i.e.
+        FFNSwiGLU(x, W, V, W2) = (Swish1(xW) ⊗ xV )W2.
+        NOTE: the transformer linear1 layer must be overwritten to identity. This layer only applies
+        the Swish(xW) * xV. The W2 multiplication is done in the main transformer layer
+        """
+        return F.silu(self.W(x)) * self.V(x)
+# ---------------------------------
+# padding and position layers
+class SinePositionalEmbedding(nn.Module):
+    def __init__(
+        self,
+        dim_model: int,
+        dropout: float = 0.0,
+        scale: bool = False,
+        alpha: bool = False,
+    ):
+        super().__init__()
+        self.dim_model = dim_model
+        self.x_scale = math.sqrt(dim_model) if scale else 1.0
+        self.alpha = nn.Parameter(torch.ones(1), requires_grad=alpha)
+        self.dropout = torch.nn.Dropout(p=dropout)
+        self.reverse = False
+        self.pe = None
+        self.extend_pe(torch.tensor(0.0).expand(1, 4000))
+    def extend_pe(self, x):
+        """Reset the positional encodings."""
+        if self.pe is not None:
+            if self.pe.size(1) >= x.size(1):
+                if self.pe.dtype != x.dtype or self.pe.device != x.device:
+                    self.pe = self.pe.to(dtype=x.dtype, device=x.device)
+                return
+        pe = torch.zeros(x.size(1), self.dim_model)
+        if self.reverse:
+            position = torch.arange(
+                x.size(1) - 1, -1, -1.0, dtype=torch.float32
+            ).unsqueeze(1)
+        else:
+            position = torch.arange(
+                0, x.size(1), dtype=torch.float32
+            ).unsqueeze(1)
+        div_term = torch.exp(
+            torch.arange(0, self.dim_model, 2, dtype=torch.float32)
+            * -(math.log(10000.0) / self.dim_model)
+        )
+        pe[:, 0::2] = torch.sin(position * div_term)
+        pe[:, 1::2] = torch.cos(position * div_term)
+        pe = pe.unsqueeze(0)
+        self.pe = pe.to(device=x.device, dtype=x.dtype).detach()
+    def forward(self, x: torch.Tensor) -> torch.Tensor:
+        """ Assumes x of shape (bs, seq_len, dim) """
+        self.extend_pe(x)
+        output = x.unsqueeze(-1) if x.ndim == 2 else x
+        output = output * self.x_scale + self.alpha * self.pe[:, : x.size(1)]
+        return self.dropout(output)
+# --------------------------------
+# kv cache blocks
+class CacheView:
+    def __init__(self, cache_k: torch.Tensor, cache_v: torch.Tensor):
+        self.cache_k = cache_k
+        self.cache_v = cache_v
+    @property
+    def sliding_window(self):
+        return self.cache_k.shape[1]
+class RotatingBufferCache:
+    """
+    This is an example that implements a less naive rotating buffer cache, allowing for variable length sequences.
+    Allocated cache is rectangular which is wasteful (see PagedAttention for better mechanisms)
+    """
+    def __init__(self, n_layers: int, max_batch_size: int, sliding_window: int, n_kv_heads: int, head_dim: int):
+        self.sliding_window = sliding_window
+        self.n_kv_heads = n_kv_heads
+        self.head_dim = head_dim
+        self.cache_k = torch.empty((
+            n_layers,
+            max_batch_size,
+            sliding_window,
+            n_kv_heads,
+            head_dim
+        ))
+        self.cache_v = torch.empty((
+            n_layers,
+            max_batch_size,
+            sliding_window,
+            n_kv_heads,
+            head_dim
+        ))
+    def get_view(self, layer_id: int) -> CacheView:
+        return CacheView(self.cache_k[layer_id], self.cache_v[layer_id])
+    @property
+    def device(self):
+        return self.cache_k.device
+    def to(self, device: torch.device, dtype: torch.dtype):
+        self.cache_k = self.cache_k.to(device=device, dtype=dtype)
+        self.cache_v = self.cache_v.to(device=device, dtype=dtype)
+        return self
+# --------------------------------
+# Mistral transformer blocks
+# Code for the follow blocks are adapted from
+# https://github.com/mistralai/mistral-src
+# Thank you Mistral team!
+@dataclass
+class ModelArgs:
+    vocab_size: int
+    dim: int = 1152 # default for mars3 and before: 1024
+    n_layers: int = 24
+    head_dim: int = 64 # = dim/n_heads
+    hidden_dim: int = 3584
+    n_heads: int = 16
+    n_kv_heads: int = 16 # default: 8
+    sliding_window: int = 1792
+    norm_eps: float = 1e-5
+    max_batch_size: int = 256
+def repeat_kv(keys: torch.Tensor, values: torch.Tensor, repeats: int):
+    if repeats == 1: return keys, values
+    keys = torch.repeat_interleave(keys, repeats=repeats, dim=2)
+    values = torch.repeat_interleave(values, repeats=repeats, dim=2)
+    return keys, values
+def _reshape_for_broadcast(freqs_cis: torch.Tensor, x: torch.Tensor) -> torch.Tensor:
+    """
+    freqs_cis: complex - (seq_len, head_dim / 2)
+    x: complex - (bsz, seq_len, head_dim / 2)
+    """
+    ndim = x.ndim
+    assert 1 < ndim
+    assert freqs_cis.shape == (x.shape[1], x.shape[-1]), (
+        freqs_cis.shape,
+        (x.shape[1], x.shape[-1]),
+    )
+    shape = [d if i == 1 or i == ndim - 1 else 1 for i, d in enumerate(x.shape)]
+    return freqs_cis.view(*shape)
+def apply_rotary_emb(
+    xq: torch.Tensor,
+    xk: torch.Tensor,
+    freqs_cis: torch.Tensor,
+) -> tuple[torch.Tensor, torch.Tensor]:
+    xq_ = torch.view_as_complex(xq.float().reshape(*xq.shape[:-1], -1, 2))
+    xk_ = torch.view_as_complex(xk.float().reshape(*xk.shape[:-1], -1, 2))
+    freqs_cis = _reshape_for_broadcast(freqs_cis, xq_)
+    xq_out = torch.view_as_real(xq_ * freqs_cis).flatten(3)
+    xk_out = torch.view_as_real(xk_ * freqs_cis).flatten(3)
+    return xq_out.type_as(xq), xk_out.type_as(xk)
+def precompute_freqs_cis(dim: int, end: int, theta: float = 10000.0) -> torch.Tensor:
+    freqs = 1.0 / (theta ** (torch.arange(0, dim, 2)[: (dim // 2)].float() / dim))
+    t = torch.arange(end, device=freqs.device)  # type: ignore
+    freqs = torch.outer(t, freqs).float()  # type: ignore
+    return torch.polar(torch.ones_like(freqs), freqs)  # complex64
+class Attention(nn.Module):
+    def __init__(self, args: ModelArgs):
+        super().__init__()
+        self.args = args
+        self.n_heads: int = args.n_heads
+        self.n_kv_heads: int = args.n_kv_heads
+        self.repeats = self.n_heads // self.n_kv_heads
+        self.sliding_window = self.args.sliding_window
+        self.scale = self.args.head_dim**-0.5
+        self.wq = nn.Linear(
+            args.dim,
+            args.n_heads * args.head_dim,
+            bias=False
+        )
+        self.wk = nn.Linear(
+            args.dim,
+            args.n_kv_heads * args.head_dim,
+            bias=False
+        )
+        self.wv = nn.Linear(
+            args.dim,
+            args.n_kv_heads * args.head_dim,
+            bias=False
+        )
+        self.wo = nn.Linear(
+            args.n_heads * args.head_dim,
+            args.dim,
+            bias=False
+        )
+    def forward(
+        self, x: torch.Tensor, freqs_cis: torch.Tensor, positions: torch.Tensor, mask: Optional[torch.Tensor], cache: Optional[CacheView]
+    ) -> torch.Tensor:
+        bsz, seqlen, _ = x.shape
+        xq, xk, xv = self.wq(x), self.wk(x), self.wv(x)
+        xq = xq.view(bsz, seqlen, self.n_heads, self.args.head_dim)
+        xk = xk.view(bsz, seqlen, self.n_kv_heads, self.args.head_dim)
+        xv = xv.view(bsz, seqlen, self.n_kv_heads, self.args.head_dim)
+        xq, xk = apply_rotary_emb(xq, xk, freqs_cis=freqs_cis)
+        # The cache is a rotating buffer
+        if cache is not None:
+            scatter_pos = (positions[-self.sliding_window:] % self.sliding_window)[None, :, None, None]
+            scatter_pos = scatter_pos.repeat(bsz, 1, self.n_kv_heads, self.args.head_dim)
+            cache.cache_k[:bsz].scatter_(dim=1, index=scatter_pos, src=xk[:, -self.sliding_window:])
+            cache.cache_v[:bsz].scatter_(dim=1, index=scatter_pos, src=xv[:, -self.sliding_window:])
+        if positions.shape[0] > 1:
+            # prefill
+            key, value = repeat_kv(xk, xv, self.repeats)
+        else:
+            cur_pos = positions[-1].item() + 1
+            key, value = repeat_kv(cache.cache_k[:bsz, :cur_pos, ...], cache.cache_v[:bsz, :cur_pos, ...], self.repeats)
+        # print(f"Internal: {xq.shape}, key: {key.shape}, mask: {mask.shape} | {mask.dtype} | xq: {xq.dtype} | mask: {mask} ")
+        # if mask is not None:
+        #     mask = mask[None, None, ...].expand(bsz, self.n_heads, -1, -1)
+        #     mask = mask.to(key.dtype)
+        query = xq.transpose(1, 2)
+        key = key.transpose(1, 2)
+        value = value.transpose(1, 2)
+        # # scores : [bsz, n_heads, seqlen | 1, seqlen]
+        # scores = torch.matmul(query, key.transpose(2, 3)) * self.scale
+        output = F.scaled_dot_product_attention(query, key, value, mask) # (bs, n_local_heads, slen, head_dim)
+        output = output.transpose(1, 2).contiguous().view(bsz, seqlen, -1)
+        return self.wo(output)
+class FeedForward(nn.Module):
+    def __init__(self, args: ModelArgs):
+        super().__init__()
+        self.w1 = nn.Linear(
+            args.dim,
+            args.hidden_dim,
+            bias=False
+        )
+        self.w2 = nn.Linear(
+            args.hidden_dim,
+            args.dim,
+            bias=False
+        )
+        self.w3 = nn.Linear(
+            args.dim,
+            args.hidden_dim,
+            bias=False
+        )
+    def forward(self, x) -> torch.Tensor:
+        return self.w2(nn.functional.silu(self.w1(x)) * self.w3(x))
+class RMSNorm(torch.nn.Module):
+    def __init__(self, dim: int, eps: float = 1e-6):
+        super().__init__()
+        self.eps = eps
+        self.weight = nn.Parameter(torch.ones(dim))
+    def _norm(self, x):
+        return x * torch.rsqrt(x.pow(2).mean(-1, keepdim=True) + self.eps)
+    def forward(self, x):
+        output = self._norm(x.float()).type_as(x)
+        return output * self.weight
+class TransformerBlock(nn.Module):
+    def __init__(self, args: ModelArgs):
+        super().__init__()
+        self.n_heads = args.n_heads
+        self.dim = args.dim
+        self.attention = Attention(args)
+        self.feed_forward = FeedForward(args=args)
+        self.attention_norm = RMSNorm(args.dim, eps=args.norm_eps)
+        self.ffn_norm = RMSNorm(args.dim, eps=args.norm_eps)
+        self.args = args
+    def forward(
+        self, x: torch.Tensor, freqs_cis: torch.Tensor, positions: torch.Tensor, mask: Optional[torch.Tensor], cache: Optional[CacheView]
+    ) -> torch.Tensor:
+        r = self.attention.forward(self.attention_norm(x), freqs_cis, positions, mask, cache)
+        h = x + r
+        r = self.feed_forward.forward(self.ffn_norm(h))
+        out = h + r
+        return out
+class MistralTransformer(nn.Module):
+    def __init__(self, args: ModelArgs):
+        super().__init__()
+        self.args = args
+        self.vocab_size = args.vocab_size
+        self.n_layers = args.n_layers
+        assert self.vocab_size > 0
+        # self.tok_embeddings = nn.Embedding(args.vocab_size, args.dim)
+        self.layers = torch.nn.ModuleList(
+            [TransformerBlock(args=args) for _ in range(args.n_layers)]
+        )
+        self.norm = RMSNorm(args.dim, eps=args.norm_eps)
+        self.output = nn.Linear(
+            args.dim,
+            args.vocab_size,
+            bias=False
+        )
+        # self.freqs_cis
+        self.freqs_cis = precompute_freqs_cis(self.args.head_dim, 128_000)
+    @property
+    def dtype(self) -> torch.dtype:
+        return self.tok_embeddings.weight.dtype
+    @property
+    def device(self) -> torch.device:
+        return self.tok_embeddings.weight.device
+    def forward(
+        self,
+        input_ids: torch.Tensor,
+        positions: torch.Tensor,
+        cache: Optional[RotatingBufferCache]
+    ):
+        h = input_ids
+        if self.freqs_cis.device != h.device:
+            self.freqs_cis = self.freqs_cis.to(h.device)
+        freqs_cis = self.freqs_cis[positions]
+        mask: Optional[torch.Tensor] = None
+        if input_ids.shape[1] > 1:
+            seqlen = input_ids.shape[1]
+            tensor = torch.full(
+                (seqlen, seqlen),
+                dtype=h.dtype,
+                fill_value=1,
+                device=h.device,
+            )
+            mask = torch.tril(tensor, diagonal=0).to(h.dtype)
+            # make the mask banded to account for sliding window
+            mask = torch.triu(mask, diagonal=-self.args.sliding_window)
+            mask = torch.log(mask)
+        for layer_id, layer in enumerate(self.layers):
+            cache_view = None if cache is None else cache.get_view(layer_id)
+            h = layer(h, freqs_cis, positions, mask, cache_view)
+        return self.output(self.norm(h))

mars5/samplers.py ADDED Viewed

	@@ -0,0 +1,122 @@

+"""
+Code for modifying categorical distributions to improve quality of sampling.
+Adapted from:
+- https://github.com/e-c-k-e-r/vall-e/blob/master/vall_e/samplers.py
+- Mirosoft UniLM
+- Matthew Baas's typical sampling code.
+- https://github.com/LostRuins/koboldcpp
+"""
+import math
+import torch
+import torch.nn.functional as F
+import numpy as np
+import logging
+from torch import Tensor, nn
+def freq_rep_penalty(logits: Tensor, previous: Tensor, alpha_frequency: float, alpha_presence: float, penalty_window: int = 100) -> Tensor:
+    """ Apply frequency and presence penalty according to openai's formuation.
+    Concretely: given `logits` (bs, vocab_size) and `previous` (bs, seq_len,)
+    Modified to support batched inference.
+    See: https://platform.openai.com/docs/guides/text-generation/parameter-details
+    """
+    bs = logits.shape[0]
+    previous = previous[..., -penalty_window:]
+    c = torch.zeros_like(logits, device=logits.device, dtype=torch.long) # (1, vocab_size)
+    for i in range(bs):
+        vals, cnts = previous[i].unique(return_counts=True)
+        c[i, vals] = cnts.to(c.device)
+    logits = logits - c * alpha_frequency - (c > 0).to(logits.dtype) * alpha_presence
+    return logits
+def early_eos_penalty(logits: Tensor, n_generated: int, estimated_gen_length: int, decay: float, factor: float = 1, eos_index: int = 0) -> Tensor:
+    """ Penalize the `eos_index` of `logits` (bs, vocab_size) up to `estimated_gen_length`,
+    whereby we reduce the logit value by `factor`*(expected_length - current_length)^decay,
+    `n_generated` is the current number of generated samples. `decay` anneals the penalty relative to the distance.
+    Good values for decay are between 0 and 1. 0 = hard always apply penalty of 1, 1 = linearly scale penalty relative to distance.
+    Setting factor = 0 disabled penatly. Increasing factor increases penalty.
+    """
+    if n_generated > estimated_gen_length: return logits
+    penalty = max(estimated_gen_length - n_generated, 1)
+    bigger = logits[:, eos_index] > 0
+    modifier = factor*(penalty ** decay)
+    # logits[bigger, eos_index] /= modifier
+    # logits[~bigger, eos_index] *= modifier
+    logits[:, eos_index] -= modifier
+    return logits
+# Credit to https://github.com/microsoft/unilm/blob/master/xtune/src/transformers/modeling_utils.py#L1145 /
+#  https://gist.github.com/thomwolf/1a5a29f6962089e871b94cbd09daf317
+def top_k_top_p_filtering( logits: Tensor, top_k=0, top_p=1.0, filter_value=-float("Inf"), min_tokens=1 ) -> Tensor:
+    """Filter a distribution of logits using top-k and/or nucleus (top-p) filtering
+    Args:
+        logits: logits distribution shape (batch size, vocabulary size)
+        if top_k > 0: keep only top k tokens with highest probability (top-k filtering).
+        if top_p < 1.0: keep the top tokens with cumulative probability >= top_p (nucleus filtering).
+            Nucleus filtering is described in Holtzman et al. (http://arxiv.org/abs/1904.09751)
+        Make sure we keep at least min_tokens per batch example in the output
+    """
+    if top_k > 0:
+        top_k = min(max(top_k, min_tokens), logits.size(-1))  # Safety check
+        # Remove all tokens with a probability less than the last token of the top-k
+        indices_to_remove = logits < torch.topk(logits, top_k)[0][..., -1, None]
+        logits[indices_to_remove] = filter_value
+    if top_p < 1.0:
+        sorted_logits, sorted_indices = torch.sort(logits, descending=True)
+        cumulative_probs = torch.cumsum(F.softmax(sorted_logits, dim=-1), dim=-1)
+        # Remove tokens with cumulative probability above the threshold (token with 0 are kept)
+        sorted_indices_to_remove = cumulative_probs > top_p
+        if min_tokens > 1:
+            # Keep at least min_tokens (set to min_tokens-1 because we add the first one below)
+            sorted_indices_to_remove[..., :min_tokens] = 0
+        # Shift the indices to the right to keep also the first token above the threshold
+        sorted_indices_to_remove[..., 1:] = sorted_indices_to_remove[..., :-1].clone()
+        sorted_indices_to_remove[..., 0] = 0
+        # scatter sorted tensors to original indexing
+        indices_to_remove = sorted_indices_to_remove.scatter(1, sorted_indices, sorted_indices_to_remove)
+        logits[indices_to_remove] = filter_value
+    return logits
+def apply_typical_p(logprobs: Tensor, mass: float) -> Tensor:
+    """ Warp categorical logprobs associated with `x` to be in line with `mass`. Last dimension is the bin dimension.
+    `mass` corresponds to `tau` in the paper.
+    """
+    if mass > 0.999: return logprobs
+    # see: https://arxiv.org/abs/2202.00666
+    # calculate entropy
+    # normalized = logprobs #torch.nn.functional.log_softmax(scores, dim=-1)
+    normalized = torch.nn.functional.log_softmax(logprobs, dim=-1)
+    p = torch.exp(normalized)
+    ent = -(normalized * p).nansum(-1, keepdim=True)
+    # shift and sort
+    shifted_scores = torch.abs((-normalized) - ent)
+    sorted_scores, sorted_indices = torch.sort(shifted_scores, descending=False)
+    sorted_logits = logprobs.gather(-1, sorted_indices)
+    cumulative_probs = sorted_logits.softmax(dim=-1).cumsum(dim=-1)
+    # Remove tokens with cumulative mass above the threshold
+    last_ind = (cumulative_probs < mass).sum(dim=1)
+    last_ind[last_ind < 0] = 0
+    sorted_indices_to_remove = sorted_scores > sorted_scores.gather(1, last_ind.view(-1, 1))
+    indices_to_remove = sorted_indices_to_remove.scatter(1, sorted_indices, sorted_indices_to_remove)
+    scores = logprobs.masked_fill(indices_to_remove, -float('Inf'))
+    return scores

mars5/trim.py ADDED Viewed

	@@ -0,0 +1,741 @@

+""" Custom port of librosa trim code, to remove numba dependency.
+This allows us to use librosa.trim effect without the librosa or numba dependancy.
+All code below adapted from librosa open source github:
+"""
+import numpy as np
+import torch
+import torch.nn.functional as F
+import warnings
+def amplitude_to_db(S, ref=1.0, amin=1e-5, top_db=80.0):
+    """Convert an amplitude spectrogram to dB-scaled spectrogram.
+    This is equivalent to ``power_to_db(S**2)``, but is provided for convenience.
+    Parameters
+    ----------
+    S : np.ndarray
+        input amplitude
+    ref : scalar or callable
+        If scalar, the amplitude ``abs(S)`` is scaled relative to ``ref``:
+        ``20 * log10(S / ref)``.
+        Zeros in the output correspond to positions where ``S == ref``.
+        If callable, the reference value is computed as ``ref(S)``.
+    amin : float > 0 [scalar]
+        minimum threshold for ``S`` and ``ref``
+    top_db : float >= 0 [scalar]
+        threshold the output at ``top_db`` below the peak:
+        ``max(20 * log10(S)) - top_db``
+    Returns
+    -------
+    S_db : np.ndarray
+        ``S`` measured in dB
+    See Also
+    --------
+    power_to_db, db_to_amplitude
+    Notes
+    -----
+    This function caches at level 30.
+    """
+    # S = np.asarray(S)
+    S = torch.asarray(S)
+    magnitude = S.abs()
+    if callable(ref):
+        # User supplied a function to calculate reference power
+        ref_value = ref(magnitude)
+    else:
+        ref_value = torch.abs(ref)
+    power = torch.square(magnitude, out=magnitude)
+    return power_to_db(power, ref=ref_value ** 2, amin=amin ** 2, top_db=top_db)
+def _signal_to_frame_nonsilent(
+    y, frame_length=2048, hop_length=512, top_db=60, ref=torch.max
+):
+    """Frame-wise non-silent indicator for audio input.
+    This is a helper function for `trim` and `split`.
+    Parameters
+    ----------
+    y : np.ndarray, shape=(n,) or (2,n)
+        Audio signal, mono or stereo
+    frame_length : int > 0
+        The number of samples per frame
+    hop_length : int > 0
+        The number of samples between frames
+    top_db : number > 0
+        The threshold (in decibels) below reference to consider as
+        silence
+    ref : callable or float
+        The reference power
+    Returns
+    -------
+    non_silent : np.ndarray, shape=(m,), dtype=bool
+        Indicator of non-silent frames
+    """
+    # Convert to mono
+    if y.ndim > 1:
+        y_mono = torch.mean(y, dim=0)
+    else: y_mono = y
+    # Compute the MSE for the signal
+    mse = rms(y=y_mono, frame_length=frame_length, hop_length=hop_length) ** 2
+    return power_to_db(mse.squeeze(), ref=ref, top_db=None) > -top_db
+def trim(y, top_db=60, ref=torch.max, frame_length=2048, hop_length=512):
+    """Trim leading and trailing silence from an audio signal.
+    Parameters
+    ----------
+    y : np.ndarray, shape=(n,) or (2,n)
+        Audio signal, can be mono or stereo
+    top_db : number > 0
+        The threshold (in decibels) below reference to consider as
+        silence
+    ref : number or callable
+        The reference power.  By default, it uses `np.max` and compares
+        to the peak power in the signal.
+    frame_length : int > 0
+        The number of samples per analysis frame
+    hop_length : int > 0
+        The number of samples between analysis frames
+    Returns
+    -------
+    y_trimmed : np.ndarray, shape=(m,) or (2, m)
+        The trimmed signal
+    index : np.ndarray, shape=(2,)
+        the interval of ``y`` corresponding to the non-silent region:
+        ``y_trimmed = y[index[0]:index[1]]`` (for mono) or
+        ``y_trimmed = y[:, index[0]:index[1]]`` (for stereo).
+    Examples
+    --------
+    >>> # Load some audio
+    >>> y, sr = librosa.load(librosa.ex('choice'))
+    >>> # Trim the beginning and ending silence
+    >>> yt, index = librosa.effects.trim(y)
+    >>> # Print the durations
+    >>> print(librosa.get_duration(y), librosa.get_duration(yt))
+    25.025986394557822 25.007891156462584
+    """
+    non_silent = _signal_to_frame_nonsilent(
+        y, frame_length=frame_length, hop_length=hop_length, ref=ref, top_db=top_db
+    )
+    # nonzero = np.flatnonzero(non_silent)
+    nonzero = torch.nonzero(torch.ravel(non_silent)).squeeze()#[0]
+    if nonzero.numel() > 0:
+        # Compute the start and end positions
+        # End position goes one frame past the last non-zero
+        start = int(frames_to_samples(nonzero[0], hop_length))
+        end = min(y.shape[-1], int(frames_to_samples(nonzero[-1] + 1, hop_length)))
+    else:
+        # The signal only contains zeros
+        start, end = 0, 0
+    # Build the mono/stereo index
+    full_index = [slice(None)] * y.ndim
+    full_index[-1] = slice(start, end)
+    # print(non_silent)
+    # print(non_silent.shape, nonzero.shape)
+    return y[tuple(full_index)], torch.asarray([start, end])
+def rms(
+    y=None, S=None, frame_length=2048, hop_length=512, center=True, pad_mode="reflect"
+):
+    """Compute root-mean-square (RMS) value for each frame, either from the
+    audio samples ``y`` or from a spectrogram ``S``.
+    Computing the RMS value from audio samples is faster as it doesn't require
+    a STFT calculation. However, using a spectrogram will give a more accurate
+    representation of energy over time because its frames can be windowed,
+    thus prefer using ``S`` if it's already available.
+    Parameters
+    ----------
+    y : np.ndarray [shape=(n,)] or None
+        (optional) audio time series. Required if ``S`` is not input.
+    S : np.ndarray [shape=(d, t)] or None
+        (optional) spectrogram magnitude. Required if ``y`` is not input.
+    frame_length : int > 0 [scalar]
+        length of analysis frame (in samples) for energy calculation
+    hop_length : int > 0 [scalar]
+        hop length for STFT. See `librosa.stft` for details.
+    center : bool
+        If `True` and operating on time-domain input (``y``), pad the signal
+        by ``frame_length//2`` on either side.
+        If operating on spectrogram input, this has no effect.
+    pad_mode : str
+        Padding mode for centered analysis.  See `numpy.pad` for valid
+        values.
+    Returns
+    -------
+    rms : np.ndarray [shape=(1, t)]
+        RMS value for each frame
+    Examples
+    --------
+    >>> y, sr = librosa.load(librosa.ex('trumpet'))
+    >>> librosa.feature.rms(y=y)
+    array([[1.248e-01, 1.259e-01, ..., 1.845e-05, 1.796e-05]],
+          dtype=float32)
+    Or from spectrogram input
+    >>> S, phase = librosa.magphase(librosa.stft(y))
+    >>> rms = librosa.feature.rms(S=S)
+    >>> import matplotlib.pyplot as plt
+    >>> fig, ax = plt.subplots(nrows=2, sharex=True)
+    >>> times = librosa.times_like(rms)
+    >>> ax[0].semilogy(times, rms[0], label='RMS Energy')
+    >>> ax[0].set(xticks=[])
+    >>> ax[0].legend()
+    >>> ax[0].label_outer()
+    >>> librosa.display.specshow(librosa.amplitude_to_db(S, ref=np.max),
+    ...                          y_axis='log', x_axis='time', ax=ax[1])
+    >>> ax[1].set(title='log Power spectrogram')
+    Use a STFT window of constant ones and no frame centering to get consistent
+    results with the RMS computed from the audio samples ``y``
+    >>> S = librosa.magphase(librosa.stft(y, window=np.ones, center=False))[0]
+    >>> librosa.feature.rms(S=S)
+    >>> plt.show()
+    """
+    if y is not None:
+        if y.dim() > 1:
+            y = torch.mean(y, dim=0)
+        if center:
+            y = F.pad(y[None, None], (int(frame_length//2), int(frame_length//2)), mode=pad_mode)[0, 0]
+            # y = np.pad(y, int(frame_length // 2), mode=pad_mode)
+        x = frame(y, frame_length=frame_length, hop_length=hop_length)
+        # print(y.shape, x.shape, x)
+        # Calculate power
+        power = torch.mean(x.abs() ** 2, dim=0, keepdim=True)
+    elif S is not None:
+        # Check the frame length
+        if S.shape[0] != frame_length // 2 + 1:
+            raise AssertionError(
+                "Since S.shape[0] is {}, "
+                "frame_length is expected to be {} or {}; "
+                "found {}".format(
+                    S.shape[0], S.shape[0] * 2 - 2, S.shape[0] * 2 - 1, frame_length
+                )
+            )
+        # power spectrogram
+        x = torch.abs(S) ** 2
+        # Adjust the DC and sr/2 component
+        x[0] *= 0.5
+        if frame_length % 2 == 0:
+            x[-1] *= 0.5
+        # Calculate power
+        power = 2 * torch.sum(x, dim=0, keepdim=True) / frame_length ** 2
+    else:
+        raise AssertionError("Either `y` or `S` must be input.")
+    return torch.sqrt(power)
+def frame(x, frame_length, hop_length, axis=-1):
+    """Slice a data array into (overlapping) frames.
+    This implementation uses low-level stride manipulation to avoid
+    making a copy of the data.  The resulting frame representation
+    is a new view of the same input data.
+    However, if the input data is not contiguous in memory, a warning
+    will be issued and the output will be a full copy, rather than
+    a view of the input data.
+    For example, a one-dimensional input ``x = [0, 1, 2, 3, 4, 5, 6]``
+    can be framed with frame length 3 and hop length 2 in two ways.
+    The first (``axis=-1``), results in the array ``x_frames``::
+        [[0, 2, 4],
+         [1, 3, 5],
+         [2, 4, 6]]
+    where each column ``x_frames[:, i]`` contains a contiguous slice of
+    the input ``x[i * hop_length : i * hop_length + frame_length]``.
+    The second way (``axis=0``) results in the array ``x_frames``::
+        [[0, 1, 2],
+         [2, 3, 4],
+         [4, 5, 6]]
+    where each row ``x_frames[i]`` contains a contiguous slice of the input.
+    This generalizes to higher dimensional inputs, as shown in the examples below.
+    In general, the framing operation increments by 1 the number of dimensions,
+    adding a new "frame axis" either to the end of the array (``axis=-1``)
+    or the beginning of the array (``axis=0``).
+    Parameters
+    ----------
+    x : np.ndarray
+        Array to frame
+    frame_length : int > 0 [scalar]
+        Length of the frame
+    hop_length : int > 0 [scalar]
+        Number of steps to advance between frames
+    axis : 0 or -1
+        The axis along which to frame.
+        If ``axis=-1`` (the default), then ``x`` is framed along its last dimension.
+        ``x`` must be "F-contiguous" in this case.
+        If ``axis=0``, then ``x`` is framed along its first dimension.
+        ``x`` must be "C-contiguous" in this case.
+    Returns
+    -------
+    x_frames : np.ndarray [shape=(..., frame_length, N_FRAMES) or (N_FRAMES, frame_length, ...)]
+        A framed view of ``x``, for example with ``axis=-1`` (framing on the last dimension)::
+            x_frames[..., j] == x[..., j * hop_length : j * hop_length + frame_length]
+        If ``axis=0`` (framing on the first dimension), then::
+            x_frames[j] = x[j * hop_length : j * hop_length + frame_length]
+    Raises
+    ------
+    ParameterError
+        If ``x`` is not an `np.ndarray`.
+        If ``x.shape[axis] < frame_length``, there is not enough data to fill one frame.
+        If ``hop_length < 1``, frames cannot advance.
+        If ``axis`` is not 0 or -1.  Framing is only supported along the first or last axis.
+    See Also
+    --------
+    numpy.asfortranarray : Convert data to F-contiguous representation
+    numpy.ascontiguousarray : Convert data to C-contiguous representation
+    numpy.ndarray.flags : information about the memory layout of a numpy `ndarray`.
+    Examples
+    --------
+    Extract 2048-sample frames from monophonic signal with a hop of 64 samples per frame
+    >>> y, sr = librosa.load(librosa.ex('trumpet'))
+    >>> frames = librosa.util.frame(y, frame_length=2048, hop_length=64)
+    >>> frames
+    array([[-1.407e-03, -2.604e-02, ..., -1.795e-05, -8.108e-06],
+           [-4.461e-04, -3.721e-02, ..., -1.573e-05, -1.652e-05],
+           ...,
+           [ 7.960e-02, -2.335e-01, ..., -6.815e-06,  1.266e-05],
+           [ 9.568e-02, -1.252e-01, ...,  7.397e-06, -1.921e-05]],
+          dtype=float32)
+    >>> y.shape
+    (117601,)
+    >>> frames.shape
+    (2048, 1806)
+    Or frame along the first axis instead of the last:
+    >>> frames = librosa.util.frame(y, frame_length=2048, hop_length=64, axis=0)
+    >>> frames.shape
+    (1806, 2048)
+    Frame a stereo signal:
+    >>> y, sr = librosa.load(librosa.ex('trumpet', hq=True), mono=False)
+    >>> y.shape
+    (2, 117601)
+    >>> frames = librosa.util.frame(y, frame_length=2048, hop_length=64)
+    (2, 2048, 1806)
+    Carve an STFT into fixed-length patches of 32 frames with 50% overlap
+    >>> y, sr = librosa.load(librosa.ex('trumpet'))
+    >>> S = np.abs(librosa.stft(y))
+    >>> S.shape
+    (1025, 230)
+    >>> S_patch = librosa.util.frame(S, frame_length=32, hop_length=16)
+    >>> S_patch.shape
+    (1025, 32, 13)
+    >>> # The first patch contains the first 32 frames of S
+    >>> np.allclose(S_patch[:, :, 0], S[:, :32])
+    True
+    >>> # The second patch contains frames 16 to 16+32=48, and so on
+    >>> np.allclose(S_patch[:, :, 1], S[:, 16:48])
+    True
+    """
+    # if not isinstance(x, np.ndarray):
+    #     raise AssertionError(
+    #         "Input must be of type numpy.ndarray, " "given type(x)={}".format(type(x))
+    #     )
+    x: torch.Tensor = x
+    if x.shape[axis] < frame_length:
+        raise AssertionError(
+            "Input is too short (n={:d})"
+            " for frame_length={:d}".format(x.shape[axis], frame_length)
+        )
+    if hop_length < 1:
+        raise AssertionError("Invalid hop_length: {:d}".format(hop_length))
+    if axis == -1 and not x.is_contiguous():
+        warnings.warn(
+            "librosa.util.frame called with axis={} "
+            "on a non-contiguous input. This will result in a copy.".format(axis)
+        )
+        x = x.contiguous()
+    elif axis == 0 and not x.is_contiguous():
+        warnings.warn(
+            "librosa.util.frame called with axis={} "
+            "on a non-contiguous input. This will result in a copy.".format(axis)
+        )
+        x = x.contiguous()
+    n_frames = 1 + (x.shape[axis] - frame_length) // hop_length
+    strides = torch.asarray(x.numpy().strides)
+    # print(strides, x)
+    new_stride = torch.prod(strides[strides > 0] // x.itemsize) * x.itemsize
+    if axis == -1:
+        shape = list(x.shape)[:-1] + [frame_length, n_frames]
+        strides = list(strides) + [hop_length * new_stride]
+    elif axis == 0:
+        shape = [n_frames, frame_length] + list(x.shape)[1:]
+        strides = [hop_length * new_stride] + list(strides)
+    else:
+        raise AssertionError("Frame axis={} must be either 0 or -1".format(axis))
+    return torch.from_numpy(as_strided(x, shape=shape, strides=strides))
+    # return x.as_strided(size=shape, stride=strides)
+class DummyArray:
+    """Dummy object that just exists to hang __array_interface__ dictionaries
+    and possibly keep alive a reference to a base array.
+    """
+    def __init__(self, interface, base=None):
+        self.__array_interface__ = interface
+        self.base = base
+def as_strided(x, shape=None, strides=None, subok=False, writeable=True):
+    """
+    Create a view into the array with the given shape and strides.
+    .. warning:: This function has to be used with extreme care, see notes.
+    Parameters
+    ----------
+    x : ndarray
+        Array to create a new.
+    shape : sequence of int, optional
+        The shape of the new array. Defaults to ``x.shape``.
+    strides : sequence of int, optional
+        The strides of the new array. Defaults to ``x.strides``.
+    subok : bool, optional
+        .. versionadded:: 1.10
+        If True, subclasses are preserved.
+    writeable : bool, optional
+        .. versionadded:: 1.12
+        If set to False, the returned array will always be readonly.
+        Otherwise it will be writable if the original array was. It
+        is advisable to set this to False if possible (see Notes).
+    Returns
+    -------
+    view : ndarray
+    See also
+    --------
+    broadcast_to : broadcast an array to a given shape.
+    reshape : reshape an array.
+    lib.stride_tricks.sliding_window_view :
+        userfriendly and safe function for the creation of sliding window views.
+    Notes
+    -----
+    ``as_strided`` creates a view into the array given the exact strides
+    and shape. This means it manipulates the internal data structure of
+    ndarray and, if done incorrectly, the array elements can point to
+    invalid memory and can corrupt results or crash your program.
+    It is advisable to always use the original ``x.strides`` when
+    calculating new strides to avoid reliance on a contiguous memory
+    layout.
+    Furthermore, arrays created with this function often contain self
+    overlapping memory, so that two elements are identical.
+    Vectorized write operations on such arrays will typically be
+    unpredictable. They may even give different results for small, large,
+    or transposed arrays.
+    Since writing to these arrays has to be tested and done with great
+    care, you may want to use ``writeable=False`` to avoid accidental write
+    operations.
+    For these reasons it is advisable to avoid ``as_strided`` when
+    possible.
+    """
+    # first convert input to array, possibly keeping subclass
+    x = np.array(x, copy=False, subok=subok)
+    interface = dict(x.__array_interface__)
+    if shape is not None:
+        interface['shape'] = tuple(shape)
+    if strides is not None:
+        interface['strides'] = tuple(strides)
+    array = np.asarray(DummyArray(interface, base=x))
+    # The route via `__interface__` does not preserve structured
+    # dtypes. Since dtype should remain unchanged, we set it explicitly.
+    array.dtype = x.dtype
+    view = _maybe_view_as_subclass(x, array)
+    if view.flags.writeable and not writeable:
+        view.flags.writeable = False
+    return view
+def _maybe_view_as_subclass(original_array, new_array):
+    if type(original_array) is not type(new_array):
+        # if input was an ndarray subclass and subclasses were OK,
+        # then view the result as that subclass.
+        new_array = new_array.view(type=type(original_array))
+        # Since we have done something akin to a view from original_array, we
+        # should let the subclass finalize (if it has it implemented, i.e., is
+        # not None).
+        if new_array.__array_finalize__:
+            new_array.__array_finalize__(original_array)
+    return new_array
+def power_to_db(S, ref=1.0, amin=1e-10, top_db=80.0):
+    """Convert a power spectrogram (amplitude squared) to decibel (dB) units
+    This computes the scaling ``10 * log10(S / ref)`` in a numerically
+    stable way.
+    Parameters
+    ----------
+    S : np.ndarray
+        input power
+    ref : scalar or callable
+        If scalar, the amplitude ``abs(S)`` is scaled relative to ``ref``::
+            10 * log10(S / ref)
+        Zeros in the output correspond to positions where ``S == ref``.
+        If callable, the reference value is computed as ``ref(S)``.
+    amin : float > 0 [scalar]
+        minimum threshold for ``abs(S)`` and ``ref``
+    top_db : float >= 0 [scalar]
+        threshold the output at ``top_db`` below the peak:
+        ``max(10 * log10(S)) - top_db``
+    Returns
+    -------
+    S_db : np.ndarray
+        ``S_db ~= 10 * log10(S) - 10 * log10(ref)``
+    See Also
+    --------
+    perceptual_weighting
+    db_to_power
+    amplitude_to_db
+    db_to_amplitude
+    Notes
+    -----
+    This function caches at level 30.
+    Examples
+    --------
+    Get a power spectrogram from a waveform ``y``
+    >>> y, sr = librosa.load(librosa.ex('trumpet'))
+    >>> S = np.abs(librosa.stft(y))
+    >>> librosa.power_to_db(S**2)
+    array([[-41.809, -41.809, ..., -41.809, -41.809],
+           [-41.809, -41.809, ..., -41.809, -41.809],
+           ...,
+           [-41.809, -41.809, ..., -41.809, -41.809],
+           [-41.809, -41.809, ..., -41.809, -41.809]], dtype=float32)
+    Compute dB relative to peak power
+    >>> librosa.power_to_db(S**2, ref=np.max)
+    array([[-80., -80., ..., -80., -80.],
+           [-80., -80., ..., -80., -80.],
+           ...,
+           [-80., -80., ..., -80., -80.],
+           [-80., -80., ..., -80., -80.]], dtype=float32)
+    Or compare to median power
+    >>> librosa.power_to_db(S**2, ref=np.median)
+    array([[16.578, 16.578, ..., 16.578, 16.578],
+           [16.578, 16.578, ..., 16.578, 16.578],
+           ...,
+           [16.578, 16.578, ..., 16.578, 16.578],
+           [16.578, 16.578, ..., 16.578, 16.578]], dtype=float32)
+    And plot the results
+    >>> import matplotlib.pyplot as plt
+    >>> fig, ax = plt.subplots(nrows=2, sharex=True, sharey=True)
+    >>> imgpow = librosa.display.specshow(S**2, sr=sr, y_axis='log', x_axis='time',
+    ...                                   ax=ax[0])
+    >>> ax[0].set(title='Power spectrogram')
+    >>> ax[0].label_outer()
+    >>> imgdb = librosa.display.specshow(librosa.power_to_db(S**2, ref=np.max),
+    ...                                  sr=sr, y_axis='log', x_axis='time', ax=ax[1])
+    >>> ax[1].set(title='Log-Power spectrogram')
+    >>> fig.colorbar(imgpow, ax=ax[0])
+    >>> fig.colorbar(imgdb, ax=ax[1], format="%+2.0f dB")
+    """
+    S = torch.asarray(S)
+    if amin <= 0:
+        raise AssertionError("amin must be strictly positive")
+    # if np.issubdtype(S.dtype, np.complexfloating):
+    #     warnings.warn(
+    #         "power_to_db was called on complex input so phase "
+    #         "information will be discarded. To suppress this warning, "
+    #         "call power_to_db(np.abs(D)**2) instead."
+    #     )
+    #     magnitude = np.abs(S)
+    # else:
+    magnitude = S
+    if callable(ref):
+        # User supplied a function to calculate reference power
+        ref_value = ref(magnitude)
+    else:
+        ref_value = torch.abs(ref)
+    log_spec = 10.0 * torch.log10(torch.maximum(torch.tensor(amin), magnitude))
+    log_spec -= 10.0 * torch.log10(torch.maximum(torch.tensor(amin), ref_value))
+    if top_db is not None:
+        if top_db < 0:
+            raise AssertionError("top_db must be non-negative")
+        log_spec = torch.maximum(log_spec, log_spec.max() - top_db)
+    return log_spec
+def frames_to_samples(frames, hop_length=512, n_fft=None):
+    """Converts frame indices to audio sample indices.
+    Parameters
+    ----------
+    frames     : number or np.ndarray [shape=(n,)]
+        frame index or vector of frame indices
+    hop_length : int > 0 [scalar]
+        number of samples between successive frames
+    n_fft : None or int > 0 [scalar]
+        Optional: length of the FFT window.
+        If given, time conversion will include an offset of ``n_fft // 2``
+        to counteract windowing effects when using a non-centered STFT.
+    Returns
+    -------
+    times : number or np.ndarray
+        time (in samples) of each given frame number::
+            times[i] = frames[i] * hop_length
+    See Also
+    --------
+    frames_to_time : convert frame indices to time values
+    samples_to_frames : convert sample indices to frame indices
+    Examples
+    --------
+    >>> y, sr = librosa.load(librosa.ex('choice'))
+    >>> tempo, beats = librosa.beat.beat_track(y, sr=sr)
+    >>> beat_samples = librosa.frames_to_samples(beats)
+    """
+    offset = 0
+    if n_fft is not None:
+        offset = int(n_fft // 2)
+    return (torch.asarray(frames) * hop_length + offset).to(torch.int)

mars5/utils.py ADDED Viewed

	@@ -0,0 +1,62 @@

+import torch
+import logging
+def length_to_mask(length, offsets, max_len=None):
+    """
+    Convert tensor of lengths into a mask.
+    Args:
+        length (Tensor): a tensor of lengths, shape = (batch_size,)
+        offsets (Tensor): a tensor of offsets, shape = (batch_size,)
+        max_len (int, optional): maximum length to be considered
+    Returns:
+        mask (Tensor): a mask tensor, shape = (batch_size, max_len),
+                        True in masked positions, False otherwise.
+    """
+    # get the batch size
+    batch_size = length.size(0)
+    # if maximum length is not provided, then compute it from the 'length' tensor.
+    if max_len is None:
+        max_len = length.max().item()
+    # Create a tensor of size `(batch_size, max_len)` filled with `True`.
+    mask = torch.ones(size=(batch_size, max_len), dtype=torch.bool, device=length.device)
+    # Create a tensor with consecutive numbers.
+    range_tensor = torch.arange(max_len, device=length.device)
+    # Expand the dim of 'length' tensor and 'offset' tensor to make it `(batch_size, max_len)`.
+    # The added dimension will be used for broadcasting.
+    length_exp = length.unsqueeze(-1)
+    offsets_exp = offsets.unsqueeze(-1)
+    # Create a boolean mask where `False` represents valid positions and `True` represents padding.
+    mask = (range_tensor < offsets_exp) | (~(range_tensor < length_exp))
+    return mask
+def construct_padding_mask(input_tensor, pad_token):
+    return (input_tensor == pad_token).cumsum(dim=1) > 0
+def nuke_weight_norm(module):
+    """
+    Recursively remove weight normalization from a module and its children.
+    Args:
+        module (torch.nn.Module): The module from which to remove weight normalization.
+    """
+    # Remove weight norm from current module if it exists
+    try:
+        torch.nn.utils.remove_weight_norm(module)
+        logging.debug(f"Removed weight norm from {module.__class__.__name__}")
+    except ValueError:
+        # Ignore if the module does not have weight norm applied.
+        pass
+    # Recursively call the function on children modules
+    for child in module.children():
+        nuke_weight_norm(child)

mars5_demo.ipynb ADDED Viewed

	@@ -0,0 +1,140 @@

+{
+ "cells": [
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "!pip install --upgrade vocos encodec librosa"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 16,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "import pprint\n",
+    "import IPython.display as ipd\n",
+    "import torch\n",
+    "import librosa"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# load model\n",
+    "mars5, config_class = torch.hub.load('Camb-ai/mars5-tts', 'mars5_english', trust_repo=True)"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "Now that the model is loaded, pick a reference audio to clone from. If you want to use deep clone, also specify its transcript. "
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# download example ref audio\n",
+    "!wget -O example.wav https://github.com/Camb-ai/mars5-tts/raw/master/docs/assets/example_ref.wav "
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "wav, sr = librosa.load('./example.wav', \n",
+    "                       sr=mars5.sr, mono=True)\n",
+    "wav = torch.from_numpy(wav)\n",
+    "ref_transcript = \"We actually haven't managed to meet demand.\"\n",
+    "print(\"Reference audio:\")\n",
+    "ipd.display(ipd.Audio(wav.numpy(), rate=mars5.sr))\n",
+    "print(f\"Reference transcript: {ref_transcript}\")"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "deep_clone = True # set to False if you don't know prompt transcript or want fast inference.\n",
+    "# Below you can tune other inference settings, like top_k, temperature, top_p, etc...\n",
+    "cfg = config_class(deep_clone=deep_clone, rep_penalty_window=100,\n",
+    "                      top_k=100, temperature=0.7, freq_penalty=3)\n",
+    "\n",
+    "ar_codes, wav_out = mars5.tts(\"The quick brown rat.\", wav, \n",
+    "          ref_transcript,\n",
+    "          cfg=cfg)\n",
+    "\n",
+    "print('Synthesized output audio:')\n",
+    "ipd.Audio(wav_out.numpy(), rate=mars5.sr)"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "You can see all the inference settings available to tune in the inference config here:"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "pprint.pprint(config_class())"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "You can also listen to the vocoded raw coarse codes, for debugging purposes:"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "ar_wav = mars5.vocode(ar_codes.cpu()[:, None])\n",
+    "ipd.Audio(ar_wav.numpy(), rate=mars5.sr)"
+   ]
+  }
+ ],
+ "metadata": {
+  "kernelspec": {
+   "display_name": "matt-py311",
+   "language": "python",
+   "name": "python3"
+  },
+  "language_info": {
+   "codemirror_mode": {
+    "name": "ipython",
+    "version": 3
+   },
+   "file_extension": ".py",
+   "mimetype": "text/x-python",
+   "name": "python",
+   "nbconvert_exporter": "python",
+   "pygments_lexer": "ipython3",
+   "version": "3.11.9"
+  }
+ },
+ "nbformat": 4,
+ "nbformat_minor": 2
+}

requirements.txt ADDED Viewed

	@@ -0,0 +1,8 @@

+torch
+torchvision
+torchaudio
+numpy
+regex
+librosa
+vocos
+encodec