Spaces:

Fabrice-TIERCELIN
/

Text-to-Music

Running

App Files Files Community

adefossez commited on Jun 13, 2023

Commit

6ec60d5

1 Parent(s): 6a458f2

support both torch and xformers + merge apps

Browse files

Files changed (5) hide show

app.py +188 -86
app_batched.py +0 -222
audiocraft/modules/transformer.py +67 -27
tests/modules/test_rope.py +9 -1
tests/modules/test_transformer.py +40 -34

app.py CHANGED Viewed

@@ -1,70 +1,125 @@
-"""
-Copyright (c) Meta Platforms, Inc. and affiliates.
-All rights reserved.
-This source code is licensed under the license found in the
-LICENSE file in the root directory of this source tree.
-"""
-from tempfile import NamedTemporaryFile
 import argparse
 import torch
 import gradio as gr
-import os
-from audiocraft.models import MusicGen
 from audiocraft.data.audio import audio_write
-MODEL = None
-IS_SHARED_SPACE = "musicgen/MusicGen" in os.environ.get('SPACE_ID', '')
-def load_model(version):
-    print("Loading model", version)
-    return MusicGen.get_pretrained(version)
-def predict(model, text, melody, duration, topk, topp, temperature, cfg_coef):
     global MODEL
-    topk = int(topk)
-    if MODEL is None or MODEL.name != model:
-        MODEL = load_model(model)
-    if duration > MODEL.lm.cfg.dataset.segment_duration:
-        raise gr.Error("MusicGen currently supports durations of up to 30 seconds!")
-    MODEL.set_generation_params(
-        use_sampling=True,
-        top_k=topk,
-        top_p=topp,
-        temperature=temperature,
-        cfg_coef=cfg_coef,
-        duration=duration,
-    )
-    if melody:
-        sr, melody = melody[0], torch.from_numpy(melody[1]).to(MODEL.device).float().t().unsqueeze(0)
-        print(melody.shape)
-        if melody.dim() == 2:
-            melody = melody[None]
-        melody = melody[..., :int(sr * MODEL.lm.cfg.dataset.segment_duration)]
-        output = MODEL.generate_with_chroma(
-            descriptions=[text],
-            melody_wavs=melody,
-            melody_sample_rate=sr,
             progress=False
         )
     else:
-        output = MODEL.generate(descriptions=[text], progress=False)
-    output = output.detach().cpu().float()[0]
-    with NamedTemporaryFile("wb", suffix=".wav", delete=False) as file:
-        audio_write(
-            file.name, output, MODEL.sample_rate, strategy="loudness",
-            loudness_headroom_db=16, loudness_compressor=True, add_suffix=False)
-        waveform_video = gr.make_waveform(file.name)
-    return waveform_video
-def ui(**kwargs):
     with gr.Blocks() as interface:
         gr.Markdown(
             """
@@ -73,14 +128,6 @@ def ui(**kwargs):
             presented at: ["Simple and Controllable Music Generation"](https://huggingface.co/papers/2306.05284)
             """
         )
-        if IS_SHARED_SPACE:
-            gr.Markdown("""
-                ⚠ This Space doesn't work in this shared UI ⚠
-                <a href="https://huggingface.co/spaces/musicgen/MusicGen?duplicate=true" style="display: inline-block;margin-top: .5em;margin-right: .25em;" target="_blank">
-                <img style="margin-bottom: 0em;display: inline;margin-top: -.25em;" src="https://bit.ly/3gLdBN6" alt="Duplicate Space"></a>
-                to use it privately, or use the <a href="https://huggingface.co/spaces/facebook/MusicGen">public demo</a>
-                """)
         with gr.Row():
             with gr.Column():
                 with gr.Row():
@@ -99,9 +146,9 @@ def ui(**kwargs):
                     cfg_coef = gr.Number(label="Classifier Free Guidance", value=3.0, interactive=True)
             with gr.Column():
                 output = gr.Video(label="Generated Music")
-        submit.click(predict, inputs=[model, text, melody, duration, topk, topp, temperature, cfg_coef], outputs=[output])
         gr.Examples(
-            fn=predict,
             examples=[
                 [
                     "An 80s driving pop song with heavy drums and synth pads in the background",
@@ -154,35 +201,83 @@ def ui(**kwargs):
             """
         )
-        # Show the interface
-        launch_kwargs = {}
-        username = kwargs.get('username')
-        password = kwargs.get('password')
-        server_port = kwargs.get('server_port', 0)
-        inbrowser = kwargs.get('inbrowser', False)
-        share = kwargs.get('share', False)
-        server_name = kwargs.get('listen')
-        launch_kwargs['server_name'] = server_name
-        if username and password:
-            launch_kwargs['auth'] = (username, password)
-        if server_port > 0:
-            launch_kwargs['server_port'] = server_port
-        if inbrowser:
-            launch_kwargs['inbrowser'] = inbrowser
-        if share:
-            launch_kwargs['share'] = share
         interface.queue().launch(**launch_kwargs, max_threads=1)
 if __name__ == "__main__":
     parser = argparse.ArgumentParser()
     parser.add_argument(
         '--listen',
         type=str,
-        default='0.0.0.0',
         help='IP to listen on for connections to Gradio',
     )
     parser.add_argument(
@@ -206,11 +301,18 @@ if __name__ == "__main__":
     args = parser.parse_args()
-    ui(
-        username=args.username,
-        password=args.password,
-        inbrowser=args.inbrowser,
-        server_port=args.server_port,
-        share=args.share,
-        listen=args.listen
-    )

+# Copyright (c) Meta Platforms, Inc. and affiliates.
+# All rights reserved.
+# This source code is licensed under the license found in the
+# LICENSE file in the root directory of this source tree.
 import argparse
+from concurrent.futures import ProcessPoolExecutor
+import os
+import subprocess as sp
+from tempfile import NamedTemporaryFile
+import time
+import warnings
 import torch
 import gradio as gr
+from audiocraft.data.audio_utils import convert_audio
 from audiocraft.data.audio import audio_write
+from audiocraft.models import MusicGen
+MODEL = None  # Last used model
+IS_BATCHED = "facebook/MusicGen" in os.environ.get('SPACE_ID', '')
+MAX_BATCH_SIZE = 12
+BATCHED_DURATION = 15
+# We have to wrap subprocess call to clean a bit the log when using gr.make_waveform
+_old_call = sp.call
+def _call_nostderr(*args, **kwargs):
+    # Avoid ffmpeg vomitting on the logs.
+    kwargs['stderr'] = sp.DEVNULL
+    kwargs['stdout'] = sp.DEVNULL
+    _old_call(*args, **kwargs)
+sp.call = _call_nostderr
+# Preallocating the pool of processes.
+pool = ProcessPoolExecutor(3)
+pool.__enter__()
+def make_waveform(*args, **kwargs):
+    # Further remove some warnings.
+    be = time.time()
+    with warnings.catch_warnings():
+        warnings.simplefilter('ignore')
+        out = gr.make_waveform(*args, **kwargs)
+        print("Make a video took", time.time() - be)
+        return out
+def load_model(version='melody'):
     global MODEL
+    print("Loading model", version)
+    if MODEL is None or MODEL.name != version:
+        MODEL = MusicGen.get_pretrained(version)
+def _do_predictions(texts, melodies, duration, **gen_kwargs):
+    MODEL.set_generation_params(duration=duration, **gen_kwargs)
+    print("new batch", len(texts), texts, [None if m is None else (m[0], m[1].shape) for m in melodies])
+    be = time.time()
+    processed_melodies = []
+    target_sr = 32000
+    target_ac = 1
+    for melody in melodies:
+        if melody is None:
+            processed_melodies.append(None)
+        else:
+            sr, melody = melody[0], torch.from_numpy(melody[1]).to(MODEL.device).float().t()
+            if melody.dim() == 1:
+                melody = melody[None]
+            melody = melody[..., :int(sr * duration)]
+            melody = convert_audio(melody, sr, target_sr, target_ac)
+            processed_melodies.append(melody)
+    if processed_melodies.any():
+        outputs = MODEL.generate_with_chroma(
+            descriptions=texts,
+            melody_wavs=processed_melodies,
+            melody_sample_rate=target_sr,
             progress=False
         )
     else:
+        outputs = MODEL.generate(texts, progress=False)
+    outputs = outputs.detach().cpu().float()
+    out_files = []
+    for output in outputs:
+        with NamedTemporaryFile("wb", suffix=".wav", delete=False) as file:
+            audio_write(
+                file.name, output, MODEL.sample_rate, strategy="loudness",
+                loudness_headroom_db=16, loudness_compressor=True, add_suffix=False)
+            out_files.append(pool.submit(make_waveform, file.name))
+    res = [out_file.result() for out_file in out_files]
+    print("batch finished", len(texts), time.time() - be)
+    return res
+def predict_batched(texts, melodies):
+    max_text_length = 512
+    texts = [text[:max_text_length] for text in texts]
+    load_model('melody')
+    res = _do_predictions(texts, melodies, BATCHED_DURATION)
+    return [res]
+def predict_full(model, text, melody, duration, topk, topp, temperature, cfg_coef):
+    topk = int(topk)
+    load_model(model)
+    if duration > MODEL.lm.cfg.dataset.segment_duration:
+        raise gr.Error("MusicGen currently supports durations of up to 30 seconds!")
+    outs = _do_predictions(
+        [text], [melody], duration,
+        topk=topk, top_p=topp, temperature=temperature, cfg_coef=cfg_coef)
+    return outs[0]
+def ui_full(launch_kwargs):
     with gr.Blocks() as interface:
         gr.Markdown(
             """
             presented at: ["Simple and Controllable Music Generation"](https://huggingface.co/papers/2306.05284)
             """
         )
         with gr.Row():
             with gr.Column():
                 with gr.Row():
                     cfg_coef = gr.Number(label="Classifier Free Guidance", value=3.0, interactive=True)
             with gr.Column():
                 output = gr.Video(label="Generated Music")
+        submit.click(predict_full, inputs=[model, text, melody, duration, topk, topp, temperature, cfg_coef], outputs=[output])
         gr.Examples(
+            fn=predict_full,
             examples=[
                 [
                     "An 80s driving pop song with heavy drums and synth pads in the background",
             """
         )
         interface.queue().launch(**launch_kwargs, max_threads=1)
+def ui_batched(launch_kwargs):
+    with gr.Blocks() as demo:
+        gr.Markdown(
+            """
+            # MusicGen
+            This is the demo for [MusicGen](https://github.com/facebookresearch/audiocraft), a simple and controllable model for music generation
+            presented at: ["Simple and Controllable Music Generation"](https://huggingface.co/papers/2306.05284).
+            <br/>
+            <a href="https://huggingface.co/spaces/musicgen/MusicGen?duplicate=true" style="display: inline-block;margin-top: .5em;margin-right: .25em;" target="_blank">
+            <img style="margin-bottom: 0em;display: inline;margin-top: -.25em;" src="https://bit.ly/3gLdBN6" alt="Duplicate Space"></a>
+            for longer sequences, more control and no queue.</p>
+            """
+        )
+        with gr.Row():
+            with gr.Column():
+                with gr.Row():
+                    text = gr.Text(label="Describe your music", lines=2, interactive=True)
+                    melody = gr.Audio(source="upload", type="numpy", label="Condition on a melody (optional)", interactive=True)
+                with gr.Row():
+                    submit = gr.Button("Generate")
+            with gr.Column():
+                output = gr.Video(label="Generated Music")
+        submit.click(predict_batched, inputs=[text, melody], outputs=[output], batch=True, max_batch_size=MAX_BATCH_SIZE)
+        gr.Examples(
+            fn=predict_batched,
+            examples=[
+                [
+                    "An 80s driving pop song with heavy drums and synth pads in the background",
+                    "./assets/bach.mp3",
+                ],
+                [
+                    "A cheerful country song with acoustic guitars",
+                    "./assets/bolero_ravel.mp3",
+                ],
+                [
+                    "90s rock song with electric guitar and heavy drums",
+                    None,
+                ],
+                [
+                    "a light and cheerly EDM track, with syncopated drums, aery pads, and strong emotions bpm: 130",
+                    "./assets/bach.mp3",
+                ],
+                [
+                    "lofi slow bpm electro chill with organic samples",
+                    None,
+                ],
+            ],
+            inputs=[text, melody],
+            outputs=[output]
+        )
+        gr.Markdown("""
+        ### More details
+        The model will generate 12 seconds of audio based on the description you provided.
+        You can optionaly provide a reference audio from which a broad melody will be extracted.
+        The model will then try to follow both the description and melody provided.
+        All samples are generated with the `melody` model.
+        You can also use your own GPU or a Google Colab by following the instructions on our repo.
+        See [github.com/facebookresearch/audiocraft](https://github.com/facebookresearch/audiocraft)
+        for more details.
+        """)
+        demo.queue(max_size=8 * 4).launch(**launch_kwargs)
 if __name__ == "__main__":
     parser = argparse.ArgumentParser()
     parser.add_argument(
         '--listen',
         type=str,
+        default='0.0.0.0' if 'SPACE_ID' in os.environ else '127.0.0.1',
         help='IP to listen on for connections to Gradio',
     )
     parser.add_argument(
     args = parser.parse_args()
+    launch_kwargs = {}
+    if args.username and args.password:
+        launch_kwargs['auth'] = (args.username, args.password)
+    if args.server_port:
+        launch_kwargs['server_port'] = args.server_port
+    if args.inbrowser:
+        launch_kwargs['inbrowser'] = args.inbrowser
+    if args.share:
+        launch_kwargs['share'] = args.share
+    # Show the interface
+    if IS_BATCHED:
+        ui_batched(launch_kwargs)
+    else:
+        ui_full(launch_kwargs)

app_batched.py DELETED Viewed

@@ -1,222 +0,0 @@
-"""
-Copyright (c) Meta Platforms, Inc. and affiliates.
-All rights reserved.
-This source code is licensed under the license found in the
-LICENSE file in the root directory of this source tree.
-"""
-import argparse
-from concurrent.futures import ProcessPoolExecutor
-import subprocess as sp
-from tempfile import NamedTemporaryFile
-import time
-import warnings
-import torch
-import gradio as gr
-from audiocraft.data.audio_utils import convert_audio
-from audiocraft.data.audio import audio_write
-from audiocraft.models import MusicGen
-MODEL = None
-_old_call = sp.call
-def _call_nostderr(*args, **kwargs):
-    # Avoid ffmpeg vomitting on the logs.
-    kwargs['stderr'] = sp.DEVNULL
-    kwargs['stdout'] = sp.DEVNULL
-    _old_call(*args, **kwargs)
-sp.call = _call_nostderr
-pool = ProcessPoolExecutor(3)
-pool.__enter__()
-def make_waveform(*args, **kwargs):
-    be = time.time()
-    with warnings.catch_warnings():
-        warnings.simplefilter('ignore')
-        out = gr.make_waveform(*args, **kwargs)
-        print("Make a video took", time.time() - be)
-        return out
-def load_model():
-    print("Loading model")
-    return MusicGen.get_pretrained("melody")
-def predict(texts, melodies):
-    global MODEL
-    if MODEL is None:
-        MODEL = load_model()
-    duration = 12
-    max_text_length = 512
-    texts = [text[:max_text_length] for text in texts]
-    MODEL.set_generation_params(duration=duration)
-    print("new batch", len(texts), texts, [None if m is None else (m[0], m[1].shape) for m in melodies])
-    be = time.time()
-    processed_melodies = []
-    target_sr = 32000
-    target_ac = 1
-    for melody in melodies:
-        if melody is None:
-            processed_melodies.append(None)
-        else:
-            sr, melody = melody[0], torch.from_numpy(melody[1]).to(MODEL.device).float().t()
-            if melody.dim() == 1:
-                melody = melody[None]
-            melody = melody[..., :int(sr * duration)]
-            melody = convert_audio(melody, sr, target_sr, target_ac)
-            processed_melodies.append(melody)
-    outputs = MODEL.generate_with_chroma(
-        descriptions=texts,
-        melody_wavs=processed_melodies,
-        melody_sample_rate=target_sr,
-        progress=False
-    )
-    outputs = outputs.detach().cpu().float()
-    out_files = []
-    for output in outputs:
-        with NamedTemporaryFile("wb", suffix=".wav", delete=False) as file:
-            audio_write(
-                file.name, output, MODEL.sample_rate, strategy="loudness",
-                loudness_headroom_db=16, loudness_compressor=True, add_suffix=False)
-            out_files.append(pool.submit(make_waveform, file.name))
-    res = [[out_file.result() for out_file in out_files]]
-    print("batch finished", len(texts), time.time() - be)
-    return res
-def ui(**kwargs):
-    with gr.Blocks() as demo:
-        gr.Markdown(
-            """
-            # MusicGen
-            This is the demo for [MusicGen](https://github.com/facebookresearch/audiocraft), a simple and controllable model for music generation
-            presented at: ["Simple and Controllable Music Generation"](https://huggingface.co/papers/2306.05284).
-            <br/>
-            <a href="https://huggingface.co/spaces/musicgen/MusicGen?duplicate=true" style="display: inline-block;margin-top: .5em;margin-right: .25em;" target="_blank">
-            <img style="margin-bottom: 0em;display: inline;margin-top: -.25em;" src="https://bit.ly/3gLdBN6" alt="Duplicate Space"></a>
-            for longer sequences, more control and no queue.</p>
-            """
-        )
-        with gr.Row():
-            with gr.Column():
-                with gr.Row():
-                    text = gr.Text(label="Describe your music", lines=2, interactive=True)
-                    melody = gr.Audio(source="upload", type="numpy", label="Condition on a melody (optional)", interactive=True)
-                with gr.Row():
-                    submit = gr.Button("Generate")
-            with gr.Column():
-                output = gr.Video(label="Generated Music")
-        submit.click(predict, inputs=[text, melody], outputs=[output], batch=True, max_batch_size=8)
-        gr.Examples(
-            fn=predict,
-            examples=[
-                [
-                    "An 80s driving pop song with heavy drums and synth pads in the background",
-                    "./assets/bach.mp3",
-                ],
-                [
-                    "A cheerful country song with acoustic guitars",
-                    "./assets/bolero_ravel.mp3",
-                ],
-                [
-                    "90s rock song with electric guitar and heavy drums",
-                    None,
-                ],
-                [
-                    "a light and cheerly EDM track, with syncopated drums, aery pads, and strong emotions bpm: 130",
-                    "./assets/bach.mp3",
-                ],
-                [
-                    "lofi slow bpm electro chill with organic samples",
-                    None,
-                ],
-            ],
-            inputs=[text, melody],
-            outputs=[output]
-        )
-        gr.Markdown("""
-        ### More details
-        The model will generate 12 seconds of audio based on the description you provided.
-        You can optionaly provide a reference audio from which a broad melody will be extracted.
-        The model will then try to follow both the description and melody provided.
-        All samples are generated with the `melody` model.
-        You can also use your own GPU or a Google Colab by following the instructions on our repo.
-        See [github.com/facebookresearch/audiocraft](https://github.com/facebookresearch/audiocraft)
-        for more details.
-        """)
-        # Show the interface
-        launch_kwargs = {}
-        username = kwargs.get('username')
-        password = kwargs.get('password')
-        server_port = kwargs.get('server_port', 0)
-        inbrowser = kwargs.get('inbrowser', False)
-        share = kwargs.get('share', False)
-        server_name = kwargs.get('listen')
-        launch_kwargs['server_name'] = server_name
-        if username and password:
-            launch_kwargs['auth'] = (username, password)
-        if server_port > 0:
-            launch_kwargs['server_port'] = server_port
-        if inbrowser:
-            launch_kwargs['inbrowser'] = inbrowser
-        if share:
-            launch_kwargs['share'] = share
-        demo.queue(max_size=8 * 4).launch(**launch_kwargs)
-if __name__ == "__main__":
-    parser = argparse.ArgumentParser()
-    parser.add_argument(
-        '--listen',
-        type=str,
-        default='0.0.0.0',
-        help='IP to listen on for connections to Gradio',
-    )
-    parser.add_argument(
-        '--username', type=str, default='', help='Username for authentication'
-    )
-    parser.add_argument(
-        '--password', type=str, default='', help='Password for authentication'
-    )
-    parser.add_argument(
-        '--server_port',
-        type=int,
-        default=0,
-        help='Port to run the server listener on',
-    )
-    parser.add_argument(
-        '--inbrowser', action='store_true', help='Open in browser'
-    )
-    parser.add_argument(
-        '--share', action='store_true', help='Share the gradio UI'
-    )
-    args = parser.parse_args()
-    ui(
-        username=args.username,
-        password=args.password,
-        inbrowser=args.inbrowser,
-        server_port=args.server_port,
-        share=args.share,
-        listen=args.listen
-    )

audiocraft/modules/transformer.py CHANGED Viewed

@@ -25,6 +25,22 @@ from xformers import ops
 from .rope import RotaryEmbedding
 from .streaming import StreamingModule
 def _is_profiled() -> bool:
     # Return true if we are currently running with a xformers profiler activated.
@@ -75,14 +91,22 @@ def create_sin_embedding(positions: torch.Tensor, dim: int, max_period: float =
 def expand_repeated_kv(x: torch.Tensor, n_rep: int) -> torch.Tensor:
     """torch.repeat_interleave(x, dim=2, repeats=n_rep) from xlformers"""
-    bs, slen, n_kv_heads, head_dim = x.shape
     if n_rep == 1:
         return x
-    return (
-        x[:, :, :, None, :]
-        .expand(bs, slen, n_kv_heads, n_rep, head_dim)
-        .reshape(bs, slen, n_kv_heads * n_rep, head_dim)
-    )
 class LayerScale(nn.Module):
@@ -210,6 +234,7 @@ class StreamingMultiheadAttention(StreamingModule):
         # Return a causal mask, accounting for potentially stored past keys/values
         # We actually return a bias for the attention score, as this has the same
         # convention both in the builtin MHA in Pytorch, and Xformers functions.
         if self.memory_efficient:
             from xformers.ops import LowerTriangularMask
             if current_steps == 1:
@@ -222,7 +247,7 @@ class StreamingMultiheadAttention(StreamingModule):
                 return LowerTriangularMask()
         if self._streaming_state:
             past_keys = self._streaming_state['past_keys']
-            past_steps = past_keys.shape[1]
         else:
             past_steps = 0
@@ -239,6 +264,7 @@ class StreamingMultiheadAttention(StreamingModule):
             torch.full([], float('-inf'), device=device, dtype=dtype))
     def _complete_kv(self, k, v):
         if self.cross_attention:
             # With cross attention we assume all keys and values
             # are already available, and streaming is with respect
@@ -247,20 +273,20 @@ class StreamingMultiheadAttention(StreamingModule):
         # Complete the key/value pair using the streaming state.
         if self._streaming_state:
             pk = self._streaming_state['past_keys']
-            nk = torch.cat([pk, k], dim=2)
             if v is k:
                 nv = nk
             else:
                 pv = self._streaming_state['past_values']
-                nv = torch.cat([pv, v], dim=2)
         else:
             nk = k
             nv = v
-        assert nk.shape[2] == nv.shape[2]
         offset = 0
         if self.past_context is not None:
-            offset = max(0, nk.shape[2] - self.past_context)
         if self._is_streaming:
             self._streaming_state['past_keys'] = nk[:, offset:]
             if v is not k:
@@ -271,8 +297,9 @@ class StreamingMultiheadAttention(StreamingModule):
                 self._streaming_state['offset'] = torch.tensor(0)
         return nk, nv
     def _apply_rope(self, query: torch.Tensor, key: torch.Tensor):
         # Apply rope embeddings to query and key tensors.
         assert self.rope is not None
         if 'past_keys' in self._streaming_state:
@@ -293,6 +320,11 @@ class StreamingMultiheadAttention(StreamingModule):
         assert not is_causal, ("new param added in torch 2.0.1 not supported, "
                                "use the causal args in the constructor.")
         dtype = query.dtype
         if self._is_streaming:
             assert self.causal or self.cross_attention, \
@@ -325,8 +357,7 @@ class StreamingMultiheadAttention(StreamingModule):
                 if self.qk_layer_norm is True:
                     q = self.q_layer_norm(q)
                     k = self.k_layer_norm(k)
-                # q, k, v = [rearrange(x, "b t (h d) -> (b h) t d", h=self.num_heads) for x in [q, k, v]]
-                q, k, v = [rearrange(x, "b t (h d) -> b h t d", h=self.num_heads) for x in [q, k, v]]
             else:
                 if not _is_profiled():
                     # profiling breaks that propertysomehow.
@@ -334,7 +365,11 @@ class StreamingMultiheadAttention(StreamingModule):
                     assert value is key, "specialized implementation"
                 projected = nn.functional.linear(query, self.in_proj_weight, self.in_proj_bias)
                 if self.kv_repeat == 1:
-                    packed = rearrange(projected, "b t (p h d) -> b h p t d", p=3, h=self.num_heads)
                     q, k, v = ops.unbind(packed, dim=2)
                 else:
                     embed_dim = self.embed_dim
@@ -345,18 +380,17 @@ class StreamingMultiheadAttention(StreamingModule):
                     end = start + per_head_dim * kv_heads
                     k = projected[:, :, start: end]
                     v = projected[:, :, end:]
-                    q = rearrange(q, "b t (h d) -> b t h d", h=self.num_heads)
-                    k = rearrange(k, "b t (h d) -> b t h d", h=kv_heads)
-                    v = rearrange(v, "b t (h d) -> b t h d", h=kv_heads)
                 if self.qk_layer_norm is True:
                     assert self.kv_repeat == 1
-                    q, k = [rearrange(x, "b t h d -> b t (h d)") for x in [q, k]]
                     q = self.q_layer_norm(q)
                     k = self.k_layer_norm(k)
-                    q, k = [rearrange(x, "b t (h d) -> b t h d", h=self.num_heads) for x in [q, k]]
                 if self.rope:
-                    assert False, "Not supported for now"
                     q, k = self._apply_rope(q, k)
                 k, v = self._complete_kv(k, v)
                 if self.kv_repeat > 1:
@@ -366,8 +400,11 @@ class StreamingMultiheadAttention(StreamingModule):
                 q, k, v = [x.float() for x in [q, k, v]]
             if self.memory_efficient:
                 p = self.dropout if self.training else 0
-                x = torch.nn.functional.scaled_dot_product_attention(
-                    q, k, v, is_causal=attn_mask is not None, dropout_p=p)
             else:
                 # We include the dot product as float32, for consistency
                 # with the other implementations that include that step
@@ -377,18 +414,21 @@ class StreamingMultiheadAttention(StreamingModule):
                 # extend a bit the range of operations done in float32,
                 # although this should make no difference.
                 q = q / q.shape[-1] ** 0.5
                 if self._is_streaming and self.safe_streaming and q.device.type == 'cuda':
                     with torch.autocast(device_type=q.device.type, dtype=torch.float32):
-                        pre_w = torch.einsum("bqhc,bkhc->bhqk", q, k)
                 else:
-                    pre_w = torch.einsum("bqhc,bkhc->bhqk", q, k)
                 if attn_mask is not None:
                     pre_w = pre_w + attn_mask
                 w = torch.softmax(pre_w, dim=-1)
                 w = F.dropout(w, self.dropout, training=self.training).to(v)
-                x = torch.einsum("bhqk,bkhc->bqhc", w, v)
             x = x.to(dtype)
-            x = rearrange(x, "b h t d -> b t (h d)", h=self.num_heads)
             x = self.out_proj(x)
         else:
             key, value = self._complete_kv(key, value)

 from .rope import RotaryEmbedding
 from .streaming import StreamingModule
+_efficient_attention_backend: str = 'torch'
+def set_efficient_attention_backend(backend: str = 'torch'):
+    # Using torch by default, it seems a bit faster on older P100 GPUs (~20% faster).
+    global _efficient_attention_backend
+    assert _efficient_attention_backend in ['xformers', 'torch']
+    _efficient_attention_backend = backend
+def _get_attention_time_dimension() -> int:
+    if _efficient_attention_backend == 'torch':
+        return 2
+    else:
+        return 1
 def _is_profiled() -> bool:
     # Return true if we are currently running with a xformers profiler activated.
 def expand_repeated_kv(x: torch.Tensor, n_rep: int) -> torch.Tensor:
     """torch.repeat_interleave(x, dim=2, repeats=n_rep) from xlformers"""
     if n_rep == 1:
         return x
+    if _efficient_attention_backend == 'torch':
+        bs, n_kv_heads, slen, head_dim = x.shape
+        return (
+            x[:, :, None, :, :]
+            .expand(bs, n_kv_heads, n_rep, slen, head_dim)
+            .reshape(bs, n_kv_heads * n_rep, slen, head_dim)
+        )
+    else:
+        bs, slen, n_kv_heads, head_dim = x.shape
+        return (
+            x[:, :, :, None, :]
+            .expand(bs, slen, n_kv_heads, n_rep, head_dim)
+            .reshape(bs, slen, n_kv_heads * n_rep, head_dim)
+        )
 class LayerScale(nn.Module):
         # Return a causal mask, accounting for potentially stored past keys/values
         # We actually return a bias for the attention score, as this has the same
         # convention both in the builtin MHA in Pytorch, and Xformers functions.
+        time_dim = _get_attention_time_dimension()
         if self.memory_efficient:
             from xformers.ops import LowerTriangularMask
             if current_steps == 1:
                 return LowerTriangularMask()
         if self._streaming_state:
             past_keys = self._streaming_state['past_keys']
+            past_steps = past_keys.shape[time_dim]
         else:
             past_steps = 0
             torch.full([], float('-inf'), device=device, dtype=dtype))
     def _complete_kv(self, k, v):
+        time_dim = _get_attention_time_dimension()
         if self.cross_attention:
             # With cross attention we assume all keys and values
             # are already available, and streaming is with respect
         # Complete the key/value pair using the streaming state.
         if self._streaming_state:
             pk = self._streaming_state['past_keys']
+            nk = torch.cat([pk, k], dim=time_dim)
             if v is k:
                 nv = nk
             else:
                 pv = self._streaming_state['past_values']
+                nv = torch.cat([pv, v], dim=time_dim)
         else:
             nk = k
             nv = v
+        assert nk.shape[time_dim] == nv.shape[time_dim]
         offset = 0
         if self.past_context is not None:
+            offset = max(0, nk.shape[time_dim] - self.past_context)
         if self._is_streaming:
             self._streaming_state['past_keys'] = nk[:, offset:]
             if v is not k:
                 self._streaming_state['offset'] = torch.tensor(0)
         return nk, nv
     def _apply_rope(self, query: torch.Tensor, key: torch.Tensor):
+        # TODO: fix and verify layout.
+        assert _efficient_attention_backend == 'xformers', 'Rope not supported with torch attn.'
         # Apply rope embeddings to query and key tensors.
         assert self.rope is not None
         if 'past_keys' in self._streaming_state:
         assert not is_causal, ("new param added in torch 2.0.1 not supported, "
                                "use the causal args in the constructor.")
+        time_dim = _get_attention_time_dimension()
+        if time_dim == 2:
+            layout = "b h t d"
+        else:
+            layout = "b t h d"
         dtype = query.dtype
         if self._is_streaming:
             assert self.causal or self.cross_attention, \
                 if self.qk_layer_norm is True:
                     q = self.q_layer_norm(q)
                     k = self.k_layer_norm(k)
+                q, k, v = [rearrange(x, f"b t (h d) -> {layout}", h=self.num_heads) for x in [q, k, v]]
             else:
                 if not _is_profiled():
                     # profiling breaks that propertysomehow.
                     assert value is key, "specialized implementation"
                 projected = nn.functional.linear(query, self.in_proj_weight, self.in_proj_bias)
                 if self.kv_repeat == 1:
+                    if time_dim == 2:
+                        bound_layout = "b h p t d"
+                    else:
+                        bound_layout = "b t p h d"
+                    packed = rearrange(projected, f"b t (p h d) -> {bound_layout}", p=3, h=self.num_heads)
                     q, k, v = ops.unbind(packed, dim=2)
                 else:
                     embed_dim = self.embed_dim
                     end = start + per_head_dim * kv_heads
                     k = projected[:, :, start: end]
                     v = projected[:, :, end:]
+                    q = rearrange(q, f"b t (h d) -> {layout}", h=self.num_heads)
+                    k = rearrange(k, f"b t (h d) -> {layout}", h=kv_heads)
+                    v = rearrange(v, f"b t (h d) -> {layout}", h=kv_heads)
                 if self.qk_layer_norm is True:
                     assert self.kv_repeat == 1
+                    q, k = [rearrange(x, f"{layout} -> b t (h d)") for x in [q, k]]
                     q = self.q_layer_norm(q)
                     k = self.k_layer_norm(k)
+                    q, k = [rearrange(x, f"b t (h d) -> {layout}", h=self.num_heads) for x in [q, k]]
                 if self.rope:
                     q, k = self._apply_rope(q, k)
                 k, v = self._complete_kv(k, v)
                 if self.kv_repeat > 1:
                 q, k, v = [x.float() for x in [q, k, v]]
             if self.memory_efficient:
                 p = self.dropout if self.training else 0
+                if _efficient_attention_backend == 'torch':
+                    x = torch.nn.functional.scaled_dot_product_attention(
+                        q, k, v, is_causal=attn_mask is not None, dropout_p=p)
+                else:
+                    x = ops.memory_efficient_attention(q, k, v, attn_mask, p=p)
             else:
                 # We include the dot product as float32, for consistency
                 # with the other implementations that include that step
                 # extend a bit the range of operations done in float32,
                 # although this should make no difference.
                 q = q / q.shape[-1] ** 0.5
+                key_layout = layout.replace('t', 'k')
+                query_layout = layout
                 if self._is_streaming and self.safe_streaming and q.device.type == 'cuda':
                     with torch.autocast(device_type=q.device.type, dtype=torch.float32):
+                        pre_w = torch.einsum(f"{query_layout},{key_layout}-> b h t k", q, k)
                 else:
+                    pre_w = torch.einsum(f"{query_layout},{key_layout}-> b h t k", q, k)
                 if attn_mask is not None:
                     pre_w = pre_w + attn_mask
                 w = torch.softmax(pre_w, dim=-1)
                 w = F.dropout(w, self.dropout, training=self.training).to(v)
+                # Key and value have the same format.
+                x = torch.einsum(f"b h t k, {key_layout} -> {layout}", w, v)
             x = x.to(dtype)
+            x = rearrange(x, f"{layout} -> b t (h d)", h=self.num_heads)
             x = self.out_proj(x)
         else:
             key, value = self._complete_kv(key, value)

tests/modules/test_rope.py CHANGED Viewed

@@ -7,10 +7,11 @@
 import torch
 from audiocraft.modules.rope import RotaryEmbedding
-from audiocraft.modules.transformer import StreamingTransformer
 def test_rope():
     B, T, H, C = 8, 75, 16, 128
     rope = RotaryEmbedding(dim=C)
@@ -23,6 +24,7 @@ def test_rope():
 def test_rope_io_dtypes():
     B, T, H, C = 8, 75, 16, 128
     rope_32 = RotaryEmbedding(dim=C, dtype=torch.float32)
@@ -46,6 +48,7 @@ def test_rope_io_dtypes():
 def test_transformer_with_rope():
     torch.manual_seed(1234)
     for pos in ['rope', 'sin_rope']:
         tr = StreamingTransformer(
@@ -61,6 +64,7 @@ def test_transformer_with_rope():
 @torch.no_grad()
 def test_rope_streaming():
     torch.manual_seed(1234)
     tr = StreamingTransformer(
         16, 4, 2, causal=True, dropout=0.,
@@ -88,6 +92,7 @@ def test_rope_streaming():
 @torch.no_grad()
 def test_rope_streaming_past_context():
     torch.manual_seed(1234)
     for context in [None, 10]:
@@ -117,6 +122,7 @@ def test_rope_streaming_past_context():
 def test_rope_memory_efficient():
     torch.manual_seed(1234)
     tr = StreamingTransformer(
         16, 4, 2, custom=True, dropout=0., layer_scale=0.1,
@@ -137,6 +143,7 @@ def test_rope_memory_efficient():
 def test_rope_with_xpos():
     B, T, H, C = 8, 75, 16, 128
     rope = RotaryEmbedding(dim=C, xpos=True)
@@ -149,6 +156,7 @@ def test_rope_with_xpos():
 def test_positional_scale():
     B, T, H, C = 8, 75, 16, 128
     rope = RotaryEmbedding(dim=C, xpos=True, scale=0.0)

 import torch
 from audiocraft.modules.rope import RotaryEmbedding
+from audiocraft.modules.transformer import StreamingTransformer, set_efficient_attention_backend
 def test_rope():
+    set_efficient_attention_backend('xformers')
     B, T, H, C = 8, 75, 16, 128
     rope = RotaryEmbedding(dim=C)
 def test_rope_io_dtypes():
+    set_efficient_attention_backend('xformers')
     B, T, H, C = 8, 75, 16, 128
     rope_32 = RotaryEmbedding(dim=C, dtype=torch.float32)
 def test_transformer_with_rope():
+    set_efficient_attention_backend('xformers')
     torch.manual_seed(1234)
     for pos in ['rope', 'sin_rope']:
         tr = StreamingTransformer(
 @torch.no_grad()
 def test_rope_streaming():
+    set_efficient_attention_backend('xformers')
     torch.manual_seed(1234)
     tr = StreamingTransformer(
         16, 4, 2, causal=True, dropout=0.,
 @torch.no_grad()
 def test_rope_streaming_past_context():
+    set_efficient_attention_backend('xformers')
     torch.manual_seed(1234)
     for context in [None, 10]:
 def test_rope_memory_efficient():
+    set_efficient_attention_backend('xformers')
     torch.manual_seed(1234)
     tr = StreamingTransformer(
         16, 4, 2, custom=True, dropout=0., layer_scale=0.1,
 def test_rope_with_xpos():
+    set_efficient_attention_backend('xformers')
     B, T, H, C = 8, 75, 16, 128
     rope = RotaryEmbedding(dim=C, xpos=True)
 def test_positional_scale():
+    set_efficient_attention_backend('xformers')
     B, T, H, C = 8, 75, 16, 128
     rope = RotaryEmbedding(dim=C, xpos=True, scale=0.0)

tests/modules/test_transformer.py CHANGED Viewed

@@ -9,7 +9,8 @@ from itertools import product
 import pytest
 import torch
-from audiocraft.modules.transformer import StreamingMultiheadAttention, StreamingTransformer
 def test_transformer_causal_streaming():
@@ -86,19 +87,22 @@ def test_streaming_api():
 def test_memory_efficient():
     torch.manual_seed(1234)
-    tr = StreamingTransformer(
-        16, 4, 2, custom=True, dropout=0., layer_scale=0.1)
-    tr_mem_efficient = StreamingTransformer(
-        16, 4, 2, dropout=0., memory_efficient=True, layer_scale=0.1)
-    tr_mem_efficient.load_state_dict(tr.state_dict())
-    tr.eval()
-    steps = 12
-    x = torch.randn(3, steps, 16)
-    with torch.no_grad():
-        y = tr(x)
-        y2 = tr_mem_efficient(x)
-        assert torch.allclose(y, y2), (y - y2).norm()
 def test_attention_as_float32():
@@ -129,30 +133,32 @@ def test_attention_as_float32():
 @torch.no_grad()
 def test_streaming_memory_efficient():
     torch.manual_seed(1234)
-    tr = StreamingTransformer(16, 4, 2, causal=True, dropout=0., custom=True)
-    tr_mem_efficient = StreamingTransformer(
-        16, 4, 2, dropout=0., memory_efficient=True, causal=True)
-    tr.load_state_dict(tr_mem_efficient.state_dict())
-    tr.eval()
-    tr_mem_efficient.eval()
-    steps = 12
-    x = torch.randn(3, steps, 16)
-    ref = tr(x)
-    with tr_mem_efficient.streaming():
-        outs = []
-        # frame_sizes = [2] + [1] * (steps - 2)
-        frame_sizes = [1] * steps
-        for frame_size in frame_sizes:
-            frame = x[:, :frame_size]
-            x = x[:, frame_size:]
-            outs.append(tr_mem_efficient(frame))
-    out = torch.cat(outs, dim=1)
-    delta = torch.norm(out - ref) / torch.norm(out)
-    assert delta < 1e-6, delta
 def test_cross_attention():
@@ -204,7 +210,7 @@ def test_cross_attention_compat():
     y = cross_attn(queries, keys, values)[0]
     y_ref = ref_attn(queries, keys, values)[0]
-    assert torch.allclose(y, y_ref, atol=1e-7)
     # Now let's check that streaming is working properly.
     with cross_attn.streaming():

 import pytest
 import torch
+from audiocraft.modules.transformer import (
+    StreamingMultiheadAttention, StreamingTransformer, set_efficient_attention_backend)
 def test_transformer_causal_streaming():
 def test_memory_efficient():
     torch.manual_seed(1234)
+    for backend in ['torch', 'xformers']:
+        set_efficient_attention_backend(backend)
+        tr = StreamingTransformer(
+            16, 4, 2, custom=True, dropout=0., layer_scale=0.1)
+        tr_mem_efficient = StreamingTransformer(
+            16, 4, 2, dropout=0., memory_efficient=True, layer_scale=0.1)
+        tr_mem_efficient.load_state_dict(tr.state_dict())
+        tr.eval()
+        steps = 12
+        x = torch.randn(3, steps, 16)
+        with torch.no_grad():
+            y = tr(x)
+            y2 = tr_mem_efficient(x)
+            assert torch.allclose(y, y2), ((y - y2).norm(), backend)
 def test_attention_as_float32():
 @torch.no_grad()
 def test_streaming_memory_efficient():
     torch.manual_seed(1234)
+    for backend in ['torch', 'xformers']:
+        set_efficient_attention_backend(backend)
+        tr = StreamingTransformer(16, 4, 2, causal=True, dropout=0., custom=True)
+        tr_mem_efficient = StreamingTransformer(
+            16, 4, 2, dropout=0., memory_efficient=True, causal=True)
+        tr.load_state_dict(tr_mem_efficient.state_dict())
+        tr.eval()
+        tr_mem_efficient.eval()
+        steps = 12
+        x = torch.randn(3, steps, 16)
+        ref = tr(x)
+        with tr_mem_efficient.streaming():
+            outs = []
+            # frame_sizes = [2] + [1] * (steps - 2)
+            frame_sizes = [1] * steps
+            for frame_size in frame_sizes:
+                frame = x[:, :frame_size]
+                x = x[:, frame_size:]
+                outs.append(tr_mem_efficient(frame))
+        out = torch.cat(outs, dim=1)
+        delta = torch.norm(out - ref) / torch.norm(out)
+        assert delta < 1e-6, delta
 def test_cross_attention():
     y = cross_attn(queries, keys, values)[0]
     y_ref = ref_attn(queries, keys, values)[0]
+    assert torch.allclose(y, y_ref, atol=1e-7), (y - y_ref).norm() / y_ref.norm()
     # Now let's check that streaming is working properly.
     with cross_attn.streaming():