Spaces:

Fabrice-TIERCELIN
/

Text-to-Music

Running

App Files Files Community

Ashley Kleynhans commited on Jun 11, 2023

Commit

8e10a53

unverified ·

1 Parent(s): 10b60b3

Allow gradio command line arguments to be specified (#50)

Browse files

Files changed (1) hide show

app.py +143 -83

app.py CHANGED Viewed

@@ -7,10 +7,10 @@ LICENSE file in the root directory of this source tree.
 """
 from tempfile import NamedTemporaryFile
 import torch
 import gradio as gr
 from audiocraft.models import MusicGen
 from audiocraft.data.audio import audio_write
@@ -61,90 +61,150 @@ def predict(model, text, melody, duration, topk, topp, temperature, cfg_coef):
     return waveform_video
-with gr.Blocks() as demo:
-    gr.Markdown(
-        """
-        # MusicGen
-        This is the demo for [MusicGen](https://github.com/facebookresearch/audiocraft), a simple and controllable model for music generation
-        presented at: ["Simple and Controllable Music Generation"](https://huggingface.co/papers/2306.05284).
-        <br/>
-        <a href="https://huggingface.co/spaces/musicgen/MusicGen?duplicate=true" style="display: inline-block;margin-top: .5em;margin-right: .25em;" target="_blank">
-        <img style="margin-bottom: 0em;display: inline;margin-top: -.25em;" src="https://bit.ly/3gLdBN6" alt="Duplicate Space"></a>
-        for longer sequences, more control and no queue.</p>
-        """
     )
-    with gr.Row():
-        with gr.Column():
-            with gr.Row():
-                text = gr.Text(label="Input Text", interactive=True)
-                melody = gr.Audio(source="upload", type="numpy", label="Melody Condition (optional)", interactive=True)
-            with gr.Row():
-                submit = gr.Button("Submit")
-            with gr.Row():
-                model = gr.Radio(["melody", "medium", "small", "large"], label="Model", value="melody", interactive=True)
-            with gr.Row():
-                duration = gr.Slider(minimum=1, maximum=30, value=10, label="Duration", interactive=True)
-            with gr.Row():
-                topk = gr.Number(label="Top-k", value=250, interactive=True)
-                topp = gr.Number(label="Top-p", value=0, interactive=True)
-                temperature = gr.Number(label="Temperature", value=1.0, interactive=True)
-                cfg_coef = gr.Number(label="Classifier Free Guidance", value=3.0, interactive=True)
-        with gr.Column():
-            output = gr.Video(label="Generated Music")
-    submit.click(predict, inputs=[model, text, melody, duration, topk, topp, temperature, cfg_coef], outputs=[output])
-    gr.Examples(
-        fn=predict,
-        examples=[
-            [
-                "An 80s driving pop song with heavy drums and synth pads in the background",
-                "./assets/bach.mp3",
-                "melody"
-            ],
-            [
-                "A cheerful country song with acoustic guitars",
-                "./assets/bolero_ravel.mp3",
-                "melody"
-            ],
-            [
-                "90s rock song with electric guitar and heavy drums",
-                None,
-                "medium"
-            ],
-            [
-                "a light and cheerly EDM track, with syncopated drums, aery pads, and strong emotions",
-                "./assets/bach.mp3",
-                "melody"
-            ],
-            [
-                "lofi slow bpm electro chill with organic samples",
-                None,
-                "medium",
-            ],
-        ],
-        inputs=[text, melody, model],
-        outputs=[output]
     )
-    gr.Markdown(
-        """
-        ### More details
-        The model will generate a short music extract based on the description you provided.
-        You can generate up to 30 seconds of audio.
-        We present 4 model variations:
-        1. Melody -- a music generation model capable of generating music condition on text and melody inputs. **Note**, you can also use text only.
-        2. Small -- a 300M transformer decoder conditioned on text only.
-        3. Medium -- a 1.5B transformer decoder conditioned on text only.
-        4. Large -- a 3.3B transformer decoder conditioned on text only (might OOM for the longest sequences.)
-        When using `melody`, ou can optionaly provide a reference audio from
-        which a broad melody will be extracted. The model will then try to follow both the description and melody provided.
-        You can also use your own GPU or a Google Colab by following the instructions on our repo.
-        See [github.com/facebookresearch/audiocraft](https://github.com/facebookresearch/audiocraft)
-        for more details.
-        """
     )
-demo.launch()

 """
 from tempfile import NamedTemporaryFile
+import argparse
 import torch
 import gradio as gr
 from audiocraft.models import MusicGen
 from audiocraft.data.audio import audio_write
     return waveform_video
+def ui(**kwargs):
+    with gr.Blocks() as interface:
+        gr.Markdown(
+            """
+            # MusicGen
+            This is the demo for [MusicGen](https://github.com/facebookresearch/audiocraft), a simple and controllable model for music generation
+            presented at: ["Simple and Controllable Music Generation"](https://huggingface.co/papers/2306.05284).
+            <br/>
+            <a href="https://huggingface.co/spaces/musicgen/MusicGen?duplicate=true" style="display: inline-block;margin-top: .5em;margin-right: .25em;" target="_blank">
+            <img style="margin-bottom: 0em;display: inline;margin-top: -.25em;" src="https://bit.ly/3gLdBN6" alt="Duplicate Space"></a>
+            for longer sequences, more control and no queue.</p>
+            """
+        )
+        with gr.Row():
+            with gr.Column():
+                with gr.Row():
+                    text = gr.Text(label="Input Text", interactive=True)
+                    melody = gr.Audio(source="upload", type="numpy", label="Melody Condition (optional)", interactive=True)
+                with gr.Row():
+                    submit = gr.Button("Submit")
+                with gr.Row():
+                    model = gr.Radio(["melody", "medium", "small", "large"], label="Model", value="melody", interactive=True)
+                with gr.Row():
+                    duration = gr.Slider(minimum=1, maximum=30, value=10, label="Duration", interactive=True)
+                with gr.Row():
+                    topk = gr.Number(label="Top-k", value=250, interactive=True)
+                    topp = gr.Number(label="Top-p", value=0, interactive=True)
+                    temperature = gr.Number(label="Temperature", value=1.0, interactive=True)
+                    cfg_coef = gr.Number(label="Classifier Free Guidance", value=3.0, interactive=True)
+            with gr.Column():
+                output = gr.Video(label="Generated Music")
+        submit.click(predict, inputs=[model, text, melody, duration, topk, topp, temperature, cfg_coef], outputs=[output])
+        gr.Examples(
+            fn=predict,
+            examples=[
+                [
+                    "An 80s driving pop song with heavy drums and synth pads in the background",
+                    "./assets/bach.mp3",
+                    "melody"
+                ],
+                [
+                    "A cheerful country song with acoustic guitars",
+                    "./assets/bolero_ravel.mp3",
+                    "melody"
+                ],
+                [
+                    "90s rock song with electric guitar and heavy drums",
+                    None,
+                    "medium"
+                ],
+                [
+                    "a light and cheerly EDM track, with syncopated drums, aery pads, and strong emotions",
+                    "./assets/bach.mp3",
+                    "melody"
+                ],
+                [
+                    "lofi slow bpm electro chill with organic samples",
+                    None,
+                    "medium",
+                ],
+            ],
+            inputs=[text, melody, model],
+            outputs=[output]
+        )
+        gr.Markdown(
+            """
+            ### More details
+            The model will generate a short music extract based on the description you provided.
+            You can generate up to 30 seconds of audio.
+            We present 4 model variations:
+            1. Melody -- a music generation model capable of generating music condition on text and melody inputs. **Note**, you can also use text only.
+            2. Small -- a 300M transformer decoder conditioned on text only.
+            3. Medium -- a 1.5B transformer decoder conditioned on text only.
+            4. Large -- a 3.3B transformer decoder conditioned on text only (might OOM for the longest sequences.)
+            When using `melody`, ou can optionaly provide a reference audio from
+            which a broad melody will be extracted. The model will then try to follow both the description and melody provided.
+            You can also use your own GPU or a Google Colab by following the instructions on our repo.
+            See [github.com/facebookresearch/audiocraft](https://github.com/facebookresearch/audiocraft)
+            for more details.
+            """
+        )
+        # Show the interface
+        launch_kwargs = {}
+        username = kwargs.get('username')
+        password = kwargs.get('password')
+        server_port = kwargs.get('server_port', 0)
+        inbrowser = kwargs.get('inbrowser', False)
+        share = kwargs.get('share', False)
+        server_name = kwargs.get('listen')
+        launch_kwargs['server_name'] = server_name
+        if username and password:
+            launch_kwargs['auth'] = (username, password)
+        if server_port > 0:
+            launch_kwargs['server_port'] = server_port
+        if inbrowser:
+            launch_kwargs['inbrowser'] = inbrowser
+        if share:
+            launch_kwargs['share'] = share
+        interface.launch(**launch_kwargs)
+if __name__ == "__main__":
+    # torch.cuda.set_per_process_memory_fraction(0.48)
+    parser = argparse.ArgumentParser()
+    parser.add_argument(
+        '--listen',
+        type=str,
+        default='127.0.0.1',
+        help='IP to listen on for connections to Gradio',
     )
+    parser.add_argument(
+        '--username', type=str, default='', help='Username for authentication'
+    )
+    parser.add_argument(
+        '--password', type=str, default='', help='Password for authentication'
     )
+    parser.add_argument(
+        '--server_port',
+        type=int,
+        default=0,
+        help='Port to run the server listener on',
     )
+    parser.add_argument(
+        '--inbrowser', action='store_true', help='Open in browser'
+    )
+    parser.add_argument(
+        '--share', action='store_true', help='Share the gradio UI'
+    )
+    args = parser.parse_args()
+    ui(
+        username=args.username,
+        password=args.password,
+        inbrowser=args.inbrowser,
+        server_port=args.server_port,
+        share=args.share,
+        listen=args.listen
+    )