File size: 2,785 Bytes
6b28a91 c1af806 6b28a91 a9c020a 91f1a24 6b28a91 ca1a401 6b28a91 9eb21f5 6b28a91 3aaf62a 6b28a91 8bfcdbb c298807 6b28a91 39a196c 72c85ef 39a196c 72c85ef 39a196c 72c85ef 8bbfb83 72c85ef 39a196c 72c85ef 9eb21f5 cad8f1b 39a196c 72c85ef 39a196c 6b28a91 97a0727 3aaf62a c298807 0cb7b61 4fc6482 39a196c 4fc6482 6b28a91 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 |
import gradio as gr
import os
import shutil
import spaces
import sys
# we will clone the repo and install the dependencies
# NOTE: Still fixing bugs, not release, do not try :) !
# os.system('pip install -r qa_mdt/requirements.txt')
# os.system('pip install xformers==0.0.26.post1')
# os.system('pip install torchlibrosa==0.0.9 librosa==0.9.2')
# os.system('pip install -q pytorch_lightning==2.1.3 torchlibrosa==0.0.9 librosa==0.9.2 ftfy==6.1.1 braceexpand')
# os.system('pip install torch==2.3.0+cu121 torchvision==0.18.0+cu121 torchaudio==2.3.0 --index-url https://download.pytorch.org/whl/cu121')
# only then import the necessary modules from qa_mdt
from qa_mdt.pipeline import MOSDiffusionPipeline
pipe = MOSDiffusionPipeline()
# this runs the pipeline with user input and saves the output as 'awesome.wav'
@spaces.GPU(duration=120)
def generate_waveform(description):
high_quality_description = "high quality " + description
pipe(high_quality_description)
generated_file_path = "./awesome.wav"
# if os.path.exists(generated_file_path):
# return generated_file_path
# else:
# return "Error: Failed to generate the waveform."
if os.path.exists(generated_file_path):
waveform_video = gr.make_waveform(audio=generated_file_path, bg_color="#000000", bars_color="#00FF00", bar_count=100, bar_width=1.5, animate=True)
return waveform_video, generated_file_path
else:
return "Error: Failed to generate the waveform."
intro = """
# 🎶 OpenMusic: Diffusion That Plays Music 🎧 🎹
Welcome to **OpenMusic**, a next-gen diffusion model designed to generate high-quality music audio from text descriptions!
Simply enter a few words describing the vibe, and watch as the model generates a unique track for your input.
Powered by the QA-MDT model, based on the new research paper linked below.
- [GitHub Repo](https://github.com/ivcylc/qa-mdt) by [@changli](https://github.com/ivcylc) 🎓.
- [Paper](https://arxiv.org/pdf/2405.15863)
- [HuggingFace](https://huggingface.co/jadechoghari/qa_mdt) [@jadechoghari](https://github.com/jadechoghari) 🤗.
Note: The music generation process will take 1-2 minutes 🎶
---
"""
# gradio interface
iface = gr.Interface(
fn=generate_waveform,
inputs=gr.Textbox(lines=2, placeholder="Enter a music description here..."),
# outputs=gr.Audio(label="Download the Music 🎼"),
outputs=[gr.Video(label="Watch the Waveform 🎼"), gr.Audio(label="Download the Music 🎶")],
description=intro,
examples=[
["A modern synthesizer creating futuristic soundscapes."],
["Acoustic ballad with heartfelt lyrics and soft piano."]
],
cache_examples=True
)
# Launch the Gradio app
if __name__ == "__main__":
iface.launch()
|