import gradio as gr import os hf_token = os.environ.get('HF_TOKEN') lpmc_client = gr.load("seungheondoh/LP-Music-Caps-demo", src="spaces") from gradio_client import Client client = Client("https://fffiloni-test-llama-api.hf.space/", api_key=hf_token) from diffusers import DiffusionPipeline import torch pipe = DiffusionPipeline.from_pretrained("stabilityai/stable-diffusion-xl-base-1.0", torch_dtype=torch.float16, use_safetensors=True, variant="fp16") pipe.to("cuda") #pipe.enable_model_cpu_offload() # if using torch < 2.0 # pipe.enable_xformers_memory_efficient_attention() from pydub import AudioSegment def cut_audio(input_path, output_path, max_duration=30000): audio = AudioSegment.from_file(input_path) if len(audio) > max_duration: audio = audio[:max_duration] audio.export(output_path, format="mp3") return output_path def solo_xd(prompt): images = pipe(prompt=prompt).images[0] return images def infer(audio_file): truncated_audio = cut_audio(audio_file, "trunc_audio.mp3") cap_result = lpmc_client( truncated_audio, # str (filepath or URL to file) in 'audio_path' Audio component api_name="predict" ) print(cap_result) #summarize_q = f""" #I'll give you a list of music descriptions. Create a summary reflecting the musical ambiance. #Do not processs each segment, but provide a summary for the whole instead. #Here's the list: #{cap_result} #""" #summary_result = client.predict( # summarize_q, # str in 'Message' Textbox component # api_name="/chat_1" #) #print(f"SUMMARY: {summary_result}") llama_q = """ [INST] <>\n I'll give you music description, then i want you to provide an illustrative image description that would fit well with the music. Answer with only one image description. Never do lists. Do not processs each segment, but provide a summary for the whole instead. Here's the music description : \n<>\n\n{} [/INST] """ prompt = llama_q.format(cap_result) result = client.predict( prompt, # str in 'Message' Textbox component api_name="/predict" ) print(f"Llama2 result: {result}") images = pipe(prompt=result).images[0] print("Finished") #return cap_result, result, images return images, result, gr.update(visible=True) css = """ #col-container {max-width: 510px; margin-left: auto; margin-right: auto;} """ with gr.Blocks(css=css) as demo: with gr.Column(elem_id="col-container"): gr.HTML("""

Music To Image

Sends an audio into LP-Music-Caps to generate a audio caption which is then translated to an illustrative image description with Llama2, and finally run through Stable Diffusion XL to generate an image from the audio !

Note: Only the first 30 seconds of your audio will be used for inference.

""") audio_input = gr.Audio(label="Music input", type="filepath", source="upload") infer_btn = gr.Button("Generate Image from Music") #lpmc_cap = gr.Textbox(label="Lp Music Caps caption") llama_trans_cap = gr.Textbox(label="Llama translation", visible=False) img_result = gr.Image(label="Image Result") tryagain_btn = gr.Button("Try again ?", visible=False) #infer_btn.click(fn=infer, inputs=[audio_input], outputs=[lpmc_cap, llama_trans_cap, img_result]) infer_btn.click(fn=infer, inputs=[audio_input], outputs=[img_result, llama_trans_cap, tryagain_btn]) tryagain_btn.click(fn=solo_xd, inputs=[llama_trans_cap], outputs=[img_result]) demo.queue(max_size=20).launch()