# external imports import gc import logging as log import time import uuid import gradio as gr # local imports from blip_image_caption_large import Blip_Image_Caption_Large from phi3_mini_4k_instruct import Phi3_Mini_4k_Instruct from musicgen_small import Musicgen_Small import config log.basicConfig(level=log.INFO) class Image_To_Music: def __init__(self, use_local_caption=False, use_local_llm=False, use_local_musicgen=False): self.use_local_llm = use_local_llm self.use_local_caption = use_local_caption self.use_local_musicgen = use_local_musicgen self.image_path = None self.generated_caption = None self.generated_description = None self.audio_path = config.AUDIO_DIR + str(uuid.uuid4()) + ".wav" self.caption_generation_duration = -1 self.description_generation_duration = -1 self.music_generation_duration = -1 def caption_image(self, image_path): log.info("Captioning Image...") caption_start_time = time.time() # load model self.image_caption_model = Blip_Image_Caption_Large() self.image_path = image_path self.generated_caption = self.image_caption_model.caption_image(self.image_path, self.use_local_caption) # delete model to free up ram del self.image_caption_model gc.collect() self.caption_generation_duration = time.time() - caption_start_time log.info(f"Captioning Complete in {self.caption_generation_duration:.2f} seconds: {self.generated_caption} - used local model: {self.use_local_caption}") return self.generated_caption def generate_description(self): log.info("Generating Music Description...") description_start_time = time.time() # load model self.text_generation_model = Phi3_Mini_4k_Instruct() messages = [ {"role": "system", "content": "You are an image caption to song description converter with a deep understanding of Music and Art. You are given the caption of an image. Your task is to generate a textual description of a musical piece that fits the caption. The description should be detailed and vivid, and should include the genre, mood, instruments, tempo, and other relevant information about the music. You should also use your knowledge of art and visual aesthetics to create a musical piece that complements the image. Only output the description of the music, without any explanation or introduction. Be concise."}, {"role": "user", "content": self.generated_caption}, ] self.generated_description = self.text_generation_model.generate_text(messages, self.use_local_llm) # delete model to free up ram del self.text_generation_model gc.collect() self.description_generation_duration = time.time() - description_start_time log.info(f"Description Generation Complete in {self.description_generation_duration:.2f} seconds: {self.generated_description} - used local model: {self.use_local_llm}") return self.generated_description def generate_music(self): log.info("Generating Music...") music_start_time = time.time() # load model self.music_generation_model = Musicgen_Small() self.music_generation_model.generate_music(self.generated_description, self.audio_path, self.use_local_musicgen) # delete model to free up ram del self.music_generation_model gc.collect() self.music_generation_duration = time.time() - music_start_time log.info(f"Music Generation Complete in {self.music_generation_duration:.2f} seconds: {self.audio_path} - used local model: {self.use_local_musicgen}") return self.audio_path def get_durations(self): return f"Caption Generation Time: {self.caption_generation_duration:.2f} seconds\nDescription Generation Time: {self.description_generation_duration:.2f} seconds\nMusic Generation Time: {self.music_generation_duration:.2f} seconds\nTotal Time: {self.caption_generation_duration + self.description_generation_duration + self.music_generation_duration:.2f} seconds" def run_yield(self, image_path): self.caption_image(image_path) yield [self.generated_caption, None, None, None] self.generate_description() yield [self.generated_caption, self.generated_description, None, None] self.generate_music() yield [self.generated_caption, self.generated_description, self.audio_path, None] return [self.generated_caption, self.generated_description, self.audio_path,self.get_durations()] def run(self, image_path): self.caption_image(image_path) self.generate_description() self.generate_music() return [self.generated_caption, self.generated_description, self.audio_path, self.get_durations()] def run_image_to_music(image_path, llm_max_new_tokens, llm_temperature, llm_top_p, musicgen_max_seconds, use_local_caption, use_local_llm, use_local_musicgen): config.LLM_MAX_NEW_TOKENS = llm_max_new_tokens config.LLM_TEMPERATURE = llm_temperature config.LLM_TOP_P = llm_top_p config.MUSICGEN_MAX_NEW_TOKENS = musicgen_max_seconds * 51 itm = Image_To_Music(use_local_caption=use_local_caption, use_local_llm=use_local_llm, use_local_musicgen=use_local_musicgen) return itm.run(image_path) # Gradio UI def gradio(): # Define Gradio Interface, information from (https://www.gradio.app/docs/chatinterface) with gr.Blocks() as demo: gr.Markdown("
# EDITS: added interactive flags
# Advanced options with sliders
with gr.Accordion("Advanced Options", open=False):
gr.Markdown("LLM Settings
")
llm_max_new_tokens = gr.Slider(1, 200, value=50, step=1, label="LLM Max Tokens", interactive=True)
llm_temperature = gr.Slider(0.0, 1.0, value=0.7, step=0.01, label="LLM Temperature", interactive=True)
llm_top_p = gr.Slider(0.01, 0.99, value=0.95, step=0.01, label="LLM Top P", interactive=True)
gr.Markdown("Music Generation Settings
")
musicgen_max_seconds = gr.Slider(1, 30, value=5, step=1, label="MusicGen Duration in Seconds (local model only)", interactive=True)
# -----ATTRIBUTION-END-----
with gr.Row():
caption_output = gr.Textbox(label="Image Caption")
music_description_output = gr.Textbox(label="Music Description")
durations = gr.Textbox(label="Processing Times", interactive=False, placeholder="Time statistics will appear here")
music_output = gr.Audio(label="Generated Music")
# Button to trigger the process
generate_button = gr.Button("Generate Music")
generate_button.click(fn=run_image_to_music, inputs=[image_input, llm_max_new_tokens, llm_temperature, llm_top_p, musicgen_max_seconds, local_captioning, local_llm, local_music_gen], outputs=[caption_output, music_description_output, music_output, durations])
# Launch Gradio app
demo.launch()
gradio()