import gradio as gr from transformers import AutoProcessor, MusicgenForConditionalGeneration import numpy as np import torch from ram import get_transform, inference_tag2text from ram.models import tag2text from PIL import Image title = "Musicalization System of Painting Demo" description = "Pui Ching Middle School: Musicalization System of Painting Demo" image_size = 384 device = "cuda" if torch.cuda.is_available() else "cpu" torch.no_grad() transform = get_transform(image_size=image_size) tag2text_model = tag2text(pretrained="tag2text_swin_14m.pth", image_size=image_size, vit='swin_b').eval().to(device) def generate_music(raw_image, audio_length): raw_image = Image.fromarray(raw_image) image = transform(raw_image).unsqueeze(0).to(device) res = inference_tag2text(image, tag2text_model) tags = res[0].strip(' ').replace(' ', ' ') caption = res[2] print(caption) processor = AutoProcessor.from_pretrained("facebook/musicgen-small") model = MusicgenForConditionalGeneration.from_pretrained("facebook/musicgen-small") inputs = processor( text=[caption], padding=True, return_tensors="pt", ) sampling_rate = model.audio_encoder.config.sampling_rate frame_rate = model.audio_encoder.config.frame_rate max_new_tokens = int(frame_rate * audio_length) audio_values = model.generate(**inputs, max_new_tokens=max_new_tokens) target_dtype = np.int16 max_range = np.iinfo(target_dtype).max audio_values = audio_values[0, 0].numpy() return sampling_rate, (audio_values * max_range).astype(np.int16) iface = gr.Interface( fn=generate_music, title=title, description=description, inputs=[ gr.Image(label="Painting"), gr.Slider(5, 30, value=15, step=1, label="Audio length(sec)") ], outputs=gr.Audio(label='Generated Music')) iface.launch()