import gradio as gr from transformers import AutoProcessor, MusicgenForConditionalGeneration import numpy as np title = "Musicalization System of Painting Demo" description = "Pui Ching Middle School: Musicalization System of Painting Demo" def generate_music(text): tag2text_checkpoint = "./tag2text_swin_14m.pth" tag2text_model = tag2text(pretrained=tag2text_checkpoint, image_size=image_size, vit='swin_b').eval().to(device) res = inference_tag2text(image, tagging_model, specified_tags) processor = AutoProcessor.from_pretrained("facebook/musicgen-small") model = MusicgenForConditionalGeneration.from_pretrained("facebook/musicgen-small") inputs = processor( text=[text], padding=True, return_tensors="pt", ) audio_values = model.generate(**inputs, max_new_tokens=256) sampling_rate = model.audio_encoder.config.sampling_rate target_dtype = np.int16 max_range = np.iinfo(target_dtype).max audio_values = audio_values[0, 0].numpy() return sampling_rate, (audio_values * max_range).astype(np.int16) iface = gr.Interface( fn=generate_music, title=title, description=description, inputs=gr.Text(label="Content"), outputs=gr.Audio(label='Generated Music')) iface.launch()