|
import gradio as gr |
|
from transformers import AutoProcessor, MusicgenForConditionalGeneration |
|
import numpy as np |
|
|
|
title = "Musicalization System of Painting Demo" |
|
description = "Pui Ching Middle School: Musicalization System of Painting Demo" |
|
|
|
def generate_music(text): |
|
tag2text_checkpoint = "./tag2text_swin_14m.pth" |
|
tag2text_model = tag2text(pretrained=tag2text_checkpoint, image_size=image_size, vit='swin_b').eval().to(device) |
|
res = inference_tag2text(image, tagging_model, specified_tags) |
|
|
|
processor = AutoProcessor.from_pretrained("facebook/musicgen-small") |
|
model = MusicgenForConditionalGeneration.from_pretrained("facebook/musicgen-small") |
|
|
|
|
|
|
|
inputs = processor( |
|
text=[text], |
|
padding=True, |
|
return_tensors="pt", |
|
) |
|
|
|
audio_values = model.generate(**inputs, max_new_tokens=256) |
|
sampling_rate = model.audio_encoder.config.sampling_rate |
|
target_dtype = np.int16 |
|
max_range = np.iinfo(target_dtype).max |
|
audio_values = audio_values[0, 0].numpy() |
|
return sampling_rate, (audio_values * max_range).astype(np.int16) |
|
|
|
|
|
iface = gr.Interface( |
|
fn=generate_music, |
|
title=title, |
|
description=description, |
|
inputs=gr.Text(label="Content"), |
|
outputs=gr.Audio(label='Generated Music')) |
|
|
|
iface.launch() |