File size: 4,504 Bytes
606a45c b28e9a1 606a45c b28e9a1 606a45c b28e9a1 d7b70fe b28e9a1 606a45c b28e9a1 606a45c 9200bb1 606a45c 9200bb1 606a45c b28e9a1 606a45c b28e9a1 606a45c b28e9a1 606a45c |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 |
import gradio as gr
import os
hf_token = os.environ.get('HF_TOKEN')
lpmc_client = gr.load("seungheondoh/LP-Music-Caps-demo", src="spaces")
from gradio_client import Client
client = Client("https://fffiloni-test-llama-api.hf.space/", hf_token=hf_token)
zrscp_client = Client("https://fffiloni-zeroscope--76p9h.hf.space/", hf_token=hf_token)
from pydub import AudioSegment
def cut_audio(input_path, output_path, max_duration=30000):
audio = AudioSegment.from_file(input_path)
if len(audio) > max_duration:
audio = audio[:max_duration]
audio.export(output_path, format="mp3")
return output_path
def solo_zrscp(prompt):
res_vid = zrscp_client.predict(
prompt,
api_name="/zrscp"
)
return res_vid
def infer(audio_file):
truncated_audio = cut_audio(audio_file, "trunc_audio.mp3")
cap_result = lpmc_client(
truncated_audio, # str (filepath or URL to file) in 'audio_path' Audio component
api_name="predict"
)
print(cap_result)
#summarize_q = f"""
#I'll give you a list of music descriptions. Create a summary reflecting the musical ambiance.
#Do not processs each segment, but provide a summary for the whole instead.
#Here's the list:
#{cap_result}
#"""
#summary_result = client.predict(
# summarize_q, # str in 'Message' Textbox component
# api_name="/chat_1"
#)
#print(f"SUMMARY: {summary_result}")
llama_q = f"""
I'll give you a music description, from i want you to provide an illustrative image description that would fit well with the music.
Do not processs each segment or song, but provide a summary for the whole instead.
Answer with only one image description. Never do lists. Maximum 77 tokens.
Here's the music description :
{cap_result}
"""
result = client.predict(
llama_q, # str in 'Message' Textbox component
api_name="/predict"
)
print(f"Llama2 result: {result}")
res_vid = zrscp_client.predict(
result,
api_name="/zrscp"
)
print("Finished")
#return cap_result, result, images
return res_vid, result, gr.update(visible=True)
css = """
#col-container {max-width: 510px; margin-left: auto; margin-right: auto;}
"""
with gr.Blocks(css=css) as demo:
with gr.Column(elem_id="col-container"):
gr.HTML("""<div style="text-align: center; max-width: 700px; margin: 0 auto;">
<div
style="
display: inline-flex;
align-items: center;
gap: 0.8rem;
font-size: 1.75rem;
"
>
<h1 style="font-weight: 900; margin-bottom: 7px; margin-top: 5px;">
Music To Zeroscope Video
</h1>
</div>
<p style="margin-bottom: 10px; font-size: 94%">
Sends an audio into <a href="https://huggingface.co/spaces/seungheondoh/LP-Music-Caps-demo" target="_blank">LP-Music-Caps</a>
to generate a audio caption which is then translated to an illustrative image description with Llama2, and finally run through
Zeroscope to generate a 3s video from the audio ! <br /><br />
Note: Only the first 30 seconds of your audio will be used for inference.
</p>
</div>""")
audio_input = gr.Audio(label="Music input", type="filepath", source="upload")
infer_btn = gr.Button("Generate Image from Music")
#lpmc_cap = gr.Textbox(label="Lp Music Caps caption")
llama_trans_cap = gr.Textbox(label="Llama translation", visible=False)
vid_result = gr.Video(label="Image Result")
tryagain_btn = gr.Button("Try again ?", visible=False)
gr.Examples(examples=[["./examples/electronic.mp3"],["./examples/folk.wav"], ["./examples/orchestra.wav"]],
fn=infer,
inputs=[audio_input],
outputs=[vid_result, llama_trans_cap, tryagain_btn],
cache_examples=True
)
#infer_btn.click(fn=infer, inputs=[audio_input], outputs=[lpmc_cap, llama_trans_cap, img_result])
infer_btn.click(fn=infer, inputs=[audio_input], outputs=[vid_result, llama_trans_cap, tryagain_btn])
tryagain_btn.click(fn=solo_zrscp, inputs=[llama_trans_cap], outputs=[vid_result])
demo.queue(max_size=20).launch() |