File size: 4,504 Bytes
606a45c
 
 
 
 
 
 
 
 
b28e9a1
606a45c
 
 
 
 
 
 
 
 
 
 
 
 
b28e9a1
 
 
 
 
 
606a45c
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
b28e9a1
d7b70fe
b28e9a1
 
606a45c
 
 
 
b28e9a1
606a45c
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
9200bb1
606a45c
 
 
 
 
9200bb1
606a45c
 
 
 
 
 
 
b28e9a1
606a45c
 
 
 
 
b28e9a1
606a45c
 
 
 
b28e9a1
 
606a45c
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
import gradio as gr
import os 
hf_token = os.environ.get('HF_TOKEN')

lpmc_client = gr.load("seungheondoh/LP-Music-Caps-demo", src="spaces")

from gradio_client import Client

client = Client("https://fffiloni-test-llama-api.hf.space/", hf_token=hf_token)
zrscp_client = Client("https://fffiloni-zeroscope--76p9h.hf.space/", hf_token=hf_token)

from pydub import AudioSegment

def cut_audio(input_path, output_path, max_duration=30000):
    audio = AudioSegment.from_file(input_path)

    if len(audio) > max_duration:
        audio = audio[:max_duration]

    audio.export(output_path, format="mp3")

    return output_path

def solo_zrscp(prompt):
    res_vid = zrscp_client.predict(
        prompt,
        api_name="/zrscp"
    )
    return res_vid

def infer(audio_file):

    truncated_audio = cut_audio(audio_file, "trunc_audio.mp3")
    
    cap_result = lpmc_client(
    				truncated_audio,	# str (filepath or URL to file) in 'audio_path' Audio component
    				api_name="predict"
    )
    print(cap_result)

    #summarize_q = f"""

    #I'll give you a list of music descriptions. Create a summary reflecting the musical ambiance. 
    #Do not processs each segment, but provide a summary for the whole instead.
    
    #Here's the list:

    #{cap_result}
    #"""

    #summary_result = client.predict(
    #				summarize_q,	# str in 'Message' Textbox component
    #				api_name="/chat_1"
    #)

    #print(f"SUMMARY: {summary_result}")

    llama_q = f"""
    I'll give you a music description, from i want you to provide an illustrative image description that would fit well with the music.
    Do not processs each segment or song, but provide a summary for the whole instead.
    Answer with only one image description. Never do lists. Maximum 77 tokens.

    Here's the music description :

    {cap_result}
    
    """
    
    result = client.predict(
    				llama_q,	# str in 'Message' Textbox component
    				api_name="/predict"
    )

    
    
    
    print(f"Llama2 result: {result}")

    res_vid = zrscp_client.predict(
        result,
        api_name="/zrscp"
    )

    print("Finished")
    
    #return cap_result, result, images
    return res_vid, result, gr.update(visible=True)

css = """
#col-container {max-width: 510px; margin-left: auto; margin-right: auto;}
"""
with gr.Blocks(css=css) as demo:
    with gr.Column(elem_id="col-container"):
        gr.HTML("""<div style="text-align: center; max-width: 700px; margin: 0 auto;">
                <div
                style="
                    display: inline-flex;
                    align-items: center;
                    gap: 0.8rem;
                    font-size: 1.75rem;
                "
                >
                <h1 style="font-weight: 900; margin-bottom: 7px; margin-top: 5px;">
                    Music To Zeroscope Video
                </h1>
                </div>
                <p style="margin-bottom: 10px; font-size: 94%">
                Sends an audio into <a href="https://huggingface.co/spaces/seungheondoh/LP-Music-Caps-demo" target="_blank">LP-Music-Caps</a>
                to generate a audio caption which is then translated to an illustrative image description with Llama2, and finally run through 
                Zeroscope to generate a 3s video from the audio ! <br /><br />
                Note: Only the first 30 seconds of your audio will be used for inference.
                </p>
            </div>""")
        audio_input = gr.Audio(label="Music input", type="filepath", source="upload")
        infer_btn = gr.Button("Generate Image from Music")
        #lpmc_cap = gr.Textbox(label="Lp Music Caps caption")
        llama_trans_cap = gr.Textbox(label="Llama translation", visible=False)
        vid_result = gr.Video(label="Image Result")
        tryagain_btn = gr.Button("Try again ?", visible=False)

        gr.Examples(examples=[["./examples/electronic.mp3"],["./examples/folk.wav"], ["./examples/orchestra.wav"]],
                    fn=infer,
                    inputs=[audio_input],
                    outputs=[vid_result, llama_trans_cap, tryagain_btn],
                    cache_examples=True
                   )

    #infer_btn.click(fn=infer, inputs=[audio_input], outputs=[lpmc_cap, llama_trans_cap, img_result])
    infer_btn.click(fn=infer, inputs=[audio_input], outputs=[vid_result, llama_trans_cap, tryagain_btn])
    tryagain_btn.click(fn=solo_zrscp, inputs=[llama_trans_cap], outputs=[vid_result])

demo.queue(max_size=20).launch()