Update README.md
Browse files
README.md
CHANGED
@@ -7,82 +7,4 @@ import numpy as np
|
|
7 |
import pandas as pd
|
8 |
from huggingface_hub import hf_hub_download
|
9 |
from model.bart import BartCaptionModel
|
10 |
-
from utils.audio_utils import load_audio, STR_CH_FIRST
|
11 |
-
|
12 |
-
if os.path.isfile("transfer.pth") == False:
|
13 |
-
torch.hub.download_url_to_file('https://huggingface.co/seungheondoh/lp-music-caps/resolve/main/transfer.pth', 'transfer.pth')
|
14 |
-
torch.hub.download_url_to_file('https://huggingface.co/seungheondoh/lp-music-caps/resolve/main/folk.wav', 'folk.wav')
|
15 |
-
torch.hub.download_url_to_file('https://huggingface.co/seungheondoh/lp-music-caps/resolve/main/electronic.mp3', 'electronic.mp3')
|
16 |
-
torch.hub.download_url_to_file('https://huggingface.co/seungheondoh/lp-music-caps/resolve/main/orchestra.wav', 'orchestra.wav')
|
17 |
-
|
18 |
-
device = "cuda:0" if torch.cuda.is_available() else "cpu"
|
19 |
-
|
20 |
-
example_list = ['folk.wav', 'electronic.mp3', 'orchestra.wav']
|
21 |
-
model = BartCaptionModel(max_length = 128)
|
22 |
-
pretrained_object = torch.load('./transfer.pth', map_location='cpu')
|
23 |
-
state_dict = pretrained_object['state_dict']
|
24 |
-
model.load_state_dict(state_dict)
|
25 |
-
if torch.cuda.is_available():
|
26 |
-
torch.cuda.set_device(device)
|
27 |
-
model = model.cuda(device)
|
28 |
-
model.eval()
|
29 |
-
|
30 |
-
def get_audio(audio_path, duration=10, target_sr=16000):
|
31 |
-
n_samples = int(duration * target_sr)
|
32 |
-
audio, sr = load_audio(
|
33 |
-
path= audio_path,
|
34 |
-
ch_format= STR_CH_FIRST,
|
35 |
-
sample_rate= target_sr,
|
36 |
-
downmix_to_mono= True,
|
37 |
-
)
|
38 |
-
if len(audio.shape) == 2:
|
39 |
-
audio = audio.mean(0, False) # to mono
|
40 |
-
input_size = int(n_samples)
|
41 |
-
if audio.shape[-1] < input_size: # pad sequence
|
42 |
-
pad = np.zeros(input_size)
|
43 |
-
pad[: audio.shape[-1]] = audio
|
44 |
-
audio = pad
|
45 |
-
ceil = int(audio.shape[-1] // n_samples)
|
46 |
-
audio = torch.from_numpy(np.stack(np.split(audio[:ceil * n_samples], ceil)).astype('float32'))
|
47 |
-
return audio
|
48 |
-
|
49 |
-
def captioning(audio_path):
|
50 |
-
audio_tensor = get_audio(audio_path = audio_path)
|
51 |
-
if torch.cuda.is_available():
|
52 |
-
audio_tensor = audio_tensor.to(device)
|
53 |
-
with torch.no_grad():
|
54 |
-
output = model.generate(
|
55 |
-
samples=audio_tensor,
|
56 |
-
num_beams=5,
|
57 |
-
)
|
58 |
-
inference = ""
|
59 |
-
number_of_chunks = range(audio_tensor.shape[0])
|
60 |
-
for chunk, text in zip(number_of_chunks, output):
|
61 |
-
time = f"[{chunk * 10}:00-{(chunk + 1) * 10}:00]"
|
62 |
-
inference += f"{time}\n{text} \n \n"
|
63 |
-
return inference
|
64 |
-
|
65 |
-
title = "Interactive demo: Music Captioning 🤖🎵"
|
66 |
-
description = """
|
67 |
-
<p style='text-align: center'> LP-MusicCaps: LLM-Based Pseudo Music Captioning</p>
|
68 |
-
<p style='text-align: center'> SeungHeon Doh, Keunwoo Choi, Jongpil Lee, Juhan Nam, ISMIR 2023</p>
|
69 |
-
<p style='text-align: center'> <a href='https://arxiv.org/abs/2307.16372' target='_blank'>ArXiv</a> | <a href='https://github.com/seungheondoh/lp-music-caps' target='_blank'>Codes</a> | <a href='https://huggingface.co/datasets/seungheondoh/LP-MusicCaps-MC' target='_blank'>Dataset</a> </p>
|
70 |
-
<p style='text-align: center'> To use it, simply upload your audio and click 'submit', or click one of the examples to load them. Read more at the links below. </p>
|
71 |
-
<p style='text-align: center'> If you have any error, plz check this code: <a href='https://github.com/seungheondoh/lp-music-caps/blob/main/demo/app.py' target='_blank'>Demo</a>. </p>
|
72 |
-
"""
|
73 |
-
|
74 |
-
article = "<p style='text-align: center'><a href='https://seungheondoh.github.io/' target='_blank'>Author Info</a> | <a href='https://github.com/seungheondoh' target='_blank'>Github</a></p>"
|
75 |
-
|
76 |
-
|
77 |
-
demo = gr.Interface(fn=captioning,
|
78 |
-
inputs=gr.Audio(type="filepath"),
|
79 |
-
outputs=[
|
80 |
-
gr.Textbox(label="Caption generated by LP-MusicCaps Transfer Model"),
|
81 |
-
],
|
82 |
-
examples=example_list,
|
83 |
-
title=title,
|
84 |
-
description=description,
|
85 |
-
article=article,
|
86 |
-
cache_examples=False
|
87 |
-
)
|
88 |
-
demo.launch()
|
|
|
7 |
import pandas as pd
|
8 |
from huggingface_hub import hf_hub_download
|
9 |
from model.bart import BartCaptionModel
|
10 |
+
from utils.audio_utils import load_audio, STR_CH_FIRST
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|