MiguelZoo commited on
Commit
381df91
1 Parent(s): d741afe

Update README.md

Browse files
Files changed (1) hide show
  1. README.md +1 -79
README.md CHANGED
@@ -7,82 +7,4 @@ import numpy as np
7
  import pandas as pd
8
  from huggingface_hub import hf_hub_download
9
  from model.bart import BartCaptionModel
10
- from utils.audio_utils import load_audio, STR_CH_FIRST
11
-
12
- if os.path.isfile("transfer.pth") == False:
13
- torch.hub.download_url_to_file('https://huggingface.co/seungheondoh/lp-music-caps/resolve/main/transfer.pth', 'transfer.pth')
14
- torch.hub.download_url_to_file('https://huggingface.co/seungheondoh/lp-music-caps/resolve/main/folk.wav', 'folk.wav')
15
- torch.hub.download_url_to_file('https://huggingface.co/seungheondoh/lp-music-caps/resolve/main/electronic.mp3', 'electronic.mp3')
16
- torch.hub.download_url_to_file('https://huggingface.co/seungheondoh/lp-music-caps/resolve/main/orchestra.wav', 'orchestra.wav')
17
-
18
- device = "cuda:0" if torch.cuda.is_available() else "cpu"
19
-
20
- example_list = ['folk.wav', 'electronic.mp3', 'orchestra.wav']
21
- model = BartCaptionModel(max_length = 128)
22
- pretrained_object = torch.load('./transfer.pth', map_location='cpu')
23
- state_dict = pretrained_object['state_dict']
24
- model.load_state_dict(state_dict)
25
- if torch.cuda.is_available():
26
- torch.cuda.set_device(device)
27
- model = model.cuda(device)
28
- model.eval()
29
-
30
- def get_audio(audio_path, duration=10, target_sr=16000):
31
- n_samples = int(duration * target_sr)
32
- audio, sr = load_audio(
33
- path= audio_path,
34
- ch_format= STR_CH_FIRST,
35
- sample_rate= target_sr,
36
- downmix_to_mono= True,
37
- )
38
- if len(audio.shape) == 2:
39
- audio = audio.mean(0, False) # to mono
40
- input_size = int(n_samples)
41
- if audio.shape[-1] < input_size: # pad sequence
42
- pad = np.zeros(input_size)
43
- pad[: audio.shape[-1]] = audio
44
- audio = pad
45
- ceil = int(audio.shape[-1] // n_samples)
46
- audio = torch.from_numpy(np.stack(np.split(audio[:ceil * n_samples], ceil)).astype('float32'))
47
- return audio
48
-
49
- def captioning(audio_path):
50
- audio_tensor = get_audio(audio_path = audio_path)
51
- if torch.cuda.is_available():
52
- audio_tensor = audio_tensor.to(device)
53
- with torch.no_grad():
54
- output = model.generate(
55
- samples=audio_tensor,
56
- num_beams=5,
57
- )
58
- inference = ""
59
- number_of_chunks = range(audio_tensor.shape[0])
60
- for chunk, text in zip(number_of_chunks, output):
61
- time = f"[{chunk * 10}:00-{(chunk + 1) * 10}:00]"
62
- inference += f"{time}\n{text} \n \n"
63
- return inference
64
-
65
- title = "Interactive demo: Music Captioning 🤖🎵"
66
- description = """
67
- <p style='text-align: center'> LP-MusicCaps: LLM-Based Pseudo Music Captioning</p>
68
- <p style='text-align: center'> SeungHeon Doh, Keunwoo Choi, Jongpil Lee, Juhan Nam, ISMIR 2023</p>
69
- <p style='text-align: center'> <a href='https://arxiv.org/abs/2307.16372' target='_blank'>ArXiv</a> | <a href='https://github.com/seungheondoh/lp-music-caps' target='_blank'>Codes</a> | <a href='https://huggingface.co/datasets/seungheondoh/LP-MusicCaps-MC' target='_blank'>Dataset</a> </p>
70
- <p style='text-align: center'> To use it, simply upload your audio and click 'submit', or click one of the examples to load them. Read more at the links below. </p>
71
- <p style='text-align: center'> If you have any error, plz check this code: <a href='https://github.com/seungheondoh/lp-music-caps/blob/main/demo/app.py' target='_blank'>Demo</a>. </p>
72
- """
73
-
74
- article = "<p style='text-align: center'><a href='https://seungheondoh.github.io/' target='_blank'>Author Info</a> | <a href='https://github.com/seungheondoh' target='_blank'>Github</a></p>"
75
-
76
-
77
- demo = gr.Interface(fn=captioning,
78
- inputs=gr.Audio(type="filepath"),
79
- outputs=[
80
- gr.Textbox(label="Caption generated by LP-MusicCaps Transfer Model"),
81
- ],
82
- examples=example_list,
83
- title=title,
84
- description=description,
85
- article=article,
86
- cache_examples=False
87
- )
88
- demo.launch()
 
7
  import pandas as pd
8
  from huggingface_hub import hf_hub_download
9
  from model.bart import BartCaptionModel
10
+ from utils.audio_utils import load_audio, STR_CH_FIRST