File size: 1,952 Bytes
73baeae
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
import argparse
from pathlib import Path

import librosa
import numpy as np
import torch
from laion_clap import CLAP_Module
from tqdm import tqdm

if __name__ == "__main__":
    parser = argparse.ArgumentParser()
    parser.add_argument(
        "--data_path",
        "-d",
        required=True,
        type=str,
        help="Path of the original wav files",
    )
    parser.add_argument(
        "--save_path",
        "-s",
        required=True,
        type=str,
        help="Path to save the clap audio embedding '.npy' files",
    )
    parser.add_argument(
        "--clap_ckpt",
        "-c",
        required=True,
        type=str,
        help="Path of the pretrained clap checkpoint",
    )
    parser.add_argument(
        "--enable_fusion",
        "-e",
        default=True,
        type=bool,
        help="Whether to enable the feature fusion of the clap model. Depends on the clap checkpoint you are using",
    )
    parser.add_argument(
        "--audio_encoder",
        "-a",
        default="HTSAT-tiny",
        type=str,
        help="Audio encoder of the clap model. Depends on the clap checkpoint you are using",
    )
    args = parser.parse_args()

    model = CLAP_Module(enable_fusion=args.enable_fusion, aencoder=args.audio_encoder)
    model.load_ckpt(args.clap_ckpt)
    data_path = Path(args.data_path)
    save_path = Path(args.save_path)

    with torch.no_grad():
        for wav_path in tqdm(
            data_path.glob("**/*.wav"), dynamic_ncols=True, colour="yellow"
        ):
            wav, _ = librosa.load(wav_path, sr=48000)

            clap_embeding = model.get_audio_embedding_from_data(
                x=wav[np.newaxis], use_tensor=False
            )
            clap_embeding = clap_embeding.squeeze(axis=0)

            out_path = save_path / wav_path.with_suffix(".npy").relative_to(data_path)
            out_path.parent.mkdir(exist_ok=True)
            np.save(out_path, clap_embeding)