haoheliu commited on
Commit
bdab1da
•
1 Parent(s): a5d109b

first commit and add large model

Browse files
This view is limited to 50 files because it contains too many changes.   See raw diff
Files changed (50) hide show
  1. .gitignore +0 -0
  2. app.py +10 -4
  3. audioldm/__init__.py +3 -0
  4. audioldm/audio/__init__.py +0 -0
  5. audioldm/audio/audio_processing.py +100 -0
  6. audioldm/audio/stft.py +180 -0
  7. audioldm/audio/tools.py +33 -0
  8. audioldm/clap/__init__.py +0 -0
  9. audioldm/clap/encoders.py +169 -0
  10. audioldm/clap/open_clip/__init__.py +25 -0
  11. audioldm/clap/open_clip/bert.py +40 -0
  12. audioldm/clap/open_clip/bpe_simple_vocab_16e6.txt.gz +3 -0
  13. audioldm/clap/open_clip/factory.py +277 -0
  14. audioldm/clap/open_clip/feature_fusion.py +192 -0
  15. audioldm/clap/open_clip/htsat.py +1308 -0
  16. audioldm/clap/open_clip/linear_probe.py +66 -0
  17. audioldm/clap/open_clip/loss.py +398 -0
  18. audioldm/clap/open_clip/model.py +934 -0
  19. audioldm/clap/open_clip/model_configs/HTSAT-base.json +23 -0
  20. audioldm/clap/open_clip/model_configs/HTSAT-large.json +23 -0
  21. audioldm/clap/open_clip/model_configs/HTSAT-tiny-win-1536.json +23 -0
  22. audioldm/clap/open_clip/model_configs/HTSAT-tiny.json +23 -0
  23. audioldm/clap/open_clip/model_configs/PANN-10.json +23 -0
  24. audioldm/clap/open_clip/model_configs/PANN-14-fmax-18k.json +23 -0
  25. audioldm/clap/open_clip/model_configs/PANN-14-fmax-8k-20s.json +23 -0
  26. audioldm/clap/open_clip/model_configs/PANN-14-tiny-transformer.json +23 -0
  27. audioldm/clap/open_clip/model_configs/PANN-14-win-1536.json +23 -0
  28. audioldm/clap/open_clip/model_configs/PANN-14.json +23 -0
  29. audioldm/clap/open_clip/model_configs/PANN-6.json +23 -0
  30. audioldm/clap/open_clip/model_configs/RN101-quickgelu.json +22 -0
  31. audioldm/clap/open_clip/model_configs/RN101.json +21 -0
  32. audioldm/clap/open_clip/model_configs/RN50-quickgelu.json +22 -0
  33. audioldm/clap/open_clip/model_configs/RN50.json +21 -0
  34. audioldm/clap/open_clip/model_configs/RN50x16.json +21 -0
  35. audioldm/clap/open_clip/model_configs/RN50x4.json +21 -0
  36. audioldm/clap/open_clip/model_configs/ViT-B-16.json +16 -0
  37. audioldm/clap/open_clip/model_configs/ViT-B-32-quickgelu.json +17 -0
  38. audioldm/clap/open_clip/model_configs/ViT-B-32.json +16 -0
  39. audioldm/clap/open_clip/model_configs/ViT-L-14.json +16 -0
  40. audioldm/clap/open_clip/openai.py +156 -0
  41. audioldm/clap/open_clip/pann_model.py +703 -0
  42. audioldm/clap/open_clip/pretrained.py +167 -0
  43. audioldm/clap/open_clip/timm_model.py +112 -0
  44. audioldm/clap/open_clip/tokenizer.py +197 -0
  45. audioldm/clap/open_clip/transform.py +45 -0
  46. audioldm/clap/open_clip/utils.py +361 -0
  47. audioldm/clap/open_clip/version.py +1 -0
  48. audioldm/clap/training/__init__.py +0 -0
  49. audioldm/clap/training/audioset_textmap.npy +3 -0
  50. audioldm/clap/training/data.py +977 -0
.gitignore ADDED
File without changes
app.py CHANGED
@@ -1,8 +1,14 @@
1
  import gradio as gr
2
  import numpy as np
 
3
 
4
- def greet(name):
5
- return [(16000, np.random.randn(16000)), (16000, np.random.randn(16000)), (16000, np.random.randn(16000))]
 
 
6
 
7
- iface = gr.Interface(fn=greet, inputs="text", outputs=["audio", "audio", "audio"])
8
- iface.launch()
 
 
 
 
1
  import gradio as gr
2
  import numpy as np
3
+ from audioldm import text_to_audio
4
 
5
+ def greet(text):
6
+ waveform = text_to_audio(text, n_gen=1) # [bs, 1, samples]
7
+ waveform = [(16000, wave[0]) for wave in waveform]
8
+ return waveform
9
 
10
+ iface = gr.Interface(fn=greet, inputs="text", outputs=["audio", "audio"])
11
+ iface.launch()
12
+
13
+ # if __name__ == "__main__":
14
+ # greet("hello world")
audioldm/__init__.py ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ from .ldm import LatentDiffusion
2
+ from .utils import seed_everything
3
+ from .pipeline import *
audioldm/audio/__init__.py ADDED
File without changes
audioldm/audio/audio_processing.py ADDED
@@ -0,0 +1,100 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import torch
2
+ import numpy as np
3
+ import librosa.util as librosa_util
4
+ from scipy.signal import get_window
5
+
6
+
7
+ def window_sumsquare(
8
+ window,
9
+ n_frames,
10
+ hop_length,
11
+ win_length,
12
+ n_fft,
13
+ dtype=np.float32,
14
+ norm=None,
15
+ ):
16
+ """
17
+ # from librosa 0.6
18
+ Compute the sum-square envelope of a window function at a given hop length.
19
+
20
+ This is used to estimate modulation effects induced by windowing
21
+ observations in short-time fourier transforms.
22
+
23
+ Parameters
24
+ ----------
25
+ window : string, tuple, number, callable, or list-like
26
+ Window specification, as in `get_window`
27
+
28
+ n_frames : int > 0
29
+ The number of analysis frames
30
+
31
+ hop_length : int > 0
32
+ The number of samples to advance between frames
33
+
34
+ win_length : [optional]
35
+ The length of the window function. By default, this matches `n_fft`.
36
+
37
+ n_fft : int > 0
38
+ The length of each analysis frame.
39
+
40
+ dtype : np.dtype
41
+ The data type of the output
42
+
43
+ Returns
44
+ -------
45
+ wss : np.ndarray, shape=`(n_fft + hop_length * (n_frames - 1))`
46
+ The sum-squared envelope of the window function
47
+ """
48
+ if win_length is None:
49
+ win_length = n_fft
50
+
51
+ n = n_fft + hop_length * (n_frames - 1)
52
+ x = np.zeros(n, dtype=dtype)
53
+
54
+ # Compute the squared window at the desired length
55
+ win_sq = get_window(window, win_length, fftbins=True)
56
+ win_sq = librosa_util.normalize(win_sq, norm=norm) ** 2
57
+ win_sq = librosa_util.pad_center(win_sq, n_fft)
58
+
59
+ # Fill the envelope
60
+ for i in range(n_frames):
61
+ sample = i * hop_length
62
+ x[sample : min(n, sample + n_fft)] += win_sq[: max(0, min(n_fft, n - sample))]
63
+ return x
64
+
65
+
66
+ def griffin_lim(magnitudes, stft_fn, n_iters=30):
67
+ """
68
+ PARAMS
69
+ ------
70
+ magnitudes: spectrogram magnitudes
71
+ stft_fn: STFT class with transform (STFT) and inverse (ISTFT) methods
72
+ """
73
+
74
+ angles = np.angle(np.exp(2j * np.pi * np.random.rand(*magnitudes.size())))
75
+ angles = angles.astype(np.float32)
76
+ angles = torch.autograd.Variable(torch.from_numpy(angles))
77
+ signal = stft_fn.inverse(magnitudes, angles).squeeze(1)
78
+
79
+ for i in range(n_iters):
80
+ _, angles = stft_fn.transform(signal)
81
+ signal = stft_fn.inverse(magnitudes, angles).squeeze(1)
82
+ return signal
83
+
84
+
85
+ def dynamic_range_compression(x, normalize_fun=torch.log, C=1, clip_val=1e-5):
86
+ """
87
+ PARAMS
88
+ ------
89
+ C: compression factor
90
+ """
91
+ return normalize_fun(torch.clamp(x, min=clip_val) * C)
92
+
93
+
94
+ def dynamic_range_decompression(x, C=1):
95
+ """
96
+ PARAMS
97
+ ------
98
+ C: compression factor used to compress
99
+ """
100
+ return torch.exp(x) / C
audioldm/audio/stft.py ADDED
@@ -0,0 +1,180 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import torch
2
+ import torch.nn.functional as F
3
+ import numpy as np
4
+ from scipy.signal import get_window
5
+ from librosa.util import pad_center, tiny
6
+ from librosa.filters import mel as librosa_mel_fn
7
+
8
+ from audioldm.audio.audio_processing import (
9
+ dynamic_range_compression,
10
+ dynamic_range_decompression,
11
+ window_sumsquare,
12
+ )
13
+
14
+
15
+ class STFT(torch.nn.Module):
16
+ """adapted from Prem Seetharaman's https://github.com/pseeth/pytorch-stft"""
17
+
18
+ def __init__(self, filter_length, hop_length, win_length, window="hann"):
19
+ super(STFT, self).__init__()
20
+ self.filter_length = filter_length
21
+ self.hop_length = hop_length
22
+ self.win_length = win_length
23
+ self.window = window
24
+ self.forward_transform = None
25
+ scale = self.filter_length / self.hop_length
26
+ fourier_basis = np.fft.fft(np.eye(self.filter_length))
27
+
28
+ cutoff = int((self.filter_length / 2 + 1))
29
+ fourier_basis = np.vstack(
30
+ [np.real(fourier_basis[:cutoff, :]), np.imag(fourier_basis[:cutoff, :])]
31
+ )
32
+
33
+ forward_basis = torch.FloatTensor(fourier_basis[:, None, :])
34
+ inverse_basis = torch.FloatTensor(
35
+ np.linalg.pinv(scale * fourier_basis).T[:, None, :]
36
+ )
37
+
38
+ if window is not None:
39
+ assert filter_length >= win_length
40
+ # get window and zero center pad it to filter_length
41
+ fft_window = get_window(window, win_length, fftbins=True)
42
+ fft_window = pad_center(fft_window, filter_length)
43
+ fft_window = torch.from_numpy(fft_window).float()
44
+
45
+ # window the bases
46
+ forward_basis *= fft_window
47
+ inverse_basis *= fft_window
48
+
49
+ self.register_buffer("forward_basis", forward_basis.float())
50
+ self.register_buffer("inverse_basis", inverse_basis.float())
51
+
52
+ def transform(self, input_data):
53
+ num_batches = input_data.size(0)
54
+ num_samples = input_data.size(1)
55
+
56
+ self.num_samples = num_samples
57
+
58
+ # similar to librosa, reflect-pad the input
59
+ input_data = input_data.view(num_batches, 1, num_samples)
60
+ input_data = F.pad(
61
+ input_data.unsqueeze(1),
62
+ (int(self.filter_length / 2), int(self.filter_length / 2), 0, 0),
63
+ mode="reflect",
64
+ )
65
+ input_data = input_data.squeeze(1)
66
+
67
+ forward_transform = F.conv1d(
68
+ input_data,
69
+ torch.autograd.Variable(self.forward_basis, requires_grad=False),
70
+ stride=self.hop_length,
71
+ padding=0,
72
+ ).cpu()
73
+
74
+ cutoff = int((self.filter_length / 2) + 1)
75
+ real_part = forward_transform[:, :cutoff, :]
76
+ imag_part = forward_transform[:, cutoff:, :]
77
+
78
+ magnitude = torch.sqrt(real_part**2 + imag_part**2)
79
+ phase = torch.autograd.Variable(torch.atan2(imag_part.data, real_part.data))
80
+
81
+ return magnitude, phase
82
+
83
+ def inverse(self, magnitude, phase):
84
+ recombine_magnitude_phase = torch.cat(
85
+ [magnitude * torch.cos(phase), magnitude * torch.sin(phase)], dim=1
86
+ )
87
+
88
+ inverse_transform = F.conv_transpose1d(
89
+ recombine_magnitude_phase,
90
+ torch.autograd.Variable(self.inverse_basis, requires_grad=False),
91
+ stride=self.hop_length,
92
+ padding=0,
93
+ )
94
+
95
+ if self.window is not None:
96
+ window_sum = window_sumsquare(
97
+ self.window,
98
+ magnitude.size(-1),
99
+ hop_length=self.hop_length,
100
+ win_length=self.win_length,
101
+ n_fft=self.filter_length,
102
+ dtype=np.float32,
103
+ )
104
+ # remove modulation effects
105
+ approx_nonzero_indices = torch.from_numpy(
106
+ np.where(window_sum > tiny(window_sum))[0]
107
+ )
108
+ window_sum = torch.autograd.Variable(
109
+ torch.from_numpy(window_sum), requires_grad=False
110
+ )
111
+ window_sum = window_sum
112
+ inverse_transform[:, :, approx_nonzero_indices] /= window_sum[
113
+ approx_nonzero_indices
114
+ ]
115
+
116
+ # scale by hop ratio
117
+ inverse_transform *= float(self.filter_length) / self.hop_length
118
+
119
+ inverse_transform = inverse_transform[:, :, int(self.filter_length / 2) :]
120
+ inverse_transform = inverse_transform[:, :, : -int(self.filter_length / 2) :]
121
+
122
+ return inverse_transform
123
+
124
+ def forward(self, input_data):
125
+ self.magnitude, self.phase = self.transform(input_data)
126
+ reconstruction = self.inverse(self.magnitude, self.phase)
127
+ return reconstruction
128
+
129
+
130
+ class TacotronSTFT(torch.nn.Module):
131
+ def __init__(
132
+ self,
133
+ filter_length,
134
+ hop_length,
135
+ win_length,
136
+ n_mel_channels,
137
+ sampling_rate,
138
+ mel_fmin,
139
+ mel_fmax,
140
+ ):
141
+ super(TacotronSTFT, self).__init__()
142
+ self.n_mel_channels = n_mel_channels
143
+ self.sampling_rate = sampling_rate
144
+ self.stft_fn = STFT(filter_length, hop_length, win_length)
145
+ mel_basis = librosa_mel_fn(
146
+ sampling_rate, filter_length, n_mel_channels, mel_fmin, mel_fmax
147
+ )
148
+ mel_basis = torch.from_numpy(mel_basis).float()
149
+ self.register_buffer("mel_basis", mel_basis)
150
+
151
+ def spectral_normalize(self, magnitudes, normalize_fun):
152
+ output = dynamic_range_compression(magnitudes, normalize_fun)
153
+ return output
154
+
155
+ def spectral_de_normalize(self, magnitudes):
156
+ output = dynamic_range_decompression(magnitudes)
157
+ return output
158
+
159
+ def mel_spectrogram(self, y, normalize_fun=torch.log):
160
+ """Computes mel-spectrograms from a batch of waves
161
+ PARAMS
162
+ ------
163
+ y: Variable(torch.FloatTensor) with shape (B, T) in range [-1, 1]
164
+
165
+ RETURNS
166
+ -------
167
+ mel_output: torch.FloatTensor of shape (B, n_mel_channels, T)
168
+ """
169
+ assert torch.min(y.data) >= -1, torch.min(y.data)
170
+ assert torch.max(y.data) <= 1, torch.max(y.data)
171
+
172
+ magnitudes, phases = self.stft_fn.transform(y)
173
+ magnitudes = magnitudes.data
174
+ mel_output = torch.matmul(self.mel_basis, magnitudes)
175
+ mel_output = self.spectral_normalize(mel_output, normalize_fun)
176
+ energy = torch.norm(magnitudes, dim=1)
177
+
178
+ log_magnitudes = self.spectral_normalize(magnitudes, normalize_fun)
179
+
180
+ return mel_output, log_magnitudes, energy
audioldm/audio/tools.py ADDED
@@ -0,0 +1,33 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import torch
2
+ import numpy as np
3
+
4
+
5
+ def get_mel_from_wav(audio, _stft):
6
+ audio = torch.clip(torch.FloatTensor(audio).unsqueeze(0), -1, 1)
7
+ audio = torch.autograd.Variable(audio, requires_grad=False)
8
+ melspec, log_magnitudes_stft, energy = _stft.mel_spectrogram(audio)
9
+ melspec = torch.squeeze(melspec, 0).numpy().astype(np.float32)
10
+ log_magnitudes_stft = (
11
+ torch.squeeze(log_magnitudes_stft, 0).numpy().astype(np.float32)
12
+ )
13
+ energy = torch.squeeze(energy, 0).numpy().astype(np.float32)
14
+ return melspec, log_magnitudes_stft, energy
15
+
16
+
17
+ # def inv_mel_spec(mel, out_filename, _stft, griffin_iters=60):
18
+ # mel = torch.stack([mel])
19
+ # mel_decompress = _stft.spectral_de_normalize(mel)
20
+ # mel_decompress = mel_decompress.transpose(1, 2).data.cpu()
21
+ # spec_from_mel_scaling = 1000
22
+ # spec_from_mel = torch.mm(mel_decompress[0], _stft.mel_basis)
23
+ # spec_from_mel = spec_from_mel.transpose(0, 1).unsqueeze(0)
24
+ # spec_from_mel = spec_from_mel * spec_from_mel_scaling
25
+
26
+ # audio = griffin_lim(
27
+ # torch.autograd.Variable(spec_from_mel[:, :, :-1]), _stft._stft_fn, griffin_iters
28
+ # )
29
+
30
+ # audio = audio.squeeze()
31
+ # audio = audio.cpu().numpy()
32
+ # audio_path = out_filename
33
+ # write(audio_path, _stft.sampling_rate, audio)
audioldm/clap/__init__.py ADDED
File without changes
audioldm/clap/encoders.py ADDED
@@ -0,0 +1,169 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import torch
2
+ import torch.nn as nn
3
+ from audioldm.clap.open_clip import create_model
4
+ from audioldm.clap.training.data import get_audio_features
5
+ import torchaudio
6
+ from transformers import RobertaTokenizer
7
+ import torch.nn.functional as F
8
+
9
+
10
+ class CLAPAudioEmbeddingClassifierFreev2(nn.Module):
11
+ def __init__(
12
+ self,
13
+ pretrained_path="",
14
+ key="class",
15
+ sampling_rate=16000,
16
+ embed_mode="audio",
17
+ unconditional_prob=0.1,
18
+ random_mute=False,
19
+ max_random_mute_portion=0.5,
20
+ training_mode=True,
21
+ ):
22
+ super().__init__()
23
+
24
+ self.key = key
25
+ self.device = "cpu"
26
+ self.precision = "fp32"
27
+ self.amodel = "HTSAT-tiny" # or 'PANN-14'
28
+ self.tmodel = "roberta" # the best text encoder in our training
29
+ self.enable_fusion = False # False if you do not want to use the fusion model
30
+ self.fusion_type = "aff_2d"
31
+ self.pretrained = pretrained_path
32
+ self.embed_mode = embed_mode
33
+ self.embed_mode_orig = embed_mode
34
+ self.sampling_rate = sampling_rate
35
+ self.unconditional_prob = unconditional_prob
36
+ self.random_mute = random_mute
37
+ self.tokenize = RobertaTokenizer.from_pretrained("roberta-base")
38
+ self.max_random_mute_portion = max_random_mute_portion
39
+ self.training_mode = training_mode
40
+ self.model, self.model_cfg = create_model(
41
+ self.amodel,
42
+ self.tmodel,
43
+ self.pretrained,
44
+ precision=self.precision,
45
+ device=self.device,
46
+ enable_fusion=self.enable_fusion,
47
+ fusion_type=self.fusion_type,
48
+ )
49
+ for p in self.model.parameters():
50
+ p.requires_grad = False
51
+
52
+ self.model.eval()
53
+
54
+ def get_unconditional_condition(self, batchsize):
55
+ self.unconditional_token = self.model.get_text_embedding(
56
+ self.tokenizer(["", ""])
57
+ )[0:1]
58
+ return torch.cat([self.unconditional_token.unsqueeze(0)] * batchsize, dim=0)
59
+
60
+ def batch_to_list(self, batch):
61
+ ret = []
62
+ for i in range(batch.size(0)):
63
+ ret.append(batch[i])
64
+ return ret
65
+
66
+ def make_decision(self, probability):
67
+ if float(torch.rand(1)) < probability:
68
+ return True
69
+ else:
70
+ return False
71
+
72
+ def random_uniform(self, start, end):
73
+ val = torch.rand(1).item()
74
+ return start + (end - start) * val
75
+
76
+ def _random_mute(self, waveform):
77
+ # waveform: [bs, t-steps]
78
+ t_steps = waveform.size(-1)
79
+ for i in range(waveform.size(0)):
80
+ mute_size = int(
81
+ self.random_uniform(0, end=int(t_steps * self.max_random_mute_portion))
82
+ )
83
+ mute_start = int(self.random_uniform(0, t_steps - mute_size))
84
+ waveform[i, mute_start : mute_start + mute_size] = 0
85
+ return waveform
86
+
87
+ def cos_similarity(self, waveform, text):
88
+ # waveform: [bs, t_steps]
89
+ with torch.no_grad():
90
+ self.embed_mode = "audio"
91
+ audio_emb = self(waveform.cuda())
92
+ self.embed_mode = "text"
93
+ text_emb = self(text)
94
+ similarity = F.cosine_similarity(audio_emb, text_emb, dim=2)
95
+ return similarity.squeeze()
96
+
97
+ def forward(self, batch, key=None):
98
+ # If you want this conditioner to be unconditional, set self.unconditional_prob = 1.0
99
+ # If you want this conditioner to be fully conditional, set self.unconditional_prob = 0.0
100
+ if self.model.training == True and not self.training_mode:
101
+ print(
102
+ "The pretrained CLAP model should always be in eval mode. Reloading model just in case you change the parameters."
103
+ )
104
+ self.model, self.model_cfg = create_model(
105
+ self.amodel,
106
+ self.tmodel,
107
+ self.pretrained,
108
+ precision=self.precision,
109
+ device="cuda",
110
+ enable_fusion=self.enable_fusion,
111
+ fusion_type=self.fusion_type,
112
+ )
113
+ for p in self.model.parameters():
114
+ p.requires_grad = False
115
+ self.model.eval()
116
+
117
+ # the 'fusion' truncate mode can be changed to 'rand_trunc' if run in unfusion mode
118
+ if self.embed_mode == "audio":
119
+ with torch.no_grad():
120
+ audio_dict_list = []
121
+ assert (
122
+ self.sampling_rate == 16000
123
+ ), "We only support 16000 sampling rate"
124
+ if self.random_mute:
125
+ batch = self._random_mute(batch)
126
+ # batch: [bs, 1, t-samples]
127
+ batch = torchaudio.functional.resample(
128
+ batch, orig_freq=self.sampling_rate, new_freq=48000
129
+ )
130
+ for waveform in self.batch_to_list(batch):
131
+ audio_dict = {}
132
+ audio_dict = get_audio_features(
133
+ audio_dict,
134
+ waveform,
135
+ 480000,
136
+ data_truncating="fusion",
137
+ data_filling="repeatpad",
138
+ audio_cfg=self.model_cfg["audio_cfg"],
139
+ )
140
+ audio_dict_list.append(audio_dict)
141
+ # [bs, 512]
142
+ embed = self.model.get_audio_embedding(audio_dict_list)
143
+ elif self.embed_mode == "text":
144
+ with torch.no_grad():
145
+ # the 'fusion' truncate mode can be changed to 'rand_trunc' if run in unfusion mode
146
+ text_data = self.tokenizer(batch)
147
+ embed = self.model.get_text_embedding(text_data)
148
+
149
+ embed = embed.unsqueeze(1)
150
+ self.unconditional_token = self.model.get_text_embedding(
151
+ self.tokenizer(["", ""])
152
+ )[0:1]
153
+
154
+ for i in range(embed.size(0)):
155
+ if self.make_decision(self.unconditional_prob):
156
+ embed[i] = self.unconditional_token
157
+
158
+ # [bs, 1, 512]
159
+ return embed.detach()
160
+
161
+ def tokenizer(self, text):
162
+ result = self.tokenize(
163
+ text,
164
+ padding="max_length",
165
+ truncation=True,
166
+ max_length=77,
167
+ return_tensors="pt",
168
+ )
169
+ return {k: v.squeeze(0) for k, v in result.items()}
audioldm/clap/open_clip/__init__.py ADDED
@@ -0,0 +1,25 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from .factory import (
2
+ list_models,
3
+ create_model,
4
+ create_model_and_transforms,
5
+ add_model_config,
6
+ )
7
+ from .loss import ClipLoss, gather_features, LPLoss, lp_gather_features, LPMetrics
8
+ from .model import (
9
+ CLAP,
10
+ CLAPTextCfg,
11
+ CLAPVisionCfg,
12
+ CLAPAudioCfp,
13
+ convert_weights_to_fp16,
14
+ trace_model,
15
+ )
16
+ from .openai import load_openai_model, list_openai_models
17
+ from .pretrained import (
18
+ list_pretrained,
19
+ list_pretrained_tag_models,
20
+ list_pretrained_model_tags,
21
+ get_pretrained_url,
22
+ download_pretrained,
23
+ )
24
+ from .tokenizer import SimpleTokenizer, tokenize
25
+ from .transform import image_transform
audioldm/clap/open_clip/bert.py ADDED
@@ -0,0 +1,40 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from transformers import BertTokenizer, BertModel
2
+
3
+ tokenizer = BertTokenizer.from_pretrained("bert-base-uncased")
4
+ model = BertModel.from_pretrained("bert-base-uncased")
5
+ text = "Replace me by any text you'd like."
6
+
7
+
8
+ def bert_embeddings(text):
9
+ # text = "Replace me by any text you'd like."
10
+ encoded_input = tokenizer(text, return_tensors="pt")
11
+ output = model(**encoded_input)
12
+ return output
13
+
14
+
15
+ from transformers import RobertaTokenizer, RobertaModel
16
+
17
+ tokenizer = RobertaTokenizer.from_pretrained("roberta-base")
18
+ model = RobertaModel.from_pretrained("roberta-base")
19
+ text = "Replace me by any text you'd like."
20
+
21
+
22
+ def Roberta_embeddings(text):
23
+ # text = "Replace me by any text you'd like."
24
+ encoded_input = tokenizer(text, return_tensors="pt")
25
+ output = model(**encoded_input)
26
+ return output
27
+
28
+
29
+ from transformers import BartTokenizer, BartModel
30
+
31
+ tokenizer = BartTokenizer.from_pretrained("facebook/bart-base")
32
+ model = BartModel.from_pretrained("facebook/bart-base")
33
+ text = "Replace me by any text you'd like."
34
+
35
+
36
+ def bart_embeddings(text):
37
+ # text = "Replace me by any text you'd like."
38
+ encoded_input = tokenizer(text, return_tensors="pt")
39
+ output = model(**encoded_input)
40
+ return output
audioldm/clap/open_clip/bpe_simple_vocab_16e6.txt.gz ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:924691ac288e54409236115652ad4aa250f48203de50a9e4722a6ecd48d6804a
3
+ size 1356917
audioldm/clap/open_clip/factory.py ADDED
@@ -0,0 +1,277 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import json
2
+ import logging
3
+ import os
4
+ import pathlib
5
+ import re
6
+ from copy import deepcopy
7
+ from pathlib import Path
8
+
9
+ import torch
10
+
11
+ from .model import CLAP, convert_weights_to_fp16
12
+ from .openai import load_openai_model
13
+ from .pretrained import get_pretrained_url, download_pretrained
14
+ from .transform import image_transform
15
+
16
+ _MODEL_CONFIG_PATHS = [Path(__file__).parent / f"model_configs/"]
17
+ _MODEL_CONFIGS = {} # directory (model_name: config) of model architecture configs
18
+
19
+
20
+ def _natural_key(string_):
21
+ return [int(s) if s.isdigit() else s for s in re.split(r"(\d+)", string_.lower())]
22
+
23
+
24
+ def _rescan_model_configs():
25
+ global _MODEL_CONFIGS
26
+
27
+ config_ext = (".json",)
28
+ config_files = []
29
+ for config_path in _MODEL_CONFIG_PATHS:
30
+ if config_path.is_file() and config_path.suffix in config_ext:
31
+ config_files.append(config_path)
32
+ elif config_path.is_dir():
33
+ for ext in config_ext:
34
+ config_files.extend(config_path.glob(f"*{ext}"))
35
+
36
+ for cf in config_files:
37
+ if os.path.basename(cf)[0] == ".":
38
+ continue # Ignore hidden files
39
+
40
+ with open(cf, "r") as f:
41
+ model_cfg = json.load(f)
42
+ if all(a in model_cfg for a in ("embed_dim", "audio_cfg", "text_cfg")):
43
+ _MODEL_CONFIGS[cf.stem] = model_cfg
44
+
45
+ _MODEL_CONFIGS = {
46
+ k: v
47
+ for k, v in sorted(_MODEL_CONFIGS.items(), key=lambda x: _natural_key(x[0]))
48
+ }
49
+
50
+
51
+ _rescan_model_configs() # initial populate of model config registry
52
+
53
+
54
+ def load_state_dict(checkpoint_path: str, map_location="cpu", skip_params=True):
55
+ checkpoint = torch.load(checkpoint_path, map_location=map_location)
56
+ if isinstance(checkpoint, dict) and "state_dict" in checkpoint:
57
+ state_dict = checkpoint["state_dict"]
58
+ else:
59
+ state_dict = checkpoint
60
+ if skip_params:
61
+ if next(iter(state_dict.items()))[0].startswith("module"):
62
+ state_dict = {k[7:]: v for k, v in state_dict.items()}
63
+ # for k in state_dict:
64
+ # if k.startswith('transformer'):
65
+ # v = state_dict.pop(k)
66
+ # state_dict['text_branch.' + k[12:]] = v
67
+ return state_dict
68
+
69
+
70
+ def create_model(
71
+ amodel_name: str,
72
+ tmodel_name: str,
73
+ pretrained: str = "",
74
+ precision: str = "fp32",
75
+ device: torch.device = torch.device("cpu"),
76
+ jit: bool = False,
77
+ force_quick_gelu: bool = False,
78
+ openai_model_cache_dir: str = os.path.expanduser("~/.cache/clip"),
79
+ skip_params=True,
80
+ pretrained_audio: str = "",
81
+ pretrained_text: str = "",
82
+ enable_fusion: bool = False,
83
+ fusion_type: str = "None"
84
+ # pretrained_image: bool = False,
85
+ ):
86
+ amodel_name = amodel_name.replace(
87
+ "/", "-"
88
+ ) # for callers using old naming with / in ViT names
89
+ pretrained_orig = pretrained
90
+ pretrained = pretrained.lower()
91
+ if pretrained == "openai":
92
+ if amodel_name in _MODEL_CONFIGS:
93
+ logging.info(f"Loading {amodel_name} model config.")
94
+ model_cfg = deepcopy(_MODEL_CONFIGS[amodel_name])
95
+ else:
96
+ logging.error(
97
+ f"Model config for {amodel_name} not found; available models {list_models()}."
98
+ )
99
+ raise RuntimeError(f"Model config for {amodel_name} not found.")
100
+
101
+ logging.info(f"Loading pretrained ViT-B-16 text encoder from OpenAI.")
102
+ # Hard Code in model name
103
+ model_cfg["text_cfg"]["model_type"] = tmodel_name
104
+ model = load_openai_model(
105
+ "ViT-B-16",
106
+ model_cfg,
107
+ device=device,
108
+ jit=jit,
109
+ cache_dir=openai_model_cache_dir,
110
+ enable_fusion=enable_fusion,
111
+ fusion_type=fusion_type,
112
+ )
113
+ # See https://discuss.pytorch.org/t/valueerror-attemting-to-unscale-fp16-gradients/81372
114
+ if precision == "amp" or precision == "fp32":
115
+ model = model.float()
116
+ else:
117
+ if amodel_name in _MODEL_CONFIGS:
118
+ logging.info(f"Loading {amodel_name} model config.")
119
+ model_cfg = deepcopy(_MODEL_CONFIGS[amodel_name])
120
+ else:
121
+ logging.error(
122
+ f"Model config for {amodel_name} not found; available models {list_models()}."
123
+ )
124
+ raise RuntimeError(f"Model config for {amodel_name} not found.")
125
+
126
+ if force_quick_gelu:
127
+ # override for use of QuickGELU on non-OpenAI transformer models
128
+ model_cfg["quick_gelu"] = True
129
+
130
+ # if pretrained_image:
131
+ # if 'timm_amodel_name' in model_cfg.get('vision_cfg', {}):
132
+ # # pretrained weight loading for timm models set via vision_cfg
133
+ # model_cfg['vision_cfg']['timm_model_pretrained'] = True
134
+ # else:
135
+ # assert False, 'pretrained image towers currently only supported for timm models'
136
+ model_cfg["text_cfg"]["model_type"] = tmodel_name
137
+ model_cfg["enable_fusion"] = enable_fusion
138
+ model_cfg["fusion_type"] = fusion_type
139
+ model = CLAP(**model_cfg)
140
+
141
+ if pretrained:
142
+ checkpoint_path = ""
143
+ url = get_pretrained_url(amodel_name, pretrained)
144
+ if url:
145
+ checkpoint_path = download_pretrained(url, root=openai_model_cache_dir)
146
+ elif os.path.exists(pretrained_orig):
147
+ checkpoint_path = pretrained_orig
148
+ if checkpoint_path:
149
+ logging.info(
150
+ f"Loading pretrained {amodel_name}-{tmodel_name} weights ({pretrained})."
151
+ )
152
+ ckpt = load_state_dict(checkpoint_path, skip_params=True)
153
+ model.load_state_dict(ckpt)
154
+ param_names = [n for n, p in model.named_parameters()]
155
+ # for n in param_names:
156
+ # print(n, "\t", "Loaded" if n in ckpt else "Unloaded")
157
+ else:
158
+ logging.warning(
159
+ f"Pretrained weights ({pretrained}) not found for model {amodel_name}."
160
+ )
161
+ raise RuntimeError(
162
+ f"Pretrained weights ({pretrained}) not found for model {amodel_name}."
163
+ )
164
+
165
+ if pretrained_audio:
166
+ if amodel_name.startswith("PANN"):
167
+ if "Cnn14_mAP" in pretrained_audio: # official checkpoint
168
+ audio_ckpt = torch.load(pretrained_audio, map_location="cpu")
169
+ audio_ckpt = audio_ckpt["model"]
170
+ keys = list(audio_ckpt.keys())
171
+ for key in keys:
172
+ if (
173
+ "spectrogram_extractor" not in key
174
+ and "logmel_extractor" not in key
175
+ ):
176
+ v = audio_ckpt.pop(key)
177
+ audio_ckpt["audio_branch." + key] = v
178
+ elif os.path.basename(pretrained_audio).startswith(
179
+ "PANN"
180
+ ): # checkpoint trained via HTSAT codebase
181
+ audio_ckpt = torch.load(pretrained_audio, map_location="cpu")
182
+ audio_ckpt = audio_ckpt["state_dict"]
183
+ keys = list(audio_ckpt.keys())
184
+ for key in keys:
185
+ if key.startswith("sed_model"):
186
+ v = audio_ckpt.pop(key)
187
+ audio_ckpt["audio_branch." + key[10:]] = v
188
+ elif os.path.basename(pretrained_audio).startswith(
189
+ "finetuned"
190
+ ): # checkpoint trained via linear probe codebase
191
+ audio_ckpt = torch.load(pretrained_audio, map_location="cpu")
192
+ else:
193
+ raise ValueError("Unknown audio checkpoint")
194
+ elif amodel_name.startswith("HTSAT"):
195
+ if "HTSAT_AudioSet_Saved" in pretrained_audio: # official checkpoint
196
+ audio_ckpt = torch.load(pretrained_audio, map_location="cpu")
197
+ audio_ckpt = audio_ckpt["state_dict"]
198
+ keys = list(audio_ckpt.keys())
199
+ for key in keys:
200
+ if key.startswith("sed_model") and (
201
+ "spectrogram_extractor" not in key
202
+ and "logmel_extractor" not in key
203
+ ):
204
+ v = audio_ckpt.pop(key)
205
+ audio_ckpt["audio_branch." + key[10:]] = v
206
+ elif os.path.basename(pretrained_audio).startswith(
207
+ "HTSAT"
208
+ ): # checkpoint trained via HTSAT codebase
209
+ audio_ckpt = torch.load(pretrained_audio, map_location="cpu")
210
+ audio_ckpt = audio_ckpt["state_dict"]
211
+ keys = list(audio_ckpt.keys())
212
+ for key in keys:
213
+ if key.startswith("sed_model"):
214
+ v = audio_ckpt.pop(key)
215
+ audio_ckpt["audio_branch." + key[10:]] = v
216
+ elif os.path.basename(pretrained_audio).startswith(
217
+ "finetuned"
218
+ ): # checkpoint trained via linear probe codebase
219
+ audio_ckpt = torch.load(pretrained_audio, map_location="cpu")
220
+ else:
221
+ raise ValueError("Unknown audio checkpoint")
222
+ else:
223
+ raise f"this audio encoder pretrained checkpoint is not support"
224
+
225
+ model.load_state_dict(audio_ckpt, strict=False)
226
+ logging.info(
227
+ f"Loading pretrained {amodel_name} weights ({pretrained_audio})."
228
+ )
229
+ param_names = [n for n, p in model.named_parameters()]
230
+ for n in param_names:
231
+ print(n, "\t", "Loaded" if n in audio_ckpt else "Unloaded")
232
+
233
+ model.to(device=device)
234
+ if precision == "fp16":
235
+ assert device.type != "cpu"
236
+ convert_weights_to_fp16(model)
237
+
238
+ if jit:
239
+ model = torch.jit.script(model)
240
+
241
+ return model, model_cfg
242
+
243
+
244
+ def create_model_and_transforms(
245
+ model_name: str,
246
+ pretrained: str = "",
247
+ precision: str = "fp32",
248
+ device: torch.device = torch.device("cpu"),
249
+ jit: bool = False,
250
+ force_quick_gelu: bool = False,
251
+ # pretrained_image: bool = False,
252
+ ):
253
+ model = create_model(
254
+ model_name,
255
+ pretrained,
256
+ precision,
257
+ device,
258
+ jit,
259
+ force_quick_gelu=force_quick_gelu,
260
+ # pretrained_image=pretrained_image
261
+ )
262
+ preprocess_train = image_transform(model.visual.image_size, is_train=True)
263
+ preprocess_val = image_transform(model.visual.image_size, is_train=False)
264
+ return model, preprocess_train, preprocess_val
265
+
266
+
267
+ def list_models():
268
+ """enumerate available model architectures based on config files"""
269
+ return list(_MODEL_CONFIGS.keys())
270
+
271
+
272
+ def add_model_config(path):
273
+ """add model config path or file and update registry"""
274
+ if not isinstance(path, Path):
275
+ path = Path(path)
276
+ _MODEL_CONFIG_PATHS.append(path)
277
+ _rescan_model_configs()
audioldm/clap/open_clip/feature_fusion.py ADDED
@@ -0,0 +1,192 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """
2
+ Feature Fusion for Varible-Length Data Processing
3
+ AFF/iAFF is referred and modified from https://github.com/YimianDai/open-aff/blob/master/aff_pytorch/aff_net/fusion.py
4
+ According to the paper: Yimian Dai et al, Attentional Feature Fusion, IEEE Winter Conference on Applications of Computer Vision, WACV 2021
5
+ """
6
+
7
+ import torch
8
+ import torch.nn as nn
9
+
10
+
11
+ class DAF(nn.Module):
12
+ """
13
+ 直接相加 DirectAddFuse
14
+ """
15
+
16
+ def __init__(self):
17
+ super(DAF, self).__init__()
18
+
19
+ def forward(self, x, residual):
20
+ return x + residual
21
+
22
+
23
+ class iAFF(nn.Module):
24
+ """
25
+ 多特征融合 iAFF
26
+ """
27
+
28
+ def __init__(self, channels=64, r=4, type="2D"):
29
+ super(iAFF, self).__init__()
30
+ inter_channels = int(channels // r)
31
+
32
+ if type == "1D":
33
+ # 本地注意力
34
+ self.local_att = nn.Sequential(
35
+ nn.Conv1d(channels, inter_channels, kernel_size=1, stride=1, padding=0),
36
+ nn.BatchNorm1d(inter_channels),
37
+ nn.ReLU(inplace=True),
38
+ nn.Conv1d(inter_channels, channels, kernel_size=1, stride=1, padding=0),
39
+ nn.BatchNorm1d(channels),
40
+ )
41
+
42
+ # 全局注意力
43
+ self.global_att = nn.Sequential(
44
+ nn.AdaptiveAvgPool1d(1),
45
+ nn.Conv1d(channels, inter_channels, kernel_size=1, stride=1, padding=0),
46
+ nn.BatchNorm1d(inter_channels),
47
+ nn.ReLU(inplace=True),
48
+ nn.Conv1d(inter_channels, channels, kernel_size=1, stride=1, padding=0),
49
+ nn.BatchNorm1d(channels),
50
+ )
51
+
52
+ # 第二次本地注意力
53
+ self.local_att2 = nn.Sequential(
54
+ nn.Conv1d(channels, inter_channels, kernel_size=1, stride=1, padding=0),
55
+ nn.BatchNorm1d(inter_channels),
56
+ nn.ReLU(inplace=True),
57
+ nn.Conv1d(inter_channels, channels, kernel_size=1, stride=1, padding=0),
58
+ nn.BatchNorm1d(channels),
59
+ )
60
+ # 第二次全局注意力
61
+ self.global_att2 = nn.Sequential(
62
+ nn.AdaptiveAvgPool1d(1),
63
+ nn.Conv1d(channels, inter_channels, kernel_size=1, stride=1, padding=0),
64
+ nn.BatchNorm1d(inter_channels),
65
+ nn.ReLU(inplace=True),
66
+ nn.Conv1d(inter_channels, channels, kernel_size=1, stride=1, padding=0),
67
+ nn.BatchNorm1d(channels),
68
+ )
69
+ elif type == "2D":
70
+ # 本地注意力
71
+ self.local_att = nn.Sequential(
72
+ nn.Conv2d(channels, inter_channels, kernel_size=1, stride=1, padding=0),
73
+ nn.BatchNorm2d(inter_channels),
74
+ nn.ReLU(inplace=True),
75
+ nn.Conv2d(inter_channels, channels, kernel_size=1, stride=1, padding=0),
76
+ nn.BatchNorm2d(channels),
77
+ )
78
+
79
+ # 全局注意力
80
+ self.global_att = nn.Sequential(
81
+ nn.AdaptiveAvgPool2d(1),
82
+ nn.Conv2d(channels, inter_channels, kernel_size=1, stride=1, padding=0),
83
+ nn.BatchNorm2d(inter_channels),
84
+ nn.ReLU(inplace=True),
85
+ nn.Conv2d(inter_channels, channels, kernel_size=1, stride=1, padding=0),
86
+ nn.BatchNorm2d(channels),
87
+ )
88
+
89
+ # 第二次本地注意力
90
+ self.local_att2 = nn.Sequential(
91
+ nn.Conv2d(channels, inter_channels, kernel_size=1, stride=1, padding=0),
92
+ nn.BatchNorm2d(inter_channels),
93
+ nn.ReLU(inplace=True),
94
+ nn.Conv2d(inter_channels, channels, kernel_size=1, stride=1, padding=0),
95
+ nn.BatchNorm2d(channels),
96
+ )
97
+ # 第二次全局注意力
98
+ self.global_att2 = nn.Sequential(
99
+ nn.AdaptiveAvgPool2d(1),
100
+ nn.Conv2d(channels, inter_channels, kernel_size=1, stride=1, padding=0),
101
+ nn.BatchNorm2d(inter_channels),
102
+ nn.ReLU(inplace=True),
103
+ nn.Conv2d(inter_channels, channels, kernel_size=1, stride=1, padding=0),
104
+ nn.BatchNorm2d(channels),
105
+ )
106
+ else:
107
+ raise f"the type is not supported"
108
+
109
+ self.sigmoid = nn.Sigmoid()
110
+
111
+ def forward(self, x, residual):
112
+ flag = False
113
+ xa = x + residual
114
+ if xa.size(0) == 1:
115
+ xa = torch.cat([xa, xa], dim=0)
116
+ flag = True
117
+ xl = self.local_att(xa)
118
+ xg = self.global_att(xa)
119
+ xlg = xl + xg
120
+ wei = self.sigmoid(xlg)
121
+ xi = x * wei + residual * (1 - wei)
122
+
123
+ xl2 = self.local_att2(xi)
124
+ xg2 = self.global_att(xi)
125
+ xlg2 = xl2 + xg2
126
+ wei2 = self.sigmoid(xlg2)
127
+ xo = x * wei2 + residual * (1 - wei2)
128
+ if flag:
129
+ xo = xo[0].unsqueeze(0)
130
+ return xo
131
+
132
+
133
+ class AFF(nn.Module):
134
+ """
135
+ 多特征融合 AFF
136
+ """
137
+
138
+ def __init__(self, channels=64, r=4, type="2D"):
139
+ super(AFF, self).__init__()
140
+ inter_channels = int(channels // r)
141
+
142
+ if type == "1D":
143
+ self.local_att = nn.Sequential(
144
+ nn.Conv1d(channels, inter_channels, kernel_size=1, stride=1, padding=0),
145
+ nn.BatchNorm1d(inter_channels),
146
+ nn.ReLU(inplace=True),
147
+ nn.Conv1d(inter_channels, channels, kernel_size=1, stride=1, padding=0),
148
+ nn.BatchNorm1d(channels),
149
+ )
150
+ self.global_att = nn.Sequential(
151
+ nn.AdaptiveAvgPool1d(1),
152
+ nn.Conv1d(channels, inter_channels, kernel_size=1, stride=1, padding=0),
153
+ nn.BatchNorm1d(inter_channels),
154
+ nn.ReLU(inplace=True),
155
+ nn.Conv1d(inter_channels, channels, kernel_size=1, stride=1, padding=0),
156
+ nn.BatchNorm1d(channels),
157
+ )
158
+ elif type == "2D":
159
+ self.local_att = nn.Sequential(
160
+ nn.Conv2d(channels, inter_channels, kernel_size=1, stride=1, padding=0),
161
+ nn.BatchNorm2d(inter_channels),
162
+ nn.ReLU(inplace=True),
163
+ nn.Conv2d(inter_channels, channels, kernel_size=1, stride=1, padding=0),
164
+ nn.BatchNorm2d(channels),
165
+ )
166
+ self.global_att = nn.Sequential(
167
+ nn.AdaptiveAvgPool2d(1),
168
+ nn.Conv2d(channels, inter_channels, kernel_size=1, stride=1, padding=0),
169
+ nn.BatchNorm2d(inter_channels),
170
+ nn.ReLU(inplace=True),
171
+ nn.Conv2d(inter_channels, channels, kernel_size=1, stride=1, padding=0),
172
+ nn.BatchNorm2d(channels),
173
+ )
174
+ else:
175
+ raise f"the type is not supported."
176
+
177
+ self.sigmoid = nn.Sigmoid()
178
+
179
+ def forward(self, x, residual):
180
+ flag = False
181
+ xa = x + residual
182
+ if xa.size(0) == 1:
183
+ xa = torch.cat([xa, xa], dim=0)
184
+ flag = True
185
+ xl = self.local_att(xa)
186
+ xg = self.global_att(xa)
187
+ xlg = xl + xg
188
+ wei = self.sigmoid(xlg)
189
+ xo = 2 * x * wei + 2 * residual * (1 - wei)
190
+ if flag:
191
+ xo = xo[0].unsqueeze(0)
192
+ return xo
audioldm/clap/open_clip/htsat.py ADDED
@@ -0,0 +1,1308 @@