Dionyssos commited on
Commit
e70ad00
1 Parent(s): 3dfbf53

distortion on barging DEBUG

Browse files
Files changed (4) hide show
  1. audiocraft/lm.py +4 -6
  2. audiocraft/seanet.py +1 -0
  3. audiocraft/transformer.py +7 -36
  4. demo.py +21 -41
audiocraft/lm.py CHANGED
@@ -1,12 +1,10 @@
1
- from dataclasses import dataclass, field
2
- from itertools import chain
3
  import logging
4
  import math
5
- import re
6
  import typing as tp
7
  import torch
8
  import torch.nn.functional as F
9
- from audiocraft.transformer import StreamingTransformer, create_norm_fn
10
  from dataclasses import dataclass
11
  from functools import partial
12
  from torch import nn
@@ -173,7 +171,7 @@ class LMModel(nn.Module):
173
  super().__init__()
174
  self.cfg_coef = cfg_coef
175
 
176
- self.n_draw = 1
177
  self.condition_provider = condition_provider
178
  self.fuser = fuser
179
  self.card = card # 2048 ?
@@ -207,7 +205,7 @@ class LMModel(nn.Module):
207
  norm_first=norm_first, **kwargs)
208
  self.out_norm: tp.Optional[nn.Module] = None
209
  if norm_first:
210
- self.out_norm = create_norm_fn(norm, dim)
211
  self.linears = nn.ModuleList([nn.Linear(dim, self.card, bias=bias_proj) for _ in range(n_q)])
212
  self._init_weights(weight_init, depthwise_init, zero_bias_init)
213
  self._fsdp: tp.Optional[nn.Module]
 
1
+ from dataclasses import dataclass
 
2
  import logging
3
  import math
 
4
  import typing as tp
5
  import torch
6
  import torch.nn.functional as F
7
+ from audiocraft.transformer import StreamingTransformer
8
  from dataclasses import dataclass
9
  from functools import partial
10
  from torch import nn
 
171
  super().__init__()
172
  self.cfg_coef = cfg_coef
173
 
174
+ self.n_draw = 3
175
  self.condition_provider = condition_provider
176
  self.fuser = fuser
177
  self.card = card # 2048 ?
 
205
  norm_first=norm_first, **kwargs)
206
  self.out_norm: tp.Optional[nn.Module] = None
207
  if norm_first:
208
+ self.out_norm = nn.LayerNorm(dim, eps=1e-5)
209
  self.linears = nn.ModuleList([nn.Linear(dim, self.card, bias=bias_proj) for _ in range(n_q)])
210
  self._init_weights(weight_init, depthwise_init, zero_bias_init)
211
  self._fsdp: tp.Optional[nn.Module]
audiocraft/seanet.py CHANGED
@@ -102,6 +102,7 @@ class SEANetDecoder(nn.Module):
102
  ]
103
 
104
  if lstm:
 
105
  model += [StreamableLSTM(mult * n_filters, num_layers=lstm)]
106
 
107
  # Upsample to raw audio scale
 
102
  ]
103
 
104
  if lstm:
105
+ print('\n\n\n\nLSTM IN SEANET\n\n\n\n')
106
  model += [StreamableLSTM(mult * n_filters, num_layers=lstm)]
107
 
108
  # Upsample to raw audio scale
audiocraft/transformer.py CHANGED
@@ -21,24 +21,6 @@ def _get_attention_time_dimension(memory_efficient: bool) -> int:
21
 
22
 
23
 
24
-
25
-
26
- def create_norm_fn(norm_type: str, dim: int, **kwargs) -> nn.Module:
27
- """Create normalization module for transformer encoder layer.
28
-
29
- Args:
30
- norm_type (str): Normalization method.
31
- dim (int): Dimension of the normalized layer.
32
- **kwargs (dict): Additional parameters for normalization layer.
33
- Returns:
34
- nn.Module: Normalization module.
35
- """
36
- if norm_type == 'layer_norm':
37
- return nn.LayerNorm(dim, eps=1e-5, **kwargs)
38
- else:
39
- raise ValueError(f"Unknown norm type: {norm_type}")
40
-
41
-
42
  def create_sin_embedding(positions: torch.Tensor, dim: int, max_period: float = 10000,
43
  dtype: torch.dtype = torch.float32) -> torch.Tensor:
44
  """Create sinusoidal positional embedding, with shape `[B, T, C]`.
@@ -105,7 +87,7 @@ class StreamingMultiheadAttention(nn.Module):
105
  self.v_history = None # clean up IN LM after finishing GENERATION - Each 1...47 mha has different kv history
106
 
107
  self.memory_efficient = memory_efficient
108
- self.attention_as_float32 = attention_as_float32
109
 
110
  self.cross_attention = cross_attention
111
 
@@ -227,20 +209,9 @@ class StreamingMultiheadAttention(nn.Module):
227
 
228
 
229
  # KV COMPLETION ONLY ON SELF ATTENTION
230
- #======================================================
231
-
232
- # so the previous layer passes you here the k,v having concatenated all previous
233
- #
234
- # also return those 2 for the next transformer layer
235
- #
236
- # also clean up after ending the transformer? NOOOOOOOOOOOOO is goes along tokens
237
- #
238
- # also why completekv does not grow longer during the 47 transformers but changes sum
239
-
240
- # k, v = self._complete_kv(k, v)
241
- # print(k.sum(), v.sum(), k.shape, v.shape,'ATTNext')
242
 
243
- print(f'{self.attention_as_float32=}')
244
  if self.memory_efficient:
245
  # print('EVER IN MEMORY EFFICIENT A')
246
 
@@ -319,14 +290,14 @@ class StreamingTransformerLayer(nn.Module): #nn.TransformerEncoderLayer):
319
  self.dropout_cross = nn.Dropout(dropout)
320
 
321
  self.norm_cross = nn.LayerNorm(d_model, eps=1e-5, **factory_kwargs)
322
- self.norm1 = create_norm_fn(norm, d_model, **factory_kwargs) # type: ignore
323
- self.norm2 = create_norm_fn(norm, d_model, **factory_kwargs) # type: ignore
324
 
325
 
326
  def forward(self,
327
  src,
328
  cross_attention_src=None): # txtcond
329
- '''T layer'''
330
 
331
  x = src
332
 
@@ -412,7 +383,7 @@ class StreamingTransformer(nn.Module):
412
 
413
 
414
  for j, lay in enumerate(self.layers):
415
- print(f'_________________________{j}___________________')
416
  x = lay(x, cross_attention_src=kwargs["cross_attention_src"]) # txt cond
417
  # each layer (mha) keeps history of its own k,v for all tokens
418
  return x
 
21
 
22
 
23
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
24
  def create_sin_embedding(positions: torch.Tensor, dim: int, max_period: float = 10000,
25
  dtype: torch.dtype = torch.float32) -> torch.Tensor:
26
  """Create sinusoidal positional embedding, with shape `[B, T, C]`.
 
87
  self.v_history = None # clean up IN LM after finishing GENERATION - Each 1...47 mha has different kv history
88
 
89
  self.memory_efficient = memory_efficient
90
+
91
 
92
  self.cross_attention = cross_attention
93
 
 
209
 
210
 
211
  # KV COMPLETION ONLY ON SELF ATTENTION
212
+
 
 
 
 
 
 
 
 
 
 
 
213
 
214
+
215
  if self.memory_efficient:
216
  # print('EVER IN MEMORY EFFICIENT A')
217
 
 
290
  self.dropout_cross = nn.Dropout(dropout)
291
 
292
  self.norm_cross = nn.LayerNorm(d_model, eps=1e-5, **factory_kwargs)
293
+ self.norm1 = nn.LayerNorm(d_model, eps=1e-5)
294
+ self.norm2 = nn.LayerNorm(d_model, eps=1e-5)
295
 
296
 
297
  def forward(self,
298
  src,
299
  cross_attention_src=None): # txtcond
300
+ '''T is saved float16 weights - should we cast src to float16'''
301
 
302
  x = src
303
 
 
383
 
384
 
385
  for j, lay in enumerate(self.layers):
386
+ # print(f'_________________________{j}___________________')
387
  x = lay(x, cross_attention_src=kwargs["cross_attention_src"]) # txt cond
388
  # each layer (mha) keeps history of its own k,v for all tokens
389
  return x
demo.py CHANGED
@@ -1,10 +1,7 @@
1
  import audiofile
2
  import numpy as np
3
- import typing as tp
4
  import torch
5
-
6
  from audiocraft.loaders import load_compression_model, load_lm_model
7
- from audiocraft.lm import LMModel
8
  from audiocraft.conditioners import ConditioningAttributes
9
 
10
 
@@ -15,57 +12,40 @@ class AudioGen():
15
  def __init__(self,
16
  compression_model=None,
17
  lm=None,
18
- duration=.04,
19
- top_k=249):
20
 
21
  self.compression_model = compression_model
22
  self.lm = lm
23
- self.top_k = top_k
24
- self.compression_model.eval()
25
- self.lm.eval()
26
  self.duration = duration
27
- self.device = next(iter(lm.parameters())).device
28
 
29
  @property
30
- def frame_rate(self) -> float:
31
- """Roughly the number of AR steps per seconds."""
32
  return self.compression_model.frame_rate
33
-
34
- @property
35
- def sample_rate(self) -> int:
36
- """Sample rate of the generated audio."""
37
- return self.compression_model.sample_rate
38
-
39
- def generate(self, descriptions):
40
- attributes = [
41
- ConditioningAttributes(text={'description': d}) for d in descriptions]
42
- tokens = self._generate_tokens(attributes)
43
- print(f'\n{tokens.shape=}\n{tokens=} FINAL 5 AUD')
44
- return self.generate_audio(tokens)
45
-
46
- def _generate_tokens(self, attributes):
47
- total_gen_len = int(self.duration * self.frame_rate)
48
- gen_tokens = self.lm.generate(conditions=attributes,
49
- max_gen_len=total_gen_len)
50
- gen_tokens = gen_tokens.transpose(0, 1).reshape(4, -1)[None, :, :]
51
- return gen_tokens
52
-
53
- def generate_audio(self, gen_tokens: torch.Tensor) -> torch.Tensor:
54
- """Generate Audio from tokens."""
55
- assert gen_tokens.dim() == 3
56
  with torch.no_grad():
57
- gen_audio = self.compression_model.decode(gen_tokens, None)
58
- return gen_audio
 
 
 
 
 
 
 
 
 
 
59
 
60
  device = 'cuda:0'
61
  # https://huggingface.co/facebook/audiogen-medium
62
 
63
 
64
  sound_generator = AudioGen(
65
- compression_model=load_compression_model('facebook/audiogen-medium', device=device),
66
- lm=load_lm_model('facebook/audiogen-medium', device=device).to(torch.float),
67
- duration=.04,
68
- top_k=1)
69
 
70
 
71
 
@@ -79,7 +59,7 @@ print('\n\n\n\n___________________')
79
 
80
  txt = 'dogs barging in the street'
81
 
82
- x = sound_generator.generate([txt])[0].detach().cpu().numpy()[0, :]
83
  x /= np.abs(x).max() + 1e-7
84
 
85
  audiofile.write('del_seane.wav', x, 16000)
 
1
  import audiofile
2
  import numpy as np
 
3
  import torch
 
4
  from audiocraft.loaders import load_compression_model, load_lm_model
 
5
  from audiocraft.conditioners import ConditioningAttributes
6
 
7
 
 
12
  def __init__(self,
13
  compression_model=None,
14
  lm=None,
15
+ duration=.74):
 
16
 
17
  self.compression_model = compression_model
18
  self.lm = lm
 
 
 
19
  self.duration = duration
 
20
 
21
  @property
22
+ def frame_rate(self):
 
23
  return self.compression_model.frame_rate
24
+
25
+ def generate(self,
26
+ descriptions):
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
27
  with torch.no_grad():
28
+ attributes = [
29
+ ConditioningAttributes(text={'description': d}) for d in descriptions]
30
+ gen_tokens = self.lm.generate(
31
+ conditions=attributes,
32
+ max_gen_len=int(self.duration * self.frame_rate)) #[n_draw, 4, 37]
33
+ x = self.compression_model.decode(gen_tokens, None) #[n_draw, 1, 11840]
34
+ n_draw, _, n_time_samples = x.shape
35
+ x = x.reshape(1, n_draw * n_time_samples) # linearise n_draw
36
+ return x
37
+
38
+
39
+
40
 
41
  device = 'cuda:0'
42
  # https://huggingface.co/facebook/audiogen-medium
43
 
44
 
45
  sound_generator = AudioGen(
46
+ compression_model=load_compression_model('facebook/audiogen-medium', device=device).eval(),
47
+ lm=load_lm_model('facebook/audiogen-medium', device=device).to(torch.float).eval(),
48
+ duration=.74)
 
49
 
50
 
51
 
 
59
 
60
  txt = 'dogs barging in the street'
61
 
62
+ x = sound_generator.generate([txt])[0].detach().cpu().numpy()
63
  x /= np.abs(x).max() + 1e-7
64
 
65
  audiofile.write('del_seane.wav', x, 16000)