teticio commited on
Commit
c1e3d89
1 Parent(s): 5bc60f9

add n_iter to Mel and update test_mel notebook

Browse files
Files changed (2) hide show
  1. audiodiffusion/mel.py +20 -17
  2. notebooks/test_mel.ipynb +7 -1
audiodiffusion/mel.py CHANGED
@@ -9,15 +9,14 @@ from PIL import Image
9
 
10
  class Mel:
11
 
12
- def __init__(
13
- self,
14
- x_res: int = 256,
15
- y_res: int = 256,
16
- sample_rate: int = 22050,
17
- n_fft: int = 2048,
18
- hop_length: int = 512,
19
- top_db: int = 80
20
- ):
21
  """Class to convert audio to mel spectrograms and vice versa.
22
 
23
  Args:
@@ -27,6 +26,7 @@ class Mel:
27
  n_fft (int): number of Fast Fourier Transforms
28
  hop_length (int): hop length (a higher number is recommended for lower than 256 y_res)
29
  top_db (int): loudest in decibels
 
30
  """
31
  self.x_res = x_res
32
  self.y_res = y_res
@@ -36,6 +36,7 @@ class Mel:
36
  self.n_mels = self.y_res
37
  self.slice_size = self.x_res * self.hop_length - 1
38
  self.top_db = top_db
 
39
  self.audio = None
40
 
41
  def load_audio(self, audio_file: str = None, raw_audio: np.ndarray = None):
@@ -94,13 +95,11 @@ class Mel:
94
  Returns:
95
  PIL Image: grayscale image of x_res x y_res
96
  """
97
- S = librosa.feature.melspectrogram(
98
- y=self.get_audio_slice(slice),
99
- sr=self.sr,
100
- n_fft=self.n_fft,
101
- hop_length=self.hop_length,
102
- n_mels=self.n_mels
103
- )
104
  log_S = librosa.power_to_db(S, ref=np.max, top_db=self.top_db)
105
  bytedata = (((log_S + self.top_db) * 255 / self.top_db).clip(0, 255) +
106
  0.5).astype(np.uint8)
@@ -121,5 +120,9 @@ class Mel:
121
  log_S = bytedata.astype("float") * self.top_db / 255 - self.top_db
122
  S = librosa.db_to_power(log_S)
123
  audio = librosa.feature.inverse.mel_to_audio(
124
- S, sr=self.sr, n_fft=self.n_fft, hop_length=self.hop_length)
 
 
 
 
125
  return audio
 
9
 
10
  class Mel:
11
 
12
+ def __init__(self,
13
+ x_res: int = 256,
14
+ y_res: int = 256,
15
+ sample_rate: int = 22050,
16
+ n_fft: int = 2048,
17
+ hop_length: int = 512,
18
+ top_db: int = 80,
19
+ n_iter: int = 32):
 
20
  """Class to convert audio to mel spectrograms and vice versa.
21
 
22
  Args:
 
26
  n_fft (int): number of Fast Fourier Transforms
27
  hop_length (int): hop length (a higher number is recommended for lower than 256 y_res)
28
  top_db (int): loudest in decibels
29
+ n_iter (int): number of iterations for Griffin Linn mel inversion
30
  """
31
  self.x_res = x_res
32
  self.y_res = y_res
 
36
  self.n_mels = self.y_res
37
  self.slice_size = self.x_res * self.hop_length - 1
38
  self.top_db = top_db
39
+ self.n_iter = n_iter
40
  self.audio = None
41
 
42
  def load_audio(self, audio_file: str = None, raw_audio: np.ndarray = None):
 
95
  Returns:
96
  PIL Image: grayscale image of x_res x y_res
97
  """
98
+ S = librosa.feature.melspectrogram(y=self.get_audio_slice(slice),
99
+ sr=self.sr,
100
+ n_fft=self.n_fft,
101
+ hop_length=self.hop_length,
102
+ n_mels=self.n_mels)
 
 
103
  log_S = librosa.power_to_db(S, ref=np.max, top_db=self.top_db)
104
  bytedata = (((log_S + self.top_db) * 255 / self.top_db).clip(0, 255) +
105
  0.5).astype(np.uint8)
 
120
  log_S = bytedata.astype("float") * self.top_db / 255 - self.top_db
121
  S = librosa.db_to_power(log_S)
122
  audio = librosa.feature.inverse.mel_to_audio(
123
+ S,
124
+ sr=self.sr,
125
+ n_fft=self.n_fft,
126
+ hop_length=self.hop_length,
127
+ n_iter=self.n_iter)
128
  return audio
notebooks/test_mel.ipynb CHANGED
@@ -41,7 +41,13 @@
41
  "metadata": {},
42
  "outputs": [],
43
  "source": [
44
- "mel = Mel()"
 
 
 
 
 
 
45
  ]
46
  },
47
  {
 
41
  "metadata": {},
42
  "outputs": [],
43
  "source": [
44
+ "# These are the default parameters. If you change any of them, you may have to adjust the others.\n",
45
+ "mel = Mel(x_res=256,\n",
46
+ " y_res=256,\n",
47
+ " hop_length=512,\n",
48
+ " sample_rate=22050,\n",
49
+ " n_fft=2048,\n",
50
+ " n_iter=32)"
51
  ]
52
  },
53
  {