Jaesung Huh commited on
Commit
931ef66
β€’
1 Parent(s): ac6a529

change to from_pretrained

Browse files
Files changed (4) hide show
  1. __pycache__/model.cpython-38.pyc +0 -0
  2. app.py +3 -3
  3. model.py +37 -43
  4. requirements.txt +2 -1
__pycache__/model.cpython-38.pyc CHANGED
Binary files a/__pycache__/model.cpython-38.pyc and b/__pycache__/model.cpython-38.pyc differ
 
app.py CHANGED
@@ -1,9 +1,9 @@
1
  import gradio as gr
2
  import torch
3
  from model import ECAPA_gender
4
-
5
- model = ECAPA_gender({"C": 1024})
6
- model.load_state_dict(torch.load("gender_classifier.model", map_location="cpu"))
7
 
8
  model.eval()
9
 
 
1
  import gradio as gr
2
  import torch
3
  from model import ECAPA_gender
4
+ # Load the model
5
+ model = ECAPA_gender.from_pretrained("JaesungHuh/ecapa-gender")
6
+ # model.load_state_dict(torch.load("gender_classifier.model", map_location="cpu"))
7
 
8
  model.eval()
9
 
model.py CHANGED
@@ -1,14 +1,18 @@
 
 
 
1
  import torch
2
  import torch.nn as nn
3
  import torch.nn.functional as F
4
 
5
  import torchaudio
6
  from torchaudio.functional import resample
7
- import math
 
8
 
9
 
10
  class SEModule(nn.Module):
11
- def __init__(self, channels, bottleneck=128):
12
  super(SEModule, self).__init__()
13
  self.se = nn.Sequential(
14
  nn.AdaptiveAvgPool1d(1),
@@ -19,13 +23,13 @@ class SEModule(nn.Module):
19
  nn.Sigmoid(),
20
  )
21
 
22
- def forward(self, input):
23
  x = self.se(input)
24
  return input * x
25
 
26
- class Bottle2neck(nn.Module):
27
 
28
- def __init__(self, inplanes, planes, kernel_size=None, dilation=None, scale = 8):
 
29
  super(Bottle2neck, self).__init__()
30
  width = int(math.floor(planes / scale))
31
  self.conv1 = nn.Conv1d(inplanes, width*scale, kernel_size=1)
@@ -45,7 +49,7 @@ class Bottle2neck(nn.Module):
45
  self.width = width
46
  self.se = SEModule(planes)
47
 
48
- def forward(self, x):
49
  residual = x
50
  out = self.conv1(x)
51
  out = self.relu(out)
@@ -73,34 +77,12 @@ class Bottle2neck(nn.Module):
73
  out = self.se(out)
74
  out += residual
75
  return out
 
76
 
77
- class PreEmphasis(torch.nn.Module):
78
-
79
- def __init__(self, coef: float = 0.97):
80
- super().__init__()
81
- self.coef = coef
82
- self.register_buffer(
83
- 'flipped_filter', torch.FloatTensor([-self.coef, 1.]).unsqueeze(0).unsqueeze(0)
84
- )
85
-
86
- def forward(self, input: torch.tensor) -> torch.tensor:
87
- input = input.unsqueeze(1)
88
- input = F.pad(input, (1, 0), 'reflect')
89
- return F.conv1d(input, self.flipped_filter).squeeze(1)
90
-
91
-
92
- class ECAPA_gender(nn.Module):
93
- def __init__(self, config):
94
  super(ECAPA_gender, self).__init__()
95
- self.config = config
96
- C = config["C"]
97
-
98
- self.torchfbank = torch.nn.Sequential(
99
- PreEmphasis(),
100
- torchaudio.transforms.MelSpectrogram(sample_rate=16000, n_fft=512, win_length=400, hop_length=160, \
101
- f_min = 20, f_max = 7600, window_fn=torch.hamming_window, n_mels=80),
102
- )
103
-
104
  self.conv1 = nn.Conv1d(80, C, kernel_size=5, stride=1, padding=2)
105
  self.relu = nn.ReLU()
106
  self.bn1 = nn.BatchNorm1d(C)
@@ -121,13 +103,26 @@ class ECAPA_gender(nn.Module):
121
  self.fc6 = nn.Linear(3072, 192)
122
  self.bn6 = nn.BatchNorm1d(192)
123
  self.fc7 = nn.Linear(192, 2)
124
- self.pred2gender = {0 : 'Male', 1 : 'Female'}
125
-
126
- def forward(self, x):
127
- with torch.no_grad():
128
- x = self.torchfbank(x)+1e-6
129
- x = x.log()
130
- x = x - torch.mean(x, dim=-1, keepdim=True)
 
 
 
 
 
 
 
 
 
 
 
 
 
131
 
132
  x = self.conv1(x)
133
  x = self.relu(x)
@@ -158,17 +153,16 @@ class ECAPA_gender(nn.Module):
158
 
159
  return x
160
 
161
- def load_audio(self, path):
162
  audio, sr = torchaudio.load(path)
163
  if sr != 16000:
164
  audio = resample(audio, sr, 16000)
165
  return audio
166
 
167
- def predict(self, audio):
168
  audio = self.load_audio(audio)
169
  self.eval()
170
  with torch.no_grad():
171
  output = self.forward(audio)
172
  _, pred = output.max(1)
173
- return self.pred2gender[pred.item()]
174
-
 
1
+ import math
2
+ from typing import Optional
3
+
4
  import torch
5
  import torch.nn as nn
6
  import torch.nn.functional as F
7
 
8
  import torchaudio
9
  from torchaudio.functional import resample
10
+
11
+ from huggingface_hub import PyTorchModelHubMixin
12
 
13
 
14
  class SEModule(nn.Module):
15
+ def __init__(self, channels : int , bottleneck : int = 128) -> None:
16
  super(SEModule, self).__init__()
17
  self.se = nn.Sequential(
18
  nn.AdaptiveAvgPool1d(1),
 
23
  nn.Sigmoid(),
24
  )
25
 
26
+ def forward(self, input : torch.Tensor) -> torch.Tensor:
27
  x = self.se(input)
28
  return input * x
29
 
 
30
 
31
+ class Bottle2neck(nn.Module):
32
+ def __init__(self, inplanes : int, planes : int, kernel_size : Optional[int] = None, dilation : Optional[int] = None, scale : int = 8) -> None:
33
  super(Bottle2neck, self).__init__()
34
  width = int(math.floor(planes / scale))
35
  self.conv1 = nn.Conv1d(inplanes, width*scale, kernel_size=1)
 
49
  self.width = width
50
  self.se = SEModule(planes)
51
 
52
+ def forward(self, x : torch.Tensor) -> torch.Tensor:
53
  residual = x
54
  out = self.conv1(x)
55
  out = self.relu(out)
 
77
  out = self.se(out)
78
  out += residual
79
  return out
80
+
81
 
82
+ class ECAPA_gender(nn.Module, PyTorchModelHubMixin):
83
+ def __init__(self, C : int = 1024):
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
84
  super(ECAPA_gender, self).__init__()
85
+ self.C = C
 
 
 
 
 
 
 
 
86
  self.conv1 = nn.Conv1d(80, C, kernel_size=5, stride=1, padding=2)
87
  self.relu = nn.ReLU()
88
  self.bn1 = nn.BatchNorm1d(C)
 
103
  self.fc6 = nn.Linear(3072, 192)
104
  self.bn6 = nn.BatchNorm1d(192)
105
  self.fc7 = nn.Linear(192, 2)
106
+ self.pred2gender = {0 : 'male', 1 : 'female'}
107
+
108
+ def logtorchfbank(self, x : torch.Tensor) -> torch.Tensor:
109
+ # Preemphasis
110
+ flipped_filter = torch.FloatTensor([-0.97, 1.]).unsqueeze(0).unsqueeze(0)
111
+ x = x.unsqueeze(1)
112
+ x = F.pad(x, (1, 0), 'reflect')
113
+ x = F.conv1d(x, flipped_filter).squeeze(1)
114
+
115
+ # Melspectrogram
116
+ x = torchaudio.transforms.MelSpectrogram(sample_rate=16000, n_fft=512, win_length=400, hop_length=160, \
117
+ f_min = 20, f_max = 7600, window_fn=torch.hamming_window, n_mels=80)(x) + 1e-6
118
+
119
+ # Log and normalize
120
+ x = x.log()
121
+ x = x - torch.mean(x, dim=-1, keepdim=True)
122
+ return x
123
+
124
+ def forward(self, x : torch.Tensor) -> torch.Tensor:
125
+ x = self.logtorchfbank(x)
126
 
127
  x = self.conv1(x)
128
  x = self.relu(x)
 
153
 
154
  return x
155
 
156
+ def load_audio(self, path : str) -> torch.Tensor:
157
  audio, sr = torchaudio.load(path)
158
  if sr != 16000:
159
  audio = resample(audio, sr, 16000)
160
  return audio
161
 
162
+ def predict(self, audio : torch.Tensor) -> torch.Tensor:
163
  audio = self.load_audio(audio)
164
  self.eval()
165
  with torch.no_grad():
166
  output = self.forward(audio)
167
  _, pred = output.max(1)
168
+ return self.pred2gender[pred.item()]
 
requirements.txt CHANGED
@@ -1,2 +1,3 @@
1
  torch
2
- torchaudio
 
 
1
  torch
2
+ torchaudio
3
+ pysoundfile