Heinrich Dinkel commited on
Commit
cdc66ce
·
1 Parent(s): 73556ff

update dnotebook

Browse files
Files changed (1) hide show
  1. notebook.ipynb +136 -115
notebook.ipynb CHANGED
@@ -25,7 +25,8 @@
25
  "from sklearn.model_selection import train_test_split\n",
26
  "from sklearn.metrics import accuracy_score\n",
27
  "import numpy as np\n",
28
- "from tqdm import tqdm"
 
29
  ]
30
  },
31
  {
@@ -34,29 +35,6 @@
34
  "metadata": {},
35
  "outputs": [],
36
  "source": [
37
- "class ESC50Dataset(Dataset):\n",
38
- " def __init__(self, audio_dir, metadata_path, sr=16000, max_length=160000):\n",
39
- " self.audio_dir = audio_dir\n",
40
- " self.sr = sr\n",
41
- " self.max_length = max_length\n",
42
- " self.metadata = pd.read_csv(metadata_path)\n",
43
- " \n",
44
- " def __len__(self):\n",
45
- " return len(self.metadata)\n",
46
- " \n",
47
- " def __getitem__(self, idx):\n",
48
- " row = self.metadata.iloc[idx]\n",
49
- " filename = row['filename']\n",
50
- " label = row['target']\n",
51
- " \n",
52
- " audio_path = os.path.join(self.audio_dir, filename)\n",
53
- " audio, sr = librosa.load(audio_path, sr=self.sr)\n",
54
- " \n",
55
- " audio_tensor = torch.tensor(audio).float()\n",
56
- " label_tensor = torch.tensor(label).long()\n",
57
- " \n",
58
- " return audio_tensor, label_tensor\n",
59
- "\n",
60
  "def download_esc50():\n",
61
  " import urllib.request\n",
62
  " import zipfile\n",
@@ -79,43 +57,131 @@
79
  "metadata": {},
80
  "outputs": [],
81
  "source": [
82
- "# Download dataset\n",
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
83
  "download_esc50()\n",
 
84
  "\n",
85
- "# Load model\n",
86
- "model = AutoModel.from_pretrained(\"mispeech/dashengtokenizer\", trust_remote_code=True)\n",
 
 
 
 
 
 
 
 
 
 
 
 
 
 
87
  "\n",
88
- "# Get embedding dimension\n",
89
- "embedding_dim = 1280\n",
90
- "print(f\"Model embedding dimension: {embedding_dim}\")\n",
91
  "\n",
92
- "# Freeze model\n",
93
- "for param in model.parameters():\n",
94
- " param.requires_grad = False\n",
 
95
  "\n",
 
 
 
 
 
 
 
 
 
96
  "# Single linear layer\n",
97
  "classifier = nn.Linear(embedding_dim, 50) # 50 ESC-50 classes\n",
98
  "\n",
99
  "# Setup\n",
100
  "device = torch.device(\"cuda\" if torch.cuda.is_available() else \"cpu\")\n",
101
- "model.to(device)\n",
102
  "classifier.to(device)\n",
103
  "print(f\"Using device: {device}\")\n",
104
- "# Create datasets\n",
105
- "audio_dir = 'ESC-50/audio'\n",
106
- "metadata_path = 'ESC-50/meta/esc50.csv'\n",
107
- "\n",
108
- "dataset = ESC50Dataset(audio_dir, metadata_path)\n",
109
  "\n",
110
- "# Split into train/val\n",
111
- "train_idx, val_idx = train_test_split(range(len(dataset)), test_size=0.2, random_state=42)\n",
112
- "train_dataset = torch.utils.data.Subset(dataset, train_idx)\n",
113
- "val_dataset = torch.utils.data.Subset(dataset, val_idx)\n",
114
- "\n",
115
- "train_loader = DataLoader(train_dataset, batch_size=4, shuffle=True, num_workers=2)\n",
116
- "val_loader = DataLoader(val_dataset, batch_size=4, shuffle=False, num_workers=2)\n",
117
- "\n",
118
- "print(f\"Train samples: {len(train_dataset)}, Val samples: {len(val_dataset)}\")"
119
  ]
120
  },
121
  {
@@ -124,94 +190,49 @@
124
  "metadata": {},
125
  "outputs": [],
126
  "source": [
127
- "# Training setup\n",
128
- "optimizer = torch.optim.Adam(classifier.parameters(), lr=1e-3)\n",
129
- "criterion = nn.CrossEntropyLoss()\n",
130
  "\n",
131
- "# Training loop\n",
132
  "for epoch in range(10):\n",
133
- " model.eval()\n",
134
  " classifier.train()\n",
135
  " \n",
136
  " # Training\n",
137
  " train_loss = 0\n",
138
  " train_preds = []\n",
139
  " train_labels = []\n",
140
- "\n",
141
- " pbar = tqdm(train_loader, desc=f'Epoch {epoch+1}/10 Training')\n",
142
- " for batch_audio, batch_labels in pbar:\n",
143
- " batch_audio = batch_audio.to(device)\n",
144
- " batch_labels = batch_labels.to(device)\n",
145
- "\n",
146
- " # Forward through frozen model\n",
147
- " with torch.no_grad(), torch.autocast(device_type='cuda'):\n",
148
- " features = model.encode(batch_audio)\n",
149
- " if isinstance(features, dict):\n",
150
- " for key in ['last_hidden_state', 'embeddings', 'audio']:\n",
151
- " if key in features:\n",
152
- " features = features[key]\n",
153
- " break\n",
154
- " else:\n",
155
- " features = list(features.values())[0]\n",
156
- "\n",
157
- " # Global average pooling if needed\n",
158
- " if features.dim() > 2:\n",
159
- " features = features.mean(dim=1)\n",
160
- "\n",
161
- " # Classifier\n",
162
- " logits = classifier(features)\n",
163
  " loss = criterion(logits, batch_labels)\n",
164
- "\n",
165
- " # Backward\n",
166
  " optimizer.zero_grad()\n",
167
  " loss.backward()\n",
168
  " optimizer.step()\n",
169
- "\n",
170
  " train_loss += loss.item()\n",
171
  " preds = torch.argmax(logits, dim=1)\n",
172
  " train_preds.extend(preds.cpu().numpy())\n",
173
  " train_labels.extend(batch_labels.cpu().numpy())\n",
174
- "\n",
175
- " # Update progress bar\n",
176
- " pbar.set_postfix({'loss': f'{loss.item():.4f}'})\n",
177
- "\n",
178
  " train_acc = accuracy_score(train_labels, train_preds)\n",
179
  " \n",
180
  " # Validation\n",
181
  " classifier.eval()\n",
182
- " val_preds = []\n",
183
- " val_labels = []\n",
184
- "\n",
185
- " with torch.no_grad(),torch.autocast(device_type='cuda'):\n",
186
- " pbar_val = tqdm(val_loader, desc=f'Epoch {epoch+1}/10 Validation')\n",
187
- " for batch_audio, batch_labels in pbar_val:\n",
188
- " batch_audio = batch_audio.to(device)\n",
189
- " batch_labels = batch_labels.to(device)\n",
190
- "\n",
191
- " features = model(batch_audio)\n",
192
- " if isinstance(features, dict):\n",
193
- " for key in ['last_hidden_state', 'embeddings', 'audio']:\n",
194
- " if key in features:\n",
195
- " features = features[key]\n",
196
- " break\n",
197
- " else:\n",
198
- " features = list(features.values())[0]\n",
199
- "\n",
200
- " if features.dim() > 2:\n",
201
- " features = features.mean(dim=1)\n",
202
- "\n",
203
- " logits = classifier(features)\n",
204
- " preds = torch.argmax(logits, dim=1)\n",
205
- " val_preds.extend(preds.cpu().numpy())\n",
206
- " val_labels.extend(batch_labels.cpu().numpy())\n",
207
- "\n",
208
- " # Update validation progress bar\n",
209
- " batch_acc = (preds == batch_labels).float().mean().item()\n",
210
- " pbar_val.set_postfix({'batch_acc': f'{batch_acc:.4f}'})\n",
211
- "\n",
212
- " val_acc = accuracy_score(val_labels, val_preds)\n",
213
  " \n",
214
- " print(f\"Epoch {epoch+1}/10 - Train Loss: {train_loss/len(train_loader):.4f} - Train Acc: {train_acc:.4f} - Val Acc: {val_acc:.4f}\")"
215
  ]
216
  }
217
  ],
 
25
  "from sklearn.model_selection import train_test_split\n",
26
  "from sklearn.metrics import accuracy_score\n",
27
  "import numpy as np\n",
28
+ "from tqdm import tqdm\n",
29
+ "import pickle"
30
  ]
31
  },
32
  {
 
35
  "metadata": {},
36
  "outputs": [],
37
  "source": [
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
38
  "def download_esc50():\n",
39
  " import urllib.request\n",
40
  " import zipfile\n",
 
57
  "metadata": {},
58
  "outputs": [],
59
  "source": [
60
+ "def extract_features():\n",
61
+ " \"\"\"Extract and save features for all ESC-50 audio files\"\"\"\n",
62
+ " \n",
63
+ " if os.path.exists('esc50_features.pkl'):\n",
64
+ " print(\"Features already extracted, loading from file...\")\n",
65
+ " with open('esc50_features.pkl', 'rb') as f:\n",
66
+ " return pickle.load(f)\n",
67
+ " \n",
68
+ " # Load model\n",
69
+ " model = AutoModel.from_pretrained(\"mispeech/dashengtokenizer\", trust_remote_code=True)\n",
70
+ " model.eval()\n",
71
+ " device = torch.device(\"cuda\" if torch.cuda.is_available() else \"cpu\")\n",
72
+ " model.to(device)\n",
73
+ " \n",
74
+ " # Load metadata\n",
75
+ " metadata_path = 'ESC-50/meta/esc50.csv'\n",
76
+ " df = pd.read_csv(metadata_path)\n",
77
+ " \n",
78
+ " features_list = []\n",
79
+ " labels_list = []\n",
80
+ " \n",
81
+ " print(\"Extracting features...\")\n",
82
+ " for idx, row in tqdm(df.iterrows(), total=len(df)):\n",
83
+ " filename = row['filename']\n",
84
+ " label = row['target']\n",
85
+ " \n",
86
+ " audio_path = os.path.join('ESC-50/audio', filename)\n",
87
+ " \n",
88
+ " try:\n",
89
+ " # Load and preprocess audio\n",
90
+ " audio, sr = librosa.load(audio_path, sr=16000)\n",
91
+ " audio_tensor = torch.tensor(audio).float().unsqueeze(0).to(device)\n",
92
+ " \n",
93
+ " # Extract features\n",
94
+ " with torch.no_grad(), torch.autocast(device_type='cuda'):\n",
95
+ " features = model.encode(audio_tensor)\n",
96
+ " if isinstance(features, dict):\n",
97
+ " for key in ['last_hidden_state', 'embeddings', 'audio']:\n",
98
+ " if key in features:\n",
99
+ " features = features[key]\n",
100
+ " break\n",
101
+ " else:\n",
102
+ " features = list(features.values())[0]\n",
103
+ " \n",
104
+ " # Global average pooling\n",
105
+ " if features.dim() > 2:\n",
106
+ " features = features.mean(dim=1)\n",
107
+ " \n",
108
+ " features = features.squeeze().cpu().numpy()\n",
109
+ " \n",
110
+ " features_list.append(features)\n",
111
+ " labels_list.append(label)\n",
112
+ " \n",
113
+ " except Exception as e:\n",
114
+ " print(f\"Error processing {filename}: {e}\")\n",
115
+ " \n",
116
+ " # Save features\n",
117
+ " features_data = {\n",
118
+ " 'features': np.array(features_list),\n",
119
+ " 'labels': np.array(labels_list),\n",
120
+ " 'embedding_dim': features_list[0].shape[0]\n",
121
+ " }\n",
122
+ " \n",
123
+ " with open('esc50_features.pkl', 'wb') as f:\n",
124
+ " pickle.dump(features_data, f)\n",
125
+ " \n",
126
+ " print(f\"Features extracted: {len(features_list)} samples, embedding dim: {features_data['embedding_dim']}\")\n",
127
+ " return features_data"
128
+ ]
129
+ },
130
+ {
131
+ "cell_type": "code",
132
+ "execution_count": null,
133
+ "metadata": {},
134
+ "outputs": [],
135
+ "source": [
136
+ "# Download dataset and extract features\n",
137
  "download_esc50()\n",
138
+ "features_data = extract_features()\n",
139
  "\n",
140
+ "X = features_data['features']\n",
141
+ "y = features_data['labels']\n",
142
+ "embedding_dim = features_data['embedding_dim']\n",
143
+ "\n",
144
+ "print(f\"Features shape: {X.shape}, Labels shape: {y.shape}\")"
145
+ ]
146
+ },
147
+ {
148
+ "cell_type": "code",
149
+ "execution_count": null,
150
+ "metadata": {},
151
+ "outputs": [],
152
+ "source": [
153
+ "# Convert to PyTorch tensors\n",
154
+ "X_tensor = torch.tensor(X, dtype=torch.float32)\n",
155
+ "y_tensor = torch.tensor(y, dtype=torch.long)\n",
156
  "\n",
157
+ "# Split into train/val\n",
158
+ "train_idx, val_idx = train_test_split(range(len(X_tensor)), test_size=0.2, random_state=42)\n",
 
159
  "\n",
160
+ "X_train = X_tensor[train_idx]\n",
161
+ "y_train = y_tensor[train_idx]\n",
162
+ "X_val = X_tensor[val_idx]\n",
163
+ "y_val = y_tensor[val_idx]\n",
164
  "\n",
165
+ "print(f\"Train: {X_train.shape}, Val: {X_val.shape}\")"
166
+ ]
167
+ },
168
+ {
169
+ "cell_type": "code",
170
+ "execution_count": null,
171
+ "metadata": {},
172
+ "outputs": [],
173
+ "source": [
174
  "# Single linear layer\n",
175
  "classifier = nn.Linear(embedding_dim, 50) # 50 ESC-50 classes\n",
176
  "\n",
177
  "# Setup\n",
178
  "device = torch.device(\"cuda\" if torch.cuda.is_available() else \"cpu\")\n",
 
179
  "classifier.to(device)\n",
180
  "print(f\"Using device: {device}\")\n",
 
 
 
 
 
181
  "\n",
182
+ "# Training setup\n",
183
+ "optimizer = torch.optim.Adam(classifier.parameters(), lr=1e-3)\n",
184
+ "criterion = nn.CrossEntropyLoss()"
 
 
 
 
 
 
185
  ]
186
  },
187
  {
 
190
  "metadata": {},
191
  "outputs": [],
192
  "source": [
193
+ "# Training loop (much faster since features are pre-extracted)\n",
194
+ "batch_size = 32\n",
 
195
  "\n",
 
196
  "for epoch in range(10):\n",
 
197
  " classifier.train()\n",
198
  " \n",
199
  " # Training\n",
200
  " train_loss = 0\n",
201
  " train_preds = []\n",
202
  " train_labels = []\n",
203
+ " \n",
204
+ " # Mini-batch training\n",
205
+ " for i in range(0, len(X_train), batch_size):\n",
206
+ " batch_features = X_train[i:i+batch_size].to(device)\n",
207
+ " batch_labels = y_train[i:i+batch_size].to(device)\n",
208
+ " \n",
209
+ " # Forward pass\n",
210
+ " logits = classifier(batch_features)\n",
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
211
  " loss = criterion(logits, batch_labels)\n",
212
+ " \n",
213
+ " # Backward pass\n",
214
  " optimizer.zero_grad()\n",
215
  " loss.backward()\n",
216
  " optimizer.step()\n",
217
+ " \n",
218
  " train_loss += loss.item()\n",
219
  " preds = torch.argmax(logits, dim=1)\n",
220
  " train_preds.extend(preds.cpu().numpy())\n",
221
  " train_labels.extend(batch_labels.cpu().numpy())\n",
222
+ " \n",
 
 
 
223
  " train_acc = accuracy_score(train_labels, train_preds)\n",
224
  " \n",
225
  " # Validation\n",
226
  " classifier.eval()\n",
227
+ " with torch.no_grad():\n",
228
+ " val_features = X_val.to(device)\n",
229
+ " val_labels = y_val.cpu().numpy()\n",
230
+ " \n",
231
+ " val_logits = classifier(val_features)\n",
232
+ " val_preds = torch.argmax(val_logits, dim=1).cpu().numpy()\n",
233
+ " val_acc = accuracy_score(val_labels, val_preds)\n",
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
234
  " \n",
235
+ " print(f\"Epoch {epoch+1}/10 - Train Loss: {train_loss/len(range(0, len(X_train), batch_size)):.4f} - Train Acc: {train_acc:.4f} - Val Acc: {val_acc:.4f}\")"
236
  ]
237
  }
238
  ],