bawolf commited on
Commit
72a4f99
·
1 Parent(s): c7e92d2

read bloag dataset better

Browse files
.gitignore CHANGED
@@ -35,6 +35,8 @@ ENV/
35
 
36
  # Project specific
37
  runs/
 
 
38
  checkpoints/
39
  *.pth
40
  *.ckpt
 
35
 
36
  # Project specific
37
  runs/
38
+ outputs/
39
+ runs_hyperparam/
40
  checkpoints/
41
  *.pth
42
  *.ckpt
script/hyperparameter_tuning.py CHANGED
@@ -1,24 +1,39 @@
1
  import optuna
2
  import os
 
 
 
 
 
3
 
4
- import os
5
  import sys
6
  sys.path.append(os.path.dirname(os.path.dirname(__file__)))
7
  from script.train import train_and_evaluate
8
  from src.utils.utils import create_run_directory
9
 
10
- def objective(trial, hyperparam_run_dir):
 
 
 
 
 
 
 
 
 
 
 
11
  config = {
12
- "clip_model": trial.suggest_categorical("clip_model", ["openai/clip-vit-base-patch32", "openai/clip-vit-large-patch14"]),
13
- "learning_rate": trial.suggest_loguniform("learning_rate", 1e-6, 1e-4),
14
- "weight_decay": trial.suggest_loguniform("weight_decay", 1e-8, 1e-1),
15
- "unfreeze_layers": trial.suggest_int("unfreeze_layers", 1, 6),
16
- "batch_size": trial.suggest_categorical("batch_size", [32, 64, 128]),
17
- "gradient_clip_max_norm": trial.suggest_uniform("gradient_clip_max_norm", 0.1, 1.0),
18
  "augmentation_strength": trial.suggest_float("augmentation_strength", 0.0, 1.0),
19
  "crop_scale_min": trial.suggest_float("crop_scale_min", 0.6, 0.9),
20
  "max_frames": trial.suggest_int("max_frames", 5, 15),
21
- "sigma": trial.suggest_uniform("sigma", 0.1, 0.5),
22
  }
23
 
24
  class_labels = ["windmill", "halo", "swipe", "baby_mill"][:3]
@@ -27,9 +42,9 @@ def objective(trial, hyperparam_run_dir):
27
  config.update({
28
  "class_labels": class_labels,
29
  "num_classes": len(class_labels),
30
- "data_path": '../finetune/3moves_test',
31
- "num_epochs": 50, # Reduced for faster trials
32
- "patience": 10, # Adjusted for faster trials
33
  "image_size": 224,
34
  "crop_scale_max": 1.0,
35
  "normalization_mean": [0.485, 0.456, 0.406],
@@ -37,7 +52,7 @@ def objective(trial, hyperparam_run_dir):
37
  "overfitting_threshold": 10,
38
  })
39
 
40
- # Derive augmentation parameters from augmentation_strength
41
  config.update({
42
  "flip_probability": 0.5 * config["augmentation_strength"],
43
  "rotation_degrees": int(15 * config["augmentation_strength"]),
@@ -47,33 +62,184 @@ def objective(trial, hyperparam_run_dir):
47
  "hue_jitter": 0.1 * config["augmentation_strength"],
48
  })
49
 
50
- # Create a unique run directory for this trial
51
- config["run_dir"] = create_run_directory(prefix=f"trial", parent_dir=hyperparam_run_dir)
52
-
53
- # Run training and evaluation
54
- val_accuracy = train_and_evaluate(config)
55
- return val_accuracy
56
-
57
- def main():
58
- # Set up the study and optimize
59
- hyperparam_run_dir = create_run_directory(suffix='_hyperparam')
60
- study = optuna.create_study(direction="maximize")
61
- study.optimize(lambda trial: objective(trial, hyperparam_run_dir), n_trials=100) # Adjust the number of trials as needed
62
 
63
- # Save the study results
64
- study.trials_dataframe().to_csv(os.path.join(hyperparam_run_dir, 'study_results.csv'))
65
 
66
- print("Best trial:")
67
- trial = study.best_trial
68
- print(" Value: ", trial.value)
69
- print(" Params: ")
70
- for key, value in trial.params.items():
71
- print(" {}: {}".format(key, value))
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
72
 
73
- # Save the best trial parameters
74
- with open(os.path.join(hyperparam_run_dir, 'best_params.txt'), 'w') as f:
75
- for key, value in trial.params.items():
76
- f.write(f"{key}: {value}\n")
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
77
 
78
  if __name__ == "__main__":
79
- main()
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
  import optuna
2
  import os
3
+ from datetime import datetime
4
+ import pandas as pd
5
+ from pathlib import Path
6
+ import json
7
+ import math
8
 
 
9
  import sys
10
  sys.path.append(os.path.dirname(os.path.dirname(__file__)))
11
  from script.train import train_and_evaluate
12
  from src.utils.utils import create_run_directory
13
 
14
+ def create_hyperparam_directory():
15
+ """Create a parent directory for all hyperparameter searches"""
16
+ timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")
17
+ base_dir = "runs_hyperparam"
18
+ hyperparam_dir = os.path.join(base_dir, f"hyperparam_{timestamp}")
19
+ os.makedirs(hyperparam_dir, exist_ok=True)
20
+ return hyperparam_dir
21
+
22
+ def objective(trial, hyperparam_run_dir, data_path):
23
+ """Objective function for a single dataset"""
24
+
25
+ # Then suggest parameters using the model-specific ranges
26
  config = {
27
+ "clip_model": trial.suggest_categorical("clip_model", ["openai/clip-vit-base-patch32", "openai/clip-vit-large-patch14"]),
28
+ "batch_size": trial.suggest_categorical("batch_size", [8,16,32]),
29
+ "unfreeze_layers": trial.suggest_int("unfreeze_layers", 1, 4),
30
+ "learning_rate": trial.suggest_float("learning_rate", 1e-6, 1e-4, log=True),
31
+ "weight_decay": trial.suggest_float("weight_decay", 1e-8, 1e-1, log=True),
32
+ "gradient_clip_max_norm": trial.suggest_float("gradient_clip_max_norm", 0.1, 1.0),
33
  "augmentation_strength": trial.suggest_float("augmentation_strength", 0.0, 1.0),
34
  "crop_scale_min": trial.suggest_float("crop_scale_min", 0.6, 0.9),
35
  "max_frames": trial.suggest_int("max_frames", 5, 15),
36
+ "sigma": trial.suggest_float("sigma", 0.1, 0.5),
37
  }
38
 
39
  class_labels = ["windmill", "halo", "swipe", "baby_mill"][:3]
 
42
  config.update({
43
  "class_labels": class_labels,
44
  "num_classes": len(class_labels),
45
+ "data_path": data_path,
46
+ "num_epochs": 50,
47
+ "patience": 10,
48
  "image_size": 224,
49
  "crop_scale_max": 1.0,
50
  "normalization_mean": [0.485, 0.456, 0.406],
 
52
  "overfitting_threshold": 10,
53
  })
54
 
55
+ # Derive augmentation parameters
56
  config.update({
57
  "flip_probability": 0.5 * config["augmentation_strength"],
58
  "rotation_degrees": int(15 * config["augmentation_strength"]),
 
62
  "hue_jitter": 0.1 * config["augmentation_strength"],
63
  })
64
 
65
+ # Create dataset-specific run directory
66
+ dataset_label = '_'.join(Path(data_path).parts[-2:]) # Get last two parts of path
67
+ trial_dir = create_run_directory(
68
+ prefix=f"trial_{dataset_label}",
69
+ parent_dir=hyperparam_run_dir
70
+ )
71
+ config["run_dir"] = trial_dir
 
 
 
 
 
72
 
 
 
73
 
74
+ # Run training and evaluation with device cleanup
75
+ try:
76
+ val_accuracy, vis_dir = train_and_evaluate(config)
77
+
78
+ if val_accuracy is None or math.isnan(val_accuracy) or math.isinf(val_accuracy):
79
+ raise ValueError(f"Invalid accuracy value: {val_accuracy}")
80
+
81
+ # Save trial info
82
+ trial_info = {
83
+ 'dataset': data_path,
84
+ 'dataset_label': dataset_label,
85
+ 'trial_number': trial.number,
86
+ 'parameters': trial.params,
87
+ 'value': val_accuracy,
88
+ 'visualization_dir': vis_dir
89
+ }
90
+
91
+ with open(os.path.join(trial_dir, 'trial_info.json'), 'w') as f:
92
+ json.dump(trial_info, f, indent=4)
93
+
94
+ return val_accuracy
95
+
96
+ except Exception as e:
97
+ print(f"Error in trial for {data_path}: {str(e)}")
98
+ # Log detailed error information
99
+ error_log_path = os.path.join(hyperparam_run_dir, 'error_log.txt')
100
+ with open(error_log_path, 'a') as f:
101
+ f.write(f"\nError in trial at {datetime.now()}:\n")
102
+ f.write(f"Dataset: {data_path}\n")
103
+ f.write(f"Error: {str(e)}\n")
104
+ f.write(f"Trial params: {trial.params}\n")
105
+ f.write("Stack trace:\n")
106
+ import traceback
107
+ f.write(traceback.format_exc())
108
+ f.write("\n" + "="*50 + "\n")
109
+
110
+ return float('-inf')
111
 
112
+ def run_hyperparameter_search(data_paths, n_trials=100):
113
+ """Run hyperparameter search for multiple datasets"""
114
+
115
+ # Create parent directory for all searches
116
+ parent_hyperparam_dir = create_hyperparam_directory()
117
+
118
+ # Store results for all datasets
119
+ all_results = {}
120
+
121
+ for data_path in data_paths:
122
+ print(f"\nStarting hyperparameter search for dataset: {data_path}")
123
+
124
+ # Create dataset-specific directory
125
+ dataset_label = '_'.join(Path(data_path).parts[-2:])
126
+ dataset_dir = os.path.join(parent_hyperparam_dir, f"search_{dataset_label}")
127
+ os.makedirs(dataset_dir, exist_ok=True)
128
+
129
+ # Create and run study with explicit trial count tracking
130
+ study = optuna.create_study(direction="maximize")
131
+ completed_trials = 0
132
+ failed_trials = []
133
+ total_attempts = 0
134
+ max_attempts = n_trials * 2
135
+ while completed_trials < n_trials and total_attempts < max_attempts:
136
+ try:
137
+ total_attempts += 1
138
+ study.optimize(
139
+ lambda trial: objective(trial, dataset_dir, data_path),
140
+ n_trials=1
141
+ )
142
+ # Only increment if the trial actually succeeded
143
+ if study.trials[-1].value != float('-inf'):
144
+ completed_trials += 1
145
+ print(f"Completed trial {completed_trials}/{n_trials} for {dataset_label}")
146
+ else:
147
+ error_info = {
148
+ 'trial_number': completed_trials + len(failed_trials) + 1,
149
+ 'error': "Trial returned -inf",
150
+ 'timestamp': datetime.now().isoformat()
151
+ }
152
+ failed_trials.append(error_info)
153
+ print(f"Failed trial for {dataset_label}: returned -inf")
154
+
155
+ except Exception as e:
156
+ error_info = {
157
+ 'trial_number': completed_trials + len(failed_trials) + 1,
158
+ 'error': str(e),
159
+ 'timestamp': datetime.now().isoformat()
160
+ }
161
+ failed_trials.append(error_info)
162
+ print(f"Error in trial for {dataset_label}: {str(e)}")
163
+
164
+ # Log the error
165
+ with open(os.path.join(dataset_dir, 'failed_trials.json'), 'w') as f:
166
+ json.dump(failed_trials, f, indent=4)
167
+ if total_attempts >= max_attempts:
168
+ print(f"Warning: Reached maximum attempts ({max_attempts}) for {dataset_label}")
169
+
170
+ # Save study results
171
+ results_df = study.trials_dataframe()
172
+ results_df.to_csv(os.path.join(dataset_dir, 'study_results.csv'))
173
+
174
+ # Save trial statistics
175
+ trial_stats = {
176
+ 'completed_trials': completed_trials,
177
+ 'failed_trials': len(failed_trials),
178
+ 'total_attempts': completed_trials + len(failed_trials)
179
+ }
180
+ with open(os.path.join(dataset_dir, 'trial_statistics.json'), 'w') as f:
181
+ json.dump(trial_stats, f, indent=4)
182
+
183
+ # Save best trial info
184
+ best_trial = study.best_trial
185
+ best_params_path = os.path.join(dataset_dir, 'best_params.txt')
186
+ with open(best_params_path, 'w') as f:
187
+ f.write(f"Best trial value: {best_trial.value}\n\n")
188
+ f.write("Best parameters:\n")
189
+ for key, value in best_trial.params.items():
190
+ f.write(f"{key}: {value}\n")
191
+
192
+ # Store results
193
+ all_results[data_path] = {
194
+ 'best_value': best_trial.value,
195
+ 'best_params': best_trial.params,
196
+ 'study': study,
197
+ 'results_df': results_df,
198
+ 'failed_trials': failed_trials,
199
+ 'trial_stats': trial_stats
200
+ }
201
+
202
+ print(f"\nResults for {data_path}:")
203
+ print(f"Completed trials: {completed_trials}")
204
+ print(f"Failed trials: {len(failed_trials)}")
205
+ print(f"Best trial value: {best_trial.value}")
206
+ print("Best parameters:")
207
+ for key, value in best_trial.params.items():
208
+ print(f" {key}: {value}")
209
+
210
+ # Create overall summary with additional statistics
211
+ summary_data = []
212
+ for data_path, result in all_results.items():
213
+ summary_data.append({
214
+ 'dataset': data_path,
215
+ 'best_accuracy': result['best_value'],
216
+ 'completed_trials': result['trial_stats']['completed_trials'],
217
+ 'failed_trials': result['trial_stats']['failed_trials'],
218
+ **result['best_params']
219
+ })
220
+
221
+ summary_df = pd.DataFrame(summary_data)
222
+ summary_df.to_csv(os.path.join(parent_hyperparam_dir, 'overall_summary.csv'), index=False)
223
+
224
+ return parent_hyperparam_dir, all_results
225
 
226
  if __name__ == "__main__":
227
+ # List of dataset paths to optimize
228
+ data_paths = [
229
+ '../finetune/blog/bryant/random',
230
+ '../finetune/blog/bryant/adjusted',
231
+ '../finetune/blog/youtube/random',
232
+ '../finetune/blog/youtube/adjusted',
233
+ '../finetune/blog/combined/random',
234
+ '../finetune/blog/combined/adjusted',
235
+ '../finetune/blog/bryant_train_youtube_val/default'
236
+ ]
237
+
238
+ # Run hyperparameter search
239
+ hyperparam_dir, results = run_hyperparameter_search(
240
+ data_paths,
241
+ n_trials=8 # Adjust as needed
242
+ )
243
+
244
+ print(f"\nHyperparameter search complete!")
245
+ print(f"Results are saved in: {hyperparam_dir}")
script/train.py CHANGED
@@ -7,6 +7,7 @@ import logging
7
  import csv
8
  import json
9
  from torch.optim.lr_scheduler import CosineAnnealingLR
 
10
 
11
  import sys
12
  sys.path.append(os.path.dirname(os.path.dirname(__file__)))
@@ -15,209 +16,253 @@ from src.utils.utils import create_run_directory
15
  from src.dataset.dataset import VideoDataset
16
  from src.models.model import create_model
17
  from src.dataset.video_utils import create_transform
 
 
18
 
19
  def train_and_evaluate(config):
20
- # Create a run directory if it doesn't exist
21
- if "run_dir" not in config:
22
- config["run_dir"] = create_run_directory()
23
-
24
- # Update paths based on run_dir
25
- config.update({
26
- "best_model_path": os.path.join(config["run_dir"], 'best_model.pth'),
27
- "final_model_path": os.path.join(config["run_dir"], 'final_model.pth'),
28
- "csv_path": os.path.join(config["run_dir"], 'training_log.csv'),
29
- "misclassifications_dir": os.path.join(config["run_dir"], 'misclassifications'),
30
- })
31
-
32
- config_path = os.path.join(config["run_dir"], 'config.json')
33
- with open(config_path, 'w') as f:
34
- json.dump(config, f, indent=2)
35
-
36
- # Set up logging
37
- logging.basicConfig(level=logging.INFO,
38
- format='%(asctime)s - %(levelname)s - %(message)s',
39
- handlers=[logging.FileHandler(os.path.join(config["run_dir"], 'training.log')),
40
- logging.StreamHandler()])
41
- logger = logging.getLogger(__name__)
42
-
43
- # Set device
44
- device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
45
- logger.info(f"Using device: {device}")
46
-
47
- # Initialize variables
48
- best_val_loss = float('inf')
49
- epochs_without_improvement = 0
50
-
51
- model = create_model(config["num_classes"], config["clip_model"])
52
-
53
- # Unfreeze the last 2 layers of the vision encoder
54
- model.unfreeze_vision_encoder(num_layers=config["unfreeze_layers"])
55
-
56
- # Move model to device
57
- model = model.to(device)
58
- logger.info(f"Model architecture:\n{model}")
59
-
60
- # Load datasets
61
- train_dataset = VideoDataset(
62
- os.path.join(config['data_path'], 'train.csv'),
63
- config=config
64
- )
65
-
66
- # For validation, create a new config with training=False for transforms
67
- val_config = config.copy()
68
- val_dataset = VideoDataset(
69
- os.path.join(config['data_path'], 'val.csv'),
70
- config=val_config,
71
- transform=create_transform(config, training=False)
72
- )
73
-
74
- # Create data loaders
75
- train_loader = DataLoader(train_dataset, batch_size=config["batch_size"], shuffle=True)
76
- val_loader = DataLoader(val_dataset, batch_size=config["batch_size"], shuffle=False)
77
-
78
- # Define optimizer and learning rate scheduler
79
- optimizer = torch.optim.AdamW(model.parameters(), lr=config["learning_rate"], weight_decay=config["weight_decay"])
80
- scheduler = CosineAnnealingLR(optimizer, T_max=config["num_epochs"])
81
-
82
- # Open a CSV file to log training progress
83
- with open(config["csv_path"], 'w', newline='') as file:
84
- writer = csv.writer(file)
85
- writer.writerow(["epoch", "train_loss", "train_accuracy", "val_loss", "val_accuracy"])
86
-
87
- # Function to calculate accuracy
88
- def calculate_accuracy(outputs, labels):
89
- _, predicted = torch.max(outputs, 1)
90
- correct = (predicted == labels).sum().item()
91
- total = labels.size(0)
92
- return correct / total
93
-
94
- def log_misclassifications(outputs, labels, video_paths, dataset, misclassified_videos):
95
- _, predicted = torch.max(outputs, 1)
96
- for pred, label, video_path in zip(predicted, labels, video_paths):
97
- if pred != label:
98
- true_label = dataset.label_map[label.item()]
99
- predicted_label = dataset.label_map[pred.item()]
100
- misclassified_videos.append({
101
- 'video_path': video_path,
102
- 'true_label': true_label,
103
- 'predicted_label': predicted_label
104
- })
105
-
106
- # Create a subfolder for misclassification logs
107
- os.makedirs(config["misclassifications_dir"], exist_ok=True)
108
-
109
- # Training loop
110
- for epoch in range(config["num_epochs"]):
111
- model.train()
112
- total_loss = 0
113
- total_accuracy = 0
114
- for frames, labels, video_paths in tqdm(train_loader, desc=f"Epoch {epoch + 1}/{config['num_epochs']}"):
115
- frames = frames.to(device)
116
- labels = labels.to(device)
117
-
118
- logits = model(frames)
119
-
120
- loss = torch.nn.functional.cross_entropy(logits, labels)
121
- accuracy = calculate_accuracy(logits, labels)
122
-
123
- optimizer.zero_grad()
124
- loss.backward()
125
- clip_grad_norm_(model.parameters(), max_norm=config["gradient_clip_max_norm"])
126
- optimizer.step()
127
-
128
- total_loss += loss.item()
129
- total_accuracy += accuracy
130
 
131
- avg_train_loss = total_loss / len(train_loader)
132
- avg_train_accuracy = total_accuracy / len(train_loader)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
133
 
134
- # Validation
135
- model.eval()
136
- val_loss = 0
137
- val_accuracy = 0
138
- misclassified_videos = []
139
- with torch.no_grad():
140
- for frames, labels, video_paths in val_loader:
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
141
  frames = frames.to(device)
142
  labels = labels.to(device)
143
 
144
  logits = model(frames)
145
 
146
- loss = torch.nn.functional.cross_entropy(logits, labels)
147
  accuracy = calculate_accuracy(logits, labels)
148
 
149
- val_loss += loss.item()
150
- val_accuracy += accuracy
 
 
151
 
152
- # Log misclassifications
153
- log_misclassifications(logits, labels, video_paths, val_dataset, misclassified_videos)
154
-
155
- avg_val_loss = val_loss / len(val_loader)
156
- avg_val_accuracy = val_accuracy / len(val_loader)
157
-
158
- # Log misclassified videos
159
- if misclassified_videos:
160
- misclassified_log_path = os.path.join(config["misclassifications_dir"], f'epoch_{epoch+1}.json')
161
- with open(misclassified_log_path, 'w') as f:
162
- json.dump(misclassified_videos, f, indent=2)
163
- logger.info(f"Logged {len(misclassified_videos)} misclassified videos to {misclassified_log_path}")
164
-
165
- # Log the metrics
166
- logger.info(f"Epoch [{epoch+1}/{config['num_epochs']}], "
167
- f"Train Loss: {avg_train_loss:.4f}, Train Accuracy: {avg_train_accuracy*100:.2f}%, "
168
- f"Val Loss: {avg_val_loss:.4f}, Val Accuracy: {avg_val_accuracy*100:.2f}%")
169
-
170
- # Write to CSV
171
- with open(config["csv_path"], 'a', newline='') as file:
172
- writer = csv.writer(file)
173
- writer.writerow([epoch+1, avg_train_loss, avg_train_accuracy*100, avg_val_loss, avg_val_accuracy*100])
174
-
175
- # Learning rate scheduling
176
- scheduler.step()
177
-
178
- # Save the best model and check for early stopping
179
- if avg_val_loss < best_val_loss:
180
- best_val_loss = avg_val_loss
181
- torch.save(model.state_dict(), config["best_model_path"])
182
- logger.info(f"Saved best model to {config['best_model_path']}")
183
- epochs_without_improvement = 0
184
- else:
185
- epochs_without_improvement += 1
186
-
187
- # Early stopping check
188
- if epochs_without_improvement >= config["patience"]:
189
- logger.info(f"Early stopping triggered after {config['patience']} epochs without improvement")
190
- break
191
-
192
- # Overfitting detection
193
- if avg_train_accuracy - avg_val_accuracy > config["overfitting_threshold"]:
194
- logger.warning("Possible overfitting detected")
195
-
196
- logger.info("Training finished!")
197
-
198
- # Save the final model
199
- torch.save(model.state_dict(), config["final_model_path"])
200
- logger.info(f"Saved final model to {config['final_model_path']}")
201
-
202
- # Save run information
203
- with open(os.path.join(config["run_dir"], 'run_info.txt'), 'w') as f:
204
- for key, value in config.items():
205
- f.write(f"{key}: {value}\n")
206
- f.write(f"Device: {device}\n")
207
- f.write(f"Model: {model.__class__.__name__}\n")
208
- f.write(f"Optimizer: {optimizer.__class__.__name__}\n")
209
- f.write(f"Scheduler: {scheduler.__class__.__name__}\n")
210
- f.write(f"Loss function: CrossEntropyLoss\n")
211
- f.write(f"Data augmentation: RandomHorizontalFlip, RandomRotation(5), ColorJitter\n")
212
- f.write(f"Mixed precision training: {'Enabled' if 'scaler' in locals() else 'Disabled'}\n")
213
- f.write(f"Train dataset size: {len(train_dataset)}\n")
214
- f.write(f"Validation dataset size: {len(val_dataset)}\n")
215
- f.write(f"Vision encoder frozen: {'Partially' if hasattr(model, 'unfreeze_vision_encoder') else 'Unknown'}\n")
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
216
 
 
 
 
 
 
 
217
 
218
- print("Script finished.")
 
 
 
 
219
 
220
- return avg_val_accuracy
 
 
 
 
221
 
222
  def main():
223
  # Create run directory
@@ -228,35 +273,57 @@ def main():
228
  config = {
229
  "class_labels": class_labels,
230
  "num_classes": len(class_labels),
231
- "data_path": '../finetune/3moves_otherpeopleval',
232
  "batch_size": 32,
233
- "learning_rate": 2e-6,
234
- "weight_decay": 0.007,
 
 
 
 
 
 
 
235
  "num_epochs": 50,
236
- "patience": 10, # for early stopping
237
- "max_frames": 10,
238
- "sigma": 0.3,
239
  "image_size": 224,
240
- "flip_probability": 0.5,
241
- "rotation_degrees": 15,
242
- "brightness_jitter": 0.2,
243
- "contrast_jitter": 0.2,
244
- "saturation_jitter": 0.2,
245
- "hue_jitter": 0.1,
246
- "crop_scale_min": 0.8,
247
  "crop_scale_max": 1.0,
248
- "normalization_mean": [0.485, 0.456, 0.406],
249
- "normalization_std": [0.229, 0.224, 0.225],
250
- "unfreeze_layers": 3,
251
- "clip_model": "openai/clip-vit-large-patch14",
252
- # "clip_model": "openai/clip-vit-base-patch32",
253
- "gradient_clip_max_norm": 1.0,
 
 
 
 
254
  "overfitting_threshold": 10,
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
255
  "run_dir": run_dir,
256
- "best_model_path": os.path.join(run_dir, 'best_model.pth'),
257
- "final_model_path": os.path.join(run_dir, 'final_model.pth'),
258
- "csv_path": os.path.join(run_dir, 'training_log.csv'),
259
- "misclassifications_dir": os.path.join(run_dir, 'misclassifications'),
260
  }
261
  train_and_evaluate(config)
262
 
 
7
  import csv
8
  import json
9
  from torch.optim.lr_scheduler import CosineAnnealingLR
10
+ import math
11
 
12
  import sys
13
  sys.path.append(os.path.dirname(os.path.dirname(__file__)))
 
16
  from src.dataset.dataset import VideoDataset
17
  from src.models.model import create_model
18
  from src.dataset.video_utils import create_transform
19
+ from visualization.visualize import run_visualization
20
+ from visualization.miscalculations_report import analyze_misclassifications
21
 
22
  def train_and_evaluate(config):
23
+ try:
24
+ # Create a run directory if it doesn't exist
25
+ if "run_dir" not in config:
26
+ config["run_dir"] = create_run_directory()
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
27
 
28
+ # Update paths based on run_dir
29
+ config.update({
30
+ "best_model_path": os.path.join(config["run_dir"], 'best_model.pth'),
31
+ "final_model_path": os.path.join(config["run_dir"], 'final_model.pth'),
32
+ "csv_path": os.path.join(config["run_dir"], 'training_log.csv'),
33
+ "misclassifications_dir": os.path.join(config["run_dir"], 'misclassifications'),
34
+ })
35
+
36
+ config_path = os.path.join(config["run_dir"], 'config.json')
37
+ with open(config_path, 'w') as f:
38
+ json.dump(config, f, indent=2)
39
+
40
+ # Set up logging
41
+ logging.basicConfig(level=logging.INFO,
42
+ format='%(asctime)s - %(levelname)s - %(message)s',
43
+ handlers=[logging.FileHandler(os.path.join(config["run_dir"], 'training.log')),
44
+ logging.StreamHandler()])
45
+ logger = logging.getLogger(__name__)
46
+
47
+ # Use device from config
48
+ device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
49
+ logger.info(f"Using device: {device}")
50
+
51
+ if torch.cuda.is_available():
52
+ torch.cuda.empty_cache()
53
+
54
+ # Initialize variables
55
+ best_val_loss = float('inf')
56
+ epochs_without_improvement = 0
57
+
58
+ if torch.cuda.is_available():
59
+ torch.cuda.empty_cache()
60
+ print(f"Available GPU memory: {torch.cuda.get_device_properties(0).total_memory/1e9:.2f}GB")
61
+ print(f"Currently allocated: {torch.cuda.memory_allocated()/1e9:.2f}GB")
62
+
63
+ model = create_model(config["num_classes"], config["clip_model"])
64
+ # Unfreeze the last 2 layers of the vision encoder
65
+ model.unfreeze_vision_encoder(num_layers=config["unfreeze_layers"])
66
+ model = model.to(device)
67
+
68
+ # Ensure criterion is on the same device
69
+ criterion = torch.nn.CrossEntropyLoss().to(device)
70
+
71
+ # logger.info(f"Model architecture:\n{model}")
72
+
73
+ # Load datasets
74
+ train_dataset = VideoDataset(
75
+ os.path.join(config['data_path'], 'train.csv'),
76
+ config=config
77
+ )
78
 
79
+ # For validation, create a new config with training=False for transforms
80
+ val_config = config.copy()
81
+ val_dataset = VideoDataset(
82
+ os.path.join(config['data_path'], 'val.csv'),
83
+ config=val_config,
84
+ transform=create_transform(config, training=False)
85
+ )
86
+
87
+ # Create data loaders
88
+ train_loader = DataLoader(train_dataset, batch_size=config["batch_size"], shuffle=True)
89
+ val_loader = DataLoader(val_dataset, batch_size=config["batch_size"], shuffle=False)
90
+
91
+ # Define optimizer and learning rate scheduler
92
+ optimizer = torch.optim.AdamW(model.parameters(), lr=config["learning_rate"], weight_decay=config["weight_decay"])
93
+ scheduler = CosineAnnealingLR(optimizer, T_max=config["num_epochs"])
94
+
95
+ # Open a CSV file to log training progress
96
+ with open(config["csv_path"], 'w', newline='') as file:
97
+ writer = csv.writer(file)
98
+ writer.writerow(["epoch", "train_loss", "train_accuracy", "val_loss", "val_accuracy"])
99
+
100
+ # Function to calculate accuracy
101
+ def calculate_accuracy(outputs, labels):
102
+ _, predicted = torch.max(outputs, 1)
103
+ correct = (predicted == labels).sum().item()
104
+ total = labels.size(0)
105
+ return correct / total
106
+
107
+ def log_misclassifications(outputs, labels, video_paths, dataset, misclassified_videos):
108
+ _, predicted = torch.max(outputs, 1)
109
+ for pred, label, video_path in zip(predicted, labels, video_paths):
110
+ if pred != label:
111
+ true_label = dataset.label_map[label.item()]
112
+ predicted_label = dataset.label_map[pred.item()]
113
+ misclassified_videos.append({
114
+ 'video_path': video_path,
115
+ 'true_label': true_label,
116
+ 'predicted_label': predicted_label
117
+ })
118
+
119
+ # Create a subfolder for misclassification logs
120
+ os.makedirs(config["misclassifications_dir"], exist_ok=True)
121
+
122
+ # Training loop
123
+ for epoch in range(config["num_epochs"]):
124
+ model.train()
125
+ total_loss = 0
126
+ total_accuracy = 0
127
+ for frames, labels, video_paths in tqdm(train_loader, desc=f"Epoch {epoch + 1}/{config['num_epochs']}"):
128
  frames = frames.to(device)
129
  labels = labels.to(device)
130
 
131
  logits = model(frames)
132
 
133
+ loss = criterion(logits, labels)
134
  accuracy = calculate_accuracy(logits, labels)
135
 
136
+ optimizer.zero_grad()
137
+ loss.backward()
138
+ clip_grad_norm_(model.parameters(), max_norm=config["gradient_clip_max_norm"])
139
+ optimizer.step()
140
 
141
+ total_loss += loss.item()
142
+ total_accuracy += accuracy
143
+
144
+ avg_train_loss = total_loss / len(train_loader)
145
+ avg_train_accuracy = total_accuracy / len(train_loader)
146
+
147
+ # Validation
148
+ model.eval()
149
+ val_loss = 0
150
+ val_accuracy = 0
151
+ misclassified_videos = []
152
+ with torch.no_grad():
153
+ for frames, labels, video_paths in val_loader:
154
+ frames = frames.to(device)
155
+ labels = labels.to(device)
156
+
157
+ logits = model(frames)
158
+
159
+ loss = criterion(logits, labels)
160
+ accuracy = calculate_accuracy(logits, labels)
161
+
162
+ val_loss += loss.item()
163
+ val_accuracy += accuracy
164
+
165
+ # Log misclassifications
166
+ log_misclassifications(logits, labels, video_paths, val_dataset, misclassified_videos)
167
+
168
+ avg_val_loss = val_loss / len(val_loader)
169
+ avg_val_accuracy = val_accuracy / len(val_loader)
170
+
171
+ # Log misclassified videos
172
+ if misclassified_videos:
173
+ misclassified_log_path = os.path.join(config["misclassifications_dir"], f'epoch_{epoch+1}.json')
174
+ with open(misclassified_log_path, 'w') as f:
175
+ json.dump(misclassified_videos, f, indent=2)
176
+ logger.info(f"Logged {len(misclassified_videos)} misclassified videos to {misclassified_log_path}")
177
+
178
+ # Log the metrics
179
+ logger.info(f"Epoch [{epoch+1}/{config['num_epochs']}], "
180
+ f"Train Loss: {avg_train_loss:.4f}, Train Accuracy: {avg_train_accuracy*100:.2f}%, "
181
+ f"Val Loss: {avg_val_loss:.4f}, Val Accuracy: {avg_val_accuracy*100:.2f}%")
182
+
183
+ # Write to CSV
184
+ with open(config["csv_path"], 'a', newline='') as file:
185
+ writer = csv.writer(file)
186
+ writer.writerow([epoch+1, avg_train_loss, avg_train_accuracy*100, avg_val_loss, avg_val_accuracy*100])
187
+
188
+ # Learning rate scheduling
189
+ scheduler.step()
190
+
191
+ # Save the best model and check for early stopping
192
+ if avg_val_loss < best_val_loss:
193
+ best_val_loss = avg_val_loss
194
+ torch.save(model.state_dict(), config["best_model_path"])
195
+ logger.info(f"Saved best model to {config['best_model_path']}")
196
+ epochs_without_improvement = 0
197
+ else:
198
+ epochs_without_improvement += 1
199
+
200
+ # Early stopping check
201
+ if epochs_without_improvement >= config["patience"]:
202
+ logger.info(f"Early stopping triggered after {config['patience']} epochs without improvement")
203
+ break
204
+
205
+ # Overfitting detection
206
+ if avg_train_accuracy - avg_val_accuracy > config["overfitting_threshold"]:
207
+ logger.warning("Possible overfitting detected")
208
+
209
+ logger.info("Training finished!")
210
+
211
+ # Save the final model
212
+ torch.save(model.state_dict(), config["final_model_path"])
213
+ logger.info(f"Saved final model to {config['final_model_path']}")
214
+
215
+ # Save run information
216
+ with open(os.path.join(config["run_dir"], 'run_info.txt'), 'w') as f:
217
+ for key, value in config.items():
218
+ f.write(f"{key}: {value}\n")
219
+ f.write(f"Device: {device}\n")
220
+ f.write(f"Model: {model.__class__.__name__}\n")
221
+ f.write(f"Optimizer: {optimizer.__class__.__name__}\n")
222
+ f.write(f"Scheduler: {scheduler.__class__.__name__}\n")
223
+ f.write(f"Loss function: CrossEntropyLoss\n")
224
+ f.write(f"Data augmentation: RandomHorizontalFlip, RandomRotation(5), ColorJitter\n")
225
+ f.write(f"Mixed precision training: {'Enabled' if 'scaler' in locals() else 'Disabled'}\n")
226
+ f.write(f"Train dataset size: {len(train_dataset)}\n")
227
+ f.write(f"Validation dataset size: {len(val_dataset)}\n")
228
+ f.write(f"Vision encoder frozen: {'Partially' if hasattr(model, 'unfreeze_vision_encoder') else 'Unknown'}\n")
229
+
230
+ # Run visualization
231
+ try:
232
+ logger.info("Running visualization...")
233
+ vis_dir, confusion_matrix = run_visualization(config["run_dir"])
234
+ logger.info(f"Visualization complete! Check the output directory: {vis_dir}")
235
+
236
+ # Log confusion matrix results
237
+ class_accuracies = confusion_matrix.diagonal() / confusion_matrix.sum(axis=1)
238
+ overall_accuracy = confusion_matrix.diagonal().sum() / confusion_matrix.sum()
239
+
240
+ logger.info("\nConfusion Matrix Results:")
241
+ for i, (label, accuracy) in enumerate(zip(config['class_labels'], class_accuracies)):
242
+ logger.info(f"{label}: {accuracy:.2%}")
243
+ logger.info(f"Overall Accuracy: {overall_accuracy:.2%}")
244
+
245
+ except Exception as e:
246
+ logger.error(f"Error running visualization: {str(e)}")
247
 
248
+ # Run misclassification analysis
249
+ try:
250
+ analyze_misclassifications(config["run_dir"])
251
+ logger.info(f"Misclassification analysis complete! Check the output directory: {config['run_dir']}")
252
+ except Exception as e:
253
+ logger.error(f"Error running misclassification analysis: {str(e)}")
254
 
255
+
256
+ if math.isnan(avg_val_accuracy) or math.isinf(avg_val_accuracy):
257
+ raise ValueError(f"Invalid validation accuracy: {avg_val_accuracy}")
258
+
259
+ print("Script finished.")
260
 
261
+ return avg_val_accuracy, vis_dir
262
+
263
+ except Exception as e:
264
+ logger.error(f"Training error: {str(e)}")
265
+ raise # Re-raise the exception to be caught by the hyperparameter tuning
266
 
267
  def main():
268
  # Create run directory
 
273
  config = {
274
  "class_labels": class_labels,
275
  "num_classes": len(class_labels),
276
+ "clip_model": "openai/clip-vit-large-patch14",
277
  "batch_size": 32,
278
+ "unfreeze_layers": 4,
279
+ "learning_rate": 5.305885796107412e-06,
280
+ "weight_decay": 4.543630233732527e-07,
281
+ "gradient_clip_max_norm": 0.6446650879658523,
282
+ "augmentation_strength": 0.5827616006715585,
283
+ "crop_scale_min": 0.7872781274088598,
284
+ "max_frames": 15,
285
+ "sigma": 0.286510943464138,
286
+ "data_path": "../finetune/blog/bryant/random",
287
  "num_epochs": 50,
288
+ "patience": 10,
 
 
289
  "image_size": 224,
 
 
 
 
 
 
 
290
  "crop_scale_max": 1.0,
291
+ "normalization_mean": [
292
+ 0.485,
293
+ 0.456,
294
+ 0.406
295
+ ],
296
+ "normalization_std": [
297
+ 0.229,
298
+ 0.224,
299
+ 0.225
300
+ ],
301
  "overfitting_threshold": 10,
302
+ # "data_path": '../finetune/blog/bryant/random',
303
+ # "batch_size": 8,
304
+ # "learning_rate": 2e-6,
305
+ # "weight_decay": 0.007,
306
+ # "num_epochs": 2,
307
+ # "patience": 10, # for early stopping
308
+ # "max_frames": 10,
309
+ # "sigma": 0.3,
310
+ # "image_size": 224,
311
+ # "flip_probability": 0.5,
312
+ # "rotation_degrees": 15,
313
+ # "brightness_jitter": 0.2,
314
+ # "contrast_jitter": 0.2,
315
+ # "saturation_jitter": 0.2,
316
+ # "hue_jitter": 0.1,
317
+ # "crop_scale_min": 0.8,
318
+ # "crop_scale_max": 1.0,
319
+ # "normalization_mean": [0.485, 0.456, 0.406],
320
+ # "normalization_std": [0.229, 0.224, 0.225],
321
+ # "unfreeze_layers": 3,
322
+ # # "clip_model": "openai/clip-vit-large-patch14",
323
+ # "clip_model": "openai/clip-vit-base-patch32",
324
+ # "gradient_clip_max_norm": 1.0,
325
+ # "overfitting_threshold": 10,
326
  "run_dir": run_dir,
 
 
 
 
327
  }
328
  train_and_evaluate(config)
329
 
script/visualization/visualize.py CHANGED
@@ -110,28 +110,28 @@ def generate_evaluation_metrics(model, data_loader, device, output_dir, class_la
110
 
111
  return cm
112
 
113
- if __name__ == "__main__":
114
- # Find the most recent run directory
115
- #
116
- run_dir = get_latest_run_dir()
117
- # run_dir= "/home/bawolf/workspace/break/clip/runs/run_20241024-150232_otherpeopleval_large_model"
118
- # run_dir = "/home/bawolf/workspace/break/clip/runs/run_20241022-122939_3moves_balanced"
119
-
 
 
120
  # Load configuration
121
  config = get_config(run_dir)
122
 
123
  class_labels = config['class_labels']
124
  num_classes = config['num_classes']
125
- data_path = config['data_path']
126
- # data_path= '../finetune/3moves_otherpeopleval'
127
- # data_path = '../finetune/otherpeople3moves'
128
 
129
  # Paths
130
  log_file = os.path.join(run_dir, 'training_log.csv')
131
  model_path = get_latest_model_path(run_dir)
132
- test_csv = os.path.join(data_path, 'test.csv')
133
- # test_csv = os.path.join(data_path, 'val.csv')
134
- # test_csv = os.path.join(data_path, 'train.csv')
135
 
136
  # Get the last directory of data_path and the file name
137
  last_dir = os.path.basename(os.path.normpath(data_path))
@@ -160,3 +160,12 @@ if __name__ == "__main__":
160
  cm = generate_evaluation_metrics(model, test_loader, device, vis_dir, class_labels, data_info)
161
 
162
  print(f"Visualization complete! Check the output directory: {vis_dir}")
 
 
 
 
 
 
 
 
 
 
110
 
111
  return cm
112
 
113
+ def run_visualization(run_dir, data_path=None, test_csv=None):
114
+ """
115
+ Run visualization for a specific training run
116
+
117
+ Args:
118
+ run_dir (str): Path to the run directory
119
+ data_path (str, optional): Override the data path from config
120
+ test_csv (str, optional): Override the test CSV path
121
+ """
122
  # Load configuration
123
  config = get_config(run_dir)
124
 
125
  class_labels = config['class_labels']
126
  num_classes = config['num_classes']
127
+ data_path = data_path or config['data_path']
 
 
128
 
129
  # Paths
130
  log_file = os.path.join(run_dir, 'training_log.csv')
131
  model_path = get_latest_model_path(run_dir)
132
+
133
+ if test_csv is None:
134
+ test_csv = os.path.join(data_path, 'test.csv')
135
 
136
  # Get the last directory of data_path and the file name
137
  last_dir = os.path.basename(os.path.normpath(data_path))
 
160
  cm = generate_evaluation_metrics(model, test_loader, device, vis_dir, class_labels, data_info)
161
 
162
  print(f"Visualization complete! Check the output directory: {vis_dir}")
163
+ return vis_dir, cm
164
+
165
+ if __name__ == "__main__":
166
+ # Find the most recent run directory
167
+ run_dir = get_latest_run_dir()
168
+ # run_dir = "/home/bawolf/workspace/break/clip/runs/run_20241024-150232_otherpeopleval_large_model"
169
+ # run_dir = "/home/bawolf/workspace/break/clip/runs/run_20241022-122939_3moves_balanced"
170
+
171
+ run_visualization(run_dir)
src/dataset/dataset.py CHANGED
@@ -2,6 +2,7 @@ import torch
2
  from torch.utils.data import Dataset
3
  import csv
4
  from .video_utils import create_transform, extract_frames
 
5
 
6
  class VideoDataset(Dataset):
7
  def __init__(self, file_path, config, transform=None):
@@ -29,7 +30,8 @@ class VideoDataset(Dataset):
29
  if len(row) != 2:
30
  print(f"Skipping invalid row: {row}")
31
  continue
32
- video_path, label = row
 
33
  try:
34
  label = int(label)
35
  except ValueError:
 
2
  from torch.utils.data import Dataset
3
  import csv
4
  from .video_utils import create_transform, extract_frames
5
+ import os
6
 
7
  class VideoDataset(Dataset):
8
  def __init__(self, file_path, config, transform=None):
 
30
  if len(row) != 2:
31
  print(f"Skipping invalid row: {row}")
32
  continue
33
+ relative_video_path, label = row
34
+ video_path = os.path.join(config['data_path'], relative_video_path)
35
  try:
36
  label = int(label)
37
  except ValueError: