AudioLlama / mmaudio /ext /synchformer /divided_224_16x4.yaml
Rex Cheng
initial commit
dbac20f
raw
history blame
1.78 kB
TRAIN:
ENABLE: True
DATASET: Ssv2
BATCH_SIZE: 32
EVAL_PERIOD: 5
CHECKPOINT_PERIOD: 5
AUTO_RESUME: True
CHECKPOINT_EPOCH_RESET: True
CHECKPOINT_FILE_PATH: /checkpoint/fmetze/neurips_sota/40944587/checkpoints/checkpoint_epoch_00035.pyth
DATA:
NUM_FRAMES: 16
SAMPLING_RATE: 4
TRAIN_JITTER_SCALES: [256, 320]
TRAIN_CROP_SIZE: 224
TEST_CROP_SIZE: 224
INPUT_CHANNEL_NUM: [3]
MEAN: [0.5, 0.5, 0.5]
STD: [0.5, 0.5, 0.5]
PATH_TO_DATA_DIR: /private/home/mandelapatrick/slowfast/data/ssv2
PATH_PREFIX: /datasets01/SomethingV2/092720/20bn-something-something-v2-frames
INV_UNIFORM_SAMPLE: True
RANDOM_FLIP: False
REVERSE_INPUT_CHANNEL: True
USE_RAND_AUGMENT: True
RE_PROB: 0.0
USE_REPEATED_AUG: False
USE_RANDOM_RESIZE_CROPS: False
COLORJITTER: False
GRAYSCALE: False
GAUSSIAN: False
SOLVER:
BASE_LR: 1e-4
LR_POLICY: steps_with_relative_lrs
LRS: [1, 0.1, 0.01]
STEPS: [0, 20, 30]
MAX_EPOCH: 35
MOMENTUM: 0.9
WEIGHT_DECAY: 5e-2
WARMUP_EPOCHS: 0.0
OPTIMIZING_METHOD: adamw
USE_MIXED_PRECISION: True
SMOOTHING: 0.2
SLOWFAST:
ALPHA: 8
VIT:
PATCH_SIZE: 16
PATCH_SIZE_TEMP: 2
CHANNELS: 3
EMBED_DIM: 768
DEPTH: 12
NUM_HEADS: 12
MLP_RATIO: 4
QKV_BIAS: True
VIDEO_INPUT: True
TEMPORAL_RESOLUTION: 8
USE_MLP: True
DROP: 0.0
POS_DROPOUT: 0.0
DROP_PATH: 0.2
IM_PRETRAINED: True
HEAD_DROPOUT: 0.0
HEAD_ACT: tanh
PRETRAINED_WEIGHTS: vit_1k
ATTN_LAYER: divided
MODEL:
NUM_CLASSES: 174
ARCH: slow
MODEL_NAME: VisionTransformer
LOSS_FUNC: cross_entropy
TEST:
ENABLE: True
DATASET: Ssv2
BATCH_SIZE: 64
NUM_ENSEMBLE_VIEWS: 1
NUM_SPATIAL_CROPS: 3
DATA_LOADER:
NUM_WORKERS: 4
PIN_MEMORY: True
NUM_GPUS: 8
NUM_SHARDS: 4
RNG_SEED: 0
OUTPUT_DIR: .
TENSORBOARD:
ENABLE: True