diff --git a/README.md b/README.md new file mode 100644 index 0000000000000000000000000000000000000000..bccf5dc23d508df8a405a594f1c0a2bf826a2395 --- /dev/null +++ b/README.md @@ -0,0 +1,61 @@ +## LoCoNet: Long-Short Context Network for Active Speaker Detection + + + +### Dependencies + +Start from building the environment +``` +conda env create -f requirements.yml +conda activate loconet +``` +export PYTHONPATH=**project_dir**/dlhammer:$PYTHONPATH +and replace **project_dir** with your code base location + + + +### Data preparation + +We follow TalkNet's data preparation script to download and prepare the AVA dataset. + +``` +python train.py --dataPathAVA AVADataPath --download +``` + +`AVADataPath` is the folder you want to save the AVA dataset and its preprocessing outputs, the details can be found in [here](https://github.com/TaoRuijie/TalkNet_ASD/blob/main/utils/tools.py#L34) . Please read them carefully. + +After AVA dataset is downloaded, please change the DATA.dataPathAVA entry in the config file. + +#### Training script +``` +python -W ignore::UserWarning train.py --cfg configs/multi.yaml OUTPUT_DIR +``` + + + +#### Pretrained model + +Please download the LoCoNet trained weights on AVA dataset [here](https://drive.google.com/file/d/1EX-V464jCD6S-wg68yGuAa-UcsMrw8mK/view?usp=sharing). + +``` +python -W ignore::UserWarning test_multicard.py --cfg configs/multi.yaml RESUME_PATH {model download path} +``` + +### Citation + +Please cite the following if our paper or code is helpful to your research. +``` +@article{wang2023loconet, + title={LoCoNet: Long-Short Context Network for Active Speaker Detection}, + author={Wang, Xizi and Cheng, Feng and Bertasius, Gedas and Crandall, David}, + journal={arXiv preprint arXiv:2301.08237}, + year={2023} +} +``` + + +### Acknowledge + +The code base of this project is studied from [TalkNet](https://github.com/TaoRuijie/TalkNet-ASD) which is a very easy-to-use ASD pipeline. + + diff --git a/__pycache__/dataLoader_multiperson.cpython-37.pyc b/__pycache__/dataLoader_multiperson.cpython-37.pyc new file mode 100644 index 0000000000000000000000000000000000000000..7185ff267eaf7ed6976127efe3ef7b35ae06946a Binary files /dev/null and b/__pycache__/dataLoader_multiperson.cpython-37.pyc differ diff --git a/__pycache__/loconet.cpython-37.pyc b/__pycache__/loconet.cpython-37.pyc new file mode 100644 index 0000000000000000000000000000000000000000..033f1b172f74594300b087bc45d9682e429cf061 Binary files /dev/null and b/__pycache__/loconet.cpython-37.pyc differ diff --git a/__pycache__/loss_multi.cpython-37.pyc b/__pycache__/loss_multi.cpython-37.pyc new file mode 100644 index 0000000000000000000000000000000000000000..f7115764cd6c3c548362dc5583b586a6679a4798 Binary files /dev/null and b/__pycache__/loss_multi.cpython-37.pyc differ diff --git a/__pycache__/talkNet_config_multi.cpython-37.pyc b/__pycache__/talkNet_config_multi.cpython-37.pyc new file mode 100644 index 0000000000000000000000000000000000000000..58a6a8ef9aa50f0fb0dc7a67eaa5d51dd92f5bc0 Binary files /dev/null and b/__pycache__/talkNet_config_multi.cpython-37.pyc differ diff --git a/builder.py b/builder.py new file mode 100644 index 0000000000000000000000000000000000000000..6afeb8375a5b1b3fc83948d230af2cb6039f745d --- /dev/null +++ b/builder.py @@ -0,0 +1,95 @@ +# -*- coding: utf-8 -*- +#================================================================ +# Don't go gently into that good night. +# +# author: klaus +# description: +# +#================================================================ + +import warnings + +from mmcv.cnn import MODELS as MMCV_MODELS +from mmcv.utils import Registry + +from mmaction.utils import import_module_error_func + +MODELS = Registry('models', parent=MMCV_MODELS) +BACKBONES = MODELS +NECKS = MODELS +HEADS = MODELS +RECOGNIZERS = MODELS +LOSSES = MODELS +LOCALIZERS = MODELS + +try: + from mmdet.models.builder import DETECTORS, build_detector +except (ImportError, ModuleNotFoundError): + # Define an empty registry and building func, so that can import + DETECTORS = MODELS + + @import_module_error_func('mmdet') + def build_detector(cfg, train_cfg, test_cfg): + pass + + +def build_backbone(cfg): + """Build backbone.""" + return BACKBONES.build(cfg) + + +def build_head(cfg): + """Build head.""" + return HEADS.build(cfg) + + +def build_recognizer(cfg, train_cfg=None, test_cfg=None): + """Build recognizer.""" + if train_cfg is not None or test_cfg is not None: + warnings.warn( + 'train_cfg and test_cfg is deprecated, ' + 'please specify them in model. Details see this ' + 'PR: https://github.com/open-mmlab/mmaction2/pull/629', UserWarning) + assert cfg.get( + 'train_cfg' + ) is None or train_cfg is None, 'train_cfg specified in both outer field and model field' # noqa: E501 + assert cfg.get( + 'test_cfg' + ) is None or test_cfg is None, 'test_cfg specified in both outer field and model field ' # noqa: E501 + return RECOGNIZERS.build(cfg, default_args=dict(train_cfg=train_cfg, test_cfg=test_cfg)) + + +def build_loss(cfg): + """Build loss.""" + return LOSSES.build(cfg) + + +def build_localizer(cfg): + """Build localizer.""" + return LOCALIZERS.build(cfg) + + +def build_model(cfg, train_cfg=None, test_cfg=None): + """Build model.""" + args = cfg.copy() + obj_type = args.pop('type') + if obj_type in LOCALIZERS: + return build_localizer(cfg) + if obj_type in RECOGNIZERS: + return build_recognizer(cfg, train_cfg, test_cfg) + if obj_type in DETECTORS: + if train_cfg is not None or test_cfg is not None: + warnings.warn( + 'train_cfg and test_cfg is deprecated, ' + 'please specify them in model. Details see this ' + 'PR: https://github.com/open-mmlab/mmaction2/pull/629', UserWarning) + return build_detector(cfg, train_cfg, test_cfg) + model_in_mmdet = ['FastRCNN'] + if obj_type in model_in_mmdet: + raise ImportError('Please install mmdet for spatial temporal detection tasks.') + raise ValueError(f'{obj_type} is not registered in ' 'LOCALIZERS, RECOGNIZERS or DETECTORS') + + +def build_neck(cfg): + """Build neck.""" + return NECKS.build(cfg) diff --git a/configs/multi.yaml b/configs/multi.yaml new file mode 100644 index 0000000000000000000000000000000000000000..75234312e9c71c496307bf7d1756b782a5c67324 --- /dev/null +++ b/configs/multi.yaml @@ -0,0 +1,51 @@ +SEED: "20210617" +NUM_GPUS: 4 +NUM_WORKERS: 6 +LOG_NAME: 'config.txt' +OUTPUT_DIR: '/nfs/joltik/data/ssd/xiziwang/TalkNet_models/' # savePath +evalDataType: "val" +downloadAVA: False +evaluation: False +RESUME: False +RESUME_PATH: "" +RESUME_EPOCH: 0 + +DATA: + dataPathAVA: '/nfs/jolteon/data/ssd/xiziwang/AVA_dataset/' + +DATALOADER: + nDataLoaderThread: 4 + + +SOLVER: + OPTIMIZER: "adam" + BASE_LR: 5e-5 + SCHEDULER: + NAME: "multistep" + GAMMA: 0.95 + +MODEL: + NUM_SPEAKERS: 3 + CLIP_LENGTH: 200 + AV: "speaker_temporal" + AV_layers: 3 + ADJUST_ATTENTION: 0 + +TRAIN: + BATCH_SIZE: 1 + MAX_EPOCH: 25 + AUDIO_AUG: 1 + TEST_INTERVAL: 1 + TRAINER_GPU: 4 + + +VAL: + BATCH_SIZE: 1 + +TEST: + BATCH_SIZE: 1 + DATASET: 'seen' + MODEL: 'unseen' + + + diff --git a/dataLoaderTalkSet.py b/dataLoaderTalkSet.py new file mode 100644 index 0000000000000000000000000000000000000000..0ef5bc173f3aefee330297e0425b04bc4b6c4bf0 --- /dev/null +++ b/dataLoaderTalkSet.py @@ -0,0 +1,182 @@ +import os, torch, numpy, cv2, imageio, random, python_speech_features +import matplotlib.pyplot as plt +from scipy.io import wavfile +from glob import glob +from torchvision.transforms import RandomCrop +from scipy import signal + +def get_noise_list(musanPath, rirPath): + augment_files = glob(os.path.join(musanPath, '*/*/*/*.wav')) + noiselist = {} + rir = numpy.load(rirPath) + for file in augment_files: + if not file.split('/')[-4] in noiselist: + noiselist[file.split('/')[-4]] = [] + noiselist[file.split('/')[-4]].append(file) + return rir, noiselist + +def augment_wav(audio, aug_type, rir, noiselist): + if aug_type == 'rir': + rir_gains = numpy.random.uniform(-7,3,1) + rir_filts = random.choice(rir) + rir = numpy.multiply(rir_filts, pow(10, 0.1 * rir_gains)) + audio = signal.convolve(audio, rir, mode='full')[:len(audio)] + else: + noisecat = aug_type + noisefile = random.choice(noiselist[noisecat].copy()) + snr = [random.uniform({'noise':[0,15],'music':[5,15]}[noisecat][0], {'noise':[0,15],'music':[5,15]}[noisecat][1])] + _, noiseaudio = wavfile.read(noisefile) + if len(noiseaudio) < len(audio): + shortage = len(audio) - len(noiseaudio) + noiseaudio = numpy.pad(noiseaudio, (0, shortage), 'wrap') + else: + noiseaudio = noiseaudio[:len(audio)] + + noise_db = 10 * numpy.log10(numpy.mean(abs(noiseaudio ** 2)) + 1e-4) + clean_db = 10 * numpy.log10(numpy.mean(abs(audio ** 2)) + 1e-4) + noise = numpy.sqrt(10 ** ((clean_db - noise_db - snr) / 10)) * noiseaudio + audio = audio + noise + return audio.astype(numpy.int16) + +def load_audio(data, data_path, length, start, end, audio_aug, rirlist = None, noiselist = None): + # Find the path of the audio data + data_type = data[0] + id_name = data[1][:8] + file_name = data[1].split('/')[0] + '_' + data[1].split('/')[1] + '_' + data[1].split('/')[2] + \ + '_' + data[2].split('/')[0] + '_' + data[2].split('/')[1] + '_' + data[2].split('/')[2] + '.wav' + audio_file_path = os.path.join(data_path, data_type, id_name, file_name) + # Load audio, compute MFCC, cut it to the required length + _, audio = wavfile.read(audio_file_path) + + if audio_aug == True: + augtype = random.randint(0,3) + if augtype == 1: # rir + audio = augment_wav(audio, 'rir', rirlist, noiselist) + elif augtype == 2: + audio = augment_wav(audio, 'noise', rirlist, noiselist) + elif augtype == 3: + audio = augment_wav(audio, 'music', rirlist, noiselist) + else: + audio = audio + + feature = python_speech_features.mfcc(audio, 16000, numcep = 13, winlen = 0.025, winstep = 0.010) + length_audio = int(round(length * 100)) + if feature.shape[0] < length_audio: + shortage = length_audio - feature.shape[0] + feature = numpy.pad(feature, ((0, shortage), (0,0)), 'wrap') + feature = feature[int(round(start * 100)):int(round(end * 100)),:] + return feature + +def load_video(data, data_path, length, start, end, visual_aug): + # Find the path of the visual data + data_type = data[0] + id_name = data[1][:8] + file_name = data[1].split('/')[0] + '_' + data[1].split('/')[1] + '_' + data[1].split('/')[2] + \ + '_' + data[2].split('/')[0] + '_' + data[2].split('/')[1] + '_' + data[2].split('/')[2] + '.mp4' + video_file_path = os.path.join(data_path, data_type, id_name, file_name) + # Load visual frame-by-frame, cut it to the required length + length_video = int(round((end - start) * 25)) + video = cv2.VideoCapture(video_file_path) + faces = [] + augtype = 'orig' + + if visual_aug == True: + new = int(112*random.uniform(0.7, 1)) + x, y = numpy.random.randint(0, 112 - new), numpy.random.randint(0, 112 - new) + M = cv2.getRotationMatrix2D((112/2,112/2), random.uniform(-15, 15), 1) + augtype = random.choice(['orig', 'flip', 'crop', 'rotate']) + + num_frame = 0 + while video.isOpened(): + ret, frames = video.read() + if ret == True: + num_frame += 1 + if num_frame >= int(round(start * 25)) and num_frame < int(round(end * 25)): + face = cv2.cvtColor(frames, cv2.COLOR_BGR2GRAY) + face = cv2.resize(face, (224,224)) + face = face[int(112-(112/2)):int(112+(112/2)), int(112-(112/2)):int(112+(112/2))] + if augtype == 'orig': + faces.append(face) + elif augtype == 'flip': + faces.append(cv2.flip(face, 1)) + elif augtype == 'crop': + faces.append(cv2.resize(face[y:y+new, x:x+new] , (112,112))) + elif augtype == 'rotate': + faces.append(cv2.warpAffine(face, M, (112,112))) + else: + break + video.release() + faces = numpy.array(faces) + if faces.shape[0] < length_video: + shortage = length_video - faces.shape[0] + faces = numpy.pad(faces, ((0,shortage), (0,0),(0,0)), 'wrap') + # faces = numpy.array(faces)[int(round(start * 25)):int(round(end * 25)),:,:] + return faces + +def load_label(data, length, start, end): + labels_all = [] + labels = [] + data_type = data[0] + start_T, end_T, start_F, end_F = float(data[4]), float(data[5]), float(data[6]), float(data[7]) + for i in range(int(round(length * 100))): + if data_type == 'TAudio': + labels_all.append(1) + elif data_type == 'FAudio' or data_type == 'FSilence': + labels_all.append(0) + else: + if i >= int(round(start_T * 100)) and i <= int(round(end_T * 100)): + labels_all.append(1) + else: + labels_all.append(0) + for i in range(int(round(length * 25))): + labels.append(int(round(sum(labels_all[i*4: (i+1)*4]) / 4))) + return labels[round(start*25): round(end*25)] + +class loader_TalkSet(object): + def __init__(self, trial_file_name, data_path, audio_aug, visual_aug, musanPath, rirPath,**kwargs): + self.data_path = data_path + self.audio_aug = audio_aug + self.visual_aug = visual_aug + self.minibatch = [] + self.rir, self.noiselist = get_noise_list(musanPath, rirPath) + mix_lst = open(trial_file_name).read().splitlines() + mix_lst = list(filter(lambda x: float(x.split()[3]) >= 1, mix_lst)) # filter the video less than 1s + # mix_lst = list(filter(lambda x: x.split()[0] == 'TSilence', mix_lst)) + sorted_mix_lst = sorted(mix_lst, key=lambda data: (float(data.split()[3]), int(data.split()[-1])), reverse=True) + start = 0 + while True: + length_total = float(sorted_mix_lst[start].split()[3]) + batch_size = int(250 / length_total) + end = min(len(sorted_mix_lst), start + batch_size) + self.minibatch.append(sorted_mix_lst[start:end]) + if end == len(sorted_mix_lst): + break + start = end + # self.minibatch = self.minibatch[0:5] + + def __getitem__(self, index): + batch_lst = self.minibatch[index] + length_total = float(batch_lst[-1].split()[3]) + length_total = (int(round(length_total * 100)) - int(round(length_total * 100)) % 4) / 100 + audio_feature, video_feature, labels = [], [], [] + duration = random.choice([1,2,4,6]) + #duration = 6 + length = min(length_total, duration) + if length == duration: + start = int(round(random.randint(0, round(length_total * 25) - round(length * 25)) * 0.04 * 100)) / 100 + end = int(round((start + length) * 100)) / 100 + else: + start, end = 0, length + + for line in batch_lst: + data = line.split() + audio_feature.append(load_audio(data, self.data_path, length_total, start, end, audio_aug = self.audio_aug, rirlist = self.rir, noiselist = self.noiselist)) + video_feature.append(load_video(data, self.data_path, length_total, start, end, visual_aug = self.visual_aug)) + labels.append(load_label(data, length_total, start, end)) + + return torch.FloatTensor(numpy.array(audio_feature)), \ + torch.FloatTensor(numpy.array(video_feature)), \ + torch.LongTensor(numpy.array(labels)) + + def __len__(self): + return len(self.minibatch) \ No newline at end of file diff --git a/dataLoader_multiperson.py b/dataLoader_multiperson.py new file mode 100755 index 0000000000000000000000000000000000000000..1e643ea3722f16734e0880dd404730b314b44a98 --- /dev/null +++ b/dataLoader_multiperson.py @@ -0,0 +1,402 @@ +import os, torch, numpy, cv2, random, glob, python_speech_features, json, math +from scipy.io import wavfile +from torchvision.transforms import RandomCrop +from operator import itemgetter +from torchvggish import vggish_input, vggish_params, mel_features + + +def overlap(audio, noiseAudio): + snr = [random.uniform(-5, 5)] + if len(noiseAudio) < len(audio): + shortage = len(audio) - len(noiseAudio) + noiseAudio = numpy.pad(noiseAudio, (0, shortage), 'wrap') + else: + noiseAudio = noiseAudio[:len(audio)] + noiseDB = 10 * numpy.log10(numpy.mean(abs(noiseAudio**2)) + 1e-4) + cleanDB = 10 * numpy.log10(numpy.mean(abs(audio**2)) + 1e-4) + noiseAudio = numpy.sqrt(10**((cleanDB - noiseDB - snr) / 10)) * noiseAudio + audio = audio + noiseAudio + return audio.astype(numpy.int16) + + +def load_audio(data, dataPath, numFrames, audioAug, audioSet=None): + dataName = data[0] + fps = float(data[2]) + audio = audioSet[dataName] + if audioAug == True: + augType = random.randint(0, 1) + if augType == 1: + audio = overlap(dataName, audio, audioSet) + else: + audio = audio + # fps is not always 25, in order to align the visual, we modify the window and step in MFCC extraction process based on fps + audio = python_speech_features.mfcc(audio, + 16000, + numcep=13, + winlen=0.025 * 25 / fps, + winstep=0.010 * 25 / fps) + maxAudio = int(numFrames * 4) + if audio.shape[0] < maxAudio: + shortage = maxAudio - audio.shape[0] + audio = numpy.pad(audio, ((0, shortage), (0, 0)), 'wrap') + audio = audio[:int(round(numFrames * 4)), :] + return audio + + +def load_single_audio(audio, fps, numFrames, audioAug=False): + audio = python_speech_features.mfcc(audio, + 16000, + numcep=13, + winlen=0.025 * 25 / fps, + winstep=0.010 * 25 / fps) + maxAudio = int(numFrames * 4) + if audio.shape[0] < maxAudio: + shortage = maxAudio - audio.shape[0] + audio = numpy.pad(audio, ((0, shortage), (0, 0)), 'wrap') + audio = audio[:int(round(numFrames * 4)), :] + return audio + + +def load_visual(data, dataPath, numFrames, visualAug): + dataName = data[0] + videoName = data[0][:11] + faceFolderPath = os.path.join(dataPath, videoName, dataName) + faceFiles = glob.glob("%s/*.jpg" % faceFolderPath) + sortedFaceFiles = sorted(faceFiles, + key=lambda data: (float(data.split('/')[-1][:-4])), + reverse=False) + faces = [] + H = 112 + if visualAug == True: + new = int(H * random.uniform(0.7, 1)) + x, y = numpy.random.randint(0, H - new), numpy.random.randint(0, H - new) + M = cv2.getRotationMatrix2D((H / 2, H / 2), random.uniform(-15, 15), 1) + augType = random.choice(['orig', 'flip', 'crop', 'rotate']) + else: + augType = 'orig' + for faceFile in sortedFaceFiles[:numFrames]: + face = cv2.imread(faceFile) + + face = cv2.cvtColor(face, cv2.COLOR_BGR2GRAY) + face = cv2.resize(face, (H, H)) + if augType == 'orig': + faces.append(face) + elif augType == 'flip': + faces.append(cv2.flip(face, 1)) + elif augType == 'crop': + faces.append(cv2.resize(face[y:y + new, x:x + new], (H, H))) + elif augType == 'rotate': + faces.append(cv2.warpAffine(face, M, (H, H))) + faces = numpy.array(faces) + return faces + + +def load_label(data, numFrames): + res = [] + labels = data[3].replace('[', '').replace(']', '') + labels = labels.split(',') + for label in labels: + res.append(int(label)) + res = numpy.array(res[:numFrames]) + return res + + +class train_loader(object): + + def __init__(self, cfg, trialFileName, audioPath, visualPath, num_speakers): + self.cfg = cfg + self.audioPath = audioPath + self.visualPath = visualPath + self.candidate_speakers = num_speakers + self.path = os.path.join(cfg.DATA.dataPathAVA, "csv") + self.entity_data = json.load(open(os.path.join(self.path, 'train_entity.json'))) + self.ts_to_entity = json.load(open(os.path.join(self.path, 'train_ts.json'))) + self.mixLst = open(trialFileName).read().splitlines() + self.list_length = len(self.mixLst) + random.shuffle(self.mixLst) + + def load_single_audio(self, audio, fps, numFrames, audioAug=False, aug_audio=None): + if audioAug: + augType = random.randint(0, 1) + if augType == 1: + audio = overlap(audio, aug_audio) + else: + audio = audio + + res = vggish_input.waveform_to_examples(audio, 16000, numFrames, fps, return_tensor=False) + return res + + def load_visual_label_mask(self, videoName, entityName, target_ts, context_ts, visualAug=True): + + faceFolderPath = os.path.join(self.visualPath, videoName, entityName) + + faces = [] + H = 112 + if visualAug == True: + new = int(H * random.uniform(0.7, 1)) + x, y = numpy.random.randint(0, H - new), numpy.random.randint(0, H - new) + M = cv2.getRotationMatrix2D((H / 2, H / 2), random.uniform(-15, 15), 1) + augType = random.choice(['orig', 'flip', 'crop', 'rotate']) + else: + augType = 'orig' + labels_dict = self.entity_data[videoName][entityName] + labels = numpy.zeros(len(target_ts)) + mask = numpy.zeros(len(target_ts)) + + for i, time in enumerate(target_ts): + if time not in context_ts: + faces.append(numpy.zeros((H, H))) + else: + labels[i] = labels_dict[time] + mask[i] = 1 + time = "%.2f" % float(time) + faceFile = os.path.join(faceFolderPath, str(time) + '.jpg') + + face = cv2.imread(faceFile) + + face = cv2.cvtColor(face, cv2.COLOR_BGR2GRAY) + face = cv2.resize(face, (H, H)) + if augType == 'orig': + faces.append(face) + elif augType == 'flip': + faces.append(cv2.flip(face, 1)) + elif augType == 'crop': + faces.append(cv2.resize(face[y:y + new, x:x + new], (H, H))) + elif augType == 'rotate': + faces.append(cv2.warpAffine(face, M, (H, H))) + faces = numpy.array(faces) + return faces, labels, mask + + def get_speaker_context(self, videoName, target_entity, all_ts, center_ts): + + context_speakers = list(self.ts_to_entity[videoName][center_ts]) + context = {} + chosen_speakers = [] + context[target_entity] = all_ts + context_speakers.remove(target_entity) + num_frames = len(all_ts) + for candidate in context_speakers: + candidate_ts = self.entity_data[videoName][candidate] + shared_ts = set(all_ts).intersection(set(candidate_ts)) + if (len(shared_ts) > (num_frames / 2)): + context[candidate] = shared_ts + chosen_speakers.append(candidate) + context_speakers = chosen_speakers + random.shuffle(context_speakers) + if not context_speakers: + context_speakers.insert(0, target_entity) # make sure is at 0 + while len(context_speakers) < self.candidate_speakers: + context_speakers.append(random.choice(context_speakers)) + elif len(context_speakers) < self.candidate_speakers: + context_speakers.insert(0, target_entity) # make sure is at 0 + while len(context_speakers) < self.candidate_speakers: + context_speakers.append(random.choice(context_speakers[1:])) + else: + context_speakers.insert(0, target_entity) # make sure is at 0 + context_speakers = context_speakers[:self.candidate_speakers] + + assert set(context_speakers).issubset(set(list(context.keys()))), target_entity + assert target_entity in context_speakers, target_entity + + return context_speakers, context + + def __getitem__(self, index): + + target_video = self.mixLst[index] + data = target_video.split('\t') + fps = float(data[2]) + videoName = data[0][:11] + target_entity = data[0] + all_ts = list(self.entity_data[videoName][target_entity].keys()) + numFrames = int(data[1]) + assert numFrames == len(all_ts) + + center_ts = all_ts[math.floor(numFrames / 2)] + + # get context speakers which have more than half time overlapped with target speaker + context_speakers, context = self.get_speaker_context(videoName, target_entity, all_ts, + center_ts) + + if self.cfg.TRAIN.AUDIO_AUG: + other_indices = list(range(0, index)) + list(range(index + 1, self.list_length)) + augment_entity = self.mixLst[random.choice(other_indices)] + augment_data = augment_entity.split('\t') + augment_entity = augment_data[0] + augment_videoname = augment_data[0][:11] + aug_sr, aug_audio = wavfile.read( + os.path.join(self.audioPath, augment_videoname, augment_entity + '.wav')) + else: + aug_audio = None + + audio_path = os.path.join(self.audioPath, videoName, target_entity + '.wav') + sr, audio = wavfile.read(os.path.join(self.audioPath, videoName, target_entity + '.wav')) + audio = self.load_single_audio(audio, + fps, + numFrames, + audioAug=self.cfg.TRAIN.AUDIO_AUG, + aug_audio=aug_audio) + + visualFeatures, labels, masks = [], [], [] + + # target_label = list(self.entity_data[videoName][target_entity].values()) + visual, target_labels, target_masks = self.load_visual_label_mask( + videoName, target_entity, all_ts, all_ts) + + for idx, context_entity in enumerate(context_speakers): + if context_entity == target_entity: + label = target_labels + visualfeat = visual + mask = target_masks + else: + visualfeat, label, mask = self.load_visual_label_mask(videoName, context_entity, + all_ts, + context[context_entity]) + visualFeatures.append(visualfeat) + labels.append(label) + masks.append(mask) + + audio = torch.FloatTensor(audio)[None, :, :] + visualFeatures = torch.FloatTensor(numpy.array(visualFeatures)) + audio_t = audio.shape[1] + video_t = visualFeatures.shape[1] + if audio_t != video_t * 4: + print(visualFeatures.shape, audio.shape, videoName, target_entity, numFrames) + labels = torch.LongTensor(numpy.array(labels)) + masks = torch.LongTensor(numpy.array(masks)) + print(audio.shape) + return audio, visualFeatures, labels, masks + + def __len__(self): + return len(self.mixLst) + + +class val_loader(object): + + def __init__(self, cfg, trialFileName, audioPath, visualPath, num_speakers): + self.cfg = cfg + self.audioPath = audioPath + self.visualPath = visualPath + self.candidate_speakers = num_speakers + self.path = os.path.join(cfg.DATA.dataPathAVA, "csv") + self.entity_data = json.load(open(os.path.join(self.path, 'val_entity.json'))) + self.ts_to_entity = json.load(open(os.path.join(self.path, 'val_ts.json'))) + self.mixLst = open(trialFileName).read().splitlines() + + def load_single_audio(self, audio, fps, numFrames, audioAug=False, aug_audio=None): + + res = vggish_input.waveform_to_examples(audio, 16000, numFrames, fps, return_tensor=False) + return res + + def load_visual_label_mask(self, videoName, entityName, target_ts, context_ts): + + faceFolderPath = os.path.join(self.visualPath, videoName, entityName) + + faces = [] + H = 112 + labels_dict = self.entity_data[videoName][entityName] + labels = numpy.zeros(len(target_ts)) + mask = numpy.zeros(len(target_ts)) + + for i, time in enumerate(target_ts): + if time not in context_ts: + faces.append(numpy.zeros((H, H))) + else: + labels[i] = labels_dict[time] + mask[i] = 1 + time = "%.2f" % float(time) + faceFile = os.path.join(faceFolderPath, str(time) + '.jpg') + + face = cv2.imread(faceFile) + face = cv2.cvtColor(face, cv2.COLOR_BGR2GRAY) + face = cv2.resize(face, (H, H)) + faces.append(face) + faces = numpy.array(faces) + return faces, labels, mask + + def get_speaker_context(self, videoName, target_entity, all_ts, center_ts): + + context_speakers = list(self.ts_to_entity[videoName][center_ts]) + context = {} + chosen_speakers = [] + context[target_entity] = all_ts + context_speakers.remove(target_entity) + num_frames = len(all_ts) + for candidate in context_speakers: + candidate_ts = self.entity_data[videoName][candidate] + shared_ts = set(all_ts).intersection(set(candidate_ts)) + context[candidate] = shared_ts + chosen_speakers.append(candidate) + # if (len(shared_ts) > (num_frames / 2)): + # context[candidate] = shared_ts + # chosen_speakers.append(candidate) + context_speakers = chosen_speakers + random.shuffle(context_speakers) + if not context_speakers: + context_speakers.insert(0, target_entity) # make sure is at 0 + while len(context_speakers) < self.candidate_speakers: + context_speakers.append(random.choice(context_speakers)) + elif len(context_speakers) < self.candidate_speakers: + context_speakers.insert(0, target_entity) # make sure is at 0 + while len(context_speakers) < self.candidate_speakers: + context_speakers.append(random.choice(context_speakers[1:])) + else: + context_speakers.insert(0, target_entity) # make sure is at 0 + context_speakers = context_speakers[:self.candidate_speakers] + + assert set(context_speakers).issubset(set(list(context.keys()))), target_entity + + return context_speakers, context + + def __getitem__(self, index): + + target_video = self.mixLst[index] + data = target_video.split('\t') + fps = float(data[2]) + videoName = data[0][:11] + target_entity = data[0] + all_ts = list(self.entity_data[videoName][target_entity].keys()) + numFrames = int(data[1]) + # print(numFrames, len(all_ts)) + assert numFrames == len(all_ts) + + center_ts = all_ts[math.floor(numFrames / 2)] + + # get context speakers which have more than half time overlapped with target speaker + context_speakers, context = self.get_speaker_context(videoName, target_entity, all_ts, + center_ts) + + sr, audio = wavfile.read(os.path.join(self.audioPath, videoName, target_entity + '.wav')) + audio = self.load_single_audio(audio, fps, numFrames, audioAug=False) + + visualFeatures, labels, masks = [], [], [] + + # target_label = list(self.entity_data[videoName][target_entity].values()) + target_visual, target_labels, target_masks = self.load_visual_label_mask( + videoName, target_entity, all_ts, all_ts) + + for idx, context_entity in enumerate(context_speakers): + if context_entity == target_entity: + label = target_labels + visualfeat = target_visual + mask = target_masks + else: + visualfeat, label, mask = self.load_visual_label_mask(videoName, context_entity, + all_ts, + context[context_entity]) + visualFeatures.append(visualfeat) + labels.append(label) + masks.append(mask) + + audio = torch.FloatTensor(audio)[None, :, :] + visualFeatures = torch.FloatTensor(numpy.array(visualFeatures)) + audio_t = audio.shape[1] + video_t = visualFeatures.shape[1] + if audio_t != video_t * 4: + print(visualFeatures.shape, audio.shape, videoName, target_entity, numFrames) + labels = torch.LongTensor(numpy.array(labels)) + masks = torch.LongTensor(numpy.array(masks)) + + return audio, visualFeatures, labels, masks + + def __len__(self): + return len(self.mixLst) diff --git a/dlhammer/.gitignore b/dlhammer/.gitignore new file mode 100644 index 0000000000000000000000000000000000000000..6819b69e3e5ed7811a3d6ecb0290ba1175601955 --- /dev/null +++ b/dlhammer/.gitignore @@ -0,0 +1,3 @@ +*.log +.vim-arsync +__pycache__/ diff --git a/dlhammer/LICENSE b/dlhammer/LICENSE new file mode 100644 index 0000000000000000000000000000000000000000..261eeb9e9f8b2b4b0d119366dda99c6fd7d35c64 --- /dev/null +++ b/dlhammer/LICENSE @@ -0,0 +1,201 @@ + Apache License + Version 2.0, January 2004 + http://www.apache.org/licenses/ + + TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION + + 1. Definitions. + + "License" shall mean the terms and conditions for use, reproduction, + and distribution as defined by Sections 1 through 9 of this document. + + "Licensor" shall mean the copyright owner or entity authorized by + the copyright owner that is granting the License. + + "Legal Entity" shall mean the union of the acting entity and all + other entities that control, are controlled by, or are under common + control with that entity. For the purposes of this definition, + "control" means (i) the power, direct or indirect, to cause the + direction or management of such entity, whether by contract or + otherwise, or (ii) ownership of fifty percent (50%) or more of the + outstanding shares, or (iii) beneficial ownership of such entity. + + "You" (or "Your") shall mean an individual or Legal Entity + exercising permissions granted by this License. + + "Source" form shall mean the preferred form for making modifications, + including but not limited to software source code, documentation + source, and configuration files. + + "Object" form shall mean any form resulting from mechanical + transformation or translation of a Source form, including but + not limited to compiled object code, generated documentation, + and conversions to other media types. + + "Work" shall mean the work of authorship, whether in Source or + Object form, made available under the License, as indicated by a + copyright notice that is included in or attached to the work + (an example is provided in the Appendix below). + + "Derivative Works" shall mean any work, whether in Source or Object + form, that is based on (or derived from) the Work and for which the + editorial revisions, annotations, elaborations, or other modifications + represent, as a whole, an original work of authorship. For the purposes + of this License, Derivative Works shall not include works that remain + separable from, or merely link (or bind by name) to the interfaces of, + the Work and Derivative Works thereof. + + "Contribution" shall mean any work of authorship, including + the original version of the Work and any modifications or additions + to that Work or Derivative Works thereof, that is intentionally + submitted to Licensor for inclusion in the Work by the copyright owner + or by an individual or Legal Entity authorized to submit on behalf of + the copyright owner. For the purposes of this definition, "submitted" + means any form of electronic, verbal, or written communication sent + to the Licensor or its representatives, including but not limited to + communication on electronic mailing lists, source code control systems, + and issue tracking systems that are managed by, or on behalf of, the + Licensor for the purpose of discussing and improving the Work, but + excluding communication that is conspicuously marked or otherwise + designated in writing by the copyright owner as "Not a Contribution." + + "Contributor" shall mean Licensor and any individual or Legal Entity + on behalf of whom a Contribution has been received by Licensor and + subsequently incorporated within the Work. + + 2. Grant of Copyright License. Subject to the terms and conditions of + this License, each Contributor hereby grants to You a perpetual, + worldwide, non-exclusive, no-charge, royalty-free, irrevocable + copyright license to reproduce, prepare Derivative Works of, + publicly display, publicly perform, sublicense, and distribute the + Work and such Derivative Works in Source or Object form. + + 3. Grant of Patent License. Subject to the terms and conditions of + this License, each Contributor hereby grants to You a perpetual, + worldwide, non-exclusive, no-charge, royalty-free, irrevocable + (except as stated in this section) patent license to make, have made, + use, offer to sell, sell, import, and otherwise transfer the Work, + where such license applies only to those patent claims licensable + by such Contributor that are necessarily infringed by their + Contribution(s) alone or by combination of their Contribution(s) + with the Work to which such Contribution(s) was submitted. If You + institute patent litigation against any entity (including a + cross-claim or counterclaim in a lawsuit) alleging that the Work + or a Contribution incorporated within the Work constitutes direct + or contributory patent infringement, then any patent licenses + granted to You under this License for that Work shall terminate + as of the date such litigation is filed. + + 4. Redistribution. You may reproduce and distribute copies of the + Work or Derivative Works thereof in any medium, with or without + modifications, and in Source or Object form, provided that You + meet the following conditions: + + (a) You must give any other recipients of the Work or + Derivative Works a copy of this License; and + + (b) You must cause any modified files to carry prominent notices + stating that You changed the files; and + + (c) You must retain, in the Source form of any Derivative Works + that You distribute, all copyright, patent, trademark, and + attribution notices from the Source form of the Work, + excluding those notices that do not pertain to any part of + the Derivative Works; and + + (d) If the Work includes a "NOTICE" text file as part of its + distribution, then any Derivative Works that You distribute must + include a readable copy of the attribution notices contained + within such NOTICE file, excluding those notices that do not + pertain to any part of the Derivative Works, in at least one + of the following places: within a NOTICE text file distributed + as part of the Derivative Works; within the Source form or + documentation, if provided along with the Derivative Works; or, + within a display generated by the Derivative Works, if and + wherever such third-party notices normally appear. The contents + of the NOTICE file are for informational purposes only and + do not modify the License. You may add Your own attribution + notices within Derivative Works that You distribute, alongside + or as an addendum to the NOTICE text from the Work, provided + that such additional attribution notices cannot be construed + as modifying the License. + + You may add Your own copyright statement to Your modifications and + may provide additional or different license terms and conditions + for use, reproduction, or distribution of Your modifications, or + for any such Derivative Works as a whole, provided Your use, + reproduction, and distribution of the Work otherwise complies with + the conditions stated in this License. + + 5. Submission of Contributions. Unless You explicitly state otherwise, + any Contribution intentionally submitted for inclusion in the Work + by You to the Licensor shall be under the terms and conditions of + this License, without any additional terms or conditions. + Notwithstanding the above, nothing herein shall supersede or modify + the terms of any separate license agreement you may have executed + with Licensor regarding such Contributions. + + 6. Trademarks. This License does not grant permission to use the trade + names, trademarks, service marks, or product names of the Licensor, + except as required for reasonable and customary use in describing the + origin of the Work and reproducing the content of the NOTICE file. + + 7. Disclaimer of Warranty. Unless required by applicable law or + agreed to in writing, Licensor provides the Work (and each + Contributor provides its Contributions) on an "AS IS" BASIS, + WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or + implied, including, without limitation, any warranties or conditions + of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A + PARTICULAR PURPOSE. You are solely responsible for determining the + appropriateness of using or redistributing the Work and assume any + risks associated with Your exercise of permissions under this License. + + 8. Limitation of Liability. In no event and under no legal theory, + whether in tort (including negligence), contract, or otherwise, + unless required by applicable law (such as deliberate and grossly + negligent acts) or agreed to in writing, shall any Contributor be + liable to You for damages, including any direct, indirect, special, + incidental, or consequential damages of any character arising as a + result of this License or out of the use or inability to use the + Work (including but not limited to damages for loss of goodwill, + work stoppage, computer failure or malfunction, or any and all + other commercial damages or losses), even if such Contributor + has been advised of the possibility of such damages. + + 9. Accepting Warranty or Additional Liability. While redistributing + the Work or Derivative Works thereof, You may choose to offer, + and charge a fee for, acceptance of support, warranty, indemnity, + or other liability obligations and/or rights consistent with this + License. However, in accepting such obligations, You may act only + on Your own behalf and on Your sole responsibility, not on behalf + of any other Contributor, and only if You agree to indemnify, + defend, and hold each Contributor harmless for any liability + incurred by, or claims asserted against, such Contributor by reason + of your accepting any such warranty or additional liability. + + END OF TERMS AND CONDITIONS + + APPENDIX: How to apply the Apache License to your work. + + To apply the Apache License to your work, attach the following + boilerplate notice, with the fields enclosed by brackets "[]" + replaced with your own identifying information. (Don't include + the brackets!) The text should be enclosed in the appropriate + comment syntax for the file format. We also recommend that a + file or class name and description of purpose be included on the + same "printed page" as the copyright notice for easier + identification within third-party archives. + + Copyright [yyyy] [name of copyright owner] + + Licensed under the Apache License, Version 2.0 (the "License"); + you may not use this file except in compliance with the License. + You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + + Unless required by applicable law or agreed to in writing, software + distributed under the License is distributed on an "AS IS" BASIS, + WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + See the License for the specific language governing permissions and + limitations under the License. diff --git a/dlhammer/README.md b/dlhammer/README.md new file mode 100644 index 0000000000000000000000000000000000000000..ab4a733d103adef0e242dad6e7435270b4f1dfb4 --- /dev/null +++ b/dlhammer/README.md @@ -0,0 +1,2 @@ +# dl-hammer +tools for deep learning coding. diff --git a/dlhammer/dlhammer/.ipynb_checkpoints/argparser-checkpoint.py b/dlhammer/dlhammer/.ipynb_checkpoints/argparser-checkpoint.py new file mode 100644 index 0000000000000000000000000000000000000000..1cc3fee70e5695cce6305c56bc59af32bcdb113b --- /dev/null +++ b/dlhammer/dlhammer/.ipynb_checkpoints/argparser-checkpoint.py @@ -0,0 +1,110 @@ +# -*- coding: utf-8 -*- +#================================================================ +# Don't go gently into that good night. +# +# author: klaus +# description: +# +#================================================================ + +import os +import argparse +import datetime +from functools import partial +import yaml +from easydict import EasyDict + +# from .utils import get_vacant_gpu +from .logger import bootstrap_logger, logger +from .utils.system import get_available_gpuids +from .utils.misc import merge_dict, merge_opts, to_string, eval_dict_leaf + +CONFIG = EasyDict() + +BASE_CONFIG = { + 'OUTPUT_DIR': './workspace', + 'SESSION': 'base', + 'NUM_GPUS': 1, + 'LOG_NAME': 'log.txt' +} + + +def bootstrap_args(default_params=None): + """get the params from yaml file and args. The args will override arguemnts in the yaml file. + Returns: EasyDict instance. + + """ + parser = define_default_arg_parser() + cfg = update_config(parser, default_params) + create_workspace(cfg) #create workspace + + CONFIG.update(cfg) + bootstrap_logger(get_logfile(CONFIG)) # setup logger + setup_gpu(CONFIG.NUM_GPUS) #setup gpu + + return cfg + + +def setup_gpu(ngpu): + gpuids = get_available_gpuids() + # os.environ['CUDA_VISIBLE_DEVICES'] = ','.join([str(i) for i in gpuids[:ngpu]]) + + +def get_logfile(config): + return os.path.join(config.WORKSPACE, config.LOG_NAME) + + +def define_default_arg_parser(): + """Define a default arg_parser. + + Returns: + A argparse.ArgumentParser. More arguments can be added. + + """ + parser = argparse.ArgumentParser() + parser.add_argument('--cfg', help='load configs from yaml file', default='', type=str) + parser.add_argument('opts', + default=None, + nargs='*', + help='modify config options using the command-line') + + return parser + + +def update_config(arg_parser, default_config=None): + """ update argparser to args. + + Args: + arg_parser: argparse.ArgumentParser. + """ + + parsed, unknown = arg_parser.parse_known_args() + if default_config and parsed.cfg == "" and "cfg" in default_config: + parsed.cfg = default_config["cfg"] + + config = EasyDict(BASE_CONFIG.copy()) + config['cfg'] = parsed.cfg + # update default config + if default_config is not None: + config.update(default_config) + + # merge config from yaml + if os.path.isfile(config.cfg): + with open(config.cfg, 'r') as f: + yml_config = yaml.full_load(f) + config = merge_dict(config, yml_config) + + # merge opts + config = merge_opts(config, parsed.opts) + + # eval values + config = eval_dict_leaf(config) + + return config + + +def create_workspace(cfg): + cfg_name, ext = os.path.splitext(os.path.basename(cfg.cfg)) + workspace = os.path.join(cfg.OUTPUT_DIR, cfg_name, cfg.SESSION) + os.makedirs(workspace, exist_ok=True) + cfg.WORKSPACE = workspace diff --git a/dlhammer/dlhammer/.ipynb_checkpoints/bootstrap-checkpoint.py b/dlhammer/dlhammer/.ipynb_checkpoints/bootstrap-checkpoint.py new file mode 100644 index 0000000000000000000000000000000000000000..2df2fcae6b6867afb06c186271805142c3ca245f --- /dev/null +++ b/dlhammer/dlhammer/.ipynb_checkpoints/bootstrap-checkpoint.py @@ -0,0 +1,33 @@ +# -*- coding: utf-8 -*- +#================================================================ +# Don't go gently into that good night. +# +# author: klaus +# description: +# +#================================================================ + +import sys +import logging + +from .logger import bootstrap_logger, logger +from .argparser import bootstrap_args, CONFIG +from .utils.misc import to_string + +__all__ = ['bootstrap', 'logger', 'CONFIG'] + + +def bootstrap(default_cfg=None, print_cfg=True): + """TODO: Docstring for bootstrap. + + Kwargs: + use_argparser (TODO): TODO + use_logger (TODO): TODO + + Returns: TODO + + """ + config = bootstrap_args(default_cfg) + if print_cfg: + logger.info(to_string(config)) + return config diff --git a/dlhammer/dlhammer/__init__.py b/dlhammer/dlhammer/__init__.py new file mode 100644 index 0000000000000000000000000000000000000000..1253e3e11333d1f9a40940a17eb3e37c1d76f763 --- /dev/null +++ b/dlhammer/dlhammer/__init__.py @@ -0,0 +1 @@ +from .bootstrap import * diff --git a/dlhammer/dlhammer/argparser.py b/dlhammer/dlhammer/argparser.py new file mode 100644 index 0000000000000000000000000000000000000000..72702608063d7a97b020e24ad55aa0ce55a7ed5e --- /dev/null +++ b/dlhammer/dlhammer/argparser.py @@ -0,0 +1,109 @@ +# -*- coding: utf-8 -*- +#================================================================ +# Don't go gently into that good night. +# +# author: klaus +# description: +# +#================================================================ + +import os +import argparse +import datetime +from functools import partial +import yaml +from easydict import EasyDict + +# from .utils import get_vacant_gpu +from .logger import bootstrap_logger, logger +from .utils.system import get_available_gpuids +from .utils.misc import merge_dict, merge_opts, to_string, eval_dict_leaf + +CONFIG = EasyDict() + +BASE_CONFIG = { + 'OUTPUT_DIR': './workspace', + 'NUM_GPUS': 1, + 'LOG_NAME': 'log.txt' +} + + +def bootstrap_args(default_params=None): + """get the params from yaml file and args. The args will override arguemnts in the yaml file. + Returns: EasyDict instance. + + """ + parser = define_default_arg_parser() + cfg = update_config(parser, default_params) + create_workspace(cfg) #create workspace + + CONFIG.update(cfg) + bootstrap_logger(get_logfile(CONFIG)) # setup logger + setup_gpu(CONFIG.NUM_GPUS) #setup gpu + + return cfg + + +def setup_gpu(ngpu): + gpuids = get_available_gpuids() + # os.environ['CUDA_VISIBLE_DEVICES'] = ','.join([str(i) for i in gpuids[:ngpu]]) + + +def get_logfile(config): + return os.path.join(config.WORKSPACE, config.LOG_NAME) + + +def define_default_arg_parser(): + """Define a default arg_parser. + + Returns: + A argparse.ArgumentParser. More arguments can be added. + + """ + parser = argparse.ArgumentParser() + parser.add_argument('--cfg', help='load configs from yaml file', default='', type=str) + parser.add_argument('opts', + default=None, + nargs='*', + help='modify config options using the command-line') + + return parser + + +def update_config(arg_parser, default_config=None): + """ update argparser to args. + + Args: + arg_parser: argparse.ArgumentParser. + """ + + parsed, unknown = arg_parser.parse_known_args() + if default_config and parsed.cfg == "" and "cfg" in default_config: + parsed.cfg = default_config["cfg"] + + config = EasyDict(BASE_CONFIG.copy()) + config['cfg'] = parsed.cfg + # update default config + if default_config is not None: + config.update(default_config) + + # merge config from yaml + if os.path.isfile(config.cfg): + with open(config.cfg, 'r') as f: + yml_config = yaml.full_load(f) + config = merge_dict(config, yml_config) + + # merge opts + config = merge_opts(config, parsed.opts) + + # eval values + config = eval_dict_leaf(config) + + return config + + +def create_workspace(cfg): + cfg_name, ext = os.path.splitext(os.path.basename(cfg.cfg)) + workspace = os.path.join(cfg.OUTPUT_DIR) + os.makedirs(workspace, exist_ok=True) + cfg.WORKSPACE = workspace diff --git a/dlhammer/dlhammer/bootstrap.py b/dlhammer/dlhammer/bootstrap.py new file mode 100644 index 0000000000000000000000000000000000000000..2df2fcae6b6867afb06c186271805142c3ca245f --- /dev/null +++ b/dlhammer/dlhammer/bootstrap.py @@ -0,0 +1,33 @@ +# -*- coding: utf-8 -*- +#================================================================ +# Don't go gently into that good night. +# +# author: klaus +# description: +# +#================================================================ + +import sys +import logging + +from .logger import bootstrap_logger, logger +from .argparser import bootstrap_args, CONFIG +from .utils.misc import to_string + +__all__ = ['bootstrap', 'logger', 'CONFIG'] + + +def bootstrap(default_cfg=None, print_cfg=True): + """TODO: Docstring for bootstrap. + + Kwargs: + use_argparser (TODO): TODO + use_logger (TODO): TODO + + Returns: TODO + + """ + config = bootstrap_args(default_cfg) + if print_cfg: + logger.info(to_string(config)) + return config diff --git a/dlhammer/dlhammer/logger.py b/dlhammer/dlhammer/logger.py new file mode 100644 index 0000000000000000000000000000000000000000..2c9854b0254aec23d18e2eff17831859278ca36d --- /dev/null +++ b/dlhammer/dlhammer/logger.py @@ -0,0 +1,66 @@ +# -*- coding: utf-8 -*- +#================================================================ +# Don't go gently into that good night. +# +# author: klaus +# description: +# +#================================================================ + +import os +import sys +import logging + +logger = logging.getLogger('DLHammer') + + +def bootstrap_logger(logfile=None, fmt=None): + """TODO: Docstring for bootstrap_logger. + + Args: + logfile (str): file path logging to. + + Kwargs: + fmt (TODO): TODO + + Returns: TODO + + """ + if fmt is None: + # fmt = '%(asctime)s - %(levelname)-5s - [%(filename)s:%(lineno)d] %(message)s' + fmt = '%(message)s' + logging.basicConfig(level=logging.DEBUG, format=fmt) + + #log to file + if logfile is not None: + formatter = logging.Formatter(fmt) + fh = logging.FileHandler(logfile) + fh.setLevel(logging.DEBUG) + fh.setFormatter(formatter) + logger.addHandler(fh) + + # sys.stdout = LoggerWriter(sys.stdout, logger.info) + # sys.stderr = LoggerWriter(sys.stderr, logger.error) + return + + +class LoggerWriter(object): + + def __init__(self, stream, logfct): + self.terminal = stream + self.logfct = logfct + self.buf = [] + + def write(self, msg): + if msg.endswith('\n'): + self.buf.append(msg.rstrip('\n')) + + message = ''.join(self.buf) + self.logfct(message) + + self.buf = [] + else: + self.buf.append(msg) + + def flush(self): + pass diff --git a/dlhammer/dlhammer/test/config.yml b/dlhammer/dlhammer/test/config.yml new file mode 100644 index 0000000000000000000000000000000000000000..53fe053b489f3e373e7a26dd9b2f20733c0f61e0 --- /dev/null +++ b/dlhammer/dlhammer/test/config.yml @@ -0,0 +1,32 @@ +a_int: 12 +a_float: 1e-2 +a_list: [0,1,2] +eval_list: eval(list(range(10))) +DATA: + PATH_TO_DATA_DIR: /home/ubuntu/data/kinetics/Mini-Kinetics-200 + PATH_PREFIX: /home/ubuntu/data/kinetics/k400_ver3 + NUM_FRAMES: 16 + SAMPLING_RATE: 8 + TARGET_FPS: 25 + TRAIN_JITTER_SCALES: [256, 320] + TRAIN_CROP_SIZE: 224 + TEST_CROP_SIZE: 224 + INPUT_CHANNEL_NUM: [3] +SOLVER: + BACKBONE: + OPTIMIZER: sgd + MOMENTUM: 0.9 + BASE_LR: 1e-3 + SCHEDULER: + NAME: warmup_multistep + MILESTONES: [13, 24] + WARMUP_EPOCHS: 0.5 + GAMMA: 0.1 + TEMPORAL_MODEL: + OPTIMIZER: sgd + MOMENTUM: 0.9 + BASE_LR: 1e-3 + SCHEDULER: + NAME: multistep + MILESTONES: [13, 24] + GAMMA: 0.1 diff --git a/dlhammer/dlhammer/test/test_args.py b/dlhammer/dlhammer/test/test_args.py new file mode 100644 index 0000000000000000000000000000000000000000..18c1faed5c369d7f6c6fa05ea28c39b21ecc6f62 --- /dev/null +++ b/dlhammer/dlhammer/test/test_args.py @@ -0,0 +1,20 @@ +# -*- coding: utf-8 -*- +#================================================================ +# Don't go gently into that good night. +# +# author: klaus +# description: +# +#================================================================ + +import os +import sys + +CURRENT_FILE_DIRECTORY = os.path.abspath(os.path.dirname(__file__)) +sys.path.append(os.path.join(CURRENT_FILE_DIRECTORY, '../..')) +sys.path.append(os.path.join(CURRENT_FILE_DIRECTORY, '.')) + +from dlhammer import bootstrap, CONFIG +from dlhammer import logger + +config = bootstrap(print_cfg=True) diff --git a/dlhammer/dlhammer/test/test_logger.py b/dlhammer/dlhammer/test/test_logger.py new file mode 100644 index 0000000000000000000000000000000000000000..7911831f9a03e7a2fa0115277be95fdb124583a8 --- /dev/null +++ b/dlhammer/dlhammer/test/test_logger.py @@ -0,0 +1,22 @@ +# -*- coding: utf-8 -*- +#================================================================ +# Don't go gently into that good night. +# +# author: klaus +# description: +# +#================================================================ + +import os +import sys + +CURRENT_FILE_DIRECTORY = os.path.abspath(os.path.dirname(__file__)) +sys.path.append(os.path.join(CURRENT_FILE_DIRECTORY, '../..')) +sys.path.append(os.path.join(CURRENT_FILE_DIRECTORY, '.')) + +from dlhammer import bootstrap, logger +bootstrap() + +logger.info('dummy output') + +raise Exception('dummy error') diff --git a/dlhammer/dlhammer/utils/__init__.py b/dlhammer/dlhammer/utils/__init__.py new file mode 100644 index 0000000000000000000000000000000000000000..e69de29bb2d1d6434b8b29ae775ad8c2e48c5391 diff --git a/dlhammer/dlhammer/utils/misc.py b/dlhammer/dlhammer/utils/misc.py new file mode 100644 index 0000000000000000000000000000000000000000..1ceacd75cd6013cf9940525514cb4f7c5d965876 --- /dev/null +++ b/dlhammer/dlhammer/utils/misc.py @@ -0,0 +1,125 @@ +# -*- coding: utf-8 -*- +#================================================================ +# Don't go gently into that good night. +# +# author: klaus +# description: +# +#================================================================ + +import ast + + +def merge_dict(a, b, path=None): + """merge b into a. The values in b will override values in a. + + Args: + a (dict): dict to merge to. + b (dict): dict to merge from. + + Returns: dict1 with values merged from b. + + """ + if path is None: path = [] + for key in b: + if key in a: + if isinstance(a[key], dict) and isinstance(b[key], dict): + merge_dict(a[key], b[key], path + [str(key)]) + else: + a[key] = b[key] + else: + a[key] = b[key] + return a + + +def merge_opts(d, opts): + """merge opts + Args: + d (dict): The dict. + opts (list): The opts to merge. format: [key1, name1, key2, name2,...] + Returns: d. the input dict `d` with merged opts. + + """ + assert len(opts) % 2 == 0, f'length of opts must be even. Got: {opts}' + for i in range(0, len(opts), 2): + full_k, v = opts[i], opts[i + 1] + keys = full_k.split('.') + sub_d = d + for i, k in enumerate(keys): + if not hasattr(sub_d, k): + raise ValueError(f'The key {k} not exist in the dict. Full key:{full_k}') + if i != len(keys) - 1: + sub_d = sub_d[k] + else: + sub_d[k] = v + return d + + +def to_string(params, indent=2): + """format params to a string + + Args: + params (EasyDict): the params. + + Returns: The string to display. + + """ + msg = '{\n' + for i, (k, v) in enumerate(params.items()): + if isinstance(v, dict): + v = to_string(v, indent + 4) + spaces = ' ' * indent + msg += spaces + '{}: {}'.format(k, v) + if i == len(params) - 1: + msg += ' }' + else: + msg += '\n' + return msg + + +def eval_dict_leaf(d): + """eval values of dict leaf. + + Args: + d (dict): The dict to eval. + + Returns: dict. + + """ + for k, v in d.items(): + if not isinstance(v, dict): + d[k] = eval_string(v) + else: + eval_dict_leaf(v) + return d + + +def eval_string(string): + """automatically evaluate string to corresponding types. + + For example: + not a string -> return the original input + '0' -> 0 + '0.2' -> 0.2 + '[0, 1, 2]' -> [0,1,2] + 'eval(1+2)' -> 3 + 'eval(range(5))' -> [0,1,2,3,4] + + + Args: + value : string. + + Returns: the corresponding type + + """ + if not isinstance(string, str): + return string + if len(string) > 1 and string[0] == '[' and string[-1] == ']': + return eval(string) + if string[0:5] == 'eval(': + return eval(string[5:-1]) + try: + v = ast.literal_eval(string) + except: + v = string + return v diff --git a/dlhammer/dlhammer/utils/system.py b/dlhammer/dlhammer/utils/system.py new file mode 100644 index 0000000000000000000000000000000000000000..d59df5266db2a5d675400c917c81b0dbbfd1d6c1 --- /dev/null +++ b/dlhammer/dlhammer/utils/system.py @@ -0,0 +1,25 @@ +# -*- coding: utf-8 -*- +#================================================================ +# Don't go gently into that good night. +# +# author: klaus +# description: +# +#================================================================ + +import os +import sys +import subprocess +import numpy as np + + +def get_available_gpuids(): + """ + Returns: the gpu ids sorted in descending order w.r.t occupied memory. + """ + com = "nvidia-smi|sed -n '/%/p'|sed 's/|/\\n/g'|sed -n '/MiB/p'|sed 's/ //g'|sed 's/MiB/\\n/'|sed '/\\//d'" + gpum = subprocess.check_output(com, shell=True) + gpum = gpum.decode('utf-8').split('\n') + gpum = gpum[:-1] + sorted_gpuid = np.argsort(gpum) + return sorted_gpuid diff --git a/environment.yml b/environment.yml new file mode 100644 index 0000000000000000000000000000000000000000..a9c704118304cd651e3e1d2c3d24450c48d50a8a --- /dev/null +++ b/environment.yml @@ -0,0 +1,298 @@ +name: loconet +channels: + - conda-forge + - defaults +dependencies: + - _libgcc_mutex=0.1=conda_forge + - _openmp_mutex=4.5=1_gnu + - alsa-lib=1.2.3=h516909a_0 + - anyio=3.5.0=py37h89c1867_0 + - argon2-cffi=21.3.0=pyhd8ed1ab_0 + - argon2-cffi-bindings=21.2.0=py37h5e8e339_1 + - aria2=1.36.0=h319415d_2 + - attrs=21.4.0=pyhd8ed1ab_0 + - babel=2.9.1=pyh44b312d_0 + - backcall=0.2.0=pyh9f0ad1d_0 + - backports=1.0=py_2 + - backports.functools_lru_cache=1.6.4=pyhd8ed1ab_0 + - bleach=4.1.0=pyhd8ed1ab_0 + - bottleneck=1.3.4=py37h6c7ee08_0 + - brotli=1.0.9=h7f98852_6 + - brotli-bin=1.0.9=h7f98852_6 + - brotlipy=0.7.0=py37h5e8e339_1003 + - c-ares=1.18.1=h7f98852_0 + - ca-certificates=2022.5.18.1=ha878542_0 + - cffi=1.14.6=py37hc58025e_0 + - configparser=5.2.0=pyhd8ed1ab_0 + - cryptography=36.0.1=py37hf1a17b8_0 + - cycler=0.11.0=pyhd8ed1ab_0 + - cython=0.29.27=py37hcd2ae1e_0 + - dbus=1.13.6=h48d8840_2 + - debugpy=1.5.1=py37hcd2ae1e_0 + - defusedxml=0.7.1=pyhd8ed1ab_0 + - easydict=1.9=py_0 + - entrypoints=0.4=pyhd8ed1ab_0 + - expat=2.4.6=h27087fc_0 + - flit-core=3.7.0=pyhd8ed1ab_0 + - fontconfig=2.13.96=ha180cfb_0 + - fonttools=4.29.1=py37h5e8e339_0 + - freetype=2.10.4=h0708190_1 + - gettext=0.19.8.1=h0b5b191_1005 + - giflib=5.2.1=h36c2ea0_2 + - glib=2.68.4=h9c3ff4c_0 + - glib-tools=2.68.4=h9c3ff4c_0 + - gst-plugins-base=1.18.5=hf529b03_0 + - gstreamer=1.18.5=h76c114f_0 + - icu=68.2=h9c3ff4c_0 + - idna=3.3=pyhd8ed1ab_0 + - importlib_resources=5.4.0=pyhd8ed1ab_0 + - ipykernel=6.9.1=py37h6531663_0 + - ipython=7.31.1=py37h89c1867_0 + - ipython_genutils=0.2.0=py_1 + - jbig=2.1=h7f98852_2003 + - jedi=0.18.1=py37h89c1867_0 + - jinja2=3.0.3=pyhd8ed1ab_0 + - jpeg=9e=h7f98852_0 + - json5=0.9.5=pyh9f0ad1d_0 + - jsonschema=4.4.0=pyhd8ed1ab_0 + - jupyter_client=7.1.2=pyhd8ed1ab_0 + - jupyter_core=4.9.2=py37h89c1867_0 + - jupyter_server=1.13.5=pyhd8ed1ab_1 + - jupyterlab=3.2.9=pyhd8ed1ab_0 + - jupyterlab_pygments=0.1.2=pyh9f0ad1d_0 + - jupyterlab_server=2.10.3=pyhd8ed1ab_0 + - kiwisolver=1.3.2=py37h2527ec5_1 + - krb5=1.19.2=hcc1bbae_3 + - lcms2=2.12=hddcbb42_0 + - ld_impl_linux-64=2.36.1=hea4e1c9_2 + - lerc=3.0=h9c3ff4c_0 + - libblas=3.9.0=13_linux64_openblas + - libbrotlicommon=1.0.9=h7f98852_6 + - libbrotlidec=1.0.9=h7f98852_6 + - libbrotlienc=1.0.9=h7f98852_6 + - libcblas=3.9.0=13_linux64_openblas + - libclang=11.1.0=default_ha53f305_1 + - libdeflate=1.10=h7f98852_0 + - libedit=3.1.20191231=he28a2e2_2 + - libevent=2.1.10=h9b69904_4 + - libffi=3.3=h58526e2_2 + - libgcc-ng=11.2.0=h1d223b6_12 + - libgfortran-ng=11.2.0=h69a702a_12 + - libgfortran5=11.2.0=h5c6108e_12 + - libglib=2.68.4=h3e27bee_0 + - libgomp=11.2.0=h1d223b6_12 + - libiconv=1.16=h516909a_0 + - liblapack=3.9.0=13_linux64_openblas + - libllvm11=11.1.0=hf817b99_3 + - libogg=1.3.4=h7f98852_1 + - libopenblas=0.3.18=pthreads_h8fe5266_0 + - libopus=1.3.1=h7f98852_1 + - libpng=1.6.37=h21135ba_2 + - libpq=13.5=hd57d9b9_1 + - libsodium=1.0.18=h36c2ea0_1 + - libssh2=1.10.0=ha56f1ee_2 + - libstdcxx-ng=11.2.0=he4da1e4_12 + - libtiff=4.3.0=h542a066_3 + - libuuid=2.32.1=h7f98852_1000 + - libvorbis=1.3.7=h9c3ff4c_0 + - libwebp=1.2.2=h3452ae3_0 + - libwebp-base=1.2.2=h7f98852_1 + - libxcb=1.13=h7f98852_1004 + - libxkbcommon=1.0.3=he3ba5ed_0 + - libxml2=2.9.12=h72842e0_0 + - libzlib=1.2.11=h36c2ea0_1013 + - llvmlite=0.38.0=py37h0761922_1 + - lz4-c=1.9.3=h9c3ff4c_1 + - markupsafe=2.1.0=py37h540881e_0 + - matplotlib=3.5.1=py37h89c1867_0 + - matplotlib-base=3.5.1=py37h1058ff1_0 + - matplotlib-inline=0.1.3=pyhd8ed1ab_0 + - mistune=0.8.4=py37h5e8e339_1005 + - munkres=1.1.4=pyh9f0ad1d_0 + - mysql-common=8.0.28=ha770c72_0 + - mysql-libs=8.0.28=hfa10184_0 + - nbclassic=0.3.5=pyhd8ed1ab_0 + - nbclient=0.5.11=pyhd8ed1ab_0 + - nbconvert=6.4.2=py37h89c1867_0 + - nbformat=5.1.3=pyhd8ed1ab_0 + - ncurses=6.2=h58526e2_4 + - nest-asyncio=1.5.4=pyhd8ed1ab_0 + - nomkl=1.0=h5ca1d4c_0 + - notebook=6.4.8=pyha770c72_0 + - nspr=4.32=h9c3ff4c_1 + - nss=3.74=hb5efdd6_0 + - numba=0.55.1=py37h2d894fd_0 + - numexpr=2.8.0=py37hfe5f03c_101 + - numpy=1.21.5=py37hf2998dd_0 + - openjpeg=2.4.0=hb52868f_1 + - openssl=1.1.1o=h166bdaf_0 + - packaging=21.3=pyhd8ed1ab_0 + - pandas=1.3.5=py37h8c16a72_0 + - pandoc=2.17.1.1=ha770c72_0 + - pandocfilters=1.5.0=pyhd8ed1ab_0 + - parso=0.8.3=pyhd8ed1ab_0 + - patsy=0.5.2=pyhd8ed1ab_0 + - pcre=8.45=h9c3ff4c_0 + - pexpect=4.8.0=pyh9f0ad1d_2 + - pickleshare=0.7.5=py_1003 + - pip=22.0.3=pyhd8ed1ab_0 + - prometheus_client=0.13.1=pyhd8ed1ab_0 + - prompt-toolkit=3.0.27=pyha770c72_0 + - pthread-stubs=0.4=h36c2ea0_1001 + - ptyprocess=0.7.0=pyhd3deb0d_0 + - pycparser=2.21=pyhd8ed1ab_0 + - pygments=2.11.2=pyhd8ed1ab_0 + - pyopenssl=22.0.0=pyhd8ed1ab_0 + - pyparsing=3.0.7=pyhd8ed1ab_0 + - pyqt=5.12.3=py37h89c1867_8 + - pyqt-impl=5.12.3=py37hac37412_8 + - pyqt5-sip=4.19.18=py37hcd2ae1e_8 + - pyqtchart=5.12=py37he336c9b_8 + - pyqtwebengine=5.12.1=py37he336c9b_8 + - pyrsistent=0.18.1=py37h5e8e339_0 + - pysocks=1.7.1=py37h89c1867_4 + - python=3.7.9=hffdb5ce_100_cpython + - python-dateutil=2.8.2=pyhd8ed1ab_0 + - python_abi=3.7=2_cp37m + - pytz=2021.3=pyhd8ed1ab_0 + - pyzmq=22.3.0=py37h336d617_1 + - qt=5.12.9=hda022c4_4 + - readline=8.1=h46c0cb4_0 + - resampy=0.2.2=py_0 + - scipy=1.7.3=py37hf2a6cf1_0 + - seaborn=0.11.2=hd8ed1ab_0 + - seaborn-base=0.11.2=pyhd8ed1ab_0 + - send2trash=1.8.0=pyhd8ed1ab_0 + - six=1.16.0=pyh6c4a22f_0 + - sniffio=1.2.0=py37h89c1867_2 + - sqlite=3.37.0=h9cd32fc_0 + - statsmodels=0.13.2=py37hb1e94ed_0 + - terminado=0.13.1=py37h89c1867_0 + - testpath=0.5.0=pyhd8ed1ab_0 + - tk=8.6.12=h27826a3_0 + - tornado=6.1=py37h5e8e339_2 + - traitlets=5.1.1=pyhd8ed1ab_0 + - typing_extensions=4.1.1=pyha770c72_0 + - unicodedata2=14.0.0=py37h5e8e339_0 + - wcwidth=0.2.5=pyh9f0ad1d_2 + - webencodings=0.5.1=py_1 + - websocket-client=1.2.3=pyhd8ed1ab_0 + - wheel=0.37.1=pyhd8ed1ab_0 + - xorg-libxau=1.0.9=h7f98852_0 + - xorg-libxdmcp=1.1.3=h7f98852_0 + - xz=5.2.5=h516909a_1 + - zeromq=4.3.4=h9c3ff4c_1 + - zlib=1.2.11=h36c2ea0_1013 + - zstd=1.5.2=ha95c52a_0 + - pip: + - absl-py==1.0.0 + - addict==2.4.0 + - aiohttp==3.8.1 + - aiosignal==1.2.0 + - analytics-python==1.4.0 + - appdirs==1.4.4 + - asgiref==3.5.2 + - async-timeout==4.0.2 + - asynctest==0.13.0 + - audioread==2.1.9 + - backoff==1.10.0 + - bcrypt==3.2.2 + - beautifulsoup4==4.10.0 + - cachetools==4.2.4 + - certifi==2021.10.8 + - charset-normalizer==2.0.9 + - click==8.0.3 + - decorator==4.4.2 + - decord==0.6.0 + - einops==0.4.0 + - fastapi==0.78.0 + - ffmpeg==1.4 + - ffmpy==0.3.0 + - filelock==3.4.0 + - frozenlist==1.3.0 + - fsspec==2022.1.0 + - future==0.18.2 + - fvcore==0.1.5.post20221221 + - gdown==4.2.0 + - google-auth==2.3.3 + - google-auth-oauthlib==0.4.6 + - gradio==3.0.2 + - grpcio==1.43.0 + - h11==0.13.0 + - imageio==2.23.0 + - imageio-ffmpeg==0.4.7 + - importlib-metadata==4.10.0 + - iopath==0.1.10 + - ipywidgets==8.0.4 + - joblib==1.1.0 + - jupyterlab-widgets==3.0.5 + - librosa==0.9.1 + - linkify-it-py==1.0.3 + - lmdb==1.4.1 + - markdown==3.3.6 + - markdown-it-py==2.1.0 + - mdit-py-plugins==0.3.0 + - mdurl==0.1.1 + - mmaction2==0.24.1 + - mmcv==1.7.0 + - mmcv-full==1.4.6 + - monotonic==1.6 + - moviepy==1.0.3 + - multidict==5.2.0 + - oauthlib==3.1.1 + - opencv-contrib-python==4.7.0.68 + - opencv-python==4.5.5.62 + - orjson==3.6.8 + - paramiko==2.11.0 + - pillow==8.3.2 + - pooch==1.6.0 + - portalocker==2.7.0 + - proglog==0.1.10 + - protobuf==3.19.3 + - pyasn1==0.4.8 + - pyasn1-modules==0.2.8 + - pycryptodome==3.14.1 + - pydantic==1.9.0 + - pydeprecate==0.3.1 + - pydub==0.25.1 + - pynacl==1.5.0 + - python-box==6.0.2 + - python-multipart==0.0.5 + - python-speech-features==0.6 + - pytorch-lightning==1.5.8 + - pyyaml==6.0 + - requests==2.26.0 + - requests-oauthlib==1.3.0 + - rsa==4.8 + - scenedetect==0.5.6.1 + - scikit-learn==1.0.1 + - setuptools==60.9.3 + - soundfile==0.10.3.post1 + - soupsieve==2.3.1 + - starlette==0.19.1 + - tabulate==0.9.0 + - tensorboard==2.7.0 + - tensorboard-data-server==0.6.1 + - tensorboard-plugin-wit==1.8.1 + - termcolor==2.2.0 + - threadpoolctl==3.0.0 + - timm==0.4.5 + - torch==1.10.1 + - torchaudio==0.10.1 + - torchlibrosa==0.0.9 + - torchmetrics==0.7.0 + - torchvision==0.11.2 + - tqdm==4.62.3 + - typing-extensions==4.0.1 + - uc-micro-py==1.0.1 + - urllib3==1.26.7 + - uvicorn==0.17.6 + - warmup-scheduler-pytorch==0.1.2 + - werkzeug==2.0.2 + - wget==3.2 + - widgetsnbextension==4.0.5 + - yacs==0.1.8 + - yapf==0.32.0 + - yarl==1.7.2 + - youtube-dl==2021.12.17 + - zipp==3.6.0 diff --git a/legacy/talkNet_multi_multicard.py b/legacy/talkNet_multi_multicard.py new file mode 100755 index 0000000000000000000000000000000000000000..bb60a1796690dad773b7d46e005fb17bb35c13b4 --- /dev/null +++ b/legacy/talkNet_multi_multicard.py @@ -0,0 +1,124 @@ +import torch +import torch.nn as nn +import torch.nn.functional as F + +import sys, time, numpy, os, subprocess, pandas, tqdm + +from loss_multi import lossAV, lossA, lossV +from model.talkNetModel import talkNetModel + +import pytorch_lightning as pl +from torch import distributed as dist + + +class talkNet(pl.LightningModule): + + def __init__(self, cfg): + super(talkNet, self).__init__() + self.model = talkNetModel().cuda() + self.cfg = cfg + self.lossAV = lossAV().cuda() + self.lossA = lossA().cuda() + self.lossV = lossV().cuda() + print( + time.strftime("%m-%d %H:%M:%S") + " Model para number = %.2f" % + (sum(param.numel() for param in self.model.parameters()) / 1024 / 1024)) + + def configure_optimizers(self): + optimizer = torch.optim.Adam(self.parameters(), lr=self.cfg.SOLVER.BASE_LR) + scheduler = torch.optim.lr_scheduler.StepLR(optimizer, + step_size=1, + gamma=self.cfg.SOLVER.SCHEDULER.GAMMA) + return {"optimizer": optimizer, "lr_scheduler": scheduler} + + def training_step(self, batch, batch_idx): + audioFeature, visualFeature, labels, masks = batch + b, s, t = visualFeature.shape[0], visualFeature.shape[1], visualFeature.shape[2] + audioFeature = audioFeature.repeat(1, s, 1, 1) + audioFeature = audioFeature.view(b * s, *audioFeature.shape[2:]) + visualFeature = visualFeature.view(b * s, *visualFeature.shape[2:]) + labels = labels.view(b * s, *labels.shape[2:]) + masks = masks.view(b * s, *masks.shape[2:]) + + audioEmbed = self.model.forward_audio_frontend(audioFeature) # feedForward + visualEmbed = self.model.forward_visual_frontend(visualFeature) + audioEmbed, visualEmbed = self.model.forward_cross_attention(audioEmbed, visualEmbed) + outsAV = self.model.forward_audio_visual_backend(audioEmbed, visualEmbed) + outsA = self.model.forward_audio_backend(audioEmbed) + outsV = self.model.forward_visual_backend(visualEmbed) + labels = labels.reshape((-1)) + nlossAV, _, _, prec = self.lossAV.forward(outsAV, labels, masks) + nlossA = self.lossA.forward(outsA, labels, masks) + nlossV = self.lossV.forward(outsV, labels, masks) + loss = nlossAV + 0.4 * nlossA + 0.4 * nlossV + self.log("train_loss", loss, on_step=True, on_epoch=True, prog_bar=True, logger=True) + return loss + + def training_epoch_end(self, training_step_outputs): + self.saveParameters( + os.path.join(self.cfg.WORKSPACE, "model", "{}.pth".format(self.current_epoch))) + + def evaluate_network(self, loader): + self.eval() + predScores = [] + self.model = self.model.cuda() + self.lossAV = self.lossAV.cuda() + self.lossA = self.lossA.cuda() + self.lossV = self.lossV.cuda() + evalCsvSave = self.cfg.evalCsvSave + evalOrig = self.cfg.evalOrig + for audioFeature, visualFeature, labels, masks in tqdm.tqdm(loader): + with torch.no_grad(): + b, s = visualFeature.shape[0], visualFeature.shape[1] + t = visualFeature.shape[2] + audioFeature = audioFeature.repeat(1, s, 1, 1) + audioFeature = audioFeature.view(b * s, *audioFeature.shape[2:]) + visualFeature = visualFeature.view(b * s, *visualFeature.shape[2:]) + labels = labels.view(b * s, *labels.shape[2:]) + masks = masks.view(b * s, *masks.shape[2:]) + audioEmbed = self.model.forward_audio_frontend(audioFeature.cuda()) + visualEmbed = self.model.forward_visual_frontend(visualFeature.cuda()) + audioEmbed, visualEmbed = self.model.forward_cross_attention( + audioEmbed, visualEmbed) + outsAV = self.model.forward_audio_visual_backend(audioEmbed, visualEmbed) + labels = labels.reshape((-1)).cuda() + outsAV = outsAV.view(b, s, t, -1)[:, 0, :, :].view(b * t, -1) + labels = labels.view(b, s, t)[:, 0, :].view(b * t) + masks = masks.view(b, s, t)[:, 0, :].view(b * t) + _, predScore, _, _ = self.lossAV.forward(outsAV, labels, masks) + predScore = predScore.detach().cpu().numpy() + predScores.extend(predScore) + evalLines = open(evalOrig).read().splitlines()[1:] + labels = [] + labels = pandas.Series(['SPEAKING_AUDIBLE' for line in evalLines]) + scores = pandas.Series(predScores) + evalRes = pandas.read_csv(evalOrig) + evalRes['score'] = scores + evalRes['label'] = labels + evalRes.drop(['label_id'], axis=1, inplace=True) + evalRes.drop(['instance_id'], axis=1, inplace=True) + evalRes.to_csv(evalCsvSave, index=False) + cmd = "python -O utils/get_ava_active_speaker_performance.py -g %s -p %s " % (evalOrig, + evalCsvSave) + mAP = float( + str(subprocess.run(cmd, shell=True, capture_output=True).stdout).split(' ')[2][:5]) + return mAP + + def saveParameters(self, path): + torch.save(self.state_dict(), path) + + def loadParameters(self, path): + selfState = self.state_dict() + loadedState = torch.load(path) + for name, param in loadedState.items(): + origName = name + if name not in selfState: + name = name.replace("module.", "") + if name not in selfState: + print("%s is not in the model." % origName) + continue + if selfState[name].size() != loadedState[origName].size(): + sys.stderr.write("Wrong parameter length: %s, model: %s, loaded: %s" % + (origName, selfState[name].size(), loadedState[origName].size())) + continue + selfState[name].copy_(param) diff --git a/legacy/talkNet_multicard.py b/legacy/talkNet_multicard.py new file mode 100755 index 0000000000000000000000000000000000000000..9ac7303feed93cc2595b32c9f6cd0a306229e2be --- /dev/null +++ b/legacy/talkNet_multicard.py @@ -0,0 +1,146 @@ +import torch +import torch.nn as nn +import torch.nn.functional as F + +import sys, time, numpy, os, subprocess, pandas, tqdm + +from loss import lossAV, lossA, lossV +from model.talkNetModel import talkNetModel + +import pytorch_lightning as pl +from torch import distributed as dist + + +class talkNet(pl.LightningModule): + + def __init__(self, cfg): + super(talkNet, self).__init__() + self.cfg = cfg + self.model = talkNetModel() + self.lossAV = lossAV() + self.lossA = lossA() + self.lossV = lossV() + print( + time.strftime("%m-%d %H:%M:%S") + " Model para number = %.2f" % + (sum(param.numel() for param in self.model.parameters()) / 1024 / 1024)) + + def configure_optimizers(self): + optimizer = torch.optim.Adam(self.parameters(), lr=self.cfg.SOLVER.BASE_LR) + scheduler = torch.optim.lr_scheduler.StepLR(optimizer, + step_size=1, + gamma=self.cfg.SOLVER.SCHEDULER.GAMMA) + return {"optimizer": optimizer, "lr_scheduler": scheduler} + + def training_step(self, batch, batch_idx): + audioFeature, visualFeature, labels = batch + audioEmbed = self.model.forward_audio_frontend(audioFeature[0]) # feedForward + visualEmbed = self.model.forward_visual_frontend(visualFeature[0]) + audioEmbed, visualEmbed = self.model.forward_cross_attention(audioEmbed, visualEmbed) + outsAV = self.model.forward_audio_visual_backend(audioEmbed, visualEmbed) + outsA = self.model.forward_audio_backend(audioEmbed) + outsV = self.model.forward_visual_backend(visualEmbed) + labels = labels[0].reshape((-1)) + nlossAV, _, _, prec = self.lossAV.forward(outsAV, labels) + nlossA = self.lossA.forward(outsA, labels) + nlossV = self.lossV.forward(outsV, labels) + loss = nlossAV + 0.4 * nlossA + 0.4 * nlossV + self.log("train_loss", loss, on_step=True, on_epoch=True, prog_bar=True, logger=True) + + return loss + + def training_epoch_end(self, training_step_outputs): + self.saveParameters( + os.path.join(self.cfg.WORKSPACE, "model", "{}.pth".format(self.current_epoch))) + + def validation_step(self, batch, batch_idx): + audioFeature, visualFeature, labels, indices = batch + audioEmbed = self.model.forward_audio_frontend(audioFeature[0]) + visualEmbed = self.model.forward_visual_frontend(visualFeature[0]) + audioEmbed, visualEmbed = self.model.forward_cross_attention(audioEmbed, visualEmbed) + outsAV = self.model.forward_audio_visual_backend(audioEmbed, visualEmbed) + labels = labels[0].reshape((-1)) + loss, predScore, _, _ = self.lossAV.forward(outsAV, labels) + predScore = predScore[:, -1:].detach().cpu().numpy() + # self.log("val_loss", loss) + + return predScore + + def validation_epoch_end(self, validation_step_outputs): + evalCsvSave = self.cfg.evalCsvSave + evalOrig = self.cfg.evalOrig + predScores = [] + + for out in validation_step_outputs: # batch size =1 + predScores.extend(out) + + evalLines = open(evalOrig).read().splitlines()[1:] + labels = [] + labels = pandas.Series(['SPEAKING_AUDIBLE' for line in evalLines]) + scores = pandas.Series(predScores) + evalRes = pandas.read_csv(evalOrig) + print(len(evalRes), len(predScores), len(evalLines)) + evalRes['score'] = scores + evalRes['label'] = labels + evalRes.drop(['label_id'], axis=1, inplace=True) + evalRes.drop(['instance_id'], axis=1, inplace=True) + evalRes.to_csv(evalCsvSave, index=False) + cmd = "python -O utils/get_ava_active_speaker_performance.py -g %s -p %s " % (evalOrig, + evalCsvSave) + mAP = float( + str(subprocess.run(cmd, shell=True, capture_output=True).stdout).split(' ')[2][:5]) + print("validation mAP: {}".format(mAP)) + + def saveParameters(self, path): + torch.save(self.state_dict(), path) + + def loadParameters(self, path): + selfState = self.state_dict() + loadedState = torch.load(path, map_location='cpu') + for name, param in loadedState.items(): + origName = name + if name not in selfState: + name = name.replace("module.", "") + if name not in selfState: + print("%s is not in the model." % origName) + continue + if selfState[name].size() != loadedState[origName].size(): + sys.stderr.write("Wrong parameter length: %s, model: %s, loaded: %s" % + (origName, selfState[name].size(), loadedState[origName].size())) + continue + selfState[name].copy_(param) + + def evaluate_network(self, loader): + self.eval() + self.model = self.model.cuda() + self.lossAV = self.lossAV.cuda() + self.lossA = self.lossA.cuda() + self.lossV = self.lossV.cuda() + predScores = [] + evalCsvSave = self.cfg.evalCsvSave + evalOrig = self.cfg.evalOrig + for audioFeature, visualFeature, labels in tqdm.tqdm(loader): + with torch.no_grad(): + audioEmbed = self.model.forward_audio_frontend(audioFeature[0].cuda()) + visualEmbed = self.model.forward_visual_frontend(visualFeature[0].cuda()) + audioEmbed, visualEmbed = self.model.forward_cross_attention( + audioEmbed, visualEmbed) + outsAV = self.model.forward_audio_visual_backend(audioEmbed, visualEmbed) + labels = labels[0].reshape((-1)).cuda() + _, predScore, _, _ = self.lossAV.forward(outsAV, labels) + predScore = predScore[:, 1].detach().cpu().numpy() + predScores.extend(predScore) + evalLines = open(evalOrig).read().splitlines()[1:] + labels = [] + labels = pandas.Series(['SPEAKING_AUDIBLE' for line in evalLines]) + scores = pandas.Series(predScores) + evalRes = pandas.read_csv(evalOrig) + evalRes['score'] = scores + evalRes['label'] = labels + evalRes.drop(['label_id'], axis=1, inplace=True) + evalRes.drop(['instance_id'], axis=1, inplace=True) + evalRes.to_csv(evalCsvSave, index=False) + cmd = "python -O utils/get_ava_active_speaker_performance.py -g %s -p %s " % (evalOrig, + evalCsvSave) + mAP = float( + str(subprocess.run(cmd, shell=True, capture_output=True).stdout).split(' ')[2][:5]) + return mAP diff --git a/legacy/talkNet_orig.py b/legacy/talkNet_orig.py new file mode 100755 index 0000000000000000000000000000000000000000..43d4d8d1190e5852429ebd58b848e41d91af528b --- /dev/null +++ b/legacy/talkNet_orig.py @@ -0,0 +1,102 @@ +import torch +import torch.nn as nn +import torch.nn.functional as F + +import sys, time, numpy, os, subprocess, pandas, tqdm + +from loss import lossAV, lossA, lossV +from model.talkNetModel import talkNetModel + + +class talkNet(nn.Module): + + def __init__(self, lr=0.0001, lrDecay=0.95, **kwargs): + super(talkNet, self).__init__() + self.model = talkNetModel().cuda() + self.lossAV = lossAV().cuda() + self.lossA = lossA().cuda() + self.lossV = lossV().cuda() + self.optim = torch.optim.Adam(self.parameters(), lr=lr) + self.scheduler = torch.optim.lr_scheduler.StepLR(self.optim, step_size=1, gamma=lrDecay) + print( + time.strftime("%m-%d %H:%M:%S") + " Model para number = %.2f" % + (sum(param.numel() for param in self.model.parameters()) / 1024 / 1024)) + + def train_network(self, loader, epoch, **kwargs): + self.train() + self.scheduler.step(epoch - 1) + index, top1, loss = 0, 0, 0 + lr = self.optim.param_groups[0]['lr'] + for num, (audioFeature, visualFeature, labels) in enumerate(loader, start=1): + self.zero_grad() + audioEmbed = self.model.forward_audio_frontend(audioFeature[0].cuda()) # feedForward + visualEmbed = self.model.forward_visual_frontend(visualFeature[0].cuda()) + audioEmbed, visualEmbed = self.model.forward_cross_attention(audioEmbed, visualEmbed) + outsAV = self.model.forward_audio_visual_backend(audioEmbed, visualEmbed) + outsA = self.model.forward_audio_backend(audioEmbed) + outsV = self.model.forward_visual_backend(visualEmbed) + labels = labels[0].reshape((-1)).cuda() # Loss + nlossAV, _, _, prec = self.lossAV.forward(outsAV, labels) + nlossA = self.lossA.forward(outsA, labels) + nlossV = self.lossV.forward(outsV, labels) + nloss = nlossAV + 0.4 * nlossA + 0.4 * nlossV + loss += nloss.detach().cpu().numpy() + top1 += prec + nloss.backward() + self.optim.step() + index += len(labels) + sys.stderr.write(time.strftime("%m-%d %H:%M:%S") + \ + " [%2d] Lr: %5f, Training: %.2f%%, " %(epoch, lr, 100 * (num / loader.__len__())) + \ + " Loss: %.5f, ACC: %2.2f%% \r" %(loss/(num), 100 * (top1/index))) + sys.stderr.flush() + sys.stdout.write("\n") + return loss / num, lr + + def evaluate_network(self, loader, evalCsvSave, evalOrig, **kwargs): + self.eval() + predScores = [] + for audioFeature, visualFeature, labels in tqdm.tqdm(loader): + with torch.no_grad(): + audioEmbed = self.model.forward_audio_frontend(audioFeature[0].cuda()) + visualEmbed = self.model.forward_visual_frontend(visualFeature[0].cuda()) + audioEmbed, visualEmbed = self.model.forward_cross_attention( + audioEmbed, visualEmbed) + outsAV = self.model.forward_audio_visual_backend(audioEmbed, visualEmbed) + labels = labels[0].reshape((-1)).cuda() + _, predScore, _, _ = self.lossAV.forward(outsAV, labels) + predScore = predScore[:, 1].detach().cpu().numpy() + predScores.extend(predScore) + evalLines = open(evalOrig).read().splitlines()[1:] + labels = [] + labels = pandas.Series(['SPEAKING_AUDIBLE' for line in evalLines]) + scores = pandas.Series(predScores) + evalRes = pandas.read_csv(evalOrig) + evalRes['score'] = scores + evalRes['label'] = labels + evalRes.drop(['label_id'], axis=1, inplace=True) + evalRes.drop(['instance_id'], axis=1, inplace=True) + evalRes.to_csv(evalCsvSave, index=False) + cmd = "python -O utils/get_ava_active_speaker_performance.py -g %s -p %s " % (evalOrig, + evalCsvSave) + mAP = float( + str(subprocess.run(cmd, shell=True, capture_output=True).stdout).split(' ')[2][:5]) + return mAP + + def saveParameters(self, path): + torch.save(self.state_dict(), path) + + def loadParameters(self, path): + selfState = self.state_dict() + loadedState = torch.load(path) + for name, param in loadedState.items(): + origName = name + if name not in selfState: + name = name.replace("module.", "") + if name not in selfState: + print("%s is not in the model." % origName) + continue + if selfState[name].size() != loadedState[origName].size(): + sys.stderr.write("Wrong parameter length: %s, model: %s, loaded: %s" % + (origName, selfState[name].size(), loadedState[origName].size())) + continue + selfState[name].copy_(param) diff --git a/legacy/trainTalkNet_multicard.py b/legacy/trainTalkNet_multicard.py new file mode 100755 index 0000000000000000000000000000000000000000..5f698d5a76f0f7ae0c8f2f31d776678f3ef8be8d --- /dev/null +++ b/legacy/trainTalkNet_multicard.py @@ -0,0 +1,171 @@ +import time, os, torch, argparse, warnings, glob + +from utils.tools import * +from dlhammer import bootstrap +import pytorch_lightning as pl +from pytorch_lightning import Trainer, seed_everything +from pytorch_lightning.callbacks import ModelCheckpoint +os.environ["CUDA_DEVICE_ORDER"] = "PCI_BUS_ID" + + +class MyCollator(object): + + def __init__(self, cfg): + self.cfg = cfg + + def __call__(self, data): + audiofeatures = [item[0] for item in data] + visualfeatures = [item[1] for item in data] + labels = [item[2] for item in data] + masks = [item[3] for item in data] + cut_limit = self.cfg.MODEL.CLIP_LENGTH + # pad audio + lengths = torch.tensor([t.shape[1] for t in audiofeatures]) + max_len = max(lengths) + padded_audio = torch.stack([ + torch.cat([i, i.new_zeros((i.shape[0], max_len - i.shape[1], i.shape[2]))], 1) + for i in audiofeatures + ], 0) + + if max_len > cut_limit * 4: + padded_audio = padded_audio[:, :, :cut_limit * 4, ...] + + # pad video + lengths = torch.tensor([t.shape[1] for t in visualfeatures]) + max_len = max(lengths) + padded_video = torch.stack([ + torch.cat( + [i, i.new_zeros((i.shape[0], max_len - i.shape[1], i.shape[2], i.shape[3]))], 1) + for i in visualfeatures + ], 0) + padded_labels = torch.stack( + [torch.cat([i, i.new_zeros((i.shape[0], max_len - i.shape[1]))], 1) for i in labels], 0) + padded_masks = torch.stack( + [torch.cat([i, i.new_zeros((i.shape[0], max_len - i.shape[1]))], 1) for i in masks], 0) + + if max_len > cut_limit: + padded_video = padded_video[:, :, :cut_limit, ...] + padded_labels = padded_labels[:, :, :cut_limit, ...] + padded_masks = padded_masks[:, :, :cut_limit, ...] + return padded_audio, padded_video, padded_labels, padded_masks + + +class DataPrep(pl.LightningDataModule): + + def __init__(self, cfg): + self.cfg = cfg + + def train_dataloader(self): + cfg = self.cfg + + if self.cfg.MODEL.NAME == "baseline": + from dataLoader import train_loader, val_loader + loader = train_loader(trialFileName = cfg.trainTrialAVA, \ + audioPath = os.path.join(cfg.audioPathAVA , 'train'), \ + visualPath = os.path.join(cfg.visualPathAVA, 'train'), \ + batchSize=2500 + ) + elif self.cfg.MODEL.NAME == "multi": + from dataLoader_multiperson import train_loader, val_loader + loader = train_loader(trialFileName = cfg.trainTrialAVA, \ + audioPath = os.path.join(cfg.audioPathAVA , 'train'), \ + visualPath = os.path.join(cfg.visualPathAVA, 'train'), \ + num_speakers=cfg.MODEL.NUM_SPEAKERS, + ) + if cfg.MODEL.NAME == "baseline": + trainLoader = torch.utils.data.DataLoader( + loader, + batch_size=1, + shuffle=True, + num_workers=4, + ) + elif cfg.MODEL.NAME == "multi": + collator = MyCollator(cfg) + trainLoader = torch.utils.data.DataLoader(loader, + batch_size=1, + shuffle=True, + num_workers=4, + collate_fn=collator) + + return trainLoader + + def val_dataloader(self): + cfg = self.cfg + loader = val_loader(trialFileName = cfg.evalTrialAVA, \ + audioPath = os.path.join(cfg.audioPathAVA , cfg.evalDataType), \ + visualPath = os.path.join(cfg.visualPathAVA, cfg.evalDataType), \ + ) + valLoader = torch.utils.data.DataLoader(loader, + batch_size=cfg.VAL.BATCH_SIZE, + shuffle=False, + num_workers=16) + return valLoader + + +def main(): + # The structure of this code is learnt from https://github.com/clovaai/voxceleb_trainer + cfg = bootstrap(print_cfg=False) + print(cfg) + + warnings.filterwarnings("ignore") + seed_everything(42, workers=True) + + cfg = init_args(cfg) + + # checkpoint_callback = ModelCheckpoint(dirpath=os.path.join(cfg.WORKSPACE, "model"), + # save_top_k=-1, + # filename='{epoch}') + + data = DataPrep(cfg) + + trainer = Trainer( + gpus=int(cfg.TRAIN.TRAINER_GPU), + precision=32, + # callbacks=[checkpoint_callback], + max_epochs=25, + replace_sampler_ddp=True) + # val_trainer = Trainer(deterministic=True, num_sanity_val_steps=-1, gpus=1) + if cfg.downloadAVA == True: + preprocess_AVA(cfg) + quit() + + # if cfg.RESUME: + # modelfiles = glob.glob('%s/model_0*.model' % cfg.modelSavePath) + # modelfiles.sort() + # if len(modelfiles) >= 1: + # print("Model %s loaded from previous state!" % modelfiles[-1]) + # epoch = int(os.path.splitext(os.path.basename(modelfiles[-1]))[0][6:]) + 1 + # s = talkNet(cfg) + # s.loadParameters(modelfiles[-1]) + # else: + # epoch = 1 + # s = talkNet(cfg) + epoch = 1 + if cfg.MODEL.NAME == "baseline": + from talkNet_multicard import talkNet + elif cfg.MODEL.NAME == "multi": + from talkNet_multi import talkNet + + s = talkNet(cfg) + + # scoreFile = open(cfg.scoreSavePath, "a+") + + trainer.fit(s, train_dataloaders=data.train_dataloader()) + + modelfiles = glob.glob('%s/*.pth' % os.path.join(cfg.WORKSPACE, "model")) + + modelfiles.sort() + for path in modelfiles: + s.loadParameters(path) + prec = trainer.validate(s, data.val_dataloader()) + + # if epoch % cfg.testInterval == 0: + # s.saveParameters(cfg.modelSavePath + "/model_%04d.model" % epoch) + # trainer.validate(dataloaders=valLoader) + # print(time.strftime("%Y-%m-%d %H:%M:%S"), "%d epoch, mAP %2.2f%%" % (epoch, mAPs[-1])) + # scoreFile.write("%d epoch, LOSS %f, mAP %2.2f%%\n" % (epoch, loss, mAPs[-1])) + # scoreFile.flush() + + +if __name__ == '__main__': + main() diff --git a/legacy/train_multi.py b/legacy/train_multi.py new file mode 100755 index 0000000000000000000000000000000000000000..951f163f6bcd37b93748b222a84a9f9c1d34648e --- /dev/null +++ b/legacy/train_multi.py @@ -0,0 +1,156 @@ +import time, os, torch, argparse, warnings, glob + +from dataLoader_multiperson import train_loader, val_loader +from utils.tools import * +from talkNet_multi import talkNet + + +def collate_fn_padding(data): + audiofeatures = [item[0] for item in data] + visualfeatures = [item[1] for item in data] + labels = [item[2] for item in data] + masks = [item[3] for item in data] + cut_limit = 200 + # pad audio + lengths = torch.tensor([t.shape[1] for t in audiofeatures]) + max_len = max(lengths) + padded_audio = torch.stack([ + torch.cat([i, i.new_zeros((i.shape[0], max_len - i.shape[1], i.shape[2]))], 1) + for i in audiofeatures + ], 0) + + if max_len > cut_limit * 4: + padded_audio = padded_audio[:, :, :cut_limit * 4, ...] + + # pad video + lengths = torch.tensor([t.shape[1] for t in visualfeatures]) + max_len = max(lengths) + padded_video = torch.stack([ + torch.cat([i, i.new_zeros((i.shape[0], max_len - i.shape[1], i.shape[2], i.shape[3]))], 1) + for i in visualfeatures + ], 0) + padded_labels = torch.stack( + [torch.cat([i, i.new_zeros((i.shape[0], max_len - i.shape[1]))], 1) for i in labels], 0) + padded_masks = torch.stack( + [torch.cat([i, i.new_zeros((i.shape[0], max_len - i.shape[1]))], 1) for i in masks], 0) + + if max_len > cut_limit: + padded_video = padded_video[:, :, :cut_limit, ...] + padded_labels = padded_labels[:, :, :cut_limit, ...] + padded_masks = padded_masks[:, :, :cut_limit, ...] + # print(padded_audio.shape, padded_video.shape, padded_labels.shape, padded_masks.shape) + return padded_audio, padded_video, padded_labels, padded_masks + + +def main(): + # The structure of this code is learnt from https://github.com/clovaai/voxceleb_trainer + warnings.filterwarnings("ignore") + + parser = argparse.ArgumentParser(description="TalkNet Training") + # Training details + parser.add_argument('--lr', type=float, default=0.0001, help='Learning rate') + parser.add_argument('--lrDecay', type=float, default=0.95, help='Learning rate decay rate') + parser.add_argument('--maxEpoch', type=int, default=25, help='Maximum number of epochs') + parser.add_argument('--testInterval', + type=int, + default=1, + help='Test and save every [testInterval] epochs') + parser.add_argument( + '--batchSize', + type=int, + default=2500, + help= + 'Dynamic batch size, default is 2500 frames, other batchsize (such as 1500) will not affect the performance' + ) + parser.add_argument('--batch_size', type=int, default=1, help='batch_size') + parser.add_argument('--num_speakers', type=int, default=5, help='num_speakers') + parser.add_argument('--nDataLoaderThread', type=int, default=4, help='Number of loader threads') + # Data path + parser.add_argument('--dataPathAVA', + type=str, + default="/data08/AVA", + help='Save path of AVA dataset') + parser.add_argument('--savePath', type=str, default="exps/exp1") + # Data selection + parser.add_argument('--evalDataType', + type=str, + default="val", + help='Only for AVA, to choose the dataset for evaluation, val or test') + # For download dataset only, for evaluation only + parser.add_argument('--downloadAVA', + dest='downloadAVA', + action='store_true', + help='Only download AVA dataset and do related preprocess') + parser.add_argument('--evaluation', + dest='evaluation', + action='store_true', + help='Only do evaluation by using pretrained model [pretrain_AVA.model]') + args = parser.parse_args() + # Data loader + args = init_args(args) + + if args.downloadAVA == True: + preprocess_AVA(args) + quit() + + loader = train_loader(trialFileName = args.trainTrialAVA, \ + audioPath = os.path.join(args.audioPathAVA , 'train'), \ + visualPath = os.path.join(args.visualPathAVA, 'train'), \ + # num_speakers = args.num_speakers, \ + **vars(args)) + trainLoader = torch.utils.data.DataLoader(loader, + batch_size=args.batch_size, + shuffle=True, + num_workers=args.nDataLoaderThread, + collate_fn=collate_fn_padding) + + loader = val_loader(trialFileName = args.evalTrialAVA, \ + audioPath = os.path.join(args.audioPathAVA , args.evalDataType), \ + visualPath = os.path.join(args.visualPathAVA, args.evalDataType), \ + # num_speakers = args.num_speakers, \ + **vars(args)) + valLoader = torch.utils.data.DataLoader(loader, batch_size=1, shuffle=False, num_workers=16) + + if args.evaluation == True: + download_pretrain_model_AVA() + s = talkNet(**vars(args)) + s.loadParameters('pretrain_AVA.model') + print("Model %s loaded from previous state!" % ('pretrain_AVA.model')) + mAP = s.evaluate_network(loader=valLoader, **vars(args)) + print("mAP %2.2f%%" % (mAP)) + quit() + + modelfiles = glob.glob('%s/model_0*.model' % args.modelSavePath) + modelfiles.sort() + if len(modelfiles) >= 1: + print("Model %s loaded from previous state!" % modelfiles[-1]) + epoch = int(os.path.splitext(os.path.basename(modelfiles[-1]))[0][6:]) + 1 + s = talkNet(epoch=epoch, **vars(args)) + s.loadParameters(modelfiles[-1]) + else: + epoch = 1 + s = talkNet(epoch=epoch, **vars(args)) + + mAPs = [] + scoreFile = open(args.scoreSavePath, "a+") + + while (1): + loss, lr = s.train_network(epoch=epoch, loader=trainLoader, **vars(args)) + + if epoch % args.testInterval == 0: + s.saveParameters(args.modelSavePath + "/model_%04d.model" % epoch) + mAPs.append(s.evaluate_network(epoch=epoch, loader=valLoader, **vars(args))) + print(time.strftime("%Y-%m-%d %H:%M:%S"), + "%d epoch, mAP %2.2f%%, bestmAP %2.2f%%" % (epoch, mAPs[-1], max(mAPs))) + scoreFile.write("%d epoch, LR %f, LOSS %f, mAP %2.2f%%, bestmAP %2.2f%%\n" % + (epoch, lr, loss, mAPs[-1], max(mAPs))) + scoreFile.flush() + + if epoch >= args.maxEpoch: + quit() + + epoch += 1 + + +if __name__ == '__main__': + main() diff --git a/loconet.py b/loconet.py new file mode 100755 index 0000000000000000000000000000000000000000..835826dd4b777d4a4d79f3d66e76191f03434da0 --- /dev/null +++ b/loconet.py @@ -0,0 +1,182 @@ +import torch +import torch.nn as nn +import torch.nn.functional as F + +import sys, time, numpy, os, subprocess, pandas, tqdm + +from loss_multi import lossAV, lossA, lossV +from model.loconet_encoder import locoencoder + +import torch.distributed as dist +from xxlib.utils.distributed import all_gather, all_reduce + + +class Loconet(nn.Module): + + def __init__(self, cfg): + super(Loconet, self).__init__() + self.cfg = cfg + self.model = locoencoder(cfg) + self.lossAV = lossAV() + self.lossA = lossA() + self.lossV = lossV() + + def forward(self, audioFeature, visualFeature, labels, masks): + b, s, t = visualFeature.shape[:3] + visualFeature = visualFeature.view(b * s, *visualFeature.shape[2:]) + labels = labels.view(b * s, *labels.shape[2:]) + masks = masks.view(b * s, *masks.shape[2:]) + + audioEmbed = self.model.forward_audio_frontend(audioFeature) # B, C, T, 4 + visualEmbed = self.model.forward_visual_frontend(visualFeature) + audioEmbed = audioEmbed.repeat(s, 1, 1) + + audioEmbed, visualEmbed = self.model.forward_cross_attention(audioEmbed, visualEmbed) + outsAV = self.model.forward_audio_visual_backend(audioEmbed, visualEmbed, b, s) + outsA = self.model.forward_audio_backend(audioEmbed) + outsV = self.model.forward_visual_backend(visualEmbed) + + labels = labels.reshape((-1)) + masks = masks.reshape((-1)) + nlossAV, _, _, prec = self.lossAV.forward(outsAV, labels, masks) + nlossA = self.lossA.forward(outsA, labels, masks) + nlossV = self.lossV.forward(outsV, labels, masks) + + nloss = nlossAV + 0.4 * nlossA + 0.4 * nlossV + + num_frames = masks.sum() + return nloss, prec, num_frames + + +class loconet(nn.Module): + + def __init__(self, cfg, rank=None, device=None): + super(loconet, self).__init__() + self.cfg = cfg + self.rank = rank + if rank != None: + self.rank = rank + self.device = device + + self.model = Loconet(cfg).to(device) + self.model = nn.SyncBatchNorm.convert_sync_batchnorm(self.model) + self.model = nn.parallel.DistributedDataParallel(self.model, + device_ids=[rank], + output_device=rank, + find_unused_parameters=False) + self.optim = torch.optim.Adam(self.model.parameters(), lr=self.cfg.SOLVER.BASE_LR) + self.scheduler = torch.optim.lr_scheduler.StepLR(self.optim, + step_size=1, + gamma=self.cfg.SOLVER.SCHEDULER.GAMMA) + else: + self.model = locoencoder(cfg).cuda() + self.lossAV = lossAV().cuda() + self.lossA = lossA().cuda() + self.lossV = lossV().cuda() + + print( + time.strftime("%m-%d %H:%M:%S") + " Model para number = %.2f" % + (sum(param.numel() for param in self.model.parameters()) / 1024 / 1024)) + + def train_network(self, epoch, loader): + self.model.train() + self.scheduler.step(epoch - 1) + index, top1, loss = 0, 0, 0 + lr = self.optim.param_groups[0]['lr'] + loader.sampler.set_epoch(epoch) + device = self.device + + pbar = enumerate(loader, start=1) + if self.rank == 0: + pbar = tqdm.tqdm(pbar, total=loader.__len__()) + + for num, (audioFeature, visualFeature, labels, masks) in pbar: + + audioFeature = audioFeature.to(device) + visualFeature = visualFeature.to(device) + labels = labels.to(device) + masks = masks.to(device) + nloss, prec, num_frames = self.model( + audioFeature, + visualFeature, + labels, + masks, + ) + + self.optim.zero_grad() + nloss.backward() + self.optim.step() + + [nloss, prec, num_frames] = all_reduce([nloss, prec, num_frames], average=False) + top1 += prec.detach().cpu().numpy() + loss += nloss.detach().cpu().numpy() + index += int(num_frames.detach().cpu().item()) + if self.rank == 0: + pbar.set_postfix( + dict(epoch=epoch, + lr=lr, + loss=loss / (num * self.cfg.NUM_GPUS), + acc=(top1 / index))) + dist.barrier() + return loss / num, lr + + def evaluate_network(self, epoch, loader): + self.eval() + predScores = [] + evalCsvSave = os.path.join(self.cfg.WORKSPACE, "{}_res.csv".format(epoch)) + evalOrig = self.cfg.evalOrig + for audioFeature, visualFeature, labels, masks in tqdm.tqdm(loader): + with torch.no_grad(): + audioFeature = audioFeature.cuda() + visualFeature = visualFeature.cuda() + labels = labels.cuda() + masks = masks.cuda() + b, s, t = visualFeature.shape[0], visualFeature.shape[1], visualFeature.shape[2] + visualFeature = visualFeature.view(b * s, *visualFeature.shape[2:]) + labels = labels.view(b * s, *labels.shape[2:]) + masks = masks.view(b * s, *masks.shape[2:]) + audioEmbed = self.model.forward_audio_frontend(audioFeature) + visualEmbed = self.model.forward_visual_frontend(visualFeature) + audioEmbed = audioEmbed.repeat(s, 1, 1) + audioEmbed, visualEmbed = self.model.forward_cross_attention( + audioEmbed, visualEmbed) + outsAV = self.model.forward_audio_visual_backend(audioEmbed, visualEmbed, b, s) + labels = labels.reshape((-1)) + masks = masks.reshape((-1)) + outsAV = outsAV.view(b, s, t, -1)[:, 0, :, :].view(b * t, -1) + labels = labels.view(b, s, t)[:, 0, :].view(b * t).cuda() + masks = masks.view(b, s, t)[:, 0, :].view(b * t) + _, predScore, _, _ = self.lossAV.forward(outsAV, labels, masks) + predScore = predScore[:, 1].detach().cpu().numpy() + predScores.extend(predScore) + evalLines = open(evalOrig).read().splitlines()[1:] + labels = [] + labels = pandas.Series(['SPEAKING_AUDIBLE' for line in evalLines]) + scores = pandas.Series(predScores) + evalRes = pandas.read_csv(evalOrig) + evalRes['score'] = scores + evalRes['label'] = labels + evalRes.drop(['label_id'], axis=1, inplace=True) + evalRes.drop(['instance_id'], axis=1, inplace=True) + evalRes.to_csv(evalCsvSave, index=False) + cmd = "python -O utils/get_ava_active_speaker_performance.py -g %s -p %s " % (evalOrig, + evalCsvSave) + mAP = float( + str(subprocess.run(cmd, shell=True, capture_output=True).stdout).split(' ')[2][:5]) + return mAP + + def saveParameters(self, path): + torch.save(self.state_dict(), path) + + def loadParameters(self, path): + selfState = self.state_dict() + loadedState = torch.load(path, map_location='cpu') + if self.rank != None: + info = self.load_state_dict(loadedState) + else: + new_state = {} + + for k, v in loadedState.items(): + new_state[k.replace("model.module.", "")] = v + info = self.load_state_dict(new_state, strict=False) + print(info) diff --git a/loss_multi.py b/loss_multi.py new file mode 100755 index 0000000000000000000000000000000000000000..47edcba1e3251d9cff47efd6e6cc2a54dde88795 --- /dev/null +++ b/loss_multi.py @@ -0,0 +1,72 @@ +import torch +import torch.nn as nn +import torch.nn.functional as F +import utils.distributed as du + + +class lossAV(nn.Module): + + def __init__(self): + super(lossAV, self).__init__() + self.criterion = nn.CrossEntropyLoss(reduction='none') + self.FC = nn.Linear(256, 2) + + def forward(self, x, labels=None, masks=None): + x = x.squeeze(1) + x = self.FC(x) + if labels == None: + predScore = x[:, 1] + predScore = predScore.t() + predScore = predScore.view(-1).detach().cpu().numpy() + return predScore + else: + nloss = self.criterion(x, labels) * masks + + num_valid = masks.sum().float() + if self.training: + [num_valid] = du.all_reduce([num_valid],average=True) + nloss = torch.sum(nloss) / num_valid + + predScore = F.softmax(x, dim=-1) + predLabel = torch.round(F.softmax(x, dim=-1))[:, 1] + correctNum = ((predLabel == labels) * masks).sum().float() + return nloss, predScore, predLabel, correctNum + + +class lossA(nn.Module): + + def __init__(self): + super(lossA, self).__init__() + self.criterion = nn.CrossEntropyLoss(reduction='none') + self.FC = nn.Linear(128, 2) + + def forward(self, x, labels, masks=None): + x = x.squeeze(1) + x = self.FC(x) + nloss = self.criterion(x, labels) * masks + num_valid = masks.sum().float() + if self.training: + [num_valid] = du.all_reduce([num_valid],average=True) + nloss = torch.sum(nloss) / num_valid + #nloss = torch.sum(nloss) / torch.sum(masks) + return nloss + + +class lossV(nn.Module): + + def __init__(self): + super(lossV, self).__init__() + + self.criterion = nn.CrossEntropyLoss(reduction='none') + self.FC = nn.Linear(128, 2) + + def forward(self, x, labels, masks=None): + x = x.squeeze(1) + x = self.FC(x) + nloss = self.criterion(x, labels) * masks + # nloss = torch.sum(nloss) / torch.sum(masks) + num_valid = masks.sum().float() + if self.training: + [num_valid] = du.all_reduce([num_valid],average=True) + nloss = torch.sum(nloss) / num_valid + return nloss diff --git a/metrics/AverageMeter.py b/metrics/AverageMeter.py new file mode 100755 index 0000000000000000000000000000000000000000..d5b1bc57d204e76690d92878a46584de98a4f1bd --- /dev/null +++ b/metrics/AverageMeter.py @@ -0,0 +1,18 @@ +#taken from pytorch imagenet example +class AverageMeter(object): + """Computes and stores the average and current value""" + def __init__(self): + self.reset() + + def reset(self): + self.val = 0 + self.avg = 0 + self.sum = 0 + self.count = 0 + + def update(self, val, n=1): + self.val = val + self.sum += val * n + self.count += n + self.avg = self.sum / self.count + diff --git a/metrics/__pycache__/.nfs000000035f4a8257000000eb b/metrics/__pycache__/.nfs000000035f4a8257000000eb new file mode 100644 index 0000000000000000000000000000000000000000..5c481da38ae427d717e3fae5b8a12b0efc12ae3b Binary files /dev/null and b/metrics/__pycache__/.nfs000000035f4a8257000000eb differ diff --git a/metrics/__pycache__/AverageMeter.cpython-36.pyc b/metrics/__pycache__/AverageMeter.cpython-36.pyc new file mode 100755 index 0000000000000000000000000000000000000000..d928a415daff76824c9e873f38db2928dd1dc58d Binary files /dev/null and b/metrics/__pycache__/AverageMeter.cpython-36.pyc differ diff --git a/metrics/__pycache__/AverageMeter.cpython-38.pyc b/metrics/__pycache__/AverageMeter.cpython-38.pyc new file mode 100644 index 0000000000000000000000000000000000000000..90592f04e63826430507fd72bdb195cf923e2c7f Binary files /dev/null and b/metrics/__pycache__/AverageMeter.cpython-38.pyc differ diff --git a/metrics/__pycache__/accuracy.cpython-36.pyc b/metrics/__pycache__/accuracy.cpython-36.pyc new file mode 100755 index 0000000000000000000000000000000000000000..4f5c7021ffc19382a03dd31b665e2e1ed66ea090 Binary files /dev/null and b/metrics/__pycache__/accuracy.cpython-36.pyc differ diff --git a/metrics/__pycache__/accuracy.cpython-38.pyc b/metrics/__pycache__/accuracy.cpython-38.pyc new file mode 100644 index 0000000000000000000000000000000000000000..28af79a9c47d1d10af6a5a364c21f8331be772ee Binary files /dev/null and b/metrics/__pycache__/accuracy.cpython-38.pyc differ diff --git a/metrics/accuracy.py b/metrics/accuracy.py new file mode 100755 index 0000000000000000000000000000000000000000..03cc9ef95d02130276d02b2a68526a8e30baa1ab --- /dev/null +++ b/metrics/accuracy.py @@ -0,0 +1,20 @@ +import torch + +accuracy = lambda output,target : acc_topk(output, target)[0] + +#taken from pytorch imagenet example +def acc_topk(output, target, topk=(1,)): + """Computes the accuracy over the k top predictions for the specified values of k""" + with torch.no_grad(): + maxk = max(topk) + batch_size = target.size(0) + + _, pred = output.topk(maxk, 1, True, True) + pred = pred.t() + correct = pred.eq(target.view(1, -1).expand_as(pred)) + + res = [] + for k in topk: + correct_k = correct[:k].view(-1).float().sum(0, keepdim=True) + res.append(correct_k.mul_(1.0 / batch_size)) + return res \ No newline at end of file diff --git a/model/.DS_Store b/model/.DS_Store new file mode 100644 index 0000000000000000000000000000000000000000..e734ea84fafb31cbe2b4f456dc438a531432a91e Binary files /dev/null and b/model/.DS_Store differ diff --git a/model/__init__.py b/model/__init__.py new file mode 100644 index 0000000000000000000000000000000000000000..59239dd9719a28db4a512938c678f4d804630c2d --- /dev/null +++ b/model/__init__.py @@ -0,0 +1,5 @@ +from model.transformer.position_encoding import PositionalEncoding +from model.transformer.transformer import Transformer +from model.transformer.transformer import TransformerEncoder, TransformerEncoderLayer +from model.transformer.transformer import TransformerDecoder, TransformerDecoderLayer +from model.transformer.utils import layer_norm, generate_square_subsequent_mask, generate_proposal_mask diff --git a/model/__pycache__/__init__.cpython-36.pyc b/model/__pycache__/__init__.cpython-36.pyc new file mode 100644 index 0000000000000000000000000000000000000000..eccec07dc8794bd26846b65adc7193f2fc477490 Binary files /dev/null and b/model/__pycache__/__init__.cpython-36.pyc differ diff --git a/model/__pycache__/__init__.cpython-37.pyc b/model/__pycache__/__init__.cpython-37.pyc new file mode 100644 index 0000000000000000000000000000000000000000..27a223bf75d5e07c4bf849e58178a865f155dc0a Binary files /dev/null and b/model/__pycache__/__init__.cpython-37.pyc differ diff --git a/model/__pycache__/attentionLayer.cpython-37.pyc b/model/__pycache__/attentionLayer.cpython-37.pyc new file mode 100644 index 0000000000000000000000000000000000000000..d40e2eb05278a0f315581e737313adacb3540e72 Binary files /dev/null and b/model/__pycache__/attentionLayer.cpython-37.pyc differ diff --git a/model/__pycache__/convLayer.cpython-37.pyc b/model/__pycache__/convLayer.cpython-37.pyc new file mode 100644 index 0000000000000000000000000000000000000000..10f90442a4c8e2d5a29476edf4620b016738711b Binary files /dev/null and b/model/__pycache__/convLayer.cpython-37.pyc differ diff --git a/model/__pycache__/loconet_encoder.cpython-37.pyc b/model/__pycache__/loconet_encoder.cpython-37.pyc new file mode 100644 index 0000000000000000000000000000000000000000..a222c2bfc64a7f9f5ef1d84917c1ce66f5f80c96 Binary files /dev/null and b/model/__pycache__/loconet_encoder.cpython-37.pyc differ diff --git a/model/__pycache__/position_encoding.cpython-36.pyc b/model/__pycache__/position_encoding.cpython-36.pyc new file mode 100644 index 0000000000000000000000000000000000000000..0e58bf2dada185cabe183810dfc4162b67eddcd8 Binary files /dev/null and b/model/__pycache__/position_encoding.cpython-36.pyc differ diff --git a/model/__pycache__/talkNetModel.cpython-37.pyc b/model/__pycache__/talkNetModel.cpython-37.pyc new file mode 100644 index 0000000000000000000000000000000000000000..925e49a9132958b0f48da11987a3cedfeadf95a0 Binary files /dev/null and b/model/__pycache__/talkNetModel.cpython-37.pyc differ diff --git a/model/__pycache__/transformer.cpython-36.pyc b/model/__pycache__/transformer.cpython-36.pyc new file mode 100644 index 0000000000000000000000000000000000000000..692c8b1f538d5202e7b297df3c611775c944d2e5 Binary files /dev/null and b/model/__pycache__/transformer.cpython-36.pyc differ diff --git a/model/__pycache__/utils.cpython-36.pyc b/model/__pycache__/utils.cpython-36.pyc new file mode 100644 index 0000000000000000000000000000000000000000..66e2d2ad6fab6f4851e6eb2875ca6353334caa7b Binary files /dev/null and b/model/__pycache__/utils.cpython-36.pyc differ diff --git a/model/__pycache__/visualEncoder.cpython-37.pyc b/model/__pycache__/visualEncoder.cpython-37.pyc new file mode 100644 index 0000000000000000000000000000000000000000..1e82303ff2b2f0155a2cd1888a4ccb8c5faa308e Binary files /dev/null and b/model/__pycache__/visualEncoder.cpython-37.pyc differ diff --git a/model/attentionLayer.py b/model/attentionLayer.py new file mode 100755 index 0000000000000000000000000000000000000000..f4f1efd8da6dcccfd133aaddeb415ab5b38ab5d3 --- /dev/null +++ b/model/attentionLayer.py @@ -0,0 +1,39 @@ +import torch +import torch.nn as nn +from torch.nn import functional as F +from torch.nn import MultiheadAttention + + +class attentionLayer(nn.Module): + + def __init__(self, d_model, nhead, dropout=0.1): + super(attentionLayer, self).__init__() + self.self_attn = MultiheadAttention(d_model, nhead, dropout=dropout) + + self.linear1 = nn.Linear(d_model, d_model * 4) + self.dropout = nn.Dropout(dropout) + self.linear2 = nn.Linear(d_model * 4, d_model) + + self.norm1 = nn.LayerNorm(d_model) + self.norm2 = nn.LayerNorm(d_model) + self.dropout1 = nn.Dropout(dropout) + self.dropout2 = nn.Dropout(dropout) + + self.activation = F.relu + + def forward(self, src, tar, adjust=False, attn_mask=None): + # type: (Tensor, Optional[Tensor], Optional[Tensor]) -> Tensor + src = src.transpose(0, 1) # B, T, C -> T, B, C + tar = tar.transpose(0, 1) # B, T, C -> T, B, C + if adjust: + src2 = self.self_attn(src, tar, tar, attn_mask=None, key_padding_mask=None)[0] + else: + src2 = self.self_attn(tar, src, src, attn_mask=None, key_padding_mask=None)[0] + src = src + self.dropout1(src2) + src = self.norm1(src) + + src2 = self.linear2(self.dropout(self.activation(self.linear1(src)))) + src = src + self.dropout2(src2) + src = self.norm2(src) + src = src.transpose(0, 1) # T, B, C -> B, T, C + return src diff --git a/model/audioEncoder.py b/model/audioEncoder.py new file mode 100755 index 0000000000000000000000000000000000000000..6aaaf66b29d9453662bd20a918ebff35229f2966 --- /dev/null +++ b/model/audioEncoder.py @@ -0,0 +1,108 @@ +import torch +import torch.nn as nn +import torch.nn.functional as F + +class SEBasicBlock(nn.Module): + expansion = 1 + + def __init__(self, inplanes, planes, stride=1, downsample=None, reduction=8): + super(SEBasicBlock, self).__init__() + self.conv1 = nn.Conv2d(inplanes, planes, kernel_size=3, stride=stride, padding=1, bias=False) + self.bn1 = nn.BatchNorm2d(planes) + self.conv2 = nn.Conv2d(planes, planes, kernel_size=3, padding=1, bias=False) + self.bn2 = nn.BatchNorm2d(planes) + self.relu = nn.ReLU(inplace=True) + self.se = SELayer(planes, reduction) + self.downsample = downsample + self.stride = stride + + def forward(self, x): + residual = x + + out = self.conv1(x) + out = self.relu(out) + out = self.bn1(out) + + out = self.conv2(out) + out = self.bn2(out) + out = self.se(out) + + if self.downsample is not None: + residual = self.downsample(x) + + out += residual + out = self.relu(out) + return out + +class SELayer(nn.Module): + def __init__(self, channel, reduction=8): + super(SELayer, self).__init__() + self.avg_pool = nn.AdaptiveAvgPool2d(1) + self.fc = nn.Sequential( + nn.Linear(channel, channel // reduction), + nn.ReLU(inplace=True), + nn.Linear(channel // reduction, channel), + nn.Sigmoid() + ) + + def forward(self, x): + b, c, _, _ = x.size() + y = self.avg_pool(x).view(b, c) + y = self.fc(y).view(b, c, 1, 1) + return x * y + +class audioEncoder(nn.Module): + def __init__(self, layers, num_filters, **kwargs): + super(audioEncoder, self).__init__() + block = SEBasicBlock + self.inplanes = num_filters[0] + + self.conv1 = nn.Conv2d(1, num_filters[0] , kernel_size=7, stride=(2, 1), padding=3, + bias=False) + self.bn1 = nn.BatchNorm2d(num_filters[0]) + self.relu = nn.ReLU(inplace=True) + + self.layer1 = self._make_layer(block, num_filters[0], layers[0]) + self.layer2 = self._make_layer(block, num_filters[1], layers[1], stride=(2, 2)) + self.layer3 = self._make_layer(block, num_filters[2], layers[2], stride=(2, 2)) + self.layer4 = self._make_layer(block, num_filters[3], layers[3], stride=(1, 1)) + out_dim = num_filters[3] * block.expansion + + for m in self.modules(): + if isinstance(m, nn.Conv2d): + nn.init.kaiming_normal_(m.weight, mode='fan_out', nonlinearity='relu') + elif isinstance(m, nn.BatchNorm2d): + nn.init.constant_(m.weight, 1) + nn.init.constant_(m.bias, 0) + + def _make_layer(self, block, planes, blocks, stride=1): + downsample = None + if stride != 1 or self.inplanes != planes * block.expansion: + downsample = nn.Sequential( + nn.Conv2d(self.inplanes, planes * block.expansion, + kernel_size=1, stride=stride, bias=False), + nn.BatchNorm2d(planes * block.expansion), + ) + + layers = [] + layers.append(block(self.inplanes, planes, stride, downsample)) + self.inplanes = planes * block.expansion + for i in range(1, blocks): + layers.append(block(self.inplanes, planes)) + + return nn.Sequential(*layers) + + def forward(self, x): + x = self.conv1(x) + x = self.bn1(x) + x = self.relu(x) + + x = self.layer1(x) + x = self.layer2(x) + x = self.layer3(x) + x = self.layer4(x) + x = torch.mean(x, dim=2, keepdim=True) + x = x.view((x.size()[0], x.size()[1], -1)) + x = x.transpose(1, 2) + + return x \ No newline at end of file diff --git a/model/convLayer.py b/model/convLayer.py new file mode 100755 index 0000000000000000000000000000000000000000..827d83e61e208bc0acfb9bb587ec4f07e1d1104b --- /dev/null +++ b/model/convLayer.py @@ -0,0 +1,42 @@ +import torch +import torch.nn as nn +from torch.nn import functional as F + + +class ConvLayer(nn.Module): + + def __init__(self, cfg): + super(ConvLayer, self).__init__() + self.cfg = cfg + self.s = cfg.MODEL.NUM_SPEAKERS + self.conv2d = torch.nn.Conv2d(256, 256 * self.s, (self.s, 7), padding=(0, 3)) + # below line is speaker parallel 93.88 code + # self.conv2d = torch.nn.Conv2d(256, 256 * self.s, (3, 7), padding=(0, 3)) + self.ln = torch.nn.LayerNorm(256) + self.conv2d_1x1 = torch.nn.Conv2d(256, 512, (1, 1), padding=(0, 0)) + self.conv2d_1x1_2 = torch.nn.Conv2d(512, 256, (1, 1), padding=(0, 0)) + self.gelu = nn.GELU() + + def forward(self, x, b, s): + + identity = x # b*s, t, c + t = x.shape[1] + c = x.shape[2] + out = x.view(b, s, t, c) + out = out.permute(0, 3, 1, 2) # b, c, s, t + + out = self.conv2d(out) # b, s*c, 1, t + out = out.view(b, c, s, t) + out = out.permute(0, 2, 3, 1) # b, s, t, c + out = self.ln(out) + out = out.permute(0, 3, 1, 2) + out = self.conv2d_1x1(out) + out = self.gelu(out) + out = self.conv2d_1x1_2(out) # b, c, s, t + + out = out.permute(0, 2, 3, 1) # b, s, t, c + out = out.view(b * s, t, c) + + out += identity + + return out, b, s diff --git a/model/faceDetector/README.md b/model/faceDetector/README.md new file mode 100755 index 0000000000000000000000000000000000000000..f5a8d4feb007f86f8c60075d8538f9ee5e93b325 --- /dev/null +++ b/model/faceDetector/README.md @@ -0,0 +1,3 @@ +# Face detector + +This face detector is adapted from `https://github.com/cs-giung/face-detection-pytorch`. diff --git a/model/faceDetector/__init__.py b/model/faceDetector/__init__.py new file mode 100755 index 0000000000000000000000000000000000000000..059d49bf0b8e8a17f641984e7d889e5b008257b9 --- /dev/null +++ b/model/faceDetector/__init__.py @@ -0,0 +1 @@ +from .s3fd import S3FD \ No newline at end of file diff --git a/model/faceDetector/s3fd/__init__.py b/model/faceDetector/s3fd/__init__.py new file mode 100755 index 0000000000000000000000000000000000000000..943292ad2afae6ba9eebd03b6f9bb684a7de5ca5 --- /dev/null +++ b/model/faceDetector/s3fd/__init__.py @@ -0,0 +1,66 @@ +import time, os, sys, subprocess +import numpy as np +import cv2 +import torch +from torchvision import transforms +from .nets import S3FDNet +from .box_utils import nms_ + +PATH_WEIGHT = 'model/faceDetector/s3fd/sfd_face.pth' +if os.path.isfile(PATH_WEIGHT) == False: + Link = "1KafnHz7ccT-3IyddBsL5yi2xGtxAKypt" + cmd = "gdown --id %s -O %s"%(Link, PATH_WEIGHT) + subprocess.call(cmd, shell=True, stdout=None) +img_mean = np.array([104., 117., 123.])[:, np.newaxis, np.newaxis].astype('float32') + + +class S3FD(): + + def __init__(self, device='cuda'): + + tstamp = time.time() + self.device = device + + # print('[S3FD] loading with', self.device) + self.net = S3FDNet(device=self.device).to(self.device) + PATH = os.path.join(os.getcwd(), PATH_WEIGHT) + state_dict = torch.load(PATH, map_location=self.device) + self.net.load_state_dict(state_dict) + self.net.eval() + # print('[S3FD] finished loading (%.4f sec)' % (time.time() - tstamp)) + + def detect_faces(self, image, conf_th=0.8, scales=[1]): + + w, h = image.shape[1], image.shape[0] + + bboxes = np.empty(shape=(0, 5)) + + with torch.no_grad(): + for s in scales: + scaled_img = cv2.resize(image, dsize=(0, 0), fx=s, fy=s, interpolation=cv2.INTER_LINEAR) + + scaled_img = np.swapaxes(scaled_img, 1, 2) + scaled_img = np.swapaxes(scaled_img, 1, 0) + scaled_img = scaled_img[[2, 1, 0], :, :] + scaled_img = scaled_img.astype('float32') + scaled_img -= img_mean + scaled_img = scaled_img[[2, 1, 0], :, :] + x = torch.from_numpy(scaled_img).unsqueeze(0).to(self.device) + y = self.net(x) + + detections = y.data + scale = torch.Tensor([w, h, w, h]) + + for i in range(detections.size(1)): + j = 0 + while detections[0, i, j, 0] > conf_th: + score = detections[0, i, j, 0] + pt = (detections[0, i, j, 1:] * scale).cpu().numpy() + bbox = (pt[0], pt[1], pt[2], pt[3], score) + bboxes = np.vstack((bboxes, bbox)) + j += 1 + + keep = nms_(bboxes, 0.1) + bboxes = bboxes[keep] + + return bboxes diff --git a/model/faceDetector/s3fd/box_utils.py b/model/faceDetector/s3fd/box_utils.py new file mode 100755 index 0000000000000000000000000000000000000000..0779bcd58062510a2979f5673f14189c4c817e92 --- /dev/null +++ b/model/faceDetector/s3fd/box_utils.py @@ -0,0 +1,217 @@ +import numpy as np +from itertools import product as product +import torch +from torch.autograd import Function + + +def nms_(dets, thresh): + """ + Courtesy of Ross Girshick + [https://github.com/rbgirshick/py-faster-rcnn/blob/master/lib/nms/py_cpu_nms.py] + """ + x1 = dets[:, 0] + y1 = dets[:, 1] + x2 = dets[:, 2] + y2 = dets[:, 3] + scores = dets[:, 4] + + areas = (x2 - x1) * (y2 - y1) + order = scores.argsort()[::-1] + + keep = [] + while order.size > 0: + i = order[0] + keep.append(int(i)) + xx1 = np.maximum(x1[i], x1[order[1:]]) + yy1 = np.maximum(y1[i], y1[order[1:]]) + xx2 = np.minimum(x2[i], x2[order[1:]]) + yy2 = np.minimum(y2[i], y2[order[1:]]) + + w = np.maximum(0.0, xx2 - xx1) + h = np.maximum(0.0, yy2 - yy1) + inter = w * h + ovr = inter / (areas[i] + areas[order[1:]] - inter) + + inds = np.where(ovr <= thresh)[0] + order = order[inds + 1] + + return np.array(keep).astype(np.int) + + +def decode(loc, priors, variances): + """Decode locations from predictions using priors to undo + the encoding we did for offset regression at train time. + Args: + loc (tensor): location predictions for loc layers, + Shape: [num_priors,4] + priors (tensor): Prior boxes in center-offset form. + Shape: [num_priors,4]. + variances: (list[float]) Variances of priorboxes + Return: + decoded bounding box predictions + """ + + boxes = torch.cat(( + priors[:, :2] + loc[:, :2] * variances[0] * priors[:, 2:], + priors[:, 2:] * torch.exp(loc[:, 2:] * variances[1])), 1) + boxes[:, :2] -= boxes[:, 2:] / 2 + boxes[:, 2:] += boxes[:, :2] + return boxes + + +def nms(boxes, scores, overlap=0.5, top_k=200): + """Apply non-maximum suppression at test time to avoid detecting too many + overlapping bounding boxes for a given object. + Args: + boxes: (tensor) The location preds for the img, Shape: [num_priors,4]. + scores: (tensor) The class predscores for the img, Shape:[num_priors]. + overlap: (float) The overlap thresh for suppressing unnecessary boxes. + top_k: (int) The Maximum number of box preds to consider. + Return: + The indices of the kept boxes with respect to num_priors. + """ + + keep = scores.new(scores.size(0)).zero_().long() + if boxes.numel() == 0: + return keep, 0 + x1 = boxes[:, 0] + y1 = boxes[:, 1] + x2 = boxes[:, 2] + y2 = boxes[:, 3] + area = torch.mul(x2 - x1, y2 - y1) + v, idx = scores.sort(0) # sort in ascending order + # I = I[v >= 0.01] + idx = idx[-top_k:] # indices of the top-k largest vals + xx1 = boxes.new() + yy1 = boxes.new() + xx2 = boxes.new() + yy2 = boxes.new() + w = boxes.new() + h = boxes.new() + + # keep = torch.Tensor() + count = 0 + while idx.numel() > 0: + i = idx[-1] # index of current largest val + # keep.append(i) + keep[count] = i + count += 1 + if idx.size(0) == 1: + break + idx = idx[:-1] # remove kept element from view + # load bboxes of next highest vals + torch.index_select(x1, 0, idx, out=xx1) + torch.index_select(y1, 0, idx, out=yy1) + torch.index_select(x2, 0, idx, out=xx2) + torch.index_select(y2, 0, idx, out=yy2) + # store element-wise max with next highest score + xx1 = torch.clamp(xx1, min=x1[i]) + yy1 = torch.clamp(yy1, min=y1[i]) + xx2 = torch.clamp(xx2, max=x2[i]) + yy2 = torch.clamp(yy2, max=y2[i]) + w.resize_as_(xx2) + h.resize_as_(yy2) + w = xx2 - xx1 + h = yy2 - yy1 + # check sizes of xx1 and xx2.. after each iteration + w = torch.clamp(w, min=0.0) + h = torch.clamp(h, min=0.0) + inter = w * h + # IoU = i / (area(a) + area(b) - i) + rem_areas = torch.index_select(area, 0, idx) # load remaining areas) + union = (rem_areas - inter) + area[i] + IoU = inter / union # store result in iou + # keep only elements with an IoU <= overlap + idx = idx[IoU.le(overlap)] + return keep, count + + +class Detect(object): + + def __init__(self, num_classes=2, + top_k=750, nms_thresh=0.3, conf_thresh=0.05, + variance=[0.1, 0.2], nms_top_k=5000): + + self.num_classes = num_classes + self.top_k = top_k + self.nms_thresh = nms_thresh + self.conf_thresh = conf_thresh + self.variance = variance + self.nms_top_k = nms_top_k + + def forward(self, loc_data, conf_data, prior_data): + + num = loc_data.size(0) + num_priors = prior_data.size(0) + + conf_preds = conf_data.view(num, num_priors, self.num_classes).transpose(2, 1) + batch_priors = prior_data.view(-1, num_priors, 4).expand(num, num_priors, 4) + batch_priors = batch_priors.contiguous().view(-1, 4) + + decoded_boxes = decode(loc_data.view(-1, 4), batch_priors, self.variance) + decoded_boxes = decoded_boxes.view(num, num_priors, 4) + + output = torch.zeros(num, self.num_classes, self.top_k, 5) + + for i in range(num): + boxes = decoded_boxes[i].clone() + conf_scores = conf_preds[i].clone() + + for cl in range(1, self.num_classes): + c_mask = conf_scores[cl].gt(self.conf_thresh) + scores = conf_scores[cl][c_mask] + + if scores.dim() == 0: + continue + l_mask = c_mask.unsqueeze(1).expand_as(boxes) + boxes_ = boxes[l_mask].view(-1, 4) + ids, count = nms(boxes_, scores, self.nms_thresh, self.nms_top_k) + count = count if count < self.top_k else self.top_k + + output[i, cl, :count] = torch.cat((scores[ids[:count]].unsqueeze(1), boxes_[ids[:count]]), 1) + + return output + + +class PriorBox(object): + + def __init__(self, input_size, feature_maps, + variance=[0.1, 0.2], + min_sizes=[16, 32, 64, 128, 256, 512], + steps=[4, 8, 16, 32, 64, 128], + clip=False): + + super(PriorBox, self).__init__() + + self.imh = input_size[0] + self.imw = input_size[1] + self.feature_maps = feature_maps + + self.variance = variance + self.min_sizes = min_sizes + self.steps = steps + self.clip = clip + + def forward(self): + mean = [] + for k, fmap in enumerate(self.feature_maps): + feath = fmap[0] + featw = fmap[1] + for i, j in product(range(feath), range(featw)): + f_kw = self.imw / self.steps[k] + f_kh = self.imh / self.steps[k] + + cx = (j + 0.5) / f_kw + cy = (i + 0.5) / f_kh + + s_kw = self.min_sizes[k] / self.imw + s_kh = self.min_sizes[k] / self.imh + + mean += [cx, cy, s_kw, s_kh] + + output = torch.FloatTensor(mean).view(-1, 4) + + if self.clip: + output.clamp_(max=1, min=0) + + return output diff --git a/model/faceDetector/s3fd/nets.py b/model/faceDetector/s3fd/nets.py new file mode 100755 index 0000000000000000000000000000000000000000..85b5c82c142f02cef75c1e03557b2a1a748c32b0 --- /dev/null +++ b/model/faceDetector/s3fd/nets.py @@ -0,0 +1,174 @@ +import torch +import torch.nn as nn +import torch.nn.functional as F +import torch.nn.init as init +from .box_utils import Detect, PriorBox + + +class L2Norm(nn.Module): + + def __init__(self, n_channels, scale): + super(L2Norm, self).__init__() + self.n_channels = n_channels + self.gamma = scale or None + self.eps = 1e-10 + self.weight = nn.Parameter(torch.Tensor(self.n_channels)) + self.reset_parameters() + + def reset_parameters(self): + init.constant_(self.weight, self.gamma) + + def forward(self, x): + norm = x.pow(2).sum(dim=1, keepdim=True).sqrt() + self.eps + x = torch.div(x, norm) + out = self.weight.unsqueeze(0).unsqueeze(2).unsqueeze(3).expand_as(x) * x + return out + + +class S3FDNet(nn.Module): + + def __init__(self, device='cuda'): + super(S3FDNet, self).__init__() + self.device = device + + self.vgg = nn.ModuleList([ + nn.Conv2d(3, 64, 3, 1, padding=1), + nn.ReLU(inplace=True), + nn.Conv2d(64, 64, 3, 1, padding=1), + nn.ReLU(inplace=True), + nn.MaxPool2d(2, 2), + + nn.Conv2d(64, 128, 3, 1, padding=1), + nn.ReLU(inplace=True), + nn.Conv2d(128, 128, 3, 1, padding=1), + nn.ReLU(inplace=True), + nn.MaxPool2d(2, 2), + + nn.Conv2d(128, 256, 3, 1, padding=1), + nn.ReLU(inplace=True), + nn.Conv2d(256, 256, 3, 1, padding=1), + nn.ReLU(inplace=True), + nn.Conv2d(256, 256, 3, 1, padding=1), + nn.ReLU(inplace=True), + nn.MaxPool2d(2, 2, ceil_mode=True), + + nn.Conv2d(256, 512, 3, 1, padding=1), + nn.ReLU(inplace=True), + nn.Conv2d(512, 512, 3, 1, padding=1), + nn.ReLU(inplace=True), + nn.Conv2d(512, 512, 3, 1, padding=1), + nn.ReLU(inplace=True), + nn.MaxPool2d(2, 2), + + nn.Conv2d(512, 512, 3, 1, padding=1), + nn.ReLU(inplace=True), + nn.Conv2d(512, 512, 3, 1, padding=1), + nn.ReLU(inplace=True), + nn.Conv2d(512, 512, 3, 1, padding=1), + nn.ReLU(inplace=True), + nn.MaxPool2d(2, 2), + + nn.Conv2d(512, 1024, 3, 1, padding=6, dilation=6), + nn.ReLU(inplace=True), + nn.Conv2d(1024, 1024, 1, 1), + nn.ReLU(inplace=True), + ]) + + self.L2Norm3_3 = L2Norm(256, 10) + self.L2Norm4_3 = L2Norm(512, 8) + self.L2Norm5_3 = L2Norm(512, 5) + + self.extras = nn.ModuleList([ + nn.Conv2d(1024, 256, 1, 1), + nn.Conv2d(256, 512, 3, 2, padding=1), + nn.Conv2d(512, 128, 1, 1), + nn.Conv2d(128, 256, 3, 2, padding=1), + ]) + + self.loc = nn.ModuleList([ + nn.Conv2d(256, 4, 3, 1, padding=1), + nn.Conv2d(512, 4, 3, 1, padding=1), + nn.Conv2d(512, 4, 3, 1, padding=1), + nn.Conv2d(1024, 4, 3, 1, padding=1), + nn.Conv2d(512, 4, 3, 1, padding=1), + nn.Conv2d(256, 4, 3, 1, padding=1), + ]) + + self.conf = nn.ModuleList([ + nn.Conv2d(256, 4, 3, 1, padding=1), + nn.Conv2d(512, 2, 3, 1, padding=1), + nn.Conv2d(512, 2, 3, 1, padding=1), + nn.Conv2d(1024, 2, 3, 1, padding=1), + nn.Conv2d(512, 2, 3, 1, padding=1), + nn.Conv2d(256, 2, 3, 1, padding=1), + ]) + + self.softmax = nn.Softmax(dim=-1) + self.detect = Detect() + + def forward(self, x): + size = x.size()[2:] + sources = list() + loc = list() + conf = list() + + for k in range(16): + x = self.vgg[k](x) + s = self.L2Norm3_3(x) + sources.append(s) + + for k in range(16, 23): + x = self.vgg[k](x) + s = self.L2Norm4_3(x) + sources.append(s) + + for k in range(23, 30): + x = self.vgg[k](x) + s = self.L2Norm5_3(x) + sources.append(s) + + for k in range(30, len(self.vgg)): + x = self.vgg[k](x) + sources.append(x) + + # apply extra layers and cache source layer outputs + for k, v in enumerate(self.extras): + x = F.relu(v(x), inplace=True) + if k % 2 == 1: + sources.append(x) + + # apply multibox head to source layers + loc_x = self.loc[0](sources[0]) + conf_x = self.conf[0](sources[0]) + + max_conf, _ = torch.max(conf_x[:, 0:3, :, :], dim=1, keepdim=True) + conf_x = torch.cat((max_conf, conf_x[:, 3:, :, :]), dim=1) + + loc.append(loc_x.permute(0, 2, 3, 1).contiguous()) + conf.append(conf_x.permute(0, 2, 3, 1).contiguous()) + + for i in range(1, len(sources)): + x = sources[i] + conf.append(self.conf[i](x).permute(0, 2, 3, 1).contiguous()) + loc.append(self.loc[i](x).permute(0, 2, 3, 1).contiguous()) + + features_maps = [] + for i in range(len(loc)): + feat = [] + feat += [loc[i].size(1), loc[i].size(2)] + features_maps += [feat] + + loc = torch.cat([o.view(o.size(0), -1) for o in loc], 1) + conf = torch.cat([o.view(o.size(0), -1) for o in conf], 1) + + with torch.no_grad(): + self.priorbox = PriorBox(size, features_maps) + self.priors = self.priorbox.forward() + + output = self.detect.forward( + loc.view(loc.size(0), -1, 4), + self.softmax(conf.view(conf.size(0), -1, 2)), + self.priors.type(type(x.data)).to(self.device) + ) + + return output diff --git a/model/loconet_encoder.py b/model/loconet_encoder.py new file mode 100755 index 0000000000000000000000000000000000000000..5437007285b1e9ea495478e95a5697e9a54ca799 --- /dev/null +++ b/model/loconet_encoder.py @@ -0,0 +1,91 @@ +import torch +import torch.nn as nn + +# from model.visualEncoder import visualFrontend, visualTCN, visualConv1D +from model.attentionLayer import attentionLayer +from model.convLayer import ConvLayer +from torchvggish import vggish +from model.visualEncoder import visualFrontend, visualConv1D, visualTCN + + +class locoencoder(nn.Module): + + def __init__(self, cfg): + super(locoencoder, self).__init__() + self.cfg = cfg + # Visual Temporal Encoder + self.visualFrontend = visualFrontend(cfg) # Visual Frontend + self.visualTCN = visualTCN() # Visual Temporal Network TCN + self.visualConv1D = visualConv1D() # Visual Temporal Network Conv1d + + urls = { + 'vggish': + "https://github.com/harritaylor/torchvggish/releases/download/v0.1/vggish-10086976.pth" + } + self.audioEncoder = vggish.VGGish(urls, preprocess=False, postprocess=False) + self.audio_pool = nn.AdaptiveAvgPool1d(1) + + # Audio-visual Cross Attention + self.crossA2V = attentionLayer(d_model=128, nhead=8) + self.crossV2A = attentionLayer(d_model=128, nhead=8) + + # Audio-visual Self Attention + + num_layers = self.cfg.MODEL.AV_layers + layers = nn.ModuleList() + for i in range(num_layers): + layers.append(ConvLayer(cfg)) + layers.append(attentionLayer(d_model=256, nhead=8)) + self.convAV = layers + + def forward_visual_frontend(self, x): + + B, T, W, H = x.shape + x = x.view(B * T, 1, 1, W, H) + x = (x / 255 - 0.4161) / 0.1688 + x = self.visualFrontend(x) + x = x.view(B, T, 512) + x = x.transpose(1, 2) + x = self.visualTCN(x) + x = self.visualConv1D(x) + x = x.transpose(1, 2) + return x + + def forward_audio_frontend(self, x): + t = x.shape[-2] + numFrames = t // 4 + pad = 8 - (t % 8) + x = torch.nn.functional.pad(x, (0, 0, 0, pad), "constant") + # x = x.unsqueeze(1).transpose(2, 3) + x = self.audioEncoder(x) + + b, c, t2, freq = x.shape + x = x.view(b * c, t2, freq) + x = self.audio_pool(x) + x = x.view(b, c, t2)[:, :, :numFrames] + x = x.permute(0, 2, 1) + return x + + def forward_cross_attention(self, x1, x2): + x1_c = self.crossA2V(src=x1, tar=x2, adjust=self.cfg.MODEL.ADJUST_ATTENTION) + x2_c = self.crossV2A(src=x2, tar=x1, adjust=self.cfg.MODEL.ADJUST_ATTENTION) + return x1_c, x2_c + + def forward_audio_visual_backend(self, x1, x2, b=1, s=1): + x = torch.cat((x1, x2), 2) # B*S, T, 2C + for i, layer in enumerate(self.convAV): + if i % 2 == 0: + x, b, s = layer(x, b, s) + else: + x = layer(src=x, tar=x) + + x = torch.reshape(x, (-1, 256)) + return x + + def forward_audio_backend(self, x): + x = torch.reshape(x, (-1, 128)) + return x + + def forward_visual_backend(self, x): + x = torch.reshape(x, (-1, 128)) + return x diff --git a/model/transformer/__pycache__/position_encoding.cpython-37.pyc b/model/transformer/__pycache__/position_encoding.cpython-37.pyc new file mode 100644 index 0000000000000000000000000000000000000000..6d3bb438aaecc9285935804e22e24a85ef38e9d4 Binary files /dev/null and b/model/transformer/__pycache__/position_encoding.cpython-37.pyc differ diff --git a/model/transformer/__pycache__/transformer.cpython-37.pyc b/model/transformer/__pycache__/transformer.cpython-37.pyc new file mode 100644 index 0000000000000000000000000000000000000000..bb3498e405b0429759bc61a8dc23b2ecd7d7b13c Binary files /dev/null and b/model/transformer/__pycache__/transformer.cpython-37.pyc differ diff --git a/model/transformer/__pycache__/utils.cpython-37.pyc b/model/transformer/__pycache__/utils.cpython-37.pyc new file mode 100644 index 0000000000000000000000000000000000000000..aff02afd5a033a716de5a956fac03810ec6fb80d Binary files /dev/null and b/model/transformer/__pycache__/utils.cpython-37.pyc differ diff --git a/model/transformer/position_encoding.py b/model/transformer/position_encoding.py new file mode 100644 index 0000000000000000000000000000000000000000..07b8ad68c878ab5a8c0e4a4de95e0bdcdc68fcfe --- /dev/null +++ b/model/transformer/position_encoding.py @@ -0,0 +1,28 @@ +########################################################################## +# We adopt the positional encoding method from PyTorch Turorial. +# Source: https://pytorch.org/tutorials/beginner/transformer_tutorial.html +########################################################################## +import math + +import torch +import torch.nn as nn + + +class PositionalEncoding(nn.Module): + + def __init__(self, d_model, dropout=0.1, max_len=5000): + super(PositionalEncoding, self).__init__() + + self.dropout = nn.Dropout(p=dropout) + + pe = torch.zeros(max_len, d_model) + position = torch.arange(0, max_len, dtype=torch.float).unsqueeze(1) + div_term = torch.exp(torch.arange(0, d_model, 2).float() * (-math.log(10000.0) / d_model)) + pe[:, 0::2] = torch.sin(position * div_term) + pe[:, 1::2] = torch.cos(position * div_term) + pe = pe.unsqueeze(0).transpose(0, 1) + self.register_buffer('pe', pe) + + def forward(self, x, padding=0): + x = x + self.pe[padding:padding + x.shape[0], :] + return self.dropout(x) diff --git a/model/transformer/transformer.py b/model/transformer/transformer.py new file mode 100644 index 0000000000000000000000000000000000000000..3c3faa13db57d78fa13a964b89582b4b02728d05 --- /dev/null +++ b/model/transformer/transformer.py @@ -0,0 +1,334 @@ +import copy + +import torch +import torch.nn as nn +import torch.nn.functional as F + + +class DotProductAttention(nn.Module): + + def __init__(self, dropout=0.0): + super(DotProductAttention, self).__init__() + + self.dropout = dropout + + def forward(self, q, k, v, attn_mask=None): + attn_output_weights = torch.bmm(q, k.transpose(1, 2)) + + if attn_mask is not None: + attn_output_weights += attn_mask + + attn_output_weights = F.softmax(attn_output_weights, dim=-1) + attn_output_weights = F.dropout(attn_output_weights, p=self.dropout, training=self.training) + attn_output = torch.bmm(attn_output_weights, v) + return attn_output + + +class MultiheadAttention(nn.Module): + + def __init__(self, embed_dim, num_heads, dropout=0.0, bias=True, kdim=None, vdim=None): + super(MultiheadAttention, self).__init__() + + self.embed_dim = embed_dim + self.num_heads = num_heads + self.kdim = kdim if kdim is not None else embed_dim + self.vdim = vdim if vdim is not None else embed_dim + self._qkv_same_embed_dim = self.kdim == embed_dim and self.vdim == embed_dim + + if self._qkv_same_embed_dim: + self.in_proj_weight = nn.Parameter(torch.empty(3 * embed_dim, embed_dim)) + else: + raise RuntimeError('Do not support q, k, v have different dimensions') + + if bias: + self.in_proj_bias = nn.Parameter(torch.empty(3 * embed_dim)) + else: + self.register_parameter('in_proj_bias', None) + + self.out_proj = nn.Linear(embed_dim, embed_dim) + + if self._qkv_same_embed_dim: + nn.init.xavier_uniform_(self.in_proj_weight) + + if self.in_proj_bias is not None: + nn.init.constant_(self.in_proj_bias, 0.) + nn.init.constant_(self.out_proj.bias, 0.) + + self.dotproductattention = DotProductAttention(dropout) + + def forward(self, q, k, v, attn_mask=None, key_padding_mask=None): + tsz, bsz, embed_dim = q.shape[0], q.shape[1], q.shape[2] + + head_dim = embed_dim // self.num_heads + assert head_dim * self.num_heads == embed_dim, \ + 'embed_dim must be divisible by num_heads' + scaling = float(head_dim)**-0.5 + + _b = self.in_proj_bias + _start = None + _end = embed_dim + _w = self.in_proj_weight[:_end, :] + if _b is not None: + _b = _b[:_end] + q = F.linear(q, _w, _b) + + _b = self.in_proj_bias + _start = embed_dim + _end = embed_dim * 2 + _w = self.in_proj_weight[_start:_end, :] + if _b is not None: + _b = _b[_start:_end] + k = F.linear(k, _w, _b) + + _b = self.in_proj_bias + _start = embed_dim * 2 + _end = None + _w = self.in_proj_weight[_start:, :] + if _b is not None: + _b = _b[_start:] + v = F.linear(v, _w, _b) + + q = q * scaling + + q = q.contiguous().view(-1, bsz * self.num_heads, head_dim).transpose(0, 1) + k = k.contiguous().view(-1, bsz * self.num_heads, head_dim).transpose(0, 1) + v = v.contiguous().view(-1, bsz * self.num_heads, head_dim).transpose(0, 1) + + if attn_mask is not None: + attn_mask = attn_mask.unsqueeze(0).repeat(bsz, 1, 1) + attn_mask = attn_mask.unsqueeze(1).repeat(1, self.num_heads, 1, 1) + attn_mask = attn_mask.reshape(-1, *attn_mask.shape[2:]) + + if key_padding_mask is not None: + key_padding_mask = key_padding_mask.unsqueeze(1).repeat(1, tsz, 1) + key_padding_mask = key_padding_mask.unsqueeze(1).repeat(1, self.num_heads, 1, 1) + key_padding_mask = key_padding_mask.reshape(-1, *key_padding_mask.shape[2:]) + + if attn_mask is not None and key_padding_mask is not None: + mask = attn_mask + key_padding_mask + elif attn_mask is not None: + mask = attn_mask + elif key_padding_mask is not None: + mask = key_padding_mask + else: + mask = None + + attn_output = self.dotproductattention(q, k, v, mask) + attn_output = attn_output.transpose(0, 1).contiguous().view(tsz, bsz, self.embed_dim) + return self.out_proj(attn_output), None + + +class Transformer(nn.Module): + + def __init__(self, + d_model=512, + nhead=8, + num_encoder_layers=6, + num_decoder_layers=6, + dim_feedforward=2048, + dropout=0.1, + activation='relu', + custom_encoder=None, + custom_decoder=None): + super(Transformer, self).__init__() + + if custom_encoder is not None: + self.encoder = custom_encoder + else: + encoder_layer = TransformerEncoderLayer(d_model, nhead, dim_feedforward, dropout, + activation) + encoder_norm = nn.LayerNorm(d_model) + self.encoder = TransformerEncoder(encoder_layer, num_encoder_layers, encoder_norm) + + if custom_decoder is not None: + self.decoder = custom_decoder + else: + decoder_layer = TransformerDecoderLayer(d_model, nhead, dim_feedforward, dropout, + activation) + decoder_norm = nn.LayerNorm(d_model) + self.decoder = TransformerDecoder(decoder_layer, num_decoder_layers, decoder_norm) + + self.d_model = d_model + self.nhead = nhead + + def forward(self, + src, + tgt, + src_mask=None, + tgt_mask=None, + memory_mask=None, + src_key_padding_mask=None, + tgt_key_padding_mask=None, + memory_key_padding_mask=None): + if src.size(1) != tgt.size(1): + raise RuntimeError('the batch number of src and tgt must be equal') + + if src.size(2) != self.d_model or tgt.size(2) != self.d_model: + raise RuntimeError('the feature number of src and tgt must be equal to d_model') + + memory = self.encoder(src, mask=src_mask, src_key_padding_mask=src_key_padding_mask) + output = self.decoder(tgt, + memory, + tgt_mask=tgt_mask, + memory_mask=memory_mask, + tgt_key_padding_mask=tgt_key_padding_mask, + memory_key_padding_mask=memory_key_padding_mask) + return output + + +class TransformerEncoder(nn.Module): + + def __init__(self, encoder_layer, num_layers, norm=None): + super(TransformerEncoder, self).__init__() + + self.layers = _get_clones(encoder_layer, num_layers) + self.num_layers = num_layers + self.norm = norm + + def forward(self, src, src_mask=None, src_key_padding_mask=None): + output = src + + for mod in self.layers: + output = mod(output, src_mask=src_mask, src_key_padding_mask=src_key_padding_mask) + + if self.norm is not None: + output = self.norm(output) + + return output + + +class TransformerDecoder(nn.Module): + + def __init__(self, decoder_layer, num_layers, norm=None): + super(TransformerDecoder, self).__init__() + + self.layers = _get_clones(decoder_layer, num_layers) + self.num_layers = num_layers + self.norm = norm + + def forward(self, + tgt, + memory, + tgt_mask=None, + memory_mask=None, + tgt_key_padding_mask=None, + memory_key_padding_mask=None): + output = tgt + + for mod in self.layers: + output = mod(output, + memory, + tgt_mask=tgt_mask, + memory_mask=memory_mask, + tgt_key_padding_mask=tgt_key_padding_mask, + memory_key_padding_mask=memory_key_padding_mask) + + if self.norm is not None: + output = self.norm(output) + + return output + + +class TransformerEncoderLayer(nn.Module): + + def __init__(self, d_model, nhead, dim_feedforward=2048, dropout=0.1, activation='relu'): + super(TransformerEncoderLayer, self).__init__() + + self.self_attn = MultiheadAttention(d_model, nhead, dropout=dropout) + + # Implementation of Feedforward model + self.linear1 = nn.Linear(d_model, dim_feedforward) + self.dropout = nn.Dropout(dropout) + self.linear2 = nn.Linear(dim_feedforward, d_model) + + self.norm1 = nn.LayerNorm(d_model) + self.norm2 = nn.LayerNorm(d_model) + self.dropout1 = nn.Dropout(dropout) + self.dropout2 = nn.Dropout(dropout) + + self.activation = _get_activation_fn(activation) + + def __setstate__(self, state): + if 'activation' not in state: + state['activation'] = F.relu + super(TransformerEncoderLayer, self).__setstate__(state) + + def forward(self, src, src_mask=None, src_key_padding_mask=None): + src2 = self.self_attn(src, + src, + src, + attn_mask=src_mask, + key_padding_mask=src_key_padding_mask)[0] + src = src + self.dropout1(src2) + src = self.norm1(src) + src2 = self.linear2(self.dropout(self.activation(self.linear1(src)))) + src = src + self.dropout2(src2) + src = self.norm2(src) + return src + + +class TransformerDecoderLayer(nn.Module): + + def __init__(self, d_model, nhead, dim_feedforward=2048, dropout=0.1, activation='relu'): + super(TransformerDecoderLayer, self).__init__() + + self.self_attn = MultiheadAttention(d_model, nhead, dropout=dropout) + self.multihead_attn = MultiheadAttention(d_model, nhead, dropout=dropout) + + # Implementation of Feedforward model + self.linear1 = nn.Linear(d_model, dim_feedforward) + self.dropout = nn.Dropout(dropout) + self.linear2 = nn.Linear(dim_feedforward, d_model) + + self.norm1 = nn.LayerNorm(d_model) + self.norm2 = nn.LayerNorm(d_model) + self.norm3 = nn.LayerNorm(d_model) + self.dropout1 = nn.Dropout(dropout) + self.dropout2 = nn.Dropout(dropout) + self.dropout3 = nn.Dropout(dropout) + + self.activation = _get_activation_fn(activation) + + def __setstate__(self, state): + if 'activation' not in state: + state['activation'] = F.relu + super(TransformerDecoderLayer, self).__setstate__(state) + + def forward(self, + tgt, + memory, + tgt_mask=None, + memory_mask=None, + tgt_key_padding_mask=None, + memory_key_padding_mask=None): + tgt2 = self.self_attn(tgt, + tgt, + tgt, + attn_mask=tgt_mask, + key_padding_mask=tgt_key_padding_mask)[0] + tgt = tgt + self.dropout1(tgt2) + tgt = self.norm1(tgt) + tgt2 = self.multihead_attn(tgt, + memory, + memory, + attn_mask=memory_mask, + key_padding_mask=memory_key_padding_mask)[0] + tgt = tgt + self.dropout2(tgt2) + tgt = self.norm2(tgt) + tgt2 = self.linear2(self.dropout(self.activation(self.linear1(tgt)))) + tgt = tgt + self.dropout3(tgt2) + tgt = self.norm3(tgt) + return tgt + + +def _get_clones(module, N): + return nn.ModuleList([copy.deepcopy(module) for i in range(N)]) + + +def _get_activation_fn(activation): + if activation == 'relu': + return F.relu + elif activation == 'gelu': + return F.gelu + + raise RuntimeError('activation should be relu/gelu, not {}'.format(activation)) diff --git a/model/transformer/utils.py b/model/transformer/utils.py new file mode 100644 index 0000000000000000000000000000000000000000..17a5463e62ba77f80f5a89e8a338f2658187f3f8 --- /dev/null +++ b/model/transformer/utils.py @@ -0,0 +1,22 @@ +import torch +assert torch.__version__ >= '1.6.0' +import torch.nn as nn +import numpy as np + + +def layer_norm(d_model, condition=True): + return nn.LayerNorm(d_model) if condition else None + + +def generate_square_subsequent_mask(sz): + mask = (torch.triu(torch.ones(sz, sz)) == 1).transpose(0, 1) + mask = mask.float().masked_fill(mask == 0, float('-inf')).masked_fill(mask == 1, float(0.0)) + return mask + + +def generate_proposal_mask(T, B): + mask = torch.zeros(T, (T + 1) * T // 2) + for sz, idx in zip(range(1, T + 1), np.cumsum(range(T))): + mask[:sz, idx: idx + sz] = torch.fliplr(torch.tril(torch.ones(sz, sz))) + mask = mask.unsqueeze(1).repeat(1, B, 1) + return mask diff --git a/model/visualEncoder.py b/model/visualEncoder.py new file mode 100755 index 0000000000000000000000000000000000000000..4b66c06b6b8b3b192fb9d86435dd178fc7150e18 --- /dev/null +++ b/model/visualEncoder.py @@ -0,0 +1,199 @@ +## +# ResNet18 Pretrained network to extract lip embedding +# This code is modified based on https://github.com/lordmartian/deep_avsr +## + +import torch +import torch.nn as nn +import torch.nn.functional as F +from model.attentionLayer import attentionLayer + + +class ResNetLayer(nn.Module): + """ + A ResNet layer used to build the ResNet network. + Architecture: + --> conv-bn-relu -> conv -> + -> bn-relu -> conv-bn-relu -> conv -> + -> bn-relu --> + | | | | + -----> downsample ------> -------------------------------------> + """ + + def __init__(self, inplanes, outplanes, stride): + super(ResNetLayer, self).__init__() + self.conv1a = nn.Conv2d(inplanes, + outplanes, + kernel_size=3, + stride=stride, + padding=1, + bias=False) + self.bn1a = nn.BatchNorm2d(outplanes, momentum=0.01, eps=0.001) + self.conv2a = nn.Conv2d(outplanes, + outplanes, + kernel_size=3, + stride=1, + padding=1, + bias=False) + self.stride = stride + if self.stride != 1: + self.downsample = nn.Conv2d(inplanes, + outplanes, + kernel_size=(1, 1), + stride=stride, + bias=False) + self.outbna = nn.BatchNorm2d(outplanes, momentum=0.01, eps=0.001) + + self.conv1b = nn.Conv2d(outplanes, + outplanes, + kernel_size=3, + stride=1, + padding=1, + bias=False) + self.bn1b = nn.BatchNorm2d(outplanes, momentum=0.01, eps=0.001) + self.conv2b = nn.Conv2d(outplanes, + outplanes, + kernel_size=3, + stride=1, + padding=1, + bias=False) + self.outbnb = nn.BatchNorm2d(outplanes, momentum=0.01, eps=0.001) + return + + def forward(self, inputBatch): + batch = F.relu(self.bn1a(self.conv1a(inputBatch))) + batch = self.conv2a(batch) + if self.stride == 1: + residualBatch = inputBatch + else: + residualBatch = self.downsample(inputBatch) + batch = batch + residualBatch + intermediateBatch = batch + batch = F.relu(self.outbna(batch)) + + batch = F.relu(self.bn1b(self.conv1b(batch))) + batch = self.conv2b(batch) + residualBatch = intermediateBatch + batch = batch + residualBatch + outputBatch = F.relu(self.outbnb(batch)) + return outputBatch + + +class ResNet(nn.Module): + """ + An 18-layer ResNet architecture. + """ + + def __init__(self): + super(ResNet, self).__init__() + self.layer1 = ResNetLayer(64, 64, stride=1) + self.layer2 = ResNetLayer(64, 128, stride=2) + self.layer3 = ResNetLayer(128, 256, stride=2) + self.layer4 = ResNetLayer(256, 512, stride=2) + self.avgpool = nn.AvgPool2d(kernel_size=(4, 4), stride=(1, 1)) + + return + + def forward(self, inputBatch): + batch = self.layer1(inputBatch) + batch = self.layer2(batch) + batch = self.layer3(batch) + batch = self.layer4(batch) + outputBatch = self.avgpool(batch) + return outputBatch + + +class GlobalLayerNorm(nn.Module): + + def __init__(self, channel_size): + super(GlobalLayerNorm, self).__init__() + self.gamma = nn.Parameter(torch.Tensor(1, channel_size, 1)) # [1, N, 1] + self.beta = nn.Parameter(torch.Tensor(1, channel_size, 1)) # [1, N, 1] + self.reset_parameters() + + def reset_parameters(self): + self.gamma.data.fill_(1) + self.beta.data.zero_() + + def forward(self, y): + mean = y.mean(dim=1, keepdim=True).mean(dim=2, keepdim=True) #[M, 1, 1] + var = (torch.pow(y - mean, 2)).mean(dim=1, keepdim=True).mean(dim=2, keepdim=True) + gLN_y = self.gamma * (y - mean) / torch.pow(var + 1e-8, 0.5) + self.beta + return gLN_y + + +class visualFrontend(nn.Module): + """ + A visual feature extraction module. Generates a 512-dim feature vector per video frame. + Architecture: A 3D convolution block followed by an 18-layer ResNet. + """ + + def __init__(self, cfg): + self.cfg = cfg + super(visualFrontend, self).__init__() + self.frontend3D = nn.Sequential( + nn.Conv3d(1, 64, kernel_size=(5, 7, 7), stride=(1, 2, 2), padding=(2, 3, 3), + bias=False), nn.BatchNorm3d(64, momentum=0.01, eps=0.001), nn.ReLU(), + nn.MaxPool3d(kernel_size=(1, 3, 3), stride=(1, 2, 2), padding=(0, 1, 1))) + self.resnet = ResNet() + return + + def forward(self, inputBatch): + inputBatch = inputBatch.transpose(0, 1).transpose(1, 2) + batchsize = inputBatch.shape[0] + batch = self.frontend3D(inputBatch) + + batch = batch.transpose(1, 2) + batch = batch.reshape(batch.shape[0] * batch.shape[1], batch.shape[2], batch.shape[3], + batch.shape[4]) + outputBatch = self.resnet(batch) + outputBatch = outputBatch.reshape(batchsize, -1, 512) + outputBatch = outputBatch.transpose(1, 2) + outputBatch = outputBatch.transpose(1, 2).transpose(0, 1) + return outputBatch + + +class DSConv1d(nn.Module): + + def __init__(self): + super(DSConv1d, self).__init__() + self.net = nn.Sequential( + nn.ReLU(), + nn.BatchNorm1d(512), + nn.Conv1d(512, 512, 3, stride=1, padding=1, dilation=1, groups=512, bias=False), + nn.PReLU(), + GlobalLayerNorm(512), + nn.Conv1d(512, 512, 1, bias=False), + ) + + def forward(self, x): + out = self.net(x) + return out + x + + +class visualTCN(nn.Module): + + def __init__(self): + super(visualTCN, self).__init__() + stacks = [] + for x in range(5): + stacks += [DSConv1d()] + self.net = nn.Sequential(*stacks) # Visual Temporal Network V-TCN + + def forward(self, x): + out = self.net(x) + return out + + +class visualConv1D(nn.Module): + + def __init__(self): + super(visualConv1D, self).__init__() + self.net = nn.Sequential( + nn.Conv1d(512, 256, 5, stride=1, padding=2), + nn.BatchNorm1d(256), + nn.ReLU(), + nn.Conv1d(256, 128, 1), + ) + + def forward(self, x): + out = self.net(x) + return out diff --git a/scripts/.ipynb_checkpoints/test-checkpoint.sh b/scripts/.ipynb_checkpoints/test-checkpoint.sh new file mode 100755 index 0000000000000000000000000000000000000000..764356c2b36ca9e8cac15fca010498a0aab16382 --- /dev/null +++ b/scripts/.ipynb_checkpoints/test-checkpoint.sh @@ -0,0 +1,40 @@ +# #expid: 1.a +# python -W ignore::UserWarning tools/test.py --cfg configs/selfattention_noise.yaml \ +# MODEL.SELFATTENTION.VERB_BASE_NOISE 1e-3 \ +# MODEL.SELFATTENTION.NOUN_BASE_NOISE 1e-2 \ +# MODEL.SELFATTENTION.ADD_NOISE_LAYERS "[0,1,2]" \ +# TEST.RESUME 1.a-v1e3_n1e2_012 \ +# TEST.DATASET "unseen" \ +# TEST.MODEL "seen" \ +# TEST.EPOCH 85 + +# #expid: 1.b +# python -W ignore::UserWarning tools/test.py --cfg configs/selfattention_noise.yaml \ +# MODEL.SELFATTENTION.VERB_BASE_NOISE 1e-3 \ +# MODEL.SELFATTENTION.NOUN_BASE_NOISE 1e-2 \ +# MODEL.SELFATTENTION.ADD_NOISE_LAYERS "[0]" \ +# TEST.RESUME 1.b-v1e3_n1e2_0 \ +# TEST.DATASET "unseen" \ +# TEST.MODEL "seen" \ +# TEST.EPOCH 63 + + +# #expid: 1.c +# python -W ignore::UserWarning tools/test.py --cfg configs/selfattention_noise.yaml \ +# MODEL.SELFATTENTION.VERB_BASE_NOISE 1e-3 \ +# MODEL.SELFATTENTION.NOUN_BASE_NOISE 1e-2 \ +# MODEL.SELFATTENTION.ADD_NOISE_LAYERS "[1]" \ +# TEST.RESUME 1.c-v1e3_n1e2_1 \ +# TEST.DATASET "unseen" \ +# TEST.MODEL "seen" \ +# TEST.EPOCH 65 + +#expid: 1.d +python -W ignore::UserWarning tools/test.py --cfg configs/selfattention_noise.yaml \ + MODEL.SELFATTENTION.VERB_BASE_NOISE 1e-3 \ + MODEL.SELFATTENTION.NOUN_BASE_NOISE 1e-2 \ + MODEL.SELFATTENTION.ADD_NOISE_LAYERS "[2]" \ + TEST.RESUME 1.d-v1e3_n1e2_2 \ + TEST.DATASET "unseen" \ + TEST.MODEL "seen" \ + TEST.EPOCH 41 \ No newline at end of file diff --git a/scripts/.ipynb_checkpoints/train-checkpoint.sh b/scripts/.ipynb_checkpoints/train-checkpoint.sh new file mode 100755 index 0000000000000000000000000000000000000000..40e10d5e5272974b47f792b813c617b46e96fea1 --- /dev/null +++ b/scripts/.ipynb_checkpoints/train-checkpoint.sh @@ -0,0 +1,117 @@ +# #expid: 0.a +# python -W ignore::UserWarning tools/train.py --cfg configs/selfattention.yaml \ +# SESSION 0.a + +# #expid: 0.b +# python -W ignore::UserWarning tools/train.py --cfg configs/selfattention.yaml \ +# SESSION 0.b + +# #expid: 0.c # change from utils deterministic to original method in coattention +# python -W ignore::UserWarning tools/train.py --cfg configs/selfattention.yaml \ +# SESSION 0.c + +# #expid: 0.d # use deterministic 123 as coattention +# python -W ignore::UserWarning tools/train.py --cfg configs/selfattention.yaml \ +# SESSION 0.d + + +# 1 compare adding noise at different layers +# #expid: 1.a +# python -W ignore::UserWarning tools/train.py --cfg configs/selfattention_noise.yaml \ +# MODEL.SELFATTENTION.VERB_BASE_NOISE 1e-3 \ +# MODEL.SELFATTENTION.NOUN_BASE_NOISE 1e-2 \ +# MODEL.SELFATTENTION.ADD_NOISE_LAYERS "[0,1,2]" \ +# SESSION 1.a-v1e3_n1e2_012 + +# #expid: 1.b +# python -W ignore::UserWarning tools/train.py --cfg configs/selfattention_noise.yaml \ +# MODEL.SELFATTENTION.VERB_BASE_NOISE 1e-3 \ +# MODEL.SELFATTENTION.NOUN_BASE_NOISE 1e-2 \ +# MODEL.SELFATTENTION.ADD_NOISE_LAYERS "[0]" \ +# SESSION 1.b-v1e3_n1e2_0 + +# #expid: 1.c +# python -W ignore::UserWarning tools/train.py --cfg configs/selfattention_noise.yaml \ +# MODEL.SELFATTENTION.VERB_BASE_NOISE 1e-3 \ +# MODEL.SELFATTENTION.NOUN_BASE_NOISE 1e-2 \ +# MODEL.SELFATTENTION.ADD_NOISE_LAYERS "[1]" \ +# SESSION 1.c-v1e3_n1e2_1 + +# #expid: 1.d +# python -W ignore::UserWarning tools/train.py --cfg configs/selfattention_noise.yaml \ +# MODEL.SELFATTENTION.VERB_BASE_NOISE 1e-3 \ +# MODEL.SELFATTENTION.NOUN_BASE_NOISE 1e-2 \ +# MODEL.SELFATTENTION.ADD_NOISE_LAYERS "[2]" \ +# SESSION 1.d-v1e3_n1e2_2 + + +# #expid: 2.a +# python -W ignore::UserWarning tools/train.py --cfg configs/selfattention_noise.yaml \ +# MODEL.SELFATTENTION.VERB_BASE_NOISE 1e-3 \ +# MODEL.SELFATTENTION.NOUN_BASE_NOISE 1e-2 \ +# MODEL.SELFATTENTION.ADD_NOISE_LAYERS "[0]" \ +# TRAIN.SAMPLE 1 \ +# SESSION 2.a-v1e3_n1e2_0_0.5 + +# #expid: 3.a +# python -W ignore::UserWarning tools/train.py --cfg configs/selfattention_noise.yaml \ +# MODEL.SELFATTENTION.VERB_BASE_NOISE 1e-2 \ +# MODEL.SELFATTENTION.NOUN_BASE_NOISE 1e-2 \ +# MODEL.SELFATTENTION.ADD_NOISE_LAYERS "[0]" \ +# SESSION 3.a-v1e2_n1e2_0 + +# #expid: 3.b +# python -W ignore::UserWarning tools/train.py --cfg configs/selfattention_noise.yaml \ +# MODEL.SELFATTENTION.VERB_BASE_NOISE 1e-4 \ +# MODEL.SELFATTENTION.NOUN_BASE_NOISE 1e-2 \ +# MODEL.SELFATTENTION.ADD_NOISE_LAYERS "[0]" \ +# SESSION 3.b-v1e4_n1e2_0 + +# #expid: 3.c +# python -W ignore::UserWarning tools/train.py --cfg configs/selfattention_noise.yaml \ +# MODEL.SELFATTENTION.VERB_BASE_NOISE 1e-5 \ +# MODEL.SELFATTENTION.NOUN_BASE_NOISE 1e-2 \ +# MODEL.SELFATTENTION.ADD_NOISE_LAYERS "[0]" \ +# SESSION 3.c-v1e5_n1e2_0 + + +# # try different noise type +# #expid: 4.a +# python -W ignore::UserWarning tools/train.py --cfg configs/selfattention_noise.yaml \ +# MODEL.SELFATTENTION.VERB_BASE_NOISE 1e-3 \ +# MODEL.SELFATTENTION.NOUN_BASE_NOISE 1e-2 \ +# MODEL.SELFATTENTION.ADD_NOISE_LAYERS "[0]" \ +# MODEL.SELFATTENTION.NOISE_TYPE "blurry" \ +# SESSION 4.a_v1e3_n1e2_0_blurry + + +# #expid: 4.b # to be run +# python -W ignore::UserWarning tools/train.py --cfg configs/selfattention_noise.yaml \ +# MODEL.SELFATTENTION.VERB_BASE_NOISE 1e-3 \ +# MODEL.SELFATTENTION.NOUN_BASE_NOISE 1e-2 \ +# MODEL.SELFATTENTION.ADD_NOISE_LAYERS "[0]" \ +# MODEL.SELFATTENTION.NOISE_TYPE "adaptive" \ +# SESSION 4.b_v1e3_n1e2_0_adaptive + + +# #expid: 5.a +# python -W ignore::UserWarning tools/train.py --cfg configs/selfattention_noise.yaml \ +# MODEL.SELFATTENTION.VERB_BASE_NOISE 1e-3 \ +# MODEL.SELFATTENTION.NOUN_BASE_NOISE 1e-3 \ +# MODEL.SELFATTENTION.ADD_NOISE_LAYERS "[0]" \ +# SESSION 5.a-v1e3_n1e3_0 + +# #expid: 5.b # zekrom +# python -W ignore::UserWarning tools/train.py --cfg configs/selfattention_noise.yaml \ +# MODEL.SELFATTENTION.VERB_BASE_NOISE 1e-3 \ +# MODEL.SELFATTENTION.NOUN_BASE_NOISE 1e-4 \ +# MODEL.SELFATTENTION.ADD_NOISE_LAYERS "[0]" \ +# SESSION 5.b-v1e3_n1e4_0 + +#expid: 6.a +python -W ignore::UserWarning tools/train.py --cfg configs/selfattention_noise.yaml \ + MODEL.SELFATTENTION.VERB_BASE_NOISE 1e-4 \ + MODEL.SELFATTENTION.NOUN_BASE_NOISE 1e-2 \ + MODEL.SELFATTENTION.ADD_NOISE_LAYERS "[1]" \ + SESSION 6.a-v1e4_n1e2_1 + \ No newline at end of file diff --git a/scripts/.ipynb_checkpoints/train_i3d_epic.sh b/scripts/.ipynb_checkpoints/train_i3d_epic.sh new file mode 100644 index 0000000000000000000000000000000000000000..e69de29bb2d1d6434b8b29ae775ad8c2e48c5391 diff --git a/scripts/.ipynb_checkpoints/try-checkpoint.ipynb b/scripts/.ipynb_checkpoints/try-checkpoint.ipynb new file mode 100644 index 0000000000000000000000000000000000000000..2450729121f75e804d0ae2efdde5ffa7f05b88be --- /dev/null +++ b/scripts/.ipynb_checkpoints/try-checkpoint.ipynb @@ -0,0 +1,154 @@ +{ + "cells": [ + { + "cell_type": "code", + "execution_count": 1, + "metadata": {}, + "outputs": [], + "source": [ + "import math\n", + "import numbers\n", + "import torch\n", + "from torch import nn\n", + "from torch.nn import functional as F\n", + "\n", + "class GaussianSmoothing(nn.Module):\n", + " \"\"\"\n", + " Apply gaussian smoothing on a\n", + " 1d, 2d or 3d tensor. Filtering is performed seperately for each channel\n", + " in the input using a depthwise convolution.\n", + " Arguments:\n", + " channels (int, sequence): Number of channels of the input tensors. Output will\n", + " have this number of channels as well.\n", + " kernel_size (int, sequence): Size of the gaussian kernel.\n", + " sigma (float, sequence): Standard deviation of the gaussian kernel.\n", + " dim (int, optional): The number of dimensions of the data.\n", + " Default value is 2 (spatial).\n", + " \"\"\"\n", + " def __init__(self, channels, kernel_size, sigma, dim=2):\n", + " super(GaussianSmoothing, self).__init__()\n", + " if isinstance(kernel_size, numbers.Number):\n", + " kernel_size = [kernel_size] * dim\n", + " if isinstance(sigma, numbers.Number):\n", + " sigma = [sigma] * dim\n", + "\n", + " # The gaussian kernel is the product of the\n", + " # gaussian function of each dimension.\n", + " kernel = 1\n", + " meshgrids = torch.meshgrid(\n", + " [\n", + " torch.arange(size, dtype=torch.float32)\n", + " for size in kernel_size\n", + " ]\n", + " )\n", + " for size, std, mgrid in zip(kernel_size, sigma, meshgrids):\n", + " mean = (size - 1) / 2\n", + " kernel *= 1 / (std * math.sqrt(2 * math.pi)) * \\\n", + " torch.exp(-((mgrid - mean) / std) ** 2 / 2)\n", + "\n", + " # Make sure sum of values in gaussian kernel equals 1.\n", + " kernel = kernel / torch.sum(kernel)\n", + "\n", + " # Reshape to depthwise convolutional weight\n", + " kernel = kernel.view(1, 1, *kernel.size())\n", + " kernel = kernel.repeat(channels, *[1] * (kernel.dim() - 1))\n", + "\n", + " self.register_buffer('weight', kernel)\n", + " self.groups = channels\n", + "\n", + " if dim == 1:\n", + " self.conv = F.conv1d\n", + " elif dim == 2:\n", + " self.conv = F.conv2d\n", + " elif dim == 3:\n", + " self.conv = F.conv3d\n", + " else:\n", + " raise RuntimeError(\n", + " 'Only 1, 2 and 3 dimensions are supported. Received {}.'.format(dim)\n", + " )\n", + "\n", + " def forward(self, input):\n", + " \"\"\"\n", + " Apply gaussian filter to input.\n", + " Arguments:\n", + " input (torch.Tensor): Input to apply gaussian filter on.\n", + " Returns:\n", + " filtered (torch.Tensor): Filtered output.\n", + " \"\"\"\n", + " return self.conv(input, weight=self.weight, groups=self.groups)" + ] + }, + { + "cell_type": "code", + "execution_count": 18, + "metadata": {}, + "outputs": [], + "source": [ + "smoothing = GaussianSmoothing(1024, 5, 1, dim=1)\n", + "input = torch.rand(4, 16, 100, 64)\n", + "b = input.shape[0]\n", + "numhead = input.shape[1]\n", + "t = input.shape[2]\n", + "c = input.shape[3]\n", + "input = input.permute(0,1,3,2)\n", + "input = input.reshape(b, numhead*c, t)\n", + "input = F.pad(input, (2, 2), mode='reflect')\n", + "output = smoothing(input)\n", + "output = output.reshape(b, numhead, c, t)\n", + "output = output.permute(0,1,3,2)" + ] + }, + { + "cell_type": "code", + "execution_count": 19, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "torch.Size([4, 16, 100, 64])" + ] + }, + "execution_count": 19, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "input\n", + "output.shape" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "input = torch.rand(4, 16, 100, 64)\n", + "attention = torch.normal(0,1 size = input.shape)" + ] + } + ], + "metadata": { + "kernelspec": { + "display_name": "Python 3", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.8.3" + } + }, + "nbformat": 4, + "nbformat_minor": 4 +} diff --git a/scripts/get_incorrect_samples.py b/scripts/get_incorrect_samples.py new file mode 100755 index 0000000000000000000000000000000000000000..cfe943f3b8fb06724c5693a6515ff75e8b4ad506 --- /dev/null +++ b/scripts/get_incorrect_samples.py @@ -0,0 +1,88 @@ +r"""Compute active speaker detection performance for the AVA dataset. +Please send any questions about this code to the Google Group ava-dataset-users: +https://groups.google.com/forum/#!forum/ava-dataset-users +Example usage: +python -O get_ava_active_speaker_performance.py \ +-g testdata/eval.csv \ +-p testdata/predictions.csv \ +-v +""" + +from __future__ import absolute_import +from __future__ import division +from __future__ import print_function + +import argparse +import logging +import time, warnings +import numpy as np +import pandas as pd +import matplotlib.pyplot as plt +warnings.filterwarnings("ignore") + + +def parse_arguments(): + """Parses command-line flags. + Returns: + args: a named tuple containing three file objects args.labelmap, + args.groundtruth, and args.detections. + """ + parser = argparse.ArgumentParser() + parser.add_argument("-g", + "--groundtruth", + help="CSV file containing ground truth.", + type=argparse.FileType("r"), + required=True) + parser.add_argument("-p", + "--predictions", + help="CSV file containing active speaker predictions.", + type=argparse.FileType("r"), + required=True) + parser.add_argument("-v", "--verbose", help="Increase output verbosity.", action="store_true") + return parser.parse_args() + + +def run_evaluation(groundtruth, predictions): + prediction = pd.read_csv(predictions) + groundtruth = pd.read_csv(groundtruth) + wrong_list = [] + num = 0 + audible_num = 0 + total = 0 + for i, row in prediction.iterrows(): + entity_id = row['entity_id'] + ts = row['frame_timestamp'] + if row['score'] < 0.5: + label = "NOT_SPEAKING" + else: + label = "SPEAKING_AUDIBLE" + + true_label = groundtruth.loc[(groundtruth['entity_id'] == entity_id) & + (groundtruth['frame_timestamp'] == ts)].iloc[0]["label"] + if true_label != label: + wrong_list.append([entity_id, ts, true_label, label]) + + if label == "SPEAKING_AUDIBLE": + num += 1 + if true_label == "SPEAKING_AUDIBLE": + audible_num += 1 + total += 1 + print(num, audible_num, total) + + df = pd.DataFrame(wrong_list, columns=['entity_id', 'frame_timestamp', "gt", "prediction"]) + df = df.sort_values(by=["frame_timestamp"]) + df.to_csv("wrong_list.csv") + + +def main(): + start = time.time() + args = parse_arguments() + if args.verbose: + logging.basicConfig(level=logging.DEBUG) + del args.verbose + run_evaluation(**vars(args)) + logging.info("Computed in %s seconds", time.time() - start) + + +if __name__ == "__main__": + main() diff --git a/test_multicard.py b/test_multicard.py new file mode 100755 index 0000000000000000000000000000000000000000..915594ac586b6b0f468f3aa65c4803b74cf96d56 --- /dev/null +++ b/test_multicard.py @@ -0,0 +1,99 @@ +import time, os, torch, argparse, warnings, glob, pandas, json + +from utils.tools import * +from dlhammer import bootstrap + +from dataLoader_multiperson import val_loader +from loconet import loconet + + +class DataPrep(): + + def __init__(self, cfg): + self.cfg = cfg + + def val_dataloader(self): + cfg = self.cfg + loader = val_loader(cfg, trialFileName = cfg.evalTrialAVA, \ + audioPath = os.path.join(cfg.audioPathAVA , cfg.evalDataType), \ + visualPath = os.path.join(cfg.visualPathAVA, cfg.evalDataType), \ + num_speakers=cfg.MODEL.NUM_SPEAKERS, + ) + valLoader = torch.utils.data.DataLoader(loader, + batch_size=cfg.VAL.BATCH_SIZE, + shuffle=False, + num_workers=16) + return valLoader + + +def prepare_context_files(cfg): + path = os.path.join(cfg.DATA.dataPathAVA, "csv") + for phase in ["val", "test"]: + csv_f = f"{phase}_loader.csv" + csv_orig = f"{phase}_orig.csv" + entity_f = os.path.join(path, phase + "_entity.json") + ts_f = os.path.join(path, phase + "_ts.json") + if os.path.exists(entity_f) and os.path.exists(ts_f): + continue + orig_df = pandas.read_csv(os.path.join(path, csv_orig)) + entity_data = {} + ts_to_entity = {} + + for index, row in orig_df.iterrows(): + + entity_id = row['entity_id'] + video_id = row['video_id'] + if row['label'] == "SPEAKING_AUDIBLE": + label = 1 + else: + label = 0 + ts = float(row['frame_timestamp']) + if video_id not in entity_data.keys(): + entity_data[video_id] = {} + if entity_id not in entity_data[video_id].keys(): + entity_data[video_id][entity_id] = {} + if ts not in entity_data[video_id][entity_id].keys(): + entity_data[video_id][entity_id][ts] = [] + + entity_data[video_id][entity_id][ts] = label + + if video_id not in ts_to_entity.keys(): + ts_to_entity[video_id] = {} + if ts not in ts_to_entity[video_id].keys(): + ts_to_entity[video_id][ts] = [] + ts_to_entity[video_id][ts].append(entity_id) + + with open(entity_f) as f: + json.dump(entity_data, f) + + with open(ts_f) as f: + json.dump(ts_to_entity, f) + + +def main(): + cfg = bootstrap(print_cfg=False) + print(cfg) + epoch = cfg.RESUME_EPOCH + + warnings.filterwarnings("ignore") + + cfg = init_args(cfg) + + data = DataPrep(cfg) + + prepare_context_files(cfg) + + if cfg.downloadAVA == True: + preprocess_AVA(cfg) + quit() + + s = loconet(cfg) + + s.loadParameters(cfg.RESUME_PATH) + mAP = s.evaluate_network(epoch=epoch, loader=data.val_dataloader()) + print(f"evaluate ckpt: {cfg.RESUME_PATH}") + print(mAP) + + +if __name__ == '__main__': + main() diff --git a/torchvggish/__pycache__/mel_features.cpython-37.pyc b/torchvggish/__pycache__/mel_features.cpython-37.pyc new file mode 100644 index 0000000000000000000000000000000000000000..20371fe601d853220baa1f2457b4b722ffdb09ff Binary files /dev/null and b/torchvggish/__pycache__/mel_features.cpython-37.pyc differ diff --git a/torchvggish/__pycache__/vggish.cpython-37.pyc b/torchvggish/__pycache__/vggish.cpython-37.pyc new file mode 100644 index 0000000000000000000000000000000000000000..1b6c6ffdbf569e1ed2323b0bf5b8f3dc2c2670e9 Binary files /dev/null and b/torchvggish/__pycache__/vggish.cpython-37.pyc differ diff --git a/torchvggish/__pycache__/vggish_input.cpython-37.pyc b/torchvggish/__pycache__/vggish_input.cpython-37.pyc new file mode 100644 index 0000000000000000000000000000000000000000..534c983ab69de151597a6f245c0e6003ad6a4cbe Binary files /dev/null and b/torchvggish/__pycache__/vggish_input.cpython-37.pyc differ diff --git a/torchvggish/__pycache__/vggish_params.cpython-37.pyc b/torchvggish/__pycache__/vggish_params.cpython-37.pyc new file mode 100644 index 0000000000000000000000000000000000000000..fe9d8921017e615b1b823c2b15c751c4d8d2423f Binary files /dev/null and b/torchvggish/__pycache__/vggish_params.cpython-37.pyc differ diff --git a/torchvggish/mel_features.py b/torchvggish/mel_features.py new file mode 100644 index 0000000000000000000000000000000000000000..ac58fb5427f772fcced9cbd3cec3373ffbe5908c --- /dev/null +++ b/torchvggish/mel_features.py @@ -0,0 +1,223 @@ +# Copyright 2017 The TensorFlow Authors All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# ============================================================================== + +"""Defines routines to compute mel spectrogram features from audio waveform.""" + +import numpy as np + + +def frame(data, window_length, hop_length): + """Convert array into a sequence of successive possibly overlapping frames. + + An n-dimensional array of shape (num_samples, ...) is converted into an + (n+1)-D array of shape (num_frames, window_length, ...), where each frame + starts hop_length points after the preceding one. + + This is accomplished using stride_tricks, so the original data is not + copied. However, there is no zero-padding, so any incomplete frames at the + end are not included. + + Args: + data: np.array of dimension N >= 1. + window_length: Number of samples in each frame. + hop_length: Advance (in samples) between each window. + + Returns: + (N+1)-D np.array with as many rows as there are complete frames that can be + extracted. + """ + num_samples = data.shape[0] + num_frames = 1 + int(np.floor((num_samples - window_length) / hop_length)) + shape = (num_frames, window_length) + data.shape[1:] + strides = (data.strides[0] * hop_length,) + data.strides + return np.lib.stride_tricks.as_strided(data, shape=shape, strides=strides) + + +def periodic_hann(window_length): + """Calculate a "periodic" Hann window. + + The classic Hann window is defined as a raised cosine that starts and + ends on zero, and where every value appears twice, except the middle + point for an odd-length window. Matlab calls this a "symmetric" window + and np.hanning() returns it. However, for Fourier analysis, this + actually represents just over one cycle of a period N-1 cosine, and + thus is not compactly expressed on a length-N Fourier basis. Instead, + it's better to use a raised cosine that ends just before the final + zero value - i.e. a complete cycle of a period-N cosine. Matlab + calls this a "periodic" window. This routine calculates it. + + Args: + window_length: The number of points in the returned window. + + Returns: + A 1D np.array containing the periodic hann window. + """ + return 0.5 - (0.5 * np.cos(2 * np.pi / window_length * + np.arange(window_length))) + + +def stft_magnitude(signal, fft_length, + hop_length=None, + window_length=None): + """Calculate the short-time Fourier transform magnitude. + + Args: + signal: 1D np.array of the input time-domain signal. + fft_length: Size of the FFT to apply. + hop_length: Advance (in samples) between each frame passed to FFT. + window_length: Length of each block of samples to pass to FFT. + + Returns: + 2D np.array where each row contains the magnitudes of the fft_length/2+1 + unique values of the FFT for the corresponding frame of input samples. + """ + frames = frame(signal, window_length, hop_length) + # Apply frame window to each frame. We use a periodic Hann (cosine of period + # window_length) instead of the symmetric Hann of np.hanning (period + # window_length-1). + window = periodic_hann(window_length) + windowed_frames = frames * window + return np.abs(np.fft.rfft(windowed_frames, int(fft_length))) + + +# Mel spectrum constants and functions. +_MEL_BREAK_FREQUENCY_HERTZ = 700.0 +_MEL_HIGH_FREQUENCY_Q = 1127.0 + + +def hertz_to_mel(frequencies_hertz): + """Convert frequencies to mel scale using HTK formula. + + Args: + frequencies_hertz: Scalar or np.array of frequencies in hertz. + + Returns: + Object of same size as frequencies_hertz containing corresponding values + on the mel scale. + """ + return _MEL_HIGH_FREQUENCY_Q * np.log( + 1.0 + (frequencies_hertz / _MEL_BREAK_FREQUENCY_HERTZ)) + + +def spectrogram_to_mel_matrix(num_mel_bins=20, + num_spectrogram_bins=129, + audio_sample_rate=8000, + lower_edge_hertz=125.0, + upper_edge_hertz=3800.0): + """Return a matrix that can post-multiply spectrogram rows to make mel. + + Returns a np.array matrix A that can be used to post-multiply a matrix S of + spectrogram values (STFT magnitudes) arranged as frames x bins to generate a + "mel spectrogram" M of frames x num_mel_bins. M = S A. + + The classic HTK algorithm exploits the complementarity of adjacent mel bands + to multiply each FFT bin by only one mel weight, then add it, with positive + and negative signs, to the two adjacent mel bands to which that bin + contributes. Here, by expressing this operation as a matrix multiply, we go + from num_fft multiplies per frame (plus around 2*num_fft adds) to around + num_fft^2 multiplies and adds. However, because these are all presumably + accomplished in a single call to np.dot(), it's not clear which approach is + faster in Python. The matrix multiplication has the attraction of being more + general and flexible, and much easier to read. + + Args: + num_mel_bins: How many bands in the resulting mel spectrum. This is + the number of columns in the output matrix. + num_spectrogram_bins: How many bins there are in the source spectrogram + data, which is understood to be fft_size/2 + 1, i.e. the spectrogram + only contains the nonredundant FFT bins. + audio_sample_rate: Samples per second of the audio at the input to the + spectrogram. We need this to figure out the actual frequencies for + each spectrogram bin, which dictates how they are mapped into mel. + lower_edge_hertz: Lower bound on the frequencies to be included in the mel + spectrum. This corresponds to the lower edge of the lowest triangular + band. + upper_edge_hertz: The desired top edge of the highest frequency band. + + Returns: + An np.array with shape (num_spectrogram_bins, num_mel_bins). + + Raises: + ValueError: if frequency edges are incorrectly ordered or out of range. + """ + nyquist_hertz = audio_sample_rate / 2. + if lower_edge_hertz < 0.0: + raise ValueError("lower_edge_hertz %.1f must be >= 0" % lower_edge_hertz) + if lower_edge_hertz >= upper_edge_hertz: + raise ValueError("lower_edge_hertz %.1f >= upper_edge_hertz %.1f" % + (lower_edge_hertz, upper_edge_hertz)) + if upper_edge_hertz > nyquist_hertz: + raise ValueError("upper_edge_hertz %.1f is greater than Nyquist %.1f" % + (upper_edge_hertz, nyquist_hertz)) + spectrogram_bins_hertz = np.linspace(0.0, nyquist_hertz, num_spectrogram_bins) + spectrogram_bins_mel = hertz_to_mel(spectrogram_bins_hertz) + # The i'th mel band (starting from i=1) has center frequency + # band_edges_mel[i], lower edge band_edges_mel[i-1], and higher edge + # band_edges_mel[i+1]. Thus, we need num_mel_bins + 2 values in + # the band_edges_mel arrays. + band_edges_mel = np.linspace(hertz_to_mel(lower_edge_hertz), + hertz_to_mel(upper_edge_hertz), num_mel_bins + 2) + # Matrix to post-multiply feature arrays whose rows are num_spectrogram_bins + # of spectrogram values. + mel_weights_matrix = np.empty((num_spectrogram_bins, num_mel_bins)) + for i in range(num_mel_bins): + lower_edge_mel, center_mel, upper_edge_mel = band_edges_mel[i:i + 3] + # Calculate lower and upper slopes for every spectrogram bin. + # Line segments are linear in the *mel* domain, not hertz. + lower_slope = ((spectrogram_bins_mel - lower_edge_mel) / + (center_mel - lower_edge_mel)) + upper_slope = ((upper_edge_mel - spectrogram_bins_mel) / + (upper_edge_mel - center_mel)) + # .. then intersect them with each other and zero. + mel_weights_matrix[:, i] = np.maximum(0.0, np.minimum(lower_slope, + upper_slope)) + # HTK excludes the spectrogram DC bin; make sure it always gets a zero + # coefficient. + mel_weights_matrix[0, :] = 0.0 + return mel_weights_matrix + + +def log_mel_spectrogram(data, + audio_sample_rate=8000, + log_offset=0.0, + window_length_secs=0.025, + hop_length_secs=0.010, + **kwargs): + """Convert waveform to a log magnitude mel-frequency spectrogram. + + Args: + data: 1D np.array of waveform data. + audio_sample_rate: The sampling rate of data. + log_offset: Add this to values when taking log to avoid -Infs. + window_length_secs: Duration of each window to analyze. + hop_length_secs: Advance between successive analysis windows. + **kwargs: Additional arguments to pass to spectrogram_to_mel_matrix. + + Returns: + 2D np.array of (num_frames, num_mel_bins) consisting of log mel filterbank + magnitudes for successive frames. + """ + window_length_samples = int(round(audio_sample_rate * window_length_secs)) + hop_length_samples = int(round(audio_sample_rate * hop_length_secs)) + fft_length = 2 ** int(np.ceil(np.log(window_length_samples) / np.log(2.0))) + spectrogram = stft_magnitude( + data, + fft_length=fft_length, + hop_length=hop_length_samples, + window_length=window_length_samples) + mel_spectrogram = np.dot(spectrogram, spectrogram_to_mel_matrix( + num_spectrogram_bins=spectrogram.shape[1], + audio_sample_rate=audio_sample_rate, **kwargs)) + return np.log(mel_spectrogram + log_offset) diff --git a/torchvggish/vggish.py b/torchvggish/vggish.py new file mode 100644 index 0000000000000000000000000000000000000000..f612fc261d8810665da934cef925bb261b69ee36 --- /dev/null +++ b/torchvggish/vggish.py @@ -0,0 +1,205 @@ +import numpy as np +import torch +import torch.nn as nn +from torch import hub + +from . import vggish_input, vggish_params + + +class VGG(nn.Module): + + def __init__(self, features): + super(VGG, self).__init__() + self.features = features + # self.embeddings = nn.Sequential( + # nn.Linear(512 * 4 * 6, 4096), + # nn.ReLU(True), + # nn.Linear(4096, 4096), + # nn.ReLU(True), + # nn.Linear(4096, 128), + # nn.ReLU(True)) + self.deconv = nn.ConvTranspose2d(512, 256, (2, 2), stride=(2, 2)) + self.conv1 = nn.Conv2d(512, 256, 1, stride=1) + self.conv2 = nn.Conv2d(256, 128, 1, stride=1) + # self.pool = nn.AdaptiveAvgPool2d((1, 1)) + + def forward(self, x): + # x = self.features(x) + for i, layer in enumerate(self.features): + x = layer(x) + if i == 9: + output4 = x + elif i == 14: + output8 = x + output8 = self.deconv(output8) + cat48 = torch.cat((output4, output8), 1) + output4 = self.conv1(cat48) + output4 = self.conv2(output4) + # res = self.pool(output4) + + # Transpose the output from features to + # remain compatible with vggish embeddings + # x = torch.transpose(x, 1, 3) + # x = torch.transpose(x, 1, 2) + # x = x.contiguous() + # x = x.view(x.size(0), -1) + + # return self.embeddings(x) + return output4 + + +class Postprocessor(nn.Module): + """Post-processes VGGish embeddings. Returns a torch.Tensor instead of a + numpy array in order to preserve the gradient. + + "The initial release of AudioSet included 128-D VGGish embeddings for each + segment of AudioSet. These released embeddings were produced by applying + a PCA transformation (technically, a whitening transform is included as well) + and 8-bit quantization to the raw embedding output from VGGish, in order to + stay compatible with the YouTube-8M project which provides visual embeddings + in the same format for a large set of YouTube videos. This class implements + the same PCA (with whitening) and quantization transformations." + """ + + def __init__(self): + """Constructs a postprocessor.""" + super(Postprocessor, self).__init__() + # Create empty matrix, for user's state_dict to load + self.pca_eigen_vectors = torch.empty( + ( + vggish_params.EMBEDDING_SIZE, + vggish_params.EMBEDDING_SIZE, + ), + dtype=torch.float, + ) + self.pca_means = torch.empty((vggish_params.EMBEDDING_SIZE, 1), dtype=torch.float) + + self.pca_eigen_vectors = nn.Parameter(self.pca_eigen_vectors, requires_grad=False) + self.pca_means = nn.Parameter(self.pca_means, requires_grad=False) + + def postprocess(self, embeddings_batch): + """Applies tensor postprocessing to a batch of embeddings. + + Args: + embeddings_batch: An tensor of shape [batch_size, embedding_size] + containing output from the embedding layer of VGGish. + + Returns: + A tensor of the same shape as the input, containing the PCA-transformed, + quantized, and clipped version of the input. + """ + assert len( + embeddings_batch.shape) == 2, "Expected 2-d batch, got %r" % (embeddings_batch.shape,) + assert (embeddings_batch.shape[1] == vggish_params.EMBEDDING_SIZE + ), "Bad batch shape: %r" % (embeddings_batch.shape,) + + # Apply PCA. + # - Embeddings come in as [batch_size, embedding_size]. + # - Transpose to [embedding_size, batch_size]. + # - Subtract pca_means column vector from each column. + # - Premultiply by PCA matrix of shape [output_dims, input_dims] + # where both are are equal to embedding_size in our case. + # - Transpose result back to [batch_size, embedding_size]. + pca_applied = torch.mm(self.pca_eigen_vectors, (embeddings_batch.t() - self.pca_means)).t() + + # Quantize by: + # - clipping to [min, max] range + clipped_embeddings = torch.clamp(pca_applied, vggish_params.QUANTIZE_MIN_VAL, + vggish_params.QUANTIZE_MAX_VAL) + # - convert to 8-bit in range [0.0, 255.0] + quantized_embeddings = torch.round( + (clipped_embeddings - vggish_params.QUANTIZE_MIN_VAL) * + (255.0 / (vggish_params.QUANTIZE_MAX_VAL - vggish_params.QUANTIZE_MIN_VAL))) + return torch.squeeze(quantized_embeddings) + + def forward(self, x): + return self.postprocess(x) + + +def make_layers(): + layers = [] + in_channels = 1 + for v in [64, "M", 128, "M", 256, 256, "M", 512, 512]: + if v == "M": + layers += [nn.MaxPool2d(kernel_size=2, stride=2)] + else: + conv2d = nn.Conv2d(in_channels, v, kernel_size=3, padding=1) + layers += [conv2d, nn.ReLU(inplace=True)] + in_channels = v + return nn.Sequential(*layers) + + +def _vgg(): + return VGG(make_layers()) + + +# def _spectrogram(): +# config = dict( +# sr=16000, +# n_fft=400, +# n_mels=64, +# hop_length=160, +# window="hann", +# center=False, +# pad_mode="reflect", +# htk=True, +# fmin=125, +# fmax=7500, +# output_format='Magnitude', +# # device=device, +# ) +# return Spectrogram.MelSpectrogram(**config) + + +class VGGish(VGG): + + def __init__(self, + urls, + device=None, + pretrained=True, + preprocess=True, + postprocess=True, + progress=True): + super().__init__(make_layers()) + if pretrained: + state_dict = hub.load_state_dict_from_url(urls['vggish'], progress=progress) + info = super().load_state_dict(state_dict, strict=False) + + if device is None: + device = torch.device('cuda' if torch.cuda.is_available() else 'cpu') + self.device = device + self.preprocess = preprocess + self.postprocess = postprocess + if self.postprocess: + self.pproc = Postprocessor() + if pretrained: + state_dict = hub.load_state_dict_from_url(urls['pca'], progress=progress) + # TODO: Convert the state_dict to torch + state_dict[vggish_params.PCA_EIGEN_VECTORS_NAME] = torch.as_tensor( + state_dict[vggish_params.PCA_EIGEN_VECTORS_NAME], dtype=torch.float) + state_dict[vggish_params.PCA_MEANS_NAME] = torch.as_tensor( + state_dict[vggish_params.PCA_MEANS_NAME].reshape(-1, 1), dtype=torch.float) + + self.pproc.load_state_dict(state_dict) + self.to(self.device) + + def forward(self, x, fs=None): + if self.preprocess: + x = self._preprocess(x, fs) + x = x.to(self.device) + x = VGG.forward(self, x) + if self.postprocess: + x = self._postprocess(x) + return x + + def _preprocess(self, x, fs): + if isinstance(x, np.ndarray): + x = vggish_input.waveform_to_examples(x, fs) + elif isinstance(x, str): + x = vggish_input.wavfile_to_examples(x) + else: + raise AttributeError + return x + + def _postprocess(self, x): + return self.pproc(x) diff --git a/torchvggish/vggish_input.py b/torchvggish/vggish_input.py new file mode 100644 index 0000000000000000000000000000000000000000..351be446f588f0e03d12dc454dcad9a05b8eef69 --- /dev/null +++ b/torchvggish/vggish_input.py @@ -0,0 +1,101 @@ +# Copyright 2017 The TensorFlow Authors All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# ============================================================================== +"""Compute input examples for VGGish from audio waveform.""" + +# Modification: Return torch tensors rather than numpy arrays +import torch + +import numpy as np +import resampy + +from . import mel_features +from . import vggish_params + +import soundfile as sf + + +def waveform_to_examples(data, sample_rate, numFrames, fps, return_tensor=True): + """Converts audio waveform into an array of examples for VGGish. + + Args: + data: np.array of either one dimension (mono) or two dimensions + (multi-channel, with the outer dimension representing channels). + Each sample is generally expected to lie in the range [-1.0, +1.0], + although this is not required. + sample_rate: Sample rate of data. + return_tensor: Return data as a Pytorch tensor ready for VGGish + + Returns: + 3-D np.array of shape [num_examples, num_frames, num_bands] which represents + a sequence of examples, each of which contains a patch of log mel + spectrogram, covering num_frames frames of audio and num_bands mel frequency + bands, where the frame length is vggish_params.STFT_HOP_LENGTH_SECONDS. + + """ + # Convert to mono. + if len(data.shape) > 1: + data = np.mean(data, axis=1) + # Resample to the rate assumed by VGGish. + if sample_rate != vggish_params.SAMPLE_RATE: + data = resampy.resample(data, sample_rate, vggish_params.SAMPLE_RATE) + window_length_seconds = vggish_params.STFT_WINDOW_LENGTH_SECONDS * 25. / fps + hop_length_seconds = vggish_params.STFT_HOP_LENGTH_SECONDS * 25. / fps + + # Compute log mel spectrogram features. + log_mel = mel_features.log_mel_spectrogram(data, + audio_sample_rate=vggish_params.SAMPLE_RATE, + log_offset=vggish_params.LOG_OFFSET, + window_length_secs=window_length_seconds, + hop_length_secs=hop_length_seconds, + num_mel_bins=vggish_params.NUM_MEL_BINS, + lower_edge_hertz=vggish_params.MEL_MIN_HZ, + upper_edge_hertz=vggish_params.MEL_MAX_HZ) + + maxAudio = int(numFrames * 4) + if log_mel.shape[0] < maxAudio: + shortage = maxAudio - log_mel.shape[0] + log_mel = np.pad(log_mel, ((0, shortage), (0, 0)), 'wrap') + log_mel = log_mel[:int(round(numFrames * 4)), :] + + # Frame features into examples. + # features_sample_rate = 1.0 / vggish_params.STFT_HOP_LENGTH_SECONDS + # example_window_length = int(round(vggish_params.EXAMPLE_WINDOW_SECONDS * features_sample_rate)) + # example_hop_length = int(round(vggish_params.EXAMPLE_HOP_SECONDS * features_sample_rate)) + # log_mel_examples = mel_features.frame(log_mel, + # window_length=example_window_length, + # hop_length=example_hop_length) + + if return_tensor: + log_mel_examples = torch.tensor(log_mel_examples, requires_grad=True)[:, None, :, :].float() + + # return log_mel_examples + return log_mel + + +def wavfile_to_examples(wav_file, return_tensor=True): + """Convenience wrapper around waveform_to_examples() for a common WAV format. + + Args: + wav_file: String path to a file, or a file-like object. The file + is assumed to contain WAV audio data with signed 16-bit PCM samples. + torch: Return data as a Pytorch tensor ready for VGGish + + Returns: + See waveform_to_examples. + """ + wav_data, sr = sf.read(wav_file, dtype='int16') + assert wav_data.dtype == np.int16, 'Bad sample type: %r' % wav_data.dtype + samples = wav_data / 32768.0 # Convert to [-1.0, +1.0] + return waveform_to_examples(samples, sr, return_tensor) diff --git a/torchvggish/vggish_params.py b/torchvggish/vggish_params.py new file mode 100644 index 0000000000000000000000000000000000000000..526784bceaa4c9c8b8dc2b8f82e0f3d395d4bec2 --- /dev/null +++ b/torchvggish/vggish_params.py @@ -0,0 +1,53 @@ +# Copyright 2017 The TensorFlow Authors All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# ============================================================================== + +"""Global parameters for the VGGish model. + +See vggish_slim.py for more information. +""" + +# Architectural constants. +NUM_FRAMES = 96 # Frames in input mel-spectrogram patch. +NUM_BANDS = 64 # Frequency bands in input mel-spectrogram patch. +EMBEDDING_SIZE = 128 # Size of embedding layer. + +# Hyperparameters used in feature and example generation. +SAMPLE_RATE = 16000 +STFT_WINDOW_LENGTH_SECONDS = 0.025 +STFT_HOP_LENGTH_SECONDS = 0.010 +NUM_MEL_BINS = NUM_BANDS +MEL_MIN_HZ = 125 +MEL_MAX_HZ = 7500 +LOG_OFFSET = 0.01 # Offset used for stabilized log of input mel-spectrogram. +EXAMPLE_WINDOW_SECONDS = 0.96 # Each example contains 96 10ms frames +EXAMPLE_HOP_SECONDS = 0.96 # with zero overlap. + +# Parameters used for embedding postprocessing. +PCA_EIGEN_VECTORS_NAME = 'pca_eigen_vectors' +PCA_MEANS_NAME = 'pca_means' +QUANTIZE_MIN_VAL = -2.0 +QUANTIZE_MAX_VAL = +2.0 + +# Hyperparameters used in training. +INIT_STDDEV = 0.01 # Standard deviation used to initialize weights. +LEARNING_RATE = 1e-4 # Learning rate for the Adam optimizer. +ADAM_EPSILON = 1e-8 # Epsilon for the Adam optimizer. + +# Names of ops, tensors, and features. +INPUT_OP_NAME = 'vggish/input_features' +INPUT_TENSOR_NAME = INPUT_OP_NAME + ':0' +OUTPUT_OP_NAME = 'vggish/embedding' +OUTPUT_TENSOR_NAME = OUTPUT_OP_NAME + ':0' +AUDIO_EMBEDDING_FEATURE_NAME = 'audio_embedding' diff --git a/train.py b/train.py new file mode 100755 index 0000000000000000000000000000000000000000..969d9cfb02f63d889f65030159f0edb8ffd424b1 --- /dev/null +++ b/train.py @@ -0,0 +1,197 @@ +import time, os, torch, argparse, warnings, glob, pandas, json + +from utils.tools import * +from dlhammer import bootstrap +os.environ["CUDA_DEVICE_ORDER"] = "PCI_BUS_ID" +import torch.multiprocessing as mp +import torch.distributed as dist + +from xxlib.utils.distributed import all_gather, all_reduce +from torch import nn +from dataLoader_multiperson import train_loader, val_loader + +from loconet import loconet + + +class MyCollator(object): + + def __init__(self, cfg): + self.cfg = cfg + + def __call__(self, data): + audiofeatures = [item[0] for item in data] + visualfeatures = [item[1] for item in data] + labels = [item[2] for item in data] + masks = [item[3] for item in data] + cut_limit = self.cfg.MODEL.CLIP_LENGTH + # pad audio + lengths = torch.tensor([t.shape[1] for t in audiofeatures]) + max_len = max(lengths) + padded_audio = torch.stack([ + torch.cat([i, i.new_zeros((i.shape[0], max_len - i.shape[1], i.shape[2]))], 1) + for i in audiofeatures + ], 0) + + if max_len > cut_limit * 4: + padded_audio = padded_audio[:, :, :cut_limit * 4, ...] + + # pad video + lengths = torch.tensor([t.shape[1] for t in visualfeatures]) + max_len = max(lengths) + padded_video = torch.stack([ + torch.cat( + [i, i.new_zeros((i.shape[0], max_len - i.shape[1], i.shape[2], i.shape[3]))], 1) + for i in visualfeatures + ], 0) + padded_labels = torch.stack( + [torch.cat([i, i.new_zeros((i.shape[0], max_len - i.shape[1]))], 1) for i in labels], 0) + padded_masks = torch.stack( + [torch.cat([i, i.new_zeros((i.shape[0], max_len - i.shape[1]))], 1) for i in masks], 0) + + if max_len > cut_limit: + padded_video = padded_video[:, :, :cut_limit, ...] + padded_labels = padded_labels[:, :, :cut_limit, ...] + padded_masks = padded_masks[:, :, :cut_limit, ...] + + return padded_audio, padded_video, padded_labels, padded_masks + + +class DataPrep(): + + def __init__(self, cfg, world_size, rank): + self.cfg = cfg + self.world_size = world_size + self.rank = rank + + def train_dataloader(self): + + loader = train_loader(self.cfg, trialFileName = self.cfg.trainTrialAVA, \ + audioPath = os.path.join(self.cfg.audioPathAVA , 'train'), \ + visualPath = os.path.join(self.cfg.visualPathAVA, 'train'), \ + num_speakers=self.cfg.MODEL.NUM_SPEAKERS, + ) + train_sampler = torch.utils.data.distributed.DistributedSampler( + loader, num_replicas=self.world_size, rank=self.rank) + collator = MyCollator(self.cfg) + trainLoader = torch.utils.data.DataLoader(loader, + batch_size=self.cfg.TRAIN.BATCH_SIZE, + pin_memory=False, + num_workers=self.cfg.NUM_WORKERS, + collate_fn=collator, + sampler=train_sampler) + return trainLoader + + def val_dataloader(self): + loader = val_loader(self.cfg, trialFileName = self.cfg.evalTrialAVA, \ + audioPath = os.path.join(self.cfg + .audioPathAVA , self.cfg + .evalDataType), \ + visualPath = os.path.join(self.cfg + .visualPathAVA, self.cfg + .evalDataType), \ + num_speakers = self.cfg.MODEL.NUM_SPEAKERS + ) + valLoader = torch.utils.data.DataLoader(loader, + batch_size=self.cfg.VAL.BATCH_SIZE, + shuffle=False, + pin_memory=True, + num_workers=16) + + return valLoader + + +def prepare_context_files(cfg): + path = os.path.join(cfg.DATA.dataPathAVA, "csv") + for phase in ["train", "val", "test"]: + csv_f = f"{phase}_loader.csv" + csv_orig = f"{phase}_orig.csv" + entity_f = os.path.join(path, phase + "_entity.json") + ts_f = os.path.join(path, phase + "_ts.json") + if os.path.exists(entity_f) and os.path.exists(ts_f): + continue + orig_df = pandas.read_csv(os.path.join(path, csv_orig)) + entity_data = {} + ts_to_entity = {} + + for index, row in orig_df.iterrows(): + + entity_id = row['entity_id'] + video_id = row['video_id'] + if row['label'] == "SPEAKING_AUDIBLE": + label = 1 + else: + label = 0 + ts = float(row['frame_timestamp']) + if video_id not in entity_data.keys(): + entity_data[video_id] = {} + if entity_id not in entity_data[video_id].keys(): + entity_data[video_id][entity_id] = {} + if ts not in entity_data[video_id][entity_id].keys(): + entity_data[video_id][entity_id][ts] = [] + + entity_data[video_id][entity_id][ts] = label + + if video_id not in ts_to_entity.keys(): + ts_to_entity[video_id] = {} + if ts not in ts_to_entity[video_id].keys(): + ts_to_entity[video_id][ts] = [] + ts_to_entity[video_id][ts].append(entity_id) + + with open(entity_f) as f: + json.dump(entity_data, f) + + with open(ts_f) as f: + json.dump(ts_to_entity, f) + + +def main(gpu, world_size): + # The structure of this code is learnt from https://github.com/clovaai/voxceleb_trainer + cfg = bootstrap(print_cfg=False) + rank = gpu + dist.init_process_group(backend='nccl', init_method='env://', world_size=world_size, rank=rank) + + make_deterministic(seed=int(cfg.SEED)) + torch.cuda.set_device(gpu) + device = torch.device("cuda:{}".format(gpu)) + + warnings.filterwarnings("ignore") + + cfg = init_args(cfg) + + data = DataPrep(cfg, world_size, rank) + + if cfg.downloadAVA == True: + preprocess_AVA(cfg) + quit() + + prepare_context_files(cfg) + + modelfiles = glob.glob('%s/model_0*.model' % cfg.modelSavePath) + modelfiles.sort() + if len(modelfiles) >= 1: + print("Model %s loaded from previous state!" % modelfiles[-1]) + epoch = int(os.path.splitext(os.path.basename(modelfiles[-1]))[0][6:]) + 1 + s = loconet(cfg, rank, device) + s.loadParameters(modelfiles[-1]) + else: + epoch = 1 + s = loconet(cfg, rank, device) + + while (1): + loss, lr = s.train_network(epoch=epoch, loader=data.train_dataloader()) + + s.saveParameters(cfg.modelSavePath + "/model_%04d.model" % epoch) + + if epoch >= cfg.TRAIN.MAX_EPOCH: + quit() + + epoch += 1 + + +if __name__ == '__main__': + + cfg = bootstrap() + world_size = cfg.NUM_GPUS # + os.environ['MASTER_ADDR'] = '127.0.0.1' # + os.environ['MASTER_PORT'] = str(random.randint(4000, 8888)) # + mp.spawn(main, nprocs=cfg.NUM_GPUS, args=(world_size,)) diff --git a/utils/.DS_Store b/utils/.DS_Store new file mode 100644 index 0000000000000000000000000000000000000000..926c61bce65c727c84bd22125b080e66142dcbdc Binary files /dev/null and b/utils/.DS_Store differ diff --git a/utils/.ipynb_checkpoints/utils-checkpoint.py b/utils/.ipynb_checkpoints/utils-checkpoint.py new file mode 100644 index 0000000000000000000000000000000000000000..24503319d88ebaa2d8eac5460a5549801ff11ce6 --- /dev/null +++ b/utils/.ipynb_checkpoints/utils-checkpoint.py @@ -0,0 +1,135 @@ +import time +import os +import sys +import json +import random +import numpy as np +import torch + +def setup_device(gpu_id): + #set up GPUS + os.environ["CUDA_DEVICE_ORDER"]="PCI_BUS_ID" + if int(gpu_id)==-2 and os.getenv('CUDA_VISIBLE_DEVICES') is not None: + gpu_id = os.getenv('CUDA_VISIBLE_DEVICES') + elif int(gpu_id) >= 0: + os.environ['CUDA_VISIBLE_DEVICES'] = str(gpu_id) + print("set CUDA_VISIBLE_DEVICES=",gpu_id) + else: + os.environ['CUDA_VISIBLE_DEVICES'] = "" + device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu") + print("using device %s"%device) + return device + +def setup_seed(seed): + if seed < 0: + if os.getenv('SATOSHI_SEED') is not None and seed == -2: + seed = int(os.getenv('SATOSHI_SEED')) + print("env seed used") + else: + import math + seed = int(10**4*math.modf(time.time())[0]) + seed = seed + print("random seed",seed) + return seed + +def setup_savedir(prefix="",basedir="./experiments",args=None,append_args=[]): + savedir = prefix + if len(append_args) > 0 and args is not None: + for arg_opt in append_args: + arg_value = getattr(args, arg_opt) + savedir +="_"+arg_opt+"-"+str(arg_value) + else: + savedir += "exp" + + savedir = savedir.replace(" ","").replace("'","").replace('"','') + savedir = os.path.join(basedir,savedir) + + #if exists, append _num-[num] + i = 1 + savedir_ori = savedir + while True: + try: + os.makedirs(savedir) + break + except FileExistsError as e: + savedir = savedir_ori+"_num-%d"%i + i+=1 + + print("made the log directory",savedir) + return savedir + +def save_args(savedir,args,name="args.json"): + #save args as "args.json" in the savedir + path = os.path.join(savedir,name) + with open(path, 'w') as f: + json.dump( vars(args), f, sort_keys=True, indent=4) + print("args saved as %s"%path) + +def save_json(dict,path): + with open(path, 'w') as f: + json.dump( dict, f, sort_keys=True, indent=4) + print("log saved at %s"%path) + +def resume_model(model,resume,state_dict_key = "model"): + ''' + model:pytorch model + resume: path to the resume file + state_dict_key: dict key + ''' + print("resuming trained weights from %s"%resume) + + checkpoint = torch.load(resume,map_location='cpu') + if state_dict_key is not None: + pretrained_dict = checkpoint[state_dict_key] + else: + pretrained_dict = checkpoint + + try: + model.load_state_dict(pretrained_dict) + except RuntimeError as e: + print(e) + print("can't load the all weights due to error above, trying to load part of them!") + model_dict = model.state_dict() + # 1. filter out unnecessary keys + pretrained_dict_use = {} + pretrained_dict_ignored = {} + for k, v in pretrained_dict.items(): + if k in model_dict: + pretrained_dict_use[k] = v + else: + pretrained_dict_ignored[k] = v + pretrained_dict =pretrained_dict_use + # 2. overwrite entries in the existing state dict + model_dict.update(pretrained_dict) + # 3. load the new state dict + model.load_state_dict(model_dict) + print("resumed only",pretrained_dict.keys()) + print("ignored:",pretrained_dict_ignored.keys()) + + return model + +def save_checkpoint(path,model,key="model"): + #save model state dict + checkpoint = {} + checkpoint[key] = model.state_dict() + torch.save(checkpoint, path) + print("checkpoint saved at",path) + + +def make_deterministic(seed,strict=False): + #https://github.com/pytorch/pytorch/issues/7068#issuecomment-487907668 + random.seed(seed) + os.environ['PYTHONHASHSEED'] = str(seed) + np.random.seed(seed) + torch.manual_seed(seed) + torch.cuda.manual_seed(seed) + torch.cuda.manual_seed_all(seed) # if you are using multi-GPU. + torch.backends.cudnn.benchmark = False + torch.backends.cudnn.deterministic = True + if strict: + #https://github.com/pytorch/pytorch/issues/7068#issuecomment-515728600 + torch.backends.cudnn.enabled = False + print("strict reproducability required! cudnn disabled. make sure to set num_workers=0 too!") + + + diff --git a/utils/__pycache__/distributed.cpython-37.pyc b/utils/__pycache__/distributed.cpython-37.pyc new file mode 100644 index 0000000000000000000000000000000000000000..b9eac8ce3fb9f5976a24888ecd8ae0d8b0d65fb3 Binary files /dev/null and b/utils/__pycache__/distributed.cpython-37.pyc differ diff --git a/utils/__pycache__/distributed.cpython-38.pyc b/utils/__pycache__/distributed.cpython-38.pyc new file mode 100644 index 0000000000000000000000000000000000000000..08cd33261bf05cbc0bdb2b83c4a55b2dfc6fb3fb Binary files /dev/null and b/utils/__pycache__/distributed.cpython-38.pyc differ diff --git a/utils/__pycache__/model_utils.cpython-38.pyc b/utils/__pycache__/model_utils.cpython-38.pyc new file mode 100644 index 0000000000000000000000000000000000000000..96695a8c7479efef2ca5dc665590b64c98c15e9d Binary files /dev/null and b/utils/__pycache__/model_utils.cpython-38.pyc differ diff --git a/utils/__pycache__/tools.cpython-37.pyc b/utils/__pycache__/tools.cpython-37.pyc new file mode 100644 index 0000000000000000000000000000000000000000..bf5aad58d4435ffde1967a30d5eb0c185284b109 Binary files /dev/null and b/utils/__pycache__/tools.cpython-37.pyc differ diff --git a/utils/__pycache__/utils.cpython-38.pyc b/utils/__pycache__/utils.cpython-38.pyc new file mode 100644 index 0000000000000000000000000000000000000000..59ab27101441a0cf7cbf40d353a9fcfae0a5547e Binary files /dev/null and b/utils/__pycache__/utils.cpython-38.pyc differ diff --git a/utils/distributed.py b/utils/distributed.py new file mode 100755 index 0000000000000000000000000000000000000000..b70e16c4c153ffddb78b70d728f39a8a62786641 --- /dev/null +++ b/utils/distributed.py @@ -0,0 +1,306 @@ +#!/usr/bin/env python3 +# Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved. +"""Distributed helpers.""" + +import functools +import logging +import pickle +import torch +import torch.distributed as dist + +_LOCAL_PROCESS_GROUP = None + + +def all_gather(tensors): + """ + All gathers the provided tensors from all processes across machines. + Args: + tensors (list): tensors to perform all gather across all processes in + all machines. + """ + + gather_list = [] + output_tensor = [] + world_size = dist.get_world_size() + for tensor in tensors: + tensor_placeholder = [ + torch.ones_like(tensor) for _ in range(world_size) + ] + dist.all_gather(tensor_placeholder, tensor, async_op=False) + gather_list.append(tensor_placeholder) + for gathered_tensor in gather_list: + output_tensor.append(torch.cat(gathered_tensor, dim=0)) + return output_tensor + + +def all_reduce(tensors, average=True): + """ + All reduce the provided tensors from all processes across machines. + Args: + tensors (list): tensors to perform all reduce across all processes in + all machines. + average (bool): scales the reduced tensor by the number of overall + processes across all machines. + """ + + for tensor in tensors: + dist.all_reduce(tensor, async_op=False) + if average: + world_size = dist.get_world_size() + for tensor in tensors: + tensor.mul_(1.0 / world_size) + return tensors + + +def init_process_group( + local_rank, + local_world_size, + shard_id, + num_shards, + init_method, + dist_backend="nccl", +): + """ + Initializes the default process group. + Args: + local_rank (int): the rank on the current local machine. + local_world_size (int): the world size (number of processes running) on + the current local machine. + shard_id (int): the shard index (machine rank) of the current machine. + num_shards (int): number of shards for distributed training. + init_method (string): supporting three different methods for + initializing process groups: + "file": use shared file system to initialize the groups across + different processes. + "tcp": use tcp address to initialize the groups across different + dist_backend (string): backend to use for distributed training. Options + includes gloo, mpi and nccl, the details can be found here: + https://pytorch.org/docs/stable/distributed.html + """ + # Sets the GPU to use. + torch.cuda.set_device(local_rank) + # Initialize the process group. + proc_rank = local_rank + shard_id * local_world_size + world_size = local_world_size * num_shards + dist.init_process_group( + backend=dist_backend, + init_method=init_method, + world_size=world_size, + rank=proc_rank, + ) + + +def is_master_proc(num_gpus=8): + """ + Determines if the current process is the master process. + """ + if torch.distributed.is_initialized(): + return dist.get_rank() % num_gpus == 0 + else: + return True + + +def is_root_proc(): + """ + Determines if the current process is the root process. + """ + if torch.distributed.is_initialized(): + return dist.get_rank() == 0 + else: + return True + + +def get_world_size(): + """ + Get the size of the world. + """ + if not dist.is_available(): + return 1 + if not dist.is_initialized(): + return 1 + return dist.get_world_size() + + +def get_rank(): + """ + Get the rank of the current process. + """ + if not dist.is_available(): + return 0 + if not dist.is_initialized(): + return 0 + return dist.get_rank() + + +def synchronize(): + """ + Helper function to synchronize (barrier) among all processes when + using distributed training + """ + if not dist.is_available(): + return + if not dist.is_initialized(): + return + world_size = dist.get_world_size() + if world_size == 1: + return + dist.barrier() + + +@functools.lru_cache() +def _get_global_gloo_group(): + """ + Return a process group based on gloo backend, containing all the ranks + The result is cached. + Returns: + (group): pytorch dist group. + """ + if dist.get_backend() == "nccl": + return dist.new_group(backend="gloo") + else: + return dist.group.WORLD + + +def _serialize_to_tensor(data, group): + """ + Seriialize the tensor to ByteTensor. Note that only `gloo` and `nccl` + backend is supported. + Args: + data (data): data to be serialized. + group (group): pytorch dist group. + Returns: + tensor (ByteTensor): tensor that serialized. + """ + + backend = dist.get_backend(group) + assert backend in ["gloo", "nccl"] + device = torch.device("cpu" if backend == "gloo" else "cuda") + + buffer = pickle.dumps(data) + if len(buffer) > 1024**3: + logger = logging.getLogger(__name__) + logger.warning( + "Rank {} trying to all-gather {:.2f} GB of data on device {}". + format(get_rank(), + len(buffer) / (1024**3), device)) + storage = torch.ByteStorage.from_buffer(buffer) + tensor = torch.ByteTensor(storage).to(device=device) + return tensor + + +def _pad_to_largest_tensor(tensor, group): + """ + Padding all the tensors from different GPUs to the largest ones. + Args: + tensor (tensor): tensor to pad. + group (group): pytorch dist group. + Returns: + list[int]: size of the tensor, on each rank + Tensor: padded tensor that has the max size + """ + world_size = dist.get_world_size(group=group) + assert ( + world_size >= 1 + ), "comm.gather/all_gather must be called from ranks within the given group!" + local_size = torch.tensor([tensor.numel()], + dtype=torch.int64, + device=tensor.device) + size_list = [ + torch.zeros([1], dtype=torch.int64, device=tensor.device) + for _ in range(world_size) + ] + dist.all_gather(size_list, local_size, group=group) + size_list = [int(size.item()) for size in size_list] + + max_size = max(size_list) + + # we pad the tensor because torch all_gather does not support + # gathering tensors of different shapes + if local_size != max_size: + padding = torch.zeros((max_size - local_size,), + dtype=torch.uint8, + device=tensor.device) + tensor = torch.cat((tensor, padding), dim=0) + return size_list, tensor + + +def all_gather_unaligned(data, group=None): + """ + Run all_gather on arbitrary picklable data (not necessarily tensors). + + Args: + data: any picklable object + group: a torch process group. By default, will use a group which + contains all ranks on gloo backend. + + Returns: + list[data]: list of data gathered from each rank + """ + if get_world_size() == 1: + return [data] + if group is None: + group = _get_global_gloo_group() + if dist.get_world_size(group) == 1: + return [data] + + tensor = _serialize_to_tensor(data, group) + + size_list, tensor = _pad_to_largest_tensor(tensor, group) + max_size = max(size_list) + + # receiving Tensor from all ranks + tensor_list = [ + torch.empty((max_size,), dtype=torch.uint8, device=tensor.device) + for _ in size_list + ] + dist.all_gather(tensor_list, tensor, group=group) + + data_list = [] + for size, tensor in zip(size_list, tensor_list): + buffer = tensor.cpu().numpy().tobytes()[:size] + data_list.append(pickle.loads(buffer)) + + return data_list + + +def init_distributed_training(cfg): + """ + Initialize variables needed for distributed training. + """ + if cfg.NUM_GPUS <= 1: + return + num_gpus_per_machine = cfg.NUM_GPUS + num_machines = dist.get_world_size() // num_gpus_per_machine + for i in range(num_machines): + ranks_on_i = list( + range(i * num_gpus_per_machine, (i + 1) * num_gpus_per_machine)) + pg = dist.new_group(ranks_on_i) + if i == cfg.SHARD_ID: + global _LOCAL_PROCESS_GROUP + _LOCAL_PROCESS_GROUP = pg + + +def get_local_size() -> int: + """ + Returns: + The size of the per-machine process group, + i.e. the number of processes per machine. + """ + if not dist.is_available(): + return 1 + if not dist.is_initialized(): + return 1 + return dist.get_world_size(group=_LOCAL_PROCESS_GROUP) + + +def get_local_rank() -> int: + """ + Returns: + The rank of the current process within the local (per-machine) process group. + """ + if not dist.is_available(): + return 0 + if not dist.is_initialized(): + return 0 + assert _LOCAL_PROCESS_GROUP is not None + return dist.get_rank(group=_LOCAL_PROCESS_GROUP) diff --git a/utils/get_ava_active_speaker_performance.py b/utils/get_ava_active_speaker_performance.py new file mode 100755 index 0000000000000000000000000000000000000000..2e66d1da9b2a06234b2f7afc6f1cecc81b0cf931 --- /dev/null +++ b/utils/get_ava_active_speaker_performance.py @@ -0,0 +1,236 @@ +r"""Compute active speaker detection performance for the AVA dataset. +Please send any questions about this code to the Google Group ava-dataset-users: +https://groups.google.com/forum/#!forum/ava-dataset-users +Example usage: +python -O get_ava_active_speaker_performance.py \ +-g testdata/eval.csv \ +-p testdata/predictions.csv \ +-v +""" + +from __future__ import absolute_import +from __future__ import division +from __future__ import print_function + +import argparse +import logging +import time, warnings +import numpy as np +import pandas as pd +import matplotlib.pyplot as plt +warnings.filterwarnings("ignore") + +def compute_average_precision(precision, recall): + """Compute Average Precision according to the definition in VOCdevkit. + Precision is modified to ensure that it does not decrease as recall + decrease. + Args: + precision: A float [N, 1] numpy array of precisions + recall: A float [N, 1] numpy array of recalls + Raises: + ValueError: if the input is not of the correct format + Returns: + average_precison: The area under the precision recall curve. NaN if + precision and recall are None. + """ + if precision is None: + if recall is not None: + raise ValueError("If precision is None, recall must also be None") + return np.NAN + + if not isinstance(precision, np.ndarray) or not isinstance( + recall, np.ndarray): + raise ValueError("precision and recall must be numpy array") + if precision.dtype != np.float or recall.dtype != np.float: + raise ValueError("input must be float numpy array.") + if len(precision) != len(recall): + raise ValueError("precision and recall must be of the same size.") + if not precision.size: + return 0.0 + if np.amin(precision) < 0 or np.amax(precision) > 1: + raise ValueError("Precision must be in the range of [0, 1].") + if np.amin(recall) < 0 or np.amax(recall) > 1: + raise ValueError("recall must be in the range of [0, 1].") + if not all(recall[i] <= recall[i + 1] for i in range(len(recall) - 1)): + raise ValueError("recall must be a non-decreasing array") + + recall = np.concatenate([[0], recall, [1]]) + precision = np.concatenate([[0], precision, [0]]) + + # Smooth precision to be monotonically decreasing. + for i in range(len(precision) - 2, -1, -1): + precision[i] = np.maximum(precision[i], precision[i + 1]) + + indices = np.where(recall[1:] != recall[:-1])[0] + 1 + average_precision = np.sum( + (recall[indices] - recall[indices - 1]) * precision[indices]) + return average_precision + + +def load_csv(filename, column_names): + """Loads CSV from the filename using given column names. + Adds uid column. + Args: + filename: Path to the CSV file to load. + column_names: A list of column names for the data. + Returns: + df: A Pandas DataFrame containing the data. + """ + # Here and elsewhere, df indicates a DataFrame variable. + + df = pd.read_csv(filename, usecols=column_names) + #df = pd.read_csv(filename, header=None, names=column_names) + + # Creates a unique id from frame timestamp and entity id. + df["uid"] = (df["frame_timestamp"].map(str) + ":" + df["entity_id"]) + return df + + +def eq(a, b, tolerance=1e-09): + """Returns true if values are approximately equal.""" + return abs(a - b) <= tolerance + + +def merge_groundtruth_and_predictions(df_groundtruth, df_predictions): + """Merges groundtruth and prediction DataFrames. + The returned DataFrame is merged on uid field and sorted in descending order + by score field. Bounding boxes are checked to make sure they match between + groundtruth and predictions. + Args: + df_groundtruth: A DataFrame with groundtruth data. + df_predictions: A DataFrame with predictions data. + Returns: + df_merged: A merged DataFrame, with rows matched on uid column. + """ + if df_groundtruth["uid"].count() != df_predictions["uid"].count(): + raise ValueError( + "Groundtruth and predictions CSV must have the same number of " + "unique rows.") + # print(df_predictions["label"].unique()) + if df_predictions["label"].unique() != ["SPEAKING_AUDIBLE"]: + raise ValueError( + "Predictions CSV must contain only SPEAKING_AUDIBLE label.") + + if df_predictions["score"].count() < df_predictions["uid"].count(): + raise ValueError("Predictions CSV must contain score value for every row.") + + # Merges groundtruth and predictions on uid, validates that uid is unique + # in both frames, and sorts the resulting frame by the predictions score. + df_merged = df_groundtruth.merge( + df_predictions, + on="uid", + suffixes=("_groundtruth", "_prediction"), + validate="1:1").sort_values( + by=["score"], ascending=False).reset_index() + # Validates that bounding boxes in ground truth and predictions match for the + # same uids. + df_merged["bounding_box_correct"] = np.where( + eq(df_merged["entity_box_x1_groundtruth"], + df_merged["entity_box_x1_prediction"]) + & eq(df_merged["entity_box_x2_groundtruth"], + df_merged["entity_box_x2_prediction"]) + & eq(df_merged["entity_box_y1_groundtruth"], + df_merged["entity_box_y1_prediction"]) + & eq(df_merged["entity_box_y2_groundtruth"], + df_merged["entity_box_y2_prediction"]), True, False) + + if (~df_merged["bounding_box_correct"]).sum() > 0: + raise ValueError( + "Mismatch between groundtruth and predictions bounding boxes found at " + + str(list(df_merged[~df_merged["bounding_box_correct"]]["uid"]))) + + return df_merged + + +def get_all_positives(df_merged): + """Counts all positive examples in the groundtruth dataset.""" + return df_merged[df_merged["label_groundtruth"] == + "SPEAKING_AUDIBLE"]["uid"].count() + + +def calculate_precision_recall(df_merged): + """Calculates precision and recall arrays going through df_merged row-wise.""" + all_positives = get_all_positives(df_merged) + # Populates each row with 1 if this row is a true positive + # (at its score level). + df_merged["is_tp"] = np.where( + (df_merged["label_groundtruth"] == "SPEAKING_AUDIBLE") & + (df_merged["label_prediction"] == "SPEAKING_AUDIBLE"), 1, 0) + + # Counts true positives up to and including that row. + df_merged["tp"] = df_merged["is_tp"].cumsum() + + # Calculates precision for every row counting true positives up to + # and including that row over the index (1-based) of that row. + df_merged["precision"] = df_merged["tp"] / (df_merged.index + 1) + # Calculates recall for every row counting true positives up to + # and including that row over all positives in the groundtruth dataset. + + df_merged["recall"] = df_merged["tp"] / all_positives + logging.info( + "\n%s\n", + df_merged.head(10)[[ + "uid", "score", "label_groundtruth", "is_tp", "tp", "precision", + "recall" + ]]) + + return np.array(df_merged["precision"]), np.array(df_merged["recall"]) + + +def run_evaluation(groundtruth, predictions): + """Runs AVA Active Speaker evaluation, printing average precision result.""" + df_groundtruth = load_csv( + groundtruth, + column_names=[ + "video_id", "frame_timestamp", "entity_box_x1", "entity_box_y1", + "entity_box_x2", "entity_box_y2", "label", "entity_id" + ]) + df_predictions = load_csv( + predictions, + column_names=[ + "video_id", "frame_timestamp", "entity_box_x1", "entity_box_y1", + "entity_box_x2", "entity_box_y2", "label", "entity_id", "score" + ]) + df_merged = merge_groundtruth_and_predictions(df_groundtruth, df_predictions) + precision, recall = calculate_precision_recall(df_merged) + mAP = 100 * compute_average_precision(precision, recall) + print("average precision: %2.2f%%"%(mAP)) + return mAP + + +def parse_arguments(): + """Parses command-line flags. + Returns: + args: a named tuple containing three file objects args.labelmap, + args.groundtruth, and args.detections. + """ + parser = argparse.ArgumentParser() + parser.add_argument( + "-g", + "--groundtruth", + help="CSV file containing ground truth.", + type=argparse.FileType("r"), + required=True) + parser.add_argument( + "-p", + "--predictions", + help="CSV file containing active speaker predictions.", + type=argparse.FileType("r"), + required=True) + parser.add_argument( + "-v", "--verbose", help="Increase output verbosity.", action="store_true") + return parser.parse_args() + + +def main(): + start = time.time() + args = parse_arguments() + if args.verbose: + logging.basicConfig(level=logging.DEBUG) + del args.verbose + mAP = run_evaluation(**vars(args)) + logging.info("Computed in %s seconds", time.time() - start) + return mAP + +if __name__ == "__main__": + main() \ No newline at end of file diff --git a/utils/get_multiperson_csv.py b/utils/get_multiperson_csv.py new file mode 100644 index 0000000000000000000000000000000000000000..fc6140b0b47576b47ed1a9273e6366275e9bfb30 --- /dev/null +++ b/utils/get_multiperson_csv.py @@ -0,0 +1,49 @@ +import os, pandas +import json + +phase = "test" +path = "/nfs/jolteon/data/ssd/xiziwang/AVA_dataset/csv" + +if phase == "train": + csv_f = "train_loader.csv" + csv_orig = "train_orig.csv" +elif phase == "val": + csv_f = "val_loader.csv" + csv_orig = "val_orig.csv" +else: + csv_f = "test_loader.csv" + csv_orig = "test_orig.csv" + +orig_df = pandas.read_csv(os.path.join(path, csv_orig)) +entity_data = {} +ts_to_entity = {} + +for index, row in orig_df.iterrows(): + + entity_id = row['entity_id'] + video_id = row['video_id'] + if row['label'] == "SPEAKING_AUDIBLE": + label = 1 + else: + label = 0 + ts = float(row['frame_timestamp']) + if video_id not in entity_data.keys(): + entity_data[video_id] = {} + if entity_id not in entity_data[video_id].keys(): + entity_data[video_id][entity_id] = {} + if ts not in entity_data[video_id][entity_id].keys(): + entity_data[video_id][entity_id][ts] = [] + + entity_data[video_id][entity_id][ts] = label + + if video_id not in ts_to_entity.keys(): + ts_to_entity[video_id] = {} + if ts not in ts_to_entity[video_id].keys(): + ts_to_entity[video_id][ts] = [] + ts_to_entity[video_id][ts].append(entity_id) + +with open(os.path.join(path, phase + "_entity.json"), 'w') as f: + json.dump(entity_data, f) + +with open(os.path.join(path, phase + "_ts.json"), 'w') as f: + json.dump(ts_to_entity, f) diff --git a/utils/model_utils.py b/utils/model_utils.py new file mode 100644 index 0000000000000000000000000000000000000000..1df761634de87bbe5f8d929d8a9768d0331a00d1 --- /dev/null +++ b/utils/model_utils.py @@ -0,0 +1,31 @@ +def set_bn_eval(m): + classname = m.__class__.__name__ + if classname.find('BatchNorm') != -1: + m.eval() + + +def set_bn_non_trainable(m): + classname = m.__class__.__name__ + if classname.find('BatchNorm') != -1: + m.weight.requires_grad = False + m.bias.requires_grad = False + + +def freeze_bn_statistics(model): + """freeze the statistic mean and variance in BN + Args: + model (nn.Module): The model to be freezed statistics. + """ + model.apply(set_bn_eval) + + +def freeze_bn_parameters(model): + """ + + Args: + model (nn.Module): The model to be freezed statistics. + + Returns: TODO + + """ + model.apply(set_bn_non_trainable) diff --git a/utils/overall.png b/utils/overall.png new file mode 100755 index 0000000000000000000000000000000000000000..9a1dd5cca29bc80835ace3114fff7b5814a4b0f5 Binary files /dev/null and b/utils/overall.png differ diff --git a/utils/tools.py b/utils/tools.py new file mode 100755 index 0000000000000000000000000000000000000000..5fdfd51cfdc81c961bcfe873051069cb252570f1 --- /dev/null +++ b/utils/tools.py @@ -0,0 +1,217 @@ +import os, subprocess, glob, pandas, tqdm, cv2, numpy +from scipy.io import wavfile +import random +import torch +import numpy as np + + +def init_args(args): + # The details for the following folders/files can be found in the annotation of the function 'preprocess_AVA' below + args.modelSavePath = os.path.join(args.WORKSPACE, 'model') + args.scoreSavePath = os.path.join(args.WORKSPACE, 'score.txt') + args.trialPathAVA = os.path.join(args.DATA.dataPathAVA, 'csv') + args.audioOrigPathAVA = os.path.join(args.DATA.dataPathAVA, 'orig_audios') + args.visualOrigPathAVA = os.path.join(args.DATA.dataPathAVA, 'orig_videos') + args.audioPathAVA = os.path.join(args.DATA.dataPathAVA, 'clips_audios') + args.visualPathAVA = os.path.join(args.DATA.dataPathAVA, 'clips_videos') + args.trainTrialAVA = os.path.join(args.trialPathAVA, 'train_loader.csv') + + if args.evalDataType == 'val': + args.evalTrialAVA = os.path.join(args.trialPathAVA, 'val_loader.csv') + args.evalOrig = os.path.join(args.trialPathAVA, 'val_orig.csv') + args.evalCsvSave = os.path.join(args.WORKSPACE, 'val_res.csv') + else: + args.evalTrialAVA = os.path.join(args.trialPathAVA, 'test_loader.csv') + args.evalOrig = os.path.join(args.trialPathAVA, 'test_orig.csv') + args.evalCsvSave = os.path.join(args.WORKSPACE, 'test_res.csv') + + os.makedirs(args.modelSavePath, exist_ok=True) + os.makedirs(args.DATA.dataPathAVA, exist_ok=True) + return args + + +def make_deterministic(seed, strict=False): + #https://github.com/pytorch/pytorch/issues/7068#issuecomment-487907668 + random.seed(seed) + os.environ['PYTHONHASHSEED'] = str(seed) + np.random.seed(seed) + torch.manual_seed(seed) + torch.cuda.manual_seed(seed) + torch.cuda.manual_seed_all(seed) # if you are using multi-GPU. + # torch.backends.cudnn.benchmark = False + torch.backends.cudnn.deterministic = True + # torch.set_deterministic(True) + if strict: + #https://github.com/pytorch/pytorch/issues/7068#issuecomment-515728600 + torch.backends.cudnn.enabled = False + print( + "strict reproducability required! cudnn disabled. make sure to set num_workers=0 too!") + + +def download_pretrain_model_AVA(): + if os.path.isfile('pretrain_AVA.model') == False: + Link = "1NVIkksrD3zbxbDuDbPc_846bLfPSZcZm" + cmd = "gdown --id %s -O %s" % (Link, 'pretrain_AVA.model') + subprocess.call(cmd, shell=True, stdout=None) + + +def preprocess_AVA(args): + # This preprocesstion is modified based on this [repository](https://github.com/fuankarion/active-speakers-context). + # The required space is 302 G. + # If you do not have enough space, you can delate `orig_videos`(167G) when you get `clips_videos(85G)`. + # also you can delate `orig_audios`(44G) when you get `clips_audios`(6.4G). + # So the final space is less than 100G. + # The AVA dataset will be saved in 'AVApath' folder like the following format: + # ``` + # ├── clips_audios (The audio clips cut from the original movies) + # │   ├── test + # │   ├── train + # │   └── val + # ├── clips_videos (The face clips cut from the original movies, be save in the image format, frame-by-frame) + # │   ├── test + # │   ├── train + # │   └── val + # ├── csv + # │   ├── test_file_list.txt (name of the test videos) + # │   ├── test_loader.csv (The csv file we generated to load data for testing) + # │   ├── test_orig.csv (The combination of the given test csv files) + # │   ├── train_loader.csv (The csv file we generated to load data for training) + # │   ├── train_orig.csv (The combination of the given training csv files) + # │   ├── trainval_file_list.txt (name of the train/val videos) + # │   ├── val_loader.csv (The csv file we generated to load data for validation) + # │   └── val_orig.csv (The combination of the given validation csv files) + # ├── orig_audios (The original audios from the movies) + # │   ├── test + # │   └── trainval + # └── orig_videos (The original movies) + # ├── test + # └── trainval + # ``` + + download_csv(args) # Take 1 minute + download_videos(args) # Take 6 hours + extract_audio(args) # Take 1 hour + extract_audio_clips(args) # Take 3 minutes + extract_video_clips(args) # Take about 2 days + + +def download_csv(args): + # Take 1 minute to download the required csv files + Link = "1C1cGxPHaJAl1NQ2i7IhRgWmdvsPhBCUy" + cmd = "gdown --id %s -O %s" % (Link, args.dataPathAVA + '/csv.tar.gz') + subprocess.call(cmd, shell=True, stdout=None) + cmd = "tar -xzvf %s -C %s" % (args.dataPathAVA + '/csv.tar.gz', args.dataPathAVA) + subprocess.call(cmd, shell=True, stdout=None) + os.remove(args.dataPathAVA + '/csv.tar.gz') + + +def download_videos(args): + # Take 6 hours to download the original movies, follow this repository: https://github.com/cvdfoundation/ava-dataset + for dataType in ['trainval', 'test']: + fileList = open('%s/%s_file_list.txt' % (args.trialPathAVA, dataType)).read().splitlines() + outFolder = '%s/%s' % (args.visualOrigPathAVA, dataType) + for fileName in fileList: + cmd = "wget -P %s https://s3.amazonaws.com/ava-dataset/%s/%s" % (outFolder, dataType, + fileName) + subprocess.call(cmd, shell=True, stdout=None) + + +def extract_audio(args): + # Take 1 hour to extract the audio from movies + for dataType in ['trainval', 'test']: + inpFolder = '%s/%s' % (args.visualOrigPathAVA, dataType) + outFolder = '%s/%s' % (args.audioOrigPathAVA, dataType) + os.makedirs(outFolder, exist_ok=True) + videos = glob.glob("%s/*" % (inpFolder)) + for videoPath in tqdm.tqdm(videos): + audioPath = '%s/%s' % (outFolder, videoPath.split('/')[-1].split('.')[0] + '.wav') + cmd = ( + "ffmpeg -y -i %s -async 1 -ac 1 -vn -acodec pcm_s16le -ar 16000 -threads 8 %s -loglevel panic" + % (videoPath, audioPath)) + subprocess.call(cmd, shell=True, stdout=None) + + +def extract_audio_clips(args): + # Take 3 minutes to extract the audio clips + dic = {'train': 'trainval', 'val': 'trainval', 'test': 'test'} + for dataType in ['train', 'val', 'test']: + df = pandas.read_csv(os.path.join(args.trialPathAVA, '%s_orig.csv' % (dataType)), + engine='python') + dfNeg = pandas.concat([df[df['label_id'] == 0], df[df['label_id'] == 2]]) + dfPos = df[df['label_id'] == 1] + insNeg = dfNeg['instance_id'].unique().tolist() + insPos = dfPos['instance_id'].unique().tolist() + df = pandas.concat([dfPos, dfNeg]).reset_index(drop=True) + df = df.sort_values(['entity_id', 'frame_timestamp']).reset_index(drop=True) + entityList = df['entity_id'].unique().tolist() + df = df.groupby('entity_id') + audioFeatures = {} + outDir = os.path.join(args.audioPathAVA, dataType) + audioDir = os.path.join(args.audioOrigPathAVA, dic[dataType]) + for l in df['video_id'].unique().tolist(): + d = os.path.join(outDir, l[0]) + if not os.path.isdir(d): + os.makedirs(d) + for entity in tqdm.tqdm(entityList, total=len(entityList)): + insData = df.get_group(entity) + videoKey = insData.iloc[0]['video_id'] + start = insData.iloc[0]['frame_timestamp'] + end = insData.iloc[-1]['frame_timestamp'] + entityID = insData.iloc[0]['entity_id'] + insPath = os.path.join(outDir, videoKey, entityID + '.wav') + if videoKey not in audioFeatures.keys(): + audioFile = os.path.join(audioDir, videoKey + '.wav') + sr, audio = wavfile.read(audioFile) + audioFeatures[videoKey] = audio + audioStart = int(float(start) * sr) + audioEnd = int(float(end) * sr) + audioData = audioFeatures[videoKey][audioStart:audioEnd] + wavfile.write(insPath, sr, audioData) + + +def extract_video_clips(args): + # Take about 2 days to crop the face clips. + # You can optimize this code to save time, while this process is one-time. + # If you do not need the data for the test set, you can only deal with the train and val part. That will take 1 day. + # This procession may have many warning info, you can just ignore it. + dic = {'train': 'trainval', 'val': 'trainval', 'test': 'test'} + for dataType in ['train', 'val', 'test']: + df = pandas.read_csv(os.path.join(args.trialPathAVA, '%s_orig.csv' % (dataType))) + dfNeg = pandas.concat([df[df['label_id'] == 0], df[df['label_id'] == 2]]) + dfPos = df[df['label_id'] == 1] + insNeg = dfNeg['instance_id'].unique().tolist() + insPos = dfPos['instance_id'].unique().tolist() + df = pandas.concat([dfPos, dfNeg]).reset_index(drop=True) + df = df.sort_values(['entity_id', 'frame_timestamp']).reset_index(drop=True) + entityList = df['entity_id'].unique().tolist() + df = df.groupby('entity_id') + outDir = os.path.join(args.visualPathAVA, dataType) + audioDir = os.path.join(args.visualOrigPathAVA, dic[dataType]) + for l in df['video_id'].unique().tolist(): + d = os.path.join(outDir, l[0]) + if not os.path.isdir(d): + os.makedirs(d) + for entity in tqdm.tqdm(entityList, total=len(entityList)): + insData = df.get_group(entity) + videoKey = insData.iloc[0]['video_id'] + entityID = insData.iloc[0]['entity_id'] + videoDir = os.path.join(args.visualOrigPathAVA, dic[dataType]) + videoFile = glob.glob(os.path.join(videoDir, '{}.*'.format(videoKey)))[0] + V = cv2.VideoCapture(videoFile) + insDir = os.path.join(os.path.join(outDir, videoKey, entityID)) + if not os.path.isdir(insDir): + os.makedirs(insDir) + j = 0 + for _, row in insData.iterrows(): + imageFilename = os.path.join(insDir, str("%.2f" % row['frame_timestamp']) + '.jpg') + V.set(cv2.CAP_PROP_POS_MSEC, row['frame_timestamp'] * 1e3) + _, frame = V.read() + h = numpy.size(frame, 0) + w = numpy.size(frame, 1) + x1 = int(row['entity_box_x1'] * w) + y1 = int(row['entity_box_y1'] * h) + x2 = int(row['entity_box_x2'] * w) + y2 = int(row['entity_box_y2'] * h) + face = frame[y1:y2, x1:x2, :] + j = j + 1 + cv2.imwrite(imageFilename, face) diff --git a/utils/utils.py b/utils/utils.py new file mode 100644 index 0000000000000000000000000000000000000000..b5241777f01a9a4a716c84970b0044dd6fb94922 --- /dev/null +++ b/utils/utils.py @@ -0,0 +1,141 @@ +import time +import os +import sys +import json +import random +import numpy as np +import torch + + +def setup_device(gpu_id): + #set up GPUS + os.environ["CUDA_DEVICE_ORDER"] = "PCI_BUS_ID" + if int(gpu_id) == -2 and os.getenv('CUDA_VISIBLE_DEVICES') is not None: + gpu_id = os.getenv('CUDA_VISIBLE_DEVICES') + elif int(gpu_id) >= 0: + os.environ['CUDA_VISIBLE_DEVICES'] = str(gpu_id) + print("set CUDA_VISIBLE_DEVICES=", gpu_id) + else: + os.environ['CUDA_VISIBLE_DEVICES'] = "" + device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu") + print("using device %s" % device) + return device + + +def setup_seed(seed): + if seed < 0: + if os.getenv('SATOSHI_SEED') is not None and seed == -2: + seed = int(os.getenv('SATOSHI_SEED')) + print("env seed used") + else: + import math + seed = int(10**4 * math.modf(time.time())[0]) + seed = seed + print("random seed", seed) + return seed + + +def setup_savedir(prefix="", basedir="./experiments", args=None, append_args=[]): + savedir = prefix + if len(append_args) > 0 and args is not None: + for arg_opt in append_args: + arg_value = getattr(args, arg_opt) + savedir += "_" + arg_opt + "-" + str(arg_value) + else: + savedir += "exp" + + savedir = savedir.replace(" ", "").replace("'", "").replace('"', '') + savedir = os.path.join(basedir, savedir) + + #if exists, append _num-[num] + i = 1 + savedir_ori = savedir + while True: + try: + os.makedirs(savedir) + break + except FileExistsError as e: + savedir = savedir_ori + "_num-%d" % i + i += 1 + + print("made the log directory", savedir) + return savedir + + +def save_args(savedir, args, name="args.json"): + #save args as "args.json" in the savedir + path = os.path.join(savedir, name) + with open(path, 'w') as f: + json.dump(vars(args), f, sort_keys=True, indent=4) + print("args saved as %s" % path) + + +def save_json(dict, path): + with open(path, 'w') as f: + json.dump(dict, f, sort_keys=True, indent=4) + print("log saved at %s" % path) + + +def resume_model(model, resume, state_dict_key="model"): + ''' + model:pytorch model + resume: path to the resume file + state_dict_key: dict key + ''' + print("resuming trained weights from %s" % resume) + + checkpoint = torch.load(resume, map_location='cpu') + if state_dict_key is not None: + pretrained_dict = checkpoint[state_dict_key] + else: + pretrained_dict = checkpoint + + try: + model.load_state_dict(pretrained_dict) + except RuntimeError as e: + print(e) + print("can't load the all weights due to error above, trying to load part of them!") + model_dict = model.state_dict() + # 1. filter out unnecessary keys + pretrained_dict_use = {} + pretrained_dict_ignored = {} + for k, v in pretrained_dict.items(): + if k in model_dict: + pretrained_dict_use[k] = v + else: + pretrained_dict_ignored[k] = v + pretrained_dict = pretrained_dict_use + # 2. overwrite entries in the existing state dict + model_dict.update(pretrained_dict) + # 3. load the new state dict + model.load_state_dict(model_dict) + print("resumed only", pretrained_dict.keys()) + print("ignored:", pretrained_dict_ignored.keys()) + + return model + + +def save_checkpoint(path, model, key="model"): + #save model state dict + checkpoint = {} + checkpoint[key] = model.state_dict() + torch.save(checkpoint, path) + print("checkpoint saved at", path) + + +def make_deterministic(seed, strict=False): + #https://github.com/pytorch/pytorch/issues/7068#issuecomment-487907668 + random.seed(seed) + os.environ['PYTHONHASHSEED'] = str(seed) + np.random.seed(seed) + torch.manual_seed(seed) + torch.cuda.manual_seed(seed) + torch.cuda.manual_seed_all(seed) # if you are using multi-GPU. + # torch.backends.cudnn.benchmark = False + torch.backends.cudnn.deterministic = True + # torch.set_deterministic(True) + if strict: + #https://github.com/pytorch/pytorch/issues/7068#issuecomment-515728600 + torch.backends.cudnn.enabled = False + print( + "strict reproducability required! cudnn disabled. make sure to set num_workers=0 too!") diff --git a/videoloaders/.DS_Store b/videoloaders/.DS_Store new file mode 100644 index 0000000000000000000000000000000000000000..f06db239bb6f916cf7f04d01a5bc394ab22bcea7 Binary files /dev/null and b/videoloaders/.DS_Store differ diff --git a/videoloaders/.ipynb_checkpoints/functional_video-checkpoint.py b/videoloaders/.ipynb_checkpoints/functional_video-checkpoint.py new file mode 100755 index 0000000000000000000000000000000000000000..861504ec04cf1998403bea8ee067620216c0ca05 --- /dev/null +++ b/videoloaders/.ipynb_checkpoints/functional_video-checkpoint.py @@ -0,0 +1,117 @@ +#copied from https://raw.githubusercontent.com/pytorch/vision/f0d3daa7f65bcde560e242d9bccc284721368f02/torchvision/transforms/functional_video.py +#copied from https://raw.githubusercontent.com/pytorch/vision/f0d3daa7f65bcde560e242d9bccc284721368f02/torchvision/transforms/transforms_video.py +import torch + + +def _is_tensor_video_clip(clip): + if not torch.is_tensor(clip): + raise TypeError("clip should be Tesnor. Got %s" % type(clip)) + + if not clip.ndimension() == 4: + raise ValueError("clip should be 4D. Got %dD" % clip.dim()) + + return True + + +def crop(clip, i, j, h, w): + """ + Args: + clip (torch.tensor): Video clip to be cropped. Size is (C, T, H, W) + """ + assert len(clip.size()) == 4, "clip should be a 4D tensor" + return clip[..., i:i + h, j:j + w] + + +def resize(clip, target_size, interpolation_mode): + assert len(target_size) == 2, "target size should be tuple (height, width)" + return torch.nn.functional.interpolate( + clip, size=target_size, mode=interpolation_mode + ) + + +def resized_crop(clip, i, j, h, w, size, interpolation_mode="bilinear"): + """ + Do spatial cropping and resizing to the video clip + Args: + clip (torch.tensor): Video clip to be cropped. Size is (C, T, H, W) + i (int): i in (i,j) i.e coordinates of the upper left corner. + j (int): j in (i,j) i.e coordinates of the upper left corner. + h (int): Height of the cropped region. + w (int): Width of the cropped region. + size (tuple(int, int)): height and width of resized clip + Returns: + clip (torch.tensor): Resized and cropped clip. Size is (C, T, H, W) + """ + assert _is_tensor_video_clip(clip), "clip should be a 4D torch.tensor" + clip = crop(clip, i, j, h, w) + clip = resize(clip, size, interpolation_mode) + return clip + + +def center_crop(clip, crop_size): + assert _is_tensor_video_clip(clip), "clip should be a 4D torch.tensor" + h, w = clip.size(-2), clip.size(-1) + th, tw = crop_size + assert h >= th and w >= tw, "height and width must be no smaller than crop_size" + + i = int(round((h - th) / 2.0)) + j = int(round((w - tw) / 2.0)) + return crop(clip, i, j, th, tw) + +def corner_crop(clip, crop_size, i, j): + assert _is_tensor_video_clip(clip),"clip should be a 4d torch tensor" + h, w = clip.size(-2), clip.size(-1) + th, tw = crop_size + assert h>=th and w>=tw, "height and width must be no smaller than crop_size" + return crop(clip, i, j, th, tw) + + +def to_tensor(clip): + """ + Convert tensor data type from uint8 to float, divide value by 255.0 and + permute the dimenions of clip tensor + Args: + clip (torch.tensor, dtype=torch.uint8): Size is (T, H, W, C) + Return: + """ + _is_tensor_video_clip(clip) + if not clip.dtype == torch.uint8: + raise TypeError("clip tensor should have data type uint8. Got %s" % str(clip.dtype)) + return clip.float().permute(3, 0, 1, 2) / 255.0 + + +def normalize(clip, mean, std, inplace=False): + """ + Args: + clip (torch.tensor): Video clip to be normalized. Size is (C, T, H, W) + mean (tuple): pixel RGB mean. Size is (3) + std (tuple): pixel standard deviation. Size is (3) + Returns: + normalized clip (torch.tensor): Size is (C, T, H, W) + """ + assert _is_tensor_video_clip(clip), "clip should be a 4D torch.tensor" + if not inplace: + clip = clip.clone() + mean = torch.as_tensor(mean, dtype=clip.dtype, device=clip.device) + std = torch.as_tensor(std, dtype=clip.dtype, device=clip.device) + if clip.size(0) == 3: + clip.sub_(mean[:, None, None, None]).div_(std[:, None, None, None]) + elif clip.size(0) == 1: + #make it compatibale with depth image + mean = mean.mean() + std = std.mean() + clip.sub_(mean).div_(std) + else: + raise NotImplementedError() + return clip + + +def hflip(clip): + """ + Args: + clip (torch.tensor): Video clip to be normalized. Size is (C, T, H, W) + Returns: + flipped clip (torch.tensor): Size is (C, T, H, W) + """ + assert _is_tensor_video_clip(clip), "clip should be a 4D torch.tensor" + return clip.flip((-1)) diff --git a/videoloaders/.ipynb_checkpoints/transform_temporal-checkpoint.py b/videoloaders/.ipynb_checkpoints/transform_temporal-checkpoint.py new file mode 100755 index 0000000000000000000000000000000000000000..14948b753f81ab5e6fd31b6de3d65b9c59e82348 --- /dev/null +++ b/videoloaders/.ipynb_checkpoints/transform_temporal-checkpoint.py @@ -0,0 +1,162 @@ +import os +import random +import math + + +def temporal_batching_index(fr,length=16): + ''' + Do padding or half-overlapping clips for video. + + Input: + fr: number of frames + Output: + batch_indices: array for batch where each element is frame index + ''' + if fr < length: + #e.g. (1,2,3,4,5) to (1,1,....,1,2,3,4,5,5,...,5,5) + right = int((length-fr)/2) + left = length - right - fr + return [[0]*left + list(range(fr)) + [fr-1]*right] + + batch_indices = [] + last_idx = fr - 1 + assert length%2 == 0 + half = int(length/2) + for i in range(0,fr-half,half): + frame_indices = [0,]*length + for j in range(length): + current_idx = i + j + if current_idx < last_idx: + frame_indices[j] = current_idx + else: + frame_indices[j] = last_idx + batch_indices.append(frame_indices) + + return batch_indices + +def temporal_sliding_window(clip,window = 16): + ''' + Make a batched tensor with 16 frame sliding window with the overlap of 8. + If a clip is not the multiply of 8, it's padded with the last frames. (1,2...,13,14,14,14) for (1,..,14) + If a clip is less than 16 frames, padding is applied like (1,1,....,1,2,3,4,5,5,...,5,5) for (1,2,3,4,5) + This can be used for sliding window evaluation. + + Input: list of image paths + Output: torch tensor of shape of (batch,ch,16,h,w). + ''' + + batch_indices = temporal_batching_index(len(clip),length = window) + + return [[clip[idx] for idx in indices] for indices in batch_indices] + +def temporal_center_crop(clip,length = 16): + ''' + Input: list of image paths + Output: torch tensor of shape of (1,ch,16,h,w). + ''' + fr = len(clip) + if fr < length: + #e.g. (1,2,3,4,5) to (1,1,....,1,2,3,4,5,5,...,5,5) + right = int((length-fr)/2) + left = length - right - fr + indicies = [0]*left + list(range(fr)) + [fr-1]*right + output = [clip[i] for i in indicies] + elif fr==length: + output = clip + else: + middle = int(fr/2) + assert length%2 == 0 + half = int(length/2) + start = middle - half + output = clip[start : start+length] + + return output[::2] + + + +def random_temporal_crop(clip,length = 16): + ''' + Just randomly sample 16 consecutive frames + if less than 16 frames, just add padding. + ''' + fr = len(clip) + if fr < length: + #e.g. (1,2,3,4,5) to (1,1,....,1,2,3,4,5,5,...,5,5) + right = int((length-fr)/2) + left = length - right - fr + indicies = [0]*left + list(range(fr)) + [fr-1]*right + output = [clip[i] for i in indicies] + elif fr==length: + output = clip + else: + start=random.randint(0,fr-length) + output = clip[start : start+length] + return output[::2] + + +def use_all_frames(clip): + ''' + Just use it as it is :) + ''' + return clip + +def looppadding(clip, length=16): + + + out = clip + + for index in out: + if len(out) >= length: + break + out.append(index) + + return out[::2] + +def temporal_even_crop(clip, length=16, n_samples=1): + + clip = list(clip) + n_frames = len(clip) + indices = list(range(len(clip))) + stride = max( + 1, math.ceil((n_frames - 1 - length) / (n_samples - 1))) + + out = [] + for begin_index in indices[::stride]: + if len(out) >= n_samples: + break + end_index = min(indices[-1] + 1, begin_index + length) + sample = list(range(begin_index, end_index)) + + if len(sample) < length: + out.append([clip[i] for i in looppadding(sample, length=length)]) + # out.append(clip[looppadding(sample, length=length)]) + break + else: + out.append([clip[i] for i in sample[::2]]) + # out.append(clip[sample[::2]]) + + return out + + +class TemporalTransform(object): + def __init__(self,length,mode="center"): + self.mode = mode + self.length = length + #pass dummpy in order to catch incoored mode + self.__call__(range(128)) + + def __call__(self, clip): + if self.mode == "random": + return random_temporal_crop(clip,self.length) + elif self.mode == "center": + return temporal_center_crop(clip,self.length) + elif self.mode == "all" or self.mode == "nocrop": + #note that length cannot be satisfied! + return use_all_frames(clip) + elif self.mode == "slide": + #note that output has one more dimention + return temporal_sliding_window(clip,self.length) + elif self.mode == "even": + return temporal_even_crop(clip, self.length, n_samples=5) + else: + raise NotImplementedError("this option is not defined:",self.mode) \ No newline at end of file diff --git a/videoloaders/.ipynb_checkpoints/transforms_video-checkpoint.py b/videoloaders/.ipynb_checkpoints/transforms_video-checkpoint.py new file mode 100755 index 0000000000000000000000000000000000000000..156cf463abf94932421cb2dd7d500fdcdb998bbe --- /dev/null +++ b/videoloaders/.ipynb_checkpoints/transforms_video-checkpoint.py @@ -0,0 +1,312 @@ +#copied from https://raw.githubusercontent.com/pytorch/vision/f0d3daa7f65bcde560e242d9bccc284721368f02/torchvision/transforms/functional_video.py +#copied from https://raw.githubusercontent.com/pytorch/vision/f0d3daa7f65bcde560e242d9bccc284721368f02/torchvision/transforms/transforms_video.py +#!/usr/bin/env python3 + +import numbers +import random +import torch + +try: + import accimage +except: + pass + +from torchvision.transforms import ( + RandomResizedCrop, +) + +from . import functional_video as F + +def _get_image_size(img): + if isinstance(img, torch.Tensor) and img.dim() > 2: + return img.shape[-2:][::-1] + else: + raise TypeError("Unexpected type {}".format(type(img))) + +class RandomCrop(object): + """Crop the given PIL Image at a random location. + Args: + size (sequence or int): Desired output size of the crop. If size is an + int instead of sequence like (h, w), a square crop (size, size) is + made. + padding (int or sequence, optional): Optional padding on each border + of the image. Default is None, i.e no padding. If a sequence of length + 4 is provided, it is used to pad left, top, right, bottom borders + respectively. If a sequence of length 2 is provided, it is used to + pad left/right, top/bottom borders, respectively. + pad_if_needed (boolean): It will pad the image if smaller than the + desired size to avoid raising an exception. Since cropping is done + after padding, the padding seems to be done at a random offset. + fill: Pixel fill value for constant fill. Default is 0. If a tuple of + length 3, it is used to fill R, G, B channels respectively. + This value is only used when the padding_mode is constant + padding_mode: Type of padding. Should be: constant, edge, reflect or symmetric. Default is constant. + - constant: pads with a constant value, this value is specified with fill + - edge: pads with the last value on the edge of the image + - reflect: pads with reflection of image (without repeating the last value on the edge) + padding [1, 2, 3, 4] with 2 elements on both sides in reflect mode + will result in [3, 2, 1, 2, 3, 4, 3, 2] + - symmetric: pads with reflection of image (repeating the last value on the edge) + padding [1, 2, 3, 4] with 2 elements on both sides in symmetric mode + will result in [2, 1, 1, 2, 3, 4, 4, 3] + """ + + def __init__(self, size, padding=None, pad_if_needed=False, fill=0, padding_mode='constant'): + if isinstance(size, numbers.Number): + self.size = (int(size), int(size)) + else: + self.size = size + self.padding = padding + self.pad_if_needed = pad_if_needed + self.fill = fill + self.padding_mode = padding_mode + + @staticmethod + def get_params(img, output_size): + """Get parameters for ``crop`` for a random crop. + Args: + img (PIL Image): Image to be cropped. + output_size (tuple): Expected output size of the crop. + Returns: + tuple: params (i, j, h, w) to be passed to ``crop`` for random crop. + """ + w, h = _get_image_size(img) + th, tw = output_size + if w == tw and h == th: + return 0, 0, h, w + + i = random.randint(0, h - th) + j = random.randint(0, w - tw) + return i, j, th, tw + + def __call__(self, img): + """ + Args: + img (PIL Image): Image to be cropped. + Returns: + PIL Image: Cropped image. + """ + if self.padding is not None: + img = F.pad(img, self.padding, self.fill, self.padding_mode) + + # pad the width if needed + if self.pad_if_needed and img.size[0] < self.size[1]: + img = F.pad(img, (self.size[1] - img.size[0], 0), self.fill, self.padding_mode) + # pad the height if needed + if self.pad_if_needed and img.size[1] < self.size[0]: + img = F.pad(img, (0, self.size[0] - img.size[1]), self.fill, self.padding_mode) + + i, j, h, w = self.get_params(img, self.size) + + return F.crop(img, i, j, h, w) + + def __repr__(self): + return self.__class__.__name__ + '(size={0}, padding={1})'.format(self.size, self.padding) + + + + + +class RandomCropVideo(RandomCrop): + def __init__(self, size): + if isinstance(size, numbers.Number): + self.size = (int(size), int(size)) + else: + self.size = size + + def __call__(self, clip): + """ + Args: + clip (torch.tensor): Video clip to be cropped. Size is (C, T, H, W) + Returns: + torch.tensor: randomly cropped/resized video clip. + size is (C, T, OH, OW) + """ + i, j, h, w = self.get_params(clip, self.size) + return F.crop(clip, i, j, h, w) + + def __repr__(self): + return self.__class__.__name__ + '(size={0})'.format(self.size) + + +class RandomResizedCropVideo(RandomResizedCrop): + def __init__( + self, + size, + scale=(0.08, 1.0), + ratio=(3.0 / 4.0, 4.0 / 3.0), + interpolation_mode="bilinear", + ): + if isinstance(size, tuple): + assert len(size) == 2, "size should be tuple (height, width)" + self.size = size + else: + self.size = (size, size) + + self.interpolation_mode = interpolation_mode + self.scale = scale + self.ratio = ratio + + def __call__(self, clip): + """ + Args: + clip (torch.tensor): Video clip to be cropped. Size is (C, T, H, W) + Returns: + torch.tensor: randomly cropped/resized video clip. + size is (C, T, H, W) + """ + i, j, h, w = self.get_params(clip, self.scale, self.ratio) + return F.resized_crop(clip, i, j, h, w, self.size, self.interpolation_mode) + + def __repr__(self): + return self.__class__.__name__ + \ + '(size={0}, interpolation_mode={1}, scale={2}, ratio={3})'.format( + self.size, self.interpolation_mode, self.scale, self.ratio + ) + + +class CenterCropVideo(object): + def __init__(self, crop_size): + if isinstance(crop_size, numbers.Number): + self.crop_size = (int(crop_size), int(crop_size)) + else: + self.crop_size = crop_size + + + def __call__(self, clip): + """ + Args: + clip (torch.tensor): Video clip to be cropped. Size is (C, T, H, W) + Returns: + torch.tensor: central cropping of video clip. Size is + (C, T, crop_size, crop_size) + """ + + return F.center_crop(clip, self.crop_size) + + def __repr__(self): + return self.__class__.__name__ + '(crop_size={0})'.format(self.crop_size) + +class CornerCropVideo(object): + def __init__(self, crop_size, loc="tr"): + if isinstance(crop_size, numbers.Number): + self.crop_size = (int(crop_size), int(crop_size)) + else: + self.crop_size = crop_size + + def __call__(self, clip, loc="tr"): + """ + Args: + clip (torch.tensor): Video clip to be cropped. Size is (C, T, H, W) + Returns: + torch.tensor: central cropping of video clip. Size is + (C, T, crop_size, crop_size) + """ + if loc == "tr": + i = 0 + j = 0 + elif loc == "center": + return F.corner_crop(clip, self.crop_size) + else: + i = clip.size(-2) - self.crop_size + j = clip.size(-1) - self.crop_size + return F.corner_crop(clip, self.crop_size, i, j) + + def __repr__(self): + return self.__class__.__name__ + '(crop_size={0})'.format(self.crop_size) + + +class NormalizeVideo(object): + """ + Normalize the video clip by mean subtraction and division by standard deviation + Args: + mean (3-tuple): pixel RGB mean + std (3-tuple): pixel RGB standard deviation + inplace (boolean): whether do in-place normalization + """ + + def __init__(self, mean, std, inplace=False): + self.mean = mean + self.std = std + self.inplace = inplace + + def __call__(self, clip): + """ + Args: + clip (torch.tensor): video clip to be normalized. Size is (C, T, H, W) + """ + return F.normalize(clip, self.mean, self.std, self.inplace) + + def __repr__(self): + return self.__class__.__name__ + '(mean={0}, std={1}, inplace={2})'.format( + self.mean, self.std, self.inplace) + + +class ToTensorVideo(object): + """ + Convert tensor data type from uint8 to float, divide value by 255.0 and + permute the dimenions of clip tensor + """ + + def __init__(self): + pass + + def __call__(self, clip): + """ + Args: + clip (torch.tensor, dtype=torch.uint8): Size is (T, H, W, C) + Return: + clip (torch.tensor, dtype=torch.float): Size is (C, T, H, W) + """ + return F.to_tensor(clip) + + def __repr__(self): + return self.__class__.__name__ + + +class RandomHorizontalFlipVideo(object): + """ + Flip the video clip along the horizonal direction with a given probability + Args: + p (float): probability of the clip being flipped. Default value is 0.5 + """ + + def __init__(self, p=0.5): + self.p = p + + def __call__(self, clip): + """ + Args: + clip (torch.tensor): Size is (C, T, H, W) + Return: + clip (torch.tensor): Size is (C, T, H, W) + """ + if random.random() < self.p: + clip = F.hflip(clip) + return clip + + def __repr__(self): + return self.__class__.__name__ + "(p={0})".format(self.p) + + + +class ResizeVideo(object): + """ + Resize the video clip + """ + def __init__(self, w,h): + self.w = w + self.h = h + def __call__(self, clip): + """ + Args: + clip (torch.tensor): Size is (C, T, H, W) + Return: + clip (torch.tensor): Size is (C, T, h, w) + """ + #interpolare needs (T,C, H, W) order while clip is (C, T, H, W) + return torch.nn.functional.interpolate( + clip.permute(1,0,2,3),(self.h,self.w),mode="bilinear",align_corners=False).permute(1,0,2,3) + + def __repr__(self): + return self.__class__.__name__ + "(w=%d,h=%d)"%(self.w,self.h) \ No newline at end of file diff --git a/videoloaders/README.md b/videoloaders/README.md new file mode 100755 index 0000000000000000000000000000000000000000..d54e8f32beb7313158a7e79f45dc3b2c76799d63 --- /dev/null +++ b/videoloaders/README.md @@ -0,0 +1,15 @@ +# How to process video as data loader + +We assume that video is preprocessed in to image files in advance. Usually, we do not use all frames in a clip but sample a certain duration (e.g. 16 frames). The pipline we assume for each chunk is the following. + +- Get a list of images paths of clips e.g. ["./video/clip1/frame0.jpg",...,"./video/clip1/frame101.jpg"] +- Sample a certain duration we want to use e.g. ["./video/clip1/frame11.jpg",...,"./video/clip1/frame26.jpg"] +- Load each frames into a tensor shaped as (T, H, W, C). HW can be changed later. +- Use torchvision builtin utilities to crop, flip, etc. For example, + - ToTensorVideo() from (T, H, W, C) to (C, T, H, W)), from 0-255 to 0-1 (devide by 225), and from uint8 to float. + - CenterCropVideo + - RandomHorizontalFlipVideo + - NormalizeVideo with kinetics mean and std + -See more https://github.com/pytorch/vision/blob/f0d3daa7f65bcde560e242d9bccc284721368f02/torchvision/transforms/transforms_video.py + +Note that the first part is different from what official pytorch repository ( https://github.com/pytorch/vision/tree/master/references/video_classification ) does. We don't use VideoClip class. \ No newline at end of file diff --git a/videoloaders/__pycache__/functional_video.cpython-36.pyc b/videoloaders/__pycache__/functional_video.cpython-36.pyc new file mode 100755 index 0000000000000000000000000000000000000000..f256a41ce9f8584dd65e0f86e7c5154e56bce323 Binary files /dev/null and b/videoloaders/__pycache__/functional_video.cpython-36.pyc differ diff --git a/videoloaders/__pycache__/functional_video.cpython-38.pyc b/videoloaders/__pycache__/functional_video.cpython-38.pyc new file mode 100644 index 0000000000000000000000000000000000000000..5f3499fe8f1cd7e25cf41147062fd973f1d59dfe Binary files /dev/null and b/videoloaders/__pycache__/functional_video.cpython-38.pyc differ diff --git a/videoloaders/__pycache__/transform_temporal.cpython-36.pyc b/videoloaders/__pycache__/transform_temporal.cpython-36.pyc new file mode 100755 index 0000000000000000000000000000000000000000..8518f65c3f3446994991e5bb1d3b43d41df266d3 Binary files /dev/null and b/videoloaders/__pycache__/transform_temporal.cpython-36.pyc differ diff --git a/videoloaders/__pycache__/transform_temporal.cpython-38.pyc b/videoloaders/__pycache__/transform_temporal.cpython-38.pyc new file mode 100644 index 0000000000000000000000000000000000000000..93747c49628499d828f57f0572d1e6260e5c9e08 Binary files /dev/null and b/videoloaders/__pycache__/transform_temporal.cpython-38.pyc differ diff --git a/videoloaders/__pycache__/transforms_video.cpython-36.pyc b/videoloaders/__pycache__/transforms_video.cpython-36.pyc new file mode 100755 index 0000000000000000000000000000000000000000..bab5b115399399841ee2d17579ef61feb7e8773b Binary files /dev/null and b/videoloaders/__pycache__/transforms_video.cpython-36.pyc differ diff --git a/videoloaders/__pycache__/transforms_video.cpython-38.pyc b/videoloaders/__pycache__/transforms_video.cpython-38.pyc new file mode 100644 index 0000000000000000000000000000000000000000..31d0eaae05905c76c44e5d98198f6d79f501abef Binary files /dev/null and b/videoloaders/__pycache__/transforms_video.cpython-38.pyc differ diff --git a/videoloaders/collate_functions.py b/videoloaders/collate_functions.py new file mode 100755 index 0000000000000000000000000000000000000000..837239cf6ad0208c77ba518a1632966f3233b03e --- /dev/null +++ b/videoloaders/collate_functions.py @@ -0,0 +1,15 @@ +import torch +from torch.utils.data.dataloader import default_collate +def collate_video(batch): + ''' + Our video is (temporal_crops, C, T, H, W) where temporal_crops differes from clip to clip + We can't use standard collate function. + Instead of stacking, let's do cat + Keep in mind that this will also need list of frame length in order to restore each videos later. + ''' + elem = batch[0] + assert isinstance(elem,dict) + output = {key: default_collate([d[key] for d in batch]) for key in elem if key!='input'} + output["input"] = torch.cat([d["input"] for d in batch]) + return output + \ No newline at end of file diff --git a/videoloaders/functional_video.py b/videoloaders/functional_video.py new file mode 100755 index 0000000000000000000000000000000000000000..861504ec04cf1998403bea8ee067620216c0ca05 --- /dev/null +++ b/videoloaders/functional_video.py @@ -0,0 +1,117 @@ +#copied from https://raw.githubusercontent.com/pytorch/vision/f0d3daa7f65bcde560e242d9bccc284721368f02/torchvision/transforms/functional_video.py +#copied from https://raw.githubusercontent.com/pytorch/vision/f0d3daa7f65bcde560e242d9bccc284721368f02/torchvision/transforms/transforms_video.py +import torch + + +def _is_tensor_video_clip(clip): + if not torch.is_tensor(clip): + raise TypeError("clip should be Tesnor. Got %s" % type(clip)) + + if not clip.ndimension() == 4: + raise ValueError("clip should be 4D. Got %dD" % clip.dim()) + + return True + + +def crop(clip, i, j, h, w): + """ + Args: + clip (torch.tensor): Video clip to be cropped. Size is (C, T, H, W) + """ + assert len(clip.size()) == 4, "clip should be a 4D tensor" + return clip[..., i:i + h, j:j + w] + + +def resize(clip, target_size, interpolation_mode): + assert len(target_size) == 2, "target size should be tuple (height, width)" + return torch.nn.functional.interpolate( + clip, size=target_size, mode=interpolation_mode + ) + + +def resized_crop(clip, i, j, h, w, size, interpolation_mode="bilinear"): + """ + Do spatial cropping and resizing to the video clip + Args: + clip (torch.tensor): Video clip to be cropped. Size is (C, T, H, W) + i (int): i in (i,j) i.e coordinates of the upper left corner. + j (int): j in (i,j) i.e coordinates of the upper left corner. + h (int): Height of the cropped region. + w (int): Width of the cropped region. + size (tuple(int, int)): height and width of resized clip + Returns: + clip (torch.tensor): Resized and cropped clip. Size is (C, T, H, W) + """ + assert _is_tensor_video_clip(clip), "clip should be a 4D torch.tensor" + clip = crop(clip, i, j, h, w) + clip = resize(clip, size, interpolation_mode) + return clip + + +def center_crop(clip, crop_size): + assert _is_tensor_video_clip(clip), "clip should be a 4D torch.tensor" + h, w = clip.size(-2), clip.size(-1) + th, tw = crop_size + assert h >= th and w >= tw, "height and width must be no smaller than crop_size" + + i = int(round((h - th) / 2.0)) + j = int(round((w - tw) / 2.0)) + return crop(clip, i, j, th, tw) + +def corner_crop(clip, crop_size, i, j): + assert _is_tensor_video_clip(clip),"clip should be a 4d torch tensor" + h, w = clip.size(-2), clip.size(-1) + th, tw = crop_size + assert h>=th and w>=tw, "height and width must be no smaller than crop_size" + return crop(clip, i, j, th, tw) + + +def to_tensor(clip): + """ + Convert tensor data type from uint8 to float, divide value by 255.0 and + permute the dimenions of clip tensor + Args: + clip (torch.tensor, dtype=torch.uint8): Size is (T, H, W, C) + Return: + """ + _is_tensor_video_clip(clip) + if not clip.dtype == torch.uint8: + raise TypeError("clip tensor should have data type uint8. Got %s" % str(clip.dtype)) + return clip.float().permute(3, 0, 1, 2) / 255.0 + + +def normalize(clip, mean, std, inplace=False): + """ + Args: + clip (torch.tensor): Video clip to be normalized. Size is (C, T, H, W) + mean (tuple): pixel RGB mean. Size is (3) + std (tuple): pixel standard deviation. Size is (3) + Returns: + normalized clip (torch.tensor): Size is (C, T, H, W) + """ + assert _is_tensor_video_clip(clip), "clip should be a 4D torch.tensor" + if not inplace: + clip = clip.clone() + mean = torch.as_tensor(mean, dtype=clip.dtype, device=clip.device) + std = torch.as_tensor(std, dtype=clip.dtype, device=clip.device) + if clip.size(0) == 3: + clip.sub_(mean[:, None, None, None]).div_(std[:, None, None, None]) + elif clip.size(0) == 1: + #make it compatibale with depth image + mean = mean.mean() + std = std.mean() + clip.sub_(mean).div_(std) + else: + raise NotImplementedError() + return clip + + +def hflip(clip): + """ + Args: + clip (torch.tensor): Video clip to be normalized. Size is (C, T, H, W) + Returns: + flipped clip (torch.tensor): Size is (C, T, H, W) + """ + assert _is_tensor_video_clip(clip), "clip should be a 4D torch.tensor" + return clip.flip((-1)) diff --git a/videoloaders/transform_temporal.py b/videoloaders/transform_temporal.py new file mode 100755 index 0000000000000000000000000000000000000000..14948b753f81ab5e6fd31b6de3d65b9c59e82348 --- /dev/null +++ b/videoloaders/transform_temporal.py @@ -0,0 +1,162 @@ +import os +import random +import math + + +def temporal_batching_index(fr,length=16): + ''' + Do padding or half-overlapping clips for video. + + Input: + fr: number of frames + Output: + batch_indices: array for batch where each element is frame index + ''' + if fr < length: + #e.g. (1,2,3,4,5) to (1,1,....,1,2,3,4,5,5,...,5,5) + right = int((length-fr)/2) + left = length - right - fr + return [[0]*left + list(range(fr)) + [fr-1]*right] + + batch_indices = [] + last_idx = fr - 1 + assert length%2 == 0 + half = int(length/2) + for i in range(0,fr-half,half): + frame_indices = [0,]*length + for j in range(length): + current_idx = i + j + if current_idx < last_idx: + frame_indices[j] = current_idx + else: + frame_indices[j] = last_idx + batch_indices.append(frame_indices) + + return batch_indices + +def temporal_sliding_window(clip,window = 16): + ''' + Make a batched tensor with 16 frame sliding window with the overlap of 8. + If a clip is not the multiply of 8, it's padded with the last frames. (1,2...,13,14,14,14) for (1,..,14) + If a clip is less than 16 frames, padding is applied like (1,1,....,1,2,3,4,5,5,...,5,5) for (1,2,3,4,5) + This can be used for sliding window evaluation. + + Input: list of image paths + Output: torch tensor of shape of (batch,ch,16,h,w). + ''' + + batch_indices = temporal_batching_index(len(clip),length = window) + + return [[clip[idx] for idx in indices] for indices in batch_indices] + +def temporal_center_crop(clip,length = 16): + ''' + Input: list of image paths + Output: torch tensor of shape of (1,ch,16,h,w). + ''' + fr = len(clip) + if fr < length: + #e.g. (1,2,3,4,5) to (1,1,....,1,2,3,4,5,5,...,5,5) + right = int((length-fr)/2) + left = length - right - fr + indicies = [0]*left + list(range(fr)) + [fr-1]*right + output = [clip[i] for i in indicies] + elif fr==length: + output = clip + else: + middle = int(fr/2) + assert length%2 == 0 + half = int(length/2) + start = middle - half + output = clip[start : start+length] + + return output[::2] + + + +def random_temporal_crop(clip,length = 16): + ''' + Just randomly sample 16 consecutive frames + if less than 16 frames, just add padding. + ''' + fr = len(clip) + if fr < length: + #e.g. (1,2,3,4,5) to (1,1,....,1,2,3,4,5,5,...,5,5) + right = int((length-fr)/2) + left = length - right - fr + indicies = [0]*left + list(range(fr)) + [fr-1]*right + output = [clip[i] for i in indicies] + elif fr==length: + output = clip + else: + start=random.randint(0,fr-length) + output = clip[start : start+length] + return output[::2] + + +def use_all_frames(clip): + ''' + Just use it as it is :) + ''' + return clip + +def looppadding(clip, length=16): + + + out = clip + + for index in out: + if len(out) >= length: + break + out.append(index) + + return out[::2] + +def temporal_even_crop(clip, length=16, n_samples=1): + + clip = list(clip) + n_frames = len(clip) + indices = list(range(len(clip))) + stride = max( + 1, math.ceil((n_frames - 1 - length) / (n_samples - 1))) + + out = [] + for begin_index in indices[::stride]: + if len(out) >= n_samples: + break + end_index = min(indices[-1] + 1, begin_index + length) + sample = list(range(begin_index, end_index)) + + if len(sample) < length: + out.append([clip[i] for i in looppadding(sample, length=length)]) + # out.append(clip[looppadding(sample, length=length)]) + break + else: + out.append([clip[i] for i in sample[::2]]) + # out.append(clip[sample[::2]]) + + return out + + +class TemporalTransform(object): + def __init__(self,length,mode="center"): + self.mode = mode + self.length = length + #pass dummpy in order to catch incoored mode + self.__call__(range(128)) + + def __call__(self, clip): + if self.mode == "random": + return random_temporal_crop(clip,self.length) + elif self.mode == "center": + return temporal_center_crop(clip,self.length) + elif self.mode == "all" or self.mode == "nocrop": + #note that length cannot be satisfied! + return use_all_frames(clip) + elif self.mode == "slide": + #note that output has one more dimention + return temporal_sliding_window(clip,self.length) + elif self.mode == "even": + return temporal_even_crop(clip, self.length, n_samples=5) + else: + raise NotImplementedError("this option is not defined:",self.mode) \ No newline at end of file diff --git a/videoloaders/transforms_video.py b/videoloaders/transforms_video.py new file mode 100755 index 0000000000000000000000000000000000000000..156cf463abf94932421cb2dd7d500fdcdb998bbe --- /dev/null +++ b/videoloaders/transforms_video.py @@ -0,0 +1,312 @@ +#copied from https://raw.githubusercontent.com/pytorch/vision/f0d3daa7f65bcde560e242d9bccc284721368f02/torchvision/transforms/functional_video.py +#copied from https://raw.githubusercontent.com/pytorch/vision/f0d3daa7f65bcde560e242d9bccc284721368f02/torchvision/transforms/transforms_video.py +#!/usr/bin/env python3 + +import numbers +import random +import torch + +try: + import accimage +except: + pass + +from torchvision.transforms import ( + RandomResizedCrop, +) + +from . import functional_video as F + +def _get_image_size(img): + if isinstance(img, torch.Tensor) and img.dim() > 2: + return img.shape[-2:][::-1] + else: + raise TypeError("Unexpected type {}".format(type(img))) + +class RandomCrop(object): + """Crop the given PIL Image at a random location. + Args: + size (sequence or int): Desired output size of the crop. If size is an + int instead of sequence like (h, w), a square crop (size, size) is + made. + padding (int or sequence, optional): Optional padding on each border + of the image. Default is None, i.e no padding. If a sequence of length + 4 is provided, it is used to pad left, top, right, bottom borders + respectively. If a sequence of length 2 is provided, it is used to + pad left/right, top/bottom borders, respectively. + pad_if_needed (boolean): It will pad the image if smaller than the + desired size to avoid raising an exception. Since cropping is done + after padding, the padding seems to be done at a random offset. + fill: Pixel fill value for constant fill. Default is 0. If a tuple of + length 3, it is used to fill R, G, B channels respectively. + This value is only used when the padding_mode is constant + padding_mode: Type of padding. Should be: constant, edge, reflect or symmetric. Default is constant. + - constant: pads with a constant value, this value is specified with fill + - edge: pads with the last value on the edge of the image + - reflect: pads with reflection of image (without repeating the last value on the edge) + padding [1, 2, 3, 4] with 2 elements on both sides in reflect mode + will result in [3, 2, 1, 2, 3, 4, 3, 2] + - symmetric: pads with reflection of image (repeating the last value on the edge) + padding [1, 2, 3, 4] with 2 elements on both sides in symmetric mode + will result in [2, 1, 1, 2, 3, 4, 4, 3] + """ + + def __init__(self, size, padding=None, pad_if_needed=False, fill=0, padding_mode='constant'): + if isinstance(size, numbers.Number): + self.size = (int(size), int(size)) + else: + self.size = size + self.padding = padding + self.pad_if_needed = pad_if_needed + self.fill = fill + self.padding_mode = padding_mode + + @staticmethod + def get_params(img, output_size): + """Get parameters for ``crop`` for a random crop. + Args: + img (PIL Image): Image to be cropped. + output_size (tuple): Expected output size of the crop. + Returns: + tuple: params (i, j, h, w) to be passed to ``crop`` for random crop. + """ + w, h = _get_image_size(img) + th, tw = output_size + if w == tw and h == th: + return 0, 0, h, w + + i = random.randint(0, h - th) + j = random.randint(0, w - tw) + return i, j, th, tw + + def __call__(self, img): + """ + Args: + img (PIL Image): Image to be cropped. + Returns: + PIL Image: Cropped image. + """ + if self.padding is not None: + img = F.pad(img, self.padding, self.fill, self.padding_mode) + + # pad the width if needed + if self.pad_if_needed and img.size[0] < self.size[1]: + img = F.pad(img, (self.size[1] - img.size[0], 0), self.fill, self.padding_mode) + # pad the height if needed + if self.pad_if_needed and img.size[1] < self.size[0]: + img = F.pad(img, (0, self.size[0] - img.size[1]), self.fill, self.padding_mode) + + i, j, h, w = self.get_params(img, self.size) + + return F.crop(img, i, j, h, w) + + def __repr__(self): + return self.__class__.__name__ + '(size={0}, padding={1})'.format(self.size, self.padding) + + + + + +class RandomCropVideo(RandomCrop): + def __init__(self, size): + if isinstance(size, numbers.Number): + self.size = (int(size), int(size)) + else: + self.size = size + + def __call__(self, clip): + """ + Args: + clip (torch.tensor): Video clip to be cropped. Size is (C, T, H, W) + Returns: + torch.tensor: randomly cropped/resized video clip. + size is (C, T, OH, OW) + """ + i, j, h, w = self.get_params(clip, self.size) + return F.crop(clip, i, j, h, w) + + def __repr__(self): + return self.__class__.__name__ + '(size={0})'.format(self.size) + + +class RandomResizedCropVideo(RandomResizedCrop): + def __init__( + self, + size, + scale=(0.08, 1.0), + ratio=(3.0 / 4.0, 4.0 / 3.0), + interpolation_mode="bilinear", + ): + if isinstance(size, tuple): + assert len(size) == 2, "size should be tuple (height, width)" + self.size = size + else: + self.size = (size, size) + + self.interpolation_mode = interpolation_mode + self.scale = scale + self.ratio = ratio + + def __call__(self, clip): + """ + Args: + clip (torch.tensor): Video clip to be cropped. Size is (C, T, H, W) + Returns: + torch.tensor: randomly cropped/resized video clip. + size is (C, T, H, W) + """ + i, j, h, w = self.get_params(clip, self.scale, self.ratio) + return F.resized_crop(clip, i, j, h, w, self.size, self.interpolation_mode) + + def __repr__(self): + return self.__class__.__name__ + \ + '(size={0}, interpolation_mode={1}, scale={2}, ratio={3})'.format( + self.size, self.interpolation_mode, self.scale, self.ratio + ) + + +class CenterCropVideo(object): + def __init__(self, crop_size): + if isinstance(crop_size, numbers.Number): + self.crop_size = (int(crop_size), int(crop_size)) + else: + self.crop_size = crop_size + + + def __call__(self, clip): + """ + Args: + clip (torch.tensor): Video clip to be cropped. Size is (C, T, H, W) + Returns: + torch.tensor: central cropping of video clip. Size is + (C, T, crop_size, crop_size) + """ + + return F.center_crop(clip, self.crop_size) + + def __repr__(self): + return self.__class__.__name__ + '(crop_size={0})'.format(self.crop_size) + +class CornerCropVideo(object): + def __init__(self, crop_size, loc="tr"): + if isinstance(crop_size, numbers.Number): + self.crop_size = (int(crop_size), int(crop_size)) + else: + self.crop_size = crop_size + + def __call__(self, clip, loc="tr"): + """ + Args: + clip (torch.tensor): Video clip to be cropped. Size is (C, T, H, W) + Returns: + torch.tensor: central cropping of video clip. Size is + (C, T, crop_size, crop_size) + """ + if loc == "tr": + i = 0 + j = 0 + elif loc == "center": + return F.corner_crop(clip, self.crop_size) + else: + i = clip.size(-2) - self.crop_size + j = clip.size(-1) - self.crop_size + return F.corner_crop(clip, self.crop_size, i, j) + + def __repr__(self): + return self.__class__.__name__ + '(crop_size={0})'.format(self.crop_size) + + +class NormalizeVideo(object): + """ + Normalize the video clip by mean subtraction and division by standard deviation + Args: + mean (3-tuple): pixel RGB mean + std (3-tuple): pixel RGB standard deviation + inplace (boolean): whether do in-place normalization + """ + + def __init__(self, mean, std, inplace=False): + self.mean = mean + self.std = std + self.inplace = inplace + + def __call__(self, clip): + """ + Args: + clip (torch.tensor): video clip to be normalized. Size is (C, T, H, W) + """ + return F.normalize(clip, self.mean, self.std, self.inplace) + + def __repr__(self): + return self.__class__.__name__ + '(mean={0}, std={1}, inplace={2})'.format( + self.mean, self.std, self.inplace) + + +class ToTensorVideo(object): + """ + Convert tensor data type from uint8 to float, divide value by 255.0 and + permute the dimenions of clip tensor + """ + + def __init__(self): + pass + + def __call__(self, clip): + """ + Args: + clip (torch.tensor, dtype=torch.uint8): Size is (T, H, W, C) + Return: + clip (torch.tensor, dtype=torch.float): Size is (C, T, H, W) + """ + return F.to_tensor(clip) + + def __repr__(self): + return self.__class__.__name__ + + +class RandomHorizontalFlipVideo(object): + """ + Flip the video clip along the horizonal direction with a given probability + Args: + p (float): probability of the clip being flipped. Default value is 0.5 + """ + + def __init__(self, p=0.5): + self.p = p + + def __call__(self, clip): + """ + Args: + clip (torch.tensor): Size is (C, T, H, W) + Return: + clip (torch.tensor): Size is (C, T, H, W) + """ + if random.random() < self.p: + clip = F.hflip(clip) + return clip + + def __repr__(self): + return self.__class__.__name__ + "(p={0})".format(self.p) + + + +class ResizeVideo(object): + """ + Resize the video clip + """ + def __init__(self, w,h): + self.w = w + self.h = h + def __call__(self, clip): + """ + Args: + clip (torch.tensor): Size is (C, T, H, W) + Return: + clip (torch.tensor): Size is (C, T, h, w) + """ + #interpolare needs (T,C, H, W) order while clip is (C, T, H, W) + return torch.nn.functional.interpolate( + clip.permute(1,0,2,3),(self.h,self.w),mode="bilinear",align_corners=False).permute(1,0,2,3) + + def __repr__(self): + return self.__class__.__name__ + "(w=%d,h=%d)"%(self.w,self.h) \ No newline at end of file