diff --git a/README.md b/README.md
new file mode 100644
index 0000000000000000000000000000000000000000..bccf5dc23d508df8a405a594f1c0a2bf826a2395
--- /dev/null
+++ b/README.md
@@ -0,0 +1,61 @@
+## LoCoNet: Long-Short Context Network for Active Speaker Detection
+
+
+
+### Dependencies
+
+Start from building the environment
+```
+conda env create -f requirements.yml
+conda activate loconet
+```
+export PYTHONPATH=**project_dir**/dlhammer:$PYTHONPATH
+and replace **project_dir** with your code base location
+
+
+
+### Data preparation
+
+We follow TalkNet's data preparation script to download and prepare the AVA dataset.
+
+```
+python train.py --dataPathAVA AVADataPath --download 
+```
+
+`AVADataPath` is the folder you want to save the AVA dataset and its preprocessing outputs, the details can be found in [here](https://github.com/TaoRuijie/TalkNet_ASD/blob/main/utils/tools.py#L34) . Please read them carefully.
+
+After AVA dataset is downloaded, please change the DATA.dataPathAVA entry in the config file. 
+
+#### Training script
+```
+python -W ignore::UserWarning train.py --cfg configs/multi.yaml OUTPUT_DIR <output directory>
+```
+
+
+
+#### Pretrained model
+
+Please download the LoCoNet trained weights on AVA dataset [here](https://drive.google.com/file/d/1EX-V464jCD6S-wg68yGuAa-UcsMrw8mK/view?usp=sharing).
+
+```
+python -W ignore::UserWarning test_multicard.py --cfg configs/multi.yaml  RESUME_PATH {model download path}
+```
+
+### Citation
+
+Please cite the following if our paper or code is helpful to your research.
+```
+@article{wang2023loconet,
+  title={LoCoNet: Long-Short Context Network for Active Speaker Detection},
+  author={Wang, Xizi and Cheng, Feng and Bertasius, Gedas and Crandall, David},
+  journal={arXiv preprint arXiv:2301.08237},
+  year={2023}
+}
+```
+
+
+### Acknowledge
+
+The code base of this project is studied from [TalkNet](https://github.com/TaoRuijie/TalkNet-ASD) which is a very easy-to-use ASD pipeline.
+
+
diff --git a/__pycache__/dataLoader_multiperson.cpython-37.pyc b/__pycache__/dataLoader_multiperson.cpython-37.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..7185ff267eaf7ed6976127efe3ef7b35ae06946a
Binary files /dev/null and b/__pycache__/dataLoader_multiperson.cpython-37.pyc differ
diff --git a/__pycache__/loconet.cpython-37.pyc b/__pycache__/loconet.cpython-37.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..033f1b172f74594300b087bc45d9682e429cf061
Binary files /dev/null and b/__pycache__/loconet.cpython-37.pyc differ
diff --git a/__pycache__/loss_multi.cpython-37.pyc b/__pycache__/loss_multi.cpython-37.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..f7115764cd6c3c548362dc5583b586a6679a4798
Binary files /dev/null and b/__pycache__/loss_multi.cpython-37.pyc differ
diff --git a/__pycache__/talkNet_config_multi.cpython-37.pyc b/__pycache__/talkNet_config_multi.cpython-37.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..58a6a8ef9aa50f0fb0dc7a67eaa5d51dd92f5bc0
Binary files /dev/null and b/__pycache__/talkNet_config_multi.cpython-37.pyc differ
diff --git a/builder.py b/builder.py
new file mode 100644
index 0000000000000000000000000000000000000000..6afeb8375a5b1b3fc83948d230af2cb6039f745d
--- /dev/null
+++ b/builder.py
@@ -0,0 +1,95 @@
+# -*- coding: utf-8 -*-
+#================================================================
+#   Don't go gently into that good night.
+#
+#   author: klaus
+#   description:
+#
+#================================================================
+
+import warnings
+
+from mmcv.cnn import MODELS as MMCV_MODELS
+from mmcv.utils import Registry
+
+from mmaction.utils import import_module_error_func
+
+MODELS = Registry('models', parent=MMCV_MODELS)
+BACKBONES = MODELS
+NECKS = MODELS
+HEADS = MODELS
+RECOGNIZERS = MODELS
+LOSSES = MODELS
+LOCALIZERS = MODELS
+
+try:
+    from mmdet.models.builder import DETECTORS, build_detector
+except (ImportError, ModuleNotFoundError):
+    # Define an empty registry and building func, so that can import
+    DETECTORS = MODELS
+
+    @import_module_error_func('mmdet')
+    def build_detector(cfg, train_cfg, test_cfg):
+        pass
+
+
+def build_backbone(cfg):
+    """Build backbone."""
+    return BACKBONES.build(cfg)
+
+
+def build_head(cfg):
+    """Build head."""
+    return HEADS.build(cfg)
+
+
+def build_recognizer(cfg, train_cfg=None, test_cfg=None):
+    """Build recognizer."""
+    if train_cfg is not None or test_cfg is not None:
+        warnings.warn(
+            'train_cfg and test_cfg is deprecated, '
+            'please specify them in model. Details see this '
+            'PR: https://github.com/open-mmlab/mmaction2/pull/629', UserWarning)
+    assert cfg.get(
+        'train_cfg'
+    ) is None or train_cfg is None, 'train_cfg specified in both outer field and model field'    # noqa: E501
+    assert cfg.get(
+        'test_cfg'
+    ) is None or test_cfg is None, 'test_cfg specified in both outer field and model field '    # noqa: E501
+    return RECOGNIZERS.build(cfg, default_args=dict(train_cfg=train_cfg, test_cfg=test_cfg))
+
+
+def build_loss(cfg):
+    """Build loss."""
+    return LOSSES.build(cfg)
+
+
+def build_localizer(cfg):
+    """Build localizer."""
+    return LOCALIZERS.build(cfg)
+
+
+def build_model(cfg, train_cfg=None, test_cfg=None):
+    """Build model."""
+    args = cfg.copy()
+    obj_type = args.pop('type')
+    if obj_type in LOCALIZERS:
+        return build_localizer(cfg)
+    if obj_type in RECOGNIZERS:
+        return build_recognizer(cfg, train_cfg, test_cfg)
+    if obj_type in DETECTORS:
+        if train_cfg is not None or test_cfg is not None:
+            warnings.warn(
+                'train_cfg and test_cfg is deprecated, '
+                'please specify them in model. Details see this '
+                'PR: https://github.com/open-mmlab/mmaction2/pull/629', UserWarning)
+        return build_detector(cfg, train_cfg, test_cfg)
+    model_in_mmdet = ['FastRCNN']
+    if obj_type in model_in_mmdet:
+        raise ImportError('Please install mmdet for spatial temporal detection tasks.')
+    raise ValueError(f'{obj_type} is not registered in ' 'LOCALIZERS, RECOGNIZERS or DETECTORS')
+
+
+def build_neck(cfg):
+    """Build neck."""
+    return NECKS.build(cfg)
diff --git a/configs/multi.yaml b/configs/multi.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..75234312e9c71c496307bf7d1756b782a5c67324
--- /dev/null
+++ b/configs/multi.yaml
@@ -0,0 +1,51 @@
+SEED: "20210617"
+NUM_GPUS: 4
+NUM_WORKERS: 6
+LOG_NAME: 'config.txt'
+OUTPUT_DIR: '/nfs/joltik/data/ssd/xiziwang/TalkNet_models/'  # savePath
+evalDataType: "val"
+downloadAVA: False
+evaluation: False
+RESUME: False
+RESUME_PATH: ""
+RESUME_EPOCH: 0
+
+DATA:
+    dataPathAVA: '/nfs/jolteon/data/ssd/xiziwang/AVA_dataset/'
+
+DATALOADER:
+    nDataLoaderThread: 4
+    
+
+SOLVER:
+    OPTIMIZER: "adam"
+    BASE_LR: 5e-5
+    SCHEDULER:
+        NAME: "multistep"
+        GAMMA: 0.95
+
+MODEL:
+    NUM_SPEAKERS: 3
+    CLIP_LENGTH: 200
+    AV: "speaker_temporal"
+    AV_layers: 3
+    ADJUST_ATTENTION: 0
+
+TRAIN:
+    BATCH_SIZE: 1
+    MAX_EPOCH: 25
+    AUDIO_AUG: 1 
+    TEST_INTERVAL: 1
+    TRAINER_GPU: 4
+
+
+VAL:
+    BATCH_SIZE: 1
+
+TEST:
+    BATCH_SIZE: 1
+    DATASET: 'seen'
+    MODEL: 'unseen'
+
+    
+
diff --git a/dataLoaderTalkSet.py b/dataLoaderTalkSet.py
new file mode 100644
index 0000000000000000000000000000000000000000..0ef5bc173f3aefee330297e0425b04bc4b6c4bf0
--- /dev/null
+++ b/dataLoaderTalkSet.py
@@ -0,0 +1,182 @@
+import os, torch, numpy, cv2, imageio, random, python_speech_features
+import matplotlib.pyplot as plt
+from scipy.io import wavfile
+from glob import glob
+from torchvision.transforms import RandomCrop
+from scipy import signal
+
+def get_noise_list(musanPath, rirPath):
+	augment_files = glob(os.path.join(musanPath, '*/*/*/*.wav'))
+	noiselist = {}
+	rir = numpy.load(rirPath)
+	for file in augment_files:
+		if not file.split('/')[-4] in noiselist:
+			noiselist[file.split('/')[-4]] = []
+		noiselist[file.split('/')[-4]].append(file)
+	return rir, noiselist
+
+def augment_wav(audio, aug_type, rir, noiselist):
+	if aug_type == 'rir':
+		rir_gains = numpy.random.uniform(-7,3,1)
+		rir_filts = random.choice(rir)
+		rir     = numpy.multiply(rir_filts, pow(10, 0.1 * rir_gains))    
+		audio   = signal.convolve(audio, rir, mode='full')[:len(audio)]
+	else:
+		noisecat = aug_type
+		noisefile = random.choice(noiselist[noisecat].copy())
+		snr = [random.uniform({'noise':[0,15],'music':[5,15]}[noisecat][0], {'noise':[0,15],'music':[5,15]}[noisecat][1])]
+		_, noiseaudio = wavfile.read(noisefile)
+		if len(noiseaudio) < len(audio):
+			shortage = len(audio) - len(noiseaudio)
+			noiseaudio = numpy.pad(noiseaudio, (0, shortage), 'wrap')
+		else:
+			noiseaudio = noiseaudio[:len(audio)]
+
+		noise_db = 10 * numpy.log10(numpy.mean(abs(noiseaudio ** 2)) + 1e-4)
+		clean_db = 10 * numpy.log10(numpy.mean(abs(audio ** 2)) + 1e-4)
+		noise = numpy.sqrt(10 ** ((clean_db - noise_db - snr) / 10)) * noiseaudio
+		audio = audio + noise
+	return audio.astype(numpy.int16)
+
+def load_audio(data, data_path, length, start, end, audio_aug, rirlist = None, noiselist = None):
+	# Find the path of the audio data
+	data_type = data[0]
+	id_name = data[1][:8]
+	file_name = data[1].split('/')[0] + '_' + data[1].split('/')[1] + '_' + data[1].split('/')[2] + \
+	'_' + data[2].split('/')[0] + '_' + data[2].split('/')[1] + '_' + data[2].split('/')[2] + '.wav'
+	audio_file_path = os.path.join(data_path, data_type, id_name, file_name)
+	# Load audio, compute MFCC, cut it to the required length
+	_, audio = wavfile.read(audio_file_path)
+
+	if audio_aug == True:
+		augtype = random.randint(0,3)
+		if augtype == 1: # rir
+			audio = augment_wav(audio, 'rir', rirlist, noiselist)
+		elif augtype == 2:
+			audio = augment_wav(audio, 'noise', rirlist, noiselist)   
+		elif augtype == 3:
+			audio = augment_wav(audio, 'music', rirlist, noiselist)
+		else:
+			audio = audio
+
+	feature = python_speech_features.mfcc(audio, 16000, numcep = 13, winlen = 0.025, winstep = 0.010)
+	length_audio = int(round(length * 100))
+	if feature.shape[0] < length_audio:
+		shortage    = length_audio - feature.shape[0]
+		feature     = numpy.pad(feature, ((0, shortage), (0,0)), 'wrap')
+	feature = feature[int(round(start * 100)):int(round(end * 100)),:]
+	return feature
+
+def load_video(data, data_path, length, start, end, visual_aug):	
+	# Find the path of the visual data
+	data_type = data[0]
+	id_name = data[1][:8]
+	file_name = data[1].split('/')[0] + '_' + data[1].split('/')[1] + '_' + data[1].split('/')[2] + \
+	'_' + data[2].split('/')[0] + '_' + data[2].split('/')[1] + '_' + data[2].split('/')[2] + '.mp4'
+	video_file_path = os.path.join(data_path, data_type, id_name, file_name)
+	# Load visual frame-by-frame, cut it to the required length
+	length_video = int(round((end - start) * 25))
+	video = cv2.VideoCapture(video_file_path)
+	faces = []
+	augtype = 'orig'
+
+	if visual_aug == True:
+		new = int(112*random.uniform(0.7, 1))
+		x, y = numpy.random.randint(0, 112 - new), numpy.random.randint(0, 112 - new)
+		M = cv2.getRotationMatrix2D((112/2,112/2), random.uniform(-15, 15), 1)
+		augtype = random.choice(['orig', 'flip', 'crop', 'rotate'])
+
+	num_frame = 0
+	while video.isOpened():
+		ret, frames = video.read()
+		if ret == True:
+			num_frame += 1
+			if num_frame >= int(round(start * 25)) and num_frame < int(round(end * 25)):
+				face = cv2.cvtColor(frames, cv2.COLOR_BGR2GRAY)
+				face = cv2.resize(face, (224,224))
+				face = face[int(112-(112/2)):int(112+(112/2)), int(112-(112/2)):int(112+(112/2))]
+				if augtype == 'orig':
+					faces.append(face)
+				elif augtype == 'flip':
+					faces.append(cv2.flip(face, 1))
+				elif augtype == 'crop':
+					faces.append(cv2.resize(face[y:y+new, x:x+new] , (112,112))) 
+				elif augtype == 'rotate':
+					faces.append(cv2.warpAffine(face, M, (112,112)))
+		else:
+			break
+	video.release()
+	faces = numpy.array(faces)
+	if faces.shape[0] < length_video:
+		shortage    = length_video - faces.shape[0]
+		faces     = numpy.pad(faces, ((0,shortage), (0,0),(0,0)), 'wrap')	
+	# faces = numpy.array(faces)[int(round(start * 25)):int(round(end * 25)),:,:]
+	return faces
+
+def load_label(data, length, start, end):
+	labels_all = []
+	labels = []
+	data_type = data[0]
+	start_T, end_T, start_F, end_F = float(data[4]), float(data[5]), float(data[6]), float(data[7])	
+	for i in range(int(round(length * 100))):
+		if data_type == 'TAudio':
+			labels_all.append(1)
+		elif data_type == 'FAudio' or data_type == 'FSilence':
+			labels_all.append(0)
+		else:
+			if i >= int(round(start_T * 100)) and i <= int(round(end_T * 100)):
+				labels_all.append(1)
+			else:
+				labels_all.append(0)
+	for i in range(int(round(length * 25))):
+		labels.append(int(round(sum(labels_all[i*4: (i+1)*4]) / 4)))
+	return labels[round(start*25): round(end*25)]
+
+class loader_TalkSet(object):
+	def __init__(self, trial_file_name, data_path, audio_aug, visual_aug, musanPath, rirPath,**kwargs):
+		self.data_path = data_path
+		self.audio_aug = audio_aug
+		self.visual_aug = visual_aug
+		self.minibatch = []
+		self.rir, self.noiselist = get_noise_list(musanPath, rirPath)
+		mix_lst = open(trial_file_name).read().splitlines()
+		mix_lst = list(filter(lambda x: float(x.split()[3]) >= 1, mix_lst)) # filter the video less than 1s
+		# mix_lst = list(filter(lambda x: x.split()[0] == 'TSilence', mix_lst))
+		sorted_mix_lst = sorted(mix_lst, key=lambda data: (float(data.split()[3]), int(data.split()[-1])), reverse=True)		
+		start = 0
+		while True:
+			length_total = float(sorted_mix_lst[start].split()[3])
+			batch_size = int(250 / length_total)
+			end = min(len(sorted_mix_lst), start + batch_size)
+			self.minibatch.append(sorted_mix_lst[start:end])
+			if end == len(sorted_mix_lst):
+				break
+			start = end
+		# self.minibatch = self.minibatch[0:5]
+
+	def __getitem__(self, index):
+		batch_lst = self.minibatch[index]
+		length_total = float(batch_lst[-1].split()[3])
+		length_total = (int(round(length_total * 100)) - int(round(length_total * 100)) % 4) / 100
+		audio_feature, video_feature, labels = [], [], []
+		duration = random.choice([1,2,4,6])
+		#duration = 6
+		length = min(length_total, duration)
+		if length == duration:
+			start = int(round(random.randint(0, round(length_total * 25) - round(length * 25)) * 0.04 * 100)) / 100
+			end = int(round((start + length) * 100)) / 100
+		else:
+			start, end = 0, length
+
+		for line in batch_lst:
+			data = line.split()
+			audio_feature.append(load_audio(data, self.data_path, length_total, start, end, audio_aug = self.audio_aug, rirlist = self.rir, noiselist = self.noiselist))
+			video_feature.append(load_video(data, self.data_path, length_total, start, end, visual_aug = self.visual_aug))        
+			labels.append(load_label(data, length_total, start, end))
+
+		return torch.FloatTensor(numpy.array(audio_feature)), \
+			   torch.FloatTensor(numpy.array(video_feature)), \
+			   torch.LongTensor(numpy.array(labels))
+
+	def __len__(self):
+		return len(self.minibatch)
\ No newline at end of file
diff --git a/dataLoader_multiperson.py b/dataLoader_multiperson.py
new file mode 100755
index 0000000000000000000000000000000000000000..1e643ea3722f16734e0880dd404730b314b44a98
--- /dev/null
+++ b/dataLoader_multiperson.py
@@ -0,0 +1,402 @@
+import os, torch, numpy, cv2, random, glob, python_speech_features, json, math
+from scipy.io import wavfile
+from torchvision.transforms import RandomCrop
+from operator import itemgetter
+from torchvggish import vggish_input, vggish_params, mel_features
+
+
+def overlap(audio, noiseAudio):
+    snr = [random.uniform(-5, 5)]
+    if len(noiseAudio) < len(audio):
+        shortage = len(audio) - len(noiseAudio)
+        noiseAudio = numpy.pad(noiseAudio, (0, shortage), 'wrap')
+    else:
+        noiseAudio = noiseAudio[:len(audio)]
+    noiseDB = 10 * numpy.log10(numpy.mean(abs(noiseAudio**2)) + 1e-4)
+    cleanDB = 10 * numpy.log10(numpy.mean(abs(audio**2)) + 1e-4)
+    noiseAudio = numpy.sqrt(10**((cleanDB - noiseDB - snr) / 10)) * noiseAudio
+    audio = audio + noiseAudio
+    return audio.astype(numpy.int16)
+
+
+def load_audio(data, dataPath, numFrames, audioAug, audioSet=None):
+    dataName = data[0]
+    fps = float(data[2])
+    audio = audioSet[dataName]
+    if audioAug == True:
+        augType = random.randint(0, 1)
+        if augType == 1:
+            audio = overlap(dataName, audio, audioSet)
+        else:
+            audio = audio
+    # fps is not always 25, in order to align the visual, we modify the window and step in MFCC extraction process based on fps
+    audio = python_speech_features.mfcc(audio,
+                                        16000,
+                                        numcep=13,
+                                        winlen=0.025 * 25 / fps,
+                                        winstep=0.010 * 25 / fps)
+    maxAudio = int(numFrames * 4)
+    if audio.shape[0] < maxAudio:
+        shortage = maxAudio - audio.shape[0]
+        audio = numpy.pad(audio, ((0, shortage), (0, 0)), 'wrap')
+    audio = audio[:int(round(numFrames * 4)), :]
+    return audio
+
+
+def load_single_audio(audio, fps, numFrames, audioAug=False):
+    audio = python_speech_features.mfcc(audio,
+                                        16000,
+                                        numcep=13,
+                                        winlen=0.025 * 25 / fps,
+                                        winstep=0.010 * 25 / fps)
+    maxAudio = int(numFrames * 4)
+    if audio.shape[0] < maxAudio:
+        shortage = maxAudio - audio.shape[0]
+        audio = numpy.pad(audio, ((0, shortage), (0, 0)), 'wrap')
+    audio = audio[:int(round(numFrames * 4)), :]
+    return audio
+
+
+def load_visual(data, dataPath, numFrames, visualAug):
+    dataName = data[0]
+    videoName = data[0][:11]
+    faceFolderPath = os.path.join(dataPath, videoName, dataName)
+    faceFiles = glob.glob("%s/*.jpg" % faceFolderPath)
+    sortedFaceFiles = sorted(faceFiles,
+                             key=lambda data: (float(data.split('/')[-1][:-4])),
+                             reverse=False)
+    faces = []
+    H = 112
+    if visualAug == True:
+        new = int(H * random.uniform(0.7, 1))
+        x, y = numpy.random.randint(0, H - new), numpy.random.randint(0, H - new)
+        M = cv2.getRotationMatrix2D((H / 2, H / 2), random.uniform(-15, 15), 1)
+        augType = random.choice(['orig', 'flip', 'crop', 'rotate'])
+    else:
+        augType = 'orig'
+    for faceFile in sortedFaceFiles[:numFrames]:
+        face = cv2.imread(faceFile)
+
+        face = cv2.cvtColor(face, cv2.COLOR_BGR2GRAY)
+        face = cv2.resize(face, (H, H))
+        if augType == 'orig':
+            faces.append(face)
+        elif augType == 'flip':
+            faces.append(cv2.flip(face, 1))
+        elif augType == 'crop':
+            faces.append(cv2.resize(face[y:y + new, x:x + new], (H, H)))
+        elif augType == 'rotate':
+            faces.append(cv2.warpAffine(face, M, (H, H)))
+    faces = numpy.array(faces)
+    return faces
+
+
+def load_label(data, numFrames):
+    res = []
+    labels = data[3].replace('[', '').replace(']', '')
+    labels = labels.split(',')
+    for label in labels:
+        res.append(int(label))
+    res = numpy.array(res[:numFrames])
+    return res
+
+
+class train_loader(object):
+
+    def __init__(self, cfg, trialFileName, audioPath, visualPath, num_speakers):
+        self.cfg = cfg
+        self.audioPath = audioPath
+        self.visualPath = visualPath
+        self.candidate_speakers = num_speakers
+        self.path = os.path.join(cfg.DATA.dataPathAVA, "csv")
+        self.entity_data = json.load(open(os.path.join(self.path, 'train_entity.json')))
+        self.ts_to_entity = json.load(open(os.path.join(self.path, 'train_ts.json')))
+        self.mixLst = open(trialFileName).read().splitlines()
+        self.list_length = len(self.mixLst)
+        random.shuffle(self.mixLst)
+
+    def load_single_audio(self, audio, fps, numFrames, audioAug=False, aug_audio=None):
+        if audioAug:
+            augType = random.randint(0, 1)
+            if augType == 1:
+                audio = overlap(audio, aug_audio)
+            else:
+                audio = audio
+
+        res = vggish_input.waveform_to_examples(audio, 16000, numFrames, fps, return_tensor=False)
+        return res
+
+    def load_visual_label_mask(self, videoName, entityName, target_ts, context_ts, visualAug=True):
+
+        faceFolderPath = os.path.join(self.visualPath, videoName, entityName)
+
+        faces = []
+        H = 112
+        if visualAug == True:
+            new = int(H * random.uniform(0.7, 1))
+            x, y = numpy.random.randint(0, H - new), numpy.random.randint(0, H - new)
+            M = cv2.getRotationMatrix2D((H / 2, H / 2), random.uniform(-15, 15), 1)
+            augType = random.choice(['orig', 'flip', 'crop', 'rotate'])
+        else:
+            augType = 'orig'
+        labels_dict = self.entity_data[videoName][entityName]
+        labels = numpy.zeros(len(target_ts))
+        mask = numpy.zeros(len(target_ts))
+
+        for i, time in enumerate(target_ts):
+            if time not in context_ts:
+                faces.append(numpy.zeros((H, H)))
+            else:
+                labels[i] = labels_dict[time]
+                mask[i] = 1
+                time = "%.2f" % float(time)
+                faceFile = os.path.join(faceFolderPath, str(time) + '.jpg')
+
+                face = cv2.imread(faceFile)
+
+                face = cv2.cvtColor(face, cv2.COLOR_BGR2GRAY)
+                face = cv2.resize(face, (H, H))
+                if augType == 'orig':
+                    faces.append(face)
+                elif augType == 'flip':
+                    faces.append(cv2.flip(face, 1))
+                elif augType == 'crop':
+                    faces.append(cv2.resize(face[y:y + new, x:x + new], (H, H)))
+                elif augType == 'rotate':
+                    faces.append(cv2.warpAffine(face, M, (H, H)))
+        faces = numpy.array(faces)
+        return faces, labels, mask
+
+    def get_speaker_context(self, videoName, target_entity, all_ts, center_ts):
+
+        context_speakers = list(self.ts_to_entity[videoName][center_ts])
+        context = {}
+        chosen_speakers = []
+        context[target_entity] = all_ts
+        context_speakers.remove(target_entity)
+        num_frames = len(all_ts)
+        for candidate in context_speakers:
+            candidate_ts = self.entity_data[videoName][candidate]
+            shared_ts = set(all_ts).intersection(set(candidate_ts))
+            if (len(shared_ts) > (num_frames / 2)):
+                context[candidate] = shared_ts
+                chosen_speakers.append(candidate)
+        context_speakers = chosen_speakers
+        random.shuffle(context_speakers)
+        if not context_speakers:
+            context_speakers.insert(0, target_entity)    # make sure is at 0
+            while len(context_speakers) < self.candidate_speakers:
+                context_speakers.append(random.choice(context_speakers))
+        elif len(context_speakers) < self.candidate_speakers:
+            context_speakers.insert(0, target_entity)    # make sure is at 0
+            while len(context_speakers) < self.candidate_speakers:
+                context_speakers.append(random.choice(context_speakers[1:]))
+        else:
+            context_speakers.insert(0, target_entity)    # make sure is at 0
+            context_speakers = context_speakers[:self.candidate_speakers]
+
+        assert set(context_speakers).issubset(set(list(context.keys()))), target_entity
+        assert target_entity in context_speakers, target_entity
+
+        return context_speakers, context
+
+    def __getitem__(self, index):
+
+        target_video = self.mixLst[index]
+        data = target_video.split('\t')
+        fps = float(data[2])
+        videoName = data[0][:11]
+        target_entity = data[0]
+        all_ts = list(self.entity_data[videoName][target_entity].keys())
+        numFrames = int(data[1])
+        assert numFrames == len(all_ts)
+
+        center_ts = all_ts[math.floor(numFrames / 2)]
+
+        # get context speakers which have more than half time overlapped with target speaker
+        context_speakers, context = self.get_speaker_context(videoName, target_entity, all_ts,
+                                                             center_ts)
+
+        if self.cfg.TRAIN.AUDIO_AUG:
+            other_indices = list(range(0, index)) + list(range(index + 1, self.list_length))
+            augment_entity = self.mixLst[random.choice(other_indices)]
+            augment_data = augment_entity.split('\t')
+            augment_entity = augment_data[0]
+            augment_videoname = augment_data[0][:11]
+            aug_sr, aug_audio = wavfile.read(
+                os.path.join(self.audioPath, augment_videoname, augment_entity + '.wav'))
+        else:
+            aug_audio = None
+
+        audio_path = os.path.join(self.audioPath, videoName, target_entity + '.wav')
+        sr, audio = wavfile.read(os.path.join(self.audioPath, videoName, target_entity + '.wav'))
+        audio = self.load_single_audio(audio,
+                                       fps,
+                                       numFrames,
+                                       audioAug=self.cfg.TRAIN.AUDIO_AUG,
+                                       aug_audio=aug_audio)
+
+        visualFeatures, labels, masks = [], [], []
+
+        # target_label = list(self.entity_data[videoName][target_entity].values())
+        visual, target_labels, target_masks = self.load_visual_label_mask(
+            videoName, target_entity, all_ts, all_ts)
+
+        for idx, context_entity in enumerate(context_speakers):
+            if context_entity == target_entity:
+                label = target_labels
+                visualfeat = visual
+                mask = target_masks
+            else:
+                visualfeat, label, mask = self.load_visual_label_mask(videoName, context_entity,
+                                                                      all_ts,
+                                                                      context[context_entity])
+            visualFeatures.append(visualfeat)
+            labels.append(label)
+            masks.append(mask)
+
+        audio = torch.FloatTensor(audio)[None, :, :]
+        visualFeatures = torch.FloatTensor(numpy.array(visualFeatures))
+        audio_t = audio.shape[1]
+        video_t = visualFeatures.shape[1]
+        if audio_t != video_t * 4:
+            print(visualFeatures.shape, audio.shape, videoName, target_entity, numFrames)
+        labels = torch.LongTensor(numpy.array(labels))
+        masks = torch.LongTensor(numpy.array(masks))
+        print(audio.shape)
+        return audio, visualFeatures, labels, masks
+
+    def __len__(self):
+        return len(self.mixLst)
+
+
+class val_loader(object):
+
+    def __init__(self, cfg, trialFileName, audioPath, visualPath, num_speakers):
+        self.cfg = cfg
+        self.audioPath = audioPath
+        self.visualPath = visualPath
+        self.candidate_speakers = num_speakers
+        self.path = os.path.join(cfg.DATA.dataPathAVA, "csv")
+        self.entity_data = json.load(open(os.path.join(self.path, 'val_entity.json')))
+        self.ts_to_entity = json.load(open(os.path.join(self.path, 'val_ts.json')))
+        self.mixLst = open(trialFileName).read().splitlines()
+
+    def load_single_audio(self, audio, fps, numFrames, audioAug=False, aug_audio=None):
+
+        res = vggish_input.waveform_to_examples(audio, 16000, numFrames, fps, return_tensor=False)
+        return res
+
+    def load_visual_label_mask(self, videoName, entityName, target_ts, context_ts):
+
+        faceFolderPath = os.path.join(self.visualPath, videoName, entityName)
+
+        faces = []
+        H = 112
+        labels_dict = self.entity_data[videoName][entityName]
+        labels = numpy.zeros(len(target_ts))
+        mask = numpy.zeros(len(target_ts))
+
+        for i, time in enumerate(target_ts):
+            if time not in context_ts:
+                faces.append(numpy.zeros((H, H)))
+            else:
+                labels[i] = labels_dict[time]
+                mask[i] = 1
+                time = "%.2f" % float(time)
+                faceFile = os.path.join(faceFolderPath, str(time) + '.jpg')
+
+                face = cv2.imread(faceFile)
+                face = cv2.cvtColor(face, cv2.COLOR_BGR2GRAY)
+                face = cv2.resize(face, (H, H))
+                faces.append(face)
+        faces = numpy.array(faces)
+        return faces, labels, mask
+
+    def get_speaker_context(self, videoName, target_entity, all_ts, center_ts):
+
+        context_speakers = list(self.ts_to_entity[videoName][center_ts])
+        context = {}
+        chosen_speakers = []
+        context[target_entity] = all_ts
+        context_speakers.remove(target_entity)
+        num_frames = len(all_ts)
+        for candidate in context_speakers:
+            candidate_ts = self.entity_data[videoName][candidate]
+            shared_ts = set(all_ts).intersection(set(candidate_ts))
+            context[candidate] = shared_ts
+            chosen_speakers.append(candidate)
+            # if (len(shared_ts) > (num_frames / 2)):
+            # context[candidate] = shared_ts
+            # chosen_speakers.append(candidate)
+        context_speakers = chosen_speakers
+        random.shuffle(context_speakers)
+        if not context_speakers:
+            context_speakers.insert(0, target_entity)    # make sure is at 0
+            while len(context_speakers) < self.candidate_speakers:
+                context_speakers.append(random.choice(context_speakers))
+        elif len(context_speakers) < self.candidate_speakers:
+            context_speakers.insert(0, target_entity)    # make sure is at 0
+            while len(context_speakers) < self.candidate_speakers:
+                context_speakers.append(random.choice(context_speakers[1:]))
+        else:
+            context_speakers.insert(0, target_entity)    # make sure is at 0
+            context_speakers = context_speakers[:self.candidate_speakers]
+
+        assert set(context_speakers).issubset(set(list(context.keys()))), target_entity
+
+        return context_speakers, context
+
+    def __getitem__(self, index):
+
+        target_video = self.mixLst[index]
+        data = target_video.split('\t')
+        fps = float(data[2])
+        videoName = data[0][:11]
+        target_entity = data[0]
+        all_ts = list(self.entity_data[videoName][target_entity].keys())
+        numFrames = int(data[1])
+        # print(numFrames, len(all_ts))
+        assert numFrames == len(all_ts)
+
+        center_ts = all_ts[math.floor(numFrames / 2)]
+
+        # get context speakers which have more than half time overlapped with target speaker
+        context_speakers, context = self.get_speaker_context(videoName, target_entity, all_ts,
+                                                             center_ts)
+
+        sr, audio = wavfile.read(os.path.join(self.audioPath, videoName, target_entity + '.wav'))
+        audio = self.load_single_audio(audio, fps, numFrames, audioAug=False)
+
+        visualFeatures, labels, masks = [], [], []
+
+        # target_label = list(self.entity_data[videoName][target_entity].values())
+        target_visual, target_labels, target_masks = self.load_visual_label_mask(
+            videoName, target_entity, all_ts, all_ts)
+
+        for idx, context_entity in enumerate(context_speakers):
+            if context_entity == target_entity:
+                label = target_labels
+                visualfeat = target_visual
+                mask = target_masks
+            else:
+                visualfeat, label, mask = self.load_visual_label_mask(videoName, context_entity,
+                                                                      all_ts,
+                                                                      context[context_entity])
+            visualFeatures.append(visualfeat)
+            labels.append(label)
+            masks.append(mask)
+
+        audio = torch.FloatTensor(audio)[None, :, :]
+        visualFeatures = torch.FloatTensor(numpy.array(visualFeatures))
+        audio_t = audio.shape[1]
+        video_t = visualFeatures.shape[1]
+        if audio_t != video_t * 4:
+            print(visualFeatures.shape, audio.shape, videoName, target_entity, numFrames)
+        labels = torch.LongTensor(numpy.array(labels))
+        masks = torch.LongTensor(numpy.array(masks))
+
+        return audio, visualFeatures, labels, masks
+
+    def __len__(self):
+        return len(self.mixLst)
diff --git a/dlhammer/.gitignore b/dlhammer/.gitignore
new file mode 100644
index 0000000000000000000000000000000000000000..6819b69e3e5ed7811a3d6ecb0290ba1175601955
--- /dev/null
+++ b/dlhammer/.gitignore
@@ -0,0 +1,3 @@
+*.log
+.vim-arsync
+__pycache__/
diff --git a/dlhammer/LICENSE b/dlhammer/LICENSE
new file mode 100644
index 0000000000000000000000000000000000000000..261eeb9e9f8b2b4b0d119366dda99c6fd7d35c64
--- /dev/null
+++ b/dlhammer/LICENSE
@@ -0,0 +1,201 @@
+                                 Apache License
+                           Version 2.0, January 2004
+                        http://www.apache.org/licenses/
+
+   TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION
+
+   1. Definitions.
+
+      "License" shall mean the terms and conditions for use, reproduction,
+      and distribution as defined by Sections 1 through 9 of this document.
+
+      "Licensor" shall mean the copyright owner or entity authorized by
+      the copyright owner that is granting the License.
+
+      "Legal Entity" shall mean the union of the acting entity and all
+      other entities that control, are controlled by, or are under common
+      control with that entity. For the purposes of this definition,
+      "control" means (i) the power, direct or indirect, to cause the
+      direction or management of such entity, whether by contract or
+      otherwise, or (ii) ownership of fifty percent (50%) or more of the
+      outstanding shares, or (iii) beneficial ownership of such entity.
+
+      "You" (or "Your") shall mean an individual or Legal Entity
+      exercising permissions granted by this License.
+
+      "Source" form shall mean the preferred form for making modifications,
+      including but not limited to software source code, documentation
+      source, and configuration files.
+
+      "Object" form shall mean any form resulting from mechanical
+      transformation or translation of a Source form, including but
+      not limited to compiled object code, generated documentation,
+      and conversions to other media types.
+
+      "Work" shall mean the work of authorship, whether in Source or
+      Object form, made available under the License, as indicated by a
+      copyright notice that is included in or attached to the work
+      (an example is provided in the Appendix below).
+
+      "Derivative Works" shall mean any work, whether in Source or Object
+      form, that is based on (or derived from) the Work and for which the
+      editorial revisions, annotations, elaborations, or other modifications
+      represent, as a whole, an original work of authorship. For the purposes
+      of this License, Derivative Works shall not include works that remain
+      separable from, or merely link (or bind by name) to the interfaces of,
+      the Work and Derivative Works thereof.
+
+      "Contribution" shall mean any work of authorship, including
+      the original version of the Work and any modifications or additions
+      to that Work or Derivative Works thereof, that is intentionally
+      submitted to Licensor for inclusion in the Work by the copyright owner
+      or by an individual or Legal Entity authorized to submit on behalf of
+      the copyright owner. For the purposes of this definition, "submitted"
+      means any form of electronic, verbal, or written communication sent
+      to the Licensor or its representatives, including but not limited to
+      communication on electronic mailing lists, source code control systems,
+      and issue tracking systems that are managed by, or on behalf of, the
+      Licensor for the purpose of discussing and improving the Work, but
+      excluding communication that is conspicuously marked or otherwise
+      designated in writing by the copyright owner as "Not a Contribution."
+
+      "Contributor" shall mean Licensor and any individual or Legal Entity
+      on behalf of whom a Contribution has been received by Licensor and
+      subsequently incorporated within the Work.
+
+   2. Grant of Copyright License. Subject to the terms and conditions of
+      this License, each Contributor hereby grants to You a perpetual,
+      worldwide, non-exclusive, no-charge, royalty-free, irrevocable
+      copyright license to reproduce, prepare Derivative Works of,
+      publicly display, publicly perform, sublicense, and distribute the
+      Work and such Derivative Works in Source or Object form.
+
+   3. Grant of Patent License. Subject to the terms and conditions of
+      this License, each Contributor hereby grants to You a perpetual,
+      worldwide, non-exclusive, no-charge, royalty-free, irrevocable
+      (except as stated in this section) patent license to make, have made,
+      use, offer to sell, sell, import, and otherwise transfer the Work,
+      where such license applies only to those patent claims licensable
+      by such Contributor that are necessarily infringed by their
+      Contribution(s) alone or by combination of their Contribution(s)
+      with the Work to which such Contribution(s) was submitted. If You
+      institute patent litigation against any entity (including a
+      cross-claim or counterclaim in a lawsuit) alleging that the Work
+      or a Contribution incorporated within the Work constitutes direct
+      or contributory patent infringement, then any patent licenses
+      granted to You under this License for that Work shall terminate
+      as of the date such litigation is filed.
+
+   4. Redistribution. You may reproduce and distribute copies of the
+      Work or Derivative Works thereof in any medium, with or without
+      modifications, and in Source or Object form, provided that You
+      meet the following conditions:
+
+      (a) You must give any other recipients of the Work or
+          Derivative Works a copy of this License; and
+
+      (b) You must cause any modified files to carry prominent notices
+          stating that You changed the files; and
+
+      (c) You must retain, in the Source form of any Derivative Works
+          that You distribute, all copyright, patent, trademark, and
+          attribution notices from the Source form of the Work,
+          excluding those notices that do not pertain to any part of
+          the Derivative Works; and
+
+      (d) If the Work includes a "NOTICE" text file as part of its
+          distribution, then any Derivative Works that You distribute must
+          include a readable copy of the attribution notices contained
+          within such NOTICE file, excluding those notices that do not
+          pertain to any part of the Derivative Works, in at least one
+          of the following places: within a NOTICE text file distributed
+          as part of the Derivative Works; within the Source form or
+          documentation, if provided along with the Derivative Works; or,
+          within a display generated by the Derivative Works, if and
+          wherever such third-party notices normally appear. The contents
+          of the NOTICE file are for informational purposes only and
+          do not modify the License. You may add Your own attribution
+          notices within Derivative Works that You distribute, alongside
+          or as an addendum to the NOTICE text from the Work, provided
+          that such additional attribution notices cannot be construed
+          as modifying the License.
+
+      You may add Your own copyright statement to Your modifications and
+      may provide additional or different license terms and conditions
+      for use, reproduction, or distribution of Your modifications, or
+      for any such Derivative Works as a whole, provided Your use,
+      reproduction, and distribution of the Work otherwise complies with
+      the conditions stated in this License.
+
+   5. Submission of Contributions. Unless You explicitly state otherwise,
+      any Contribution intentionally submitted for inclusion in the Work
+      by You to the Licensor shall be under the terms and conditions of
+      this License, without any additional terms or conditions.
+      Notwithstanding the above, nothing herein shall supersede or modify
+      the terms of any separate license agreement you may have executed
+      with Licensor regarding such Contributions.
+
+   6. Trademarks. This License does not grant permission to use the trade
+      names, trademarks, service marks, or product names of the Licensor,
+      except as required for reasonable and customary use in describing the
+      origin of the Work and reproducing the content of the NOTICE file.
+
+   7. Disclaimer of Warranty. Unless required by applicable law or
+      agreed to in writing, Licensor provides the Work (and each
+      Contributor provides its Contributions) on an "AS IS" BASIS,
+      WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
+      implied, including, without limitation, any warranties or conditions
+      of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A
+      PARTICULAR PURPOSE. You are solely responsible for determining the
+      appropriateness of using or redistributing the Work and assume any
+      risks associated with Your exercise of permissions under this License.
+
+   8. Limitation of Liability. In no event and under no legal theory,
+      whether in tort (including negligence), contract, or otherwise,
+      unless required by applicable law (such as deliberate and grossly
+      negligent acts) or agreed to in writing, shall any Contributor be
+      liable to You for damages, including any direct, indirect, special,
+      incidental, or consequential damages of any character arising as a
+      result of this License or out of the use or inability to use the
+      Work (including but not limited to damages for loss of goodwill,
+      work stoppage, computer failure or malfunction, or any and all
+      other commercial damages or losses), even if such Contributor
+      has been advised of the possibility of such damages.
+
+   9. Accepting Warranty or Additional Liability. While redistributing
+      the Work or Derivative Works thereof, You may choose to offer,
+      and charge a fee for, acceptance of support, warranty, indemnity,
+      or other liability obligations and/or rights consistent with this
+      License. However, in accepting such obligations, You may act only
+      on Your own behalf and on Your sole responsibility, not on behalf
+      of any other Contributor, and only if You agree to indemnify,
+      defend, and hold each Contributor harmless for any liability
+      incurred by, or claims asserted against, such Contributor by reason
+      of your accepting any such warranty or additional liability.
+
+   END OF TERMS AND CONDITIONS
+
+   APPENDIX: How to apply the Apache License to your work.
+
+      To apply the Apache License to your work, attach the following
+      boilerplate notice, with the fields enclosed by brackets "[]"
+      replaced with your own identifying information. (Don't include
+      the brackets!)  The text should be enclosed in the appropriate
+      comment syntax for the file format. We also recommend that a
+      file or class name and description of purpose be included on the
+      same "printed page" as the copyright notice for easier
+      identification within third-party archives.
+
+   Copyright [yyyy] [name of copyright owner]
+
+   Licensed under the Apache License, Version 2.0 (the "License");
+   you may not use this file except in compliance with the License.
+   You may obtain a copy of the License at
+
+       http://www.apache.org/licenses/LICENSE-2.0
+
+   Unless required by applicable law or agreed to in writing, software
+   distributed under the License is distributed on an "AS IS" BASIS,
+   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+   See the License for the specific language governing permissions and
+   limitations under the License.
diff --git a/dlhammer/README.md b/dlhammer/README.md
new file mode 100644
index 0000000000000000000000000000000000000000..ab4a733d103adef0e242dad6e7435270b4f1dfb4
--- /dev/null
+++ b/dlhammer/README.md
@@ -0,0 +1,2 @@
+# dl-hammer
+tools for deep learning coding.
diff --git a/dlhammer/dlhammer/.ipynb_checkpoints/argparser-checkpoint.py b/dlhammer/dlhammer/.ipynb_checkpoints/argparser-checkpoint.py
new file mode 100644
index 0000000000000000000000000000000000000000..1cc3fee70e5695cce6305c56bc59af32bcdb113b
--- /dev/null
+++ b/dlhammer/dlhammer/.ipynb_checkpoints/argparser-checkpoint.py
@@ -0,0 +1,110 @@
+# -*- coding: utf-8 -*-
+#================================================================
+#   Don't go gently into that good night.
+#
+#   author: klaus
+#   description:
+#
+#================================================================
+
+import os
+import argparse
+import datetime
+from functools import partial
+import yaml
+from easydict import EasyDict
+
+# from .utils import get_vacant_gpu
+from .logger import bootstrap_logger, logger
+from .utils.system import get_available_gpuids
+from .utils.misc import merge_dict, merge_opts, to_string, eval_dict_leaf
+
+CONFIG = EasyDict()
+
+BASE_CONFIG = {
+    'OUTPUT_DIR': './workspace',
+    'SESSION': 'base',
+    'NUM_GPUS': 1,
+    'LOG_NAME': 'log.txt'
+}
+
+
+def bootstrap_args(default_params=None):
+    """get the params from yaml file and args. The args will override arguemnts in the yaml file.
+    Returns: EasyDict instance.
+
+    """
+    parser = define_default_arg_parser()
+    cfg = update_config(parser, default_params)
+    create_workspace(cfg)    #create workspace
+
+    CONFIG.update(cfg)
+    bootstrap_logger(get_logfile(CONFIG))    # setup logger
+    setup_gpu(CONFIG.NUM_GPUS)    #setup gpu
+
+    return cfg
+
+
+def setup_gpu(ngpu):
+    gpuids = get_available_gpuids()
+    # os.environ['CUDA_VISIBLE_DEVICES'] = ','.join([str(i) for i in gpuids[:ngpu]])
+
+
+def get_logfile(config):
+    return os.path.join(config.WORKSPACE, config.LOG_NAME)
+
+
+def define_default_arg_parser():
+    """Define a default arg_parser.
+
+    Returns: 
+        A argparse.ArgumentParser. More arguments can be added.
+
+    """
+    parser = argparse.ArgumentParser()
+    parser.add_argument('--cfg', help='load configs from yaml file', default='', type=str)
+    parser.add_argument('opts',
+                        default=None,
+                        nargs='*',
+                        help='modify config options using the command-line')
+
+    return parser
+
+
+def update_config(arg_parser, default_config=None):
+    """ update argparser to args.
+
+    Args:
+        arg_parser: argparse.ArgumentParser.
+    """
+
+    parsed, unknown = arg_parser.parse_known_args()
+    if default_config and parsed.cfg == "" and "cfg" in default_config:
+        parsed.cfg = default_config["cfg"]
+
+    config = EasyDict(BASE_CONFIG.copy())
+    config['cfg'] = parsed.cfg
+    # update default config
+    if default_config is not None:
+        config.update(default_config)
+
+    # merge config from yaml
+    if os.path.isfile(config.cfg):
+        with open(config.cfg, 'r') as f:
+            yml_config = yaml.full_load(f)
+        config = merge_dict(config, yml_config)
+
+    # merge opts
+    config = merge_opts(config, parsed.opts)
+
+    # eval values
+    config = eval_dict_leaf(config)
+
+    return config
+
+
+def create_workspace(cfg):
+    cfg_name, ext = os.path.splitext(os.path.basename(cfg.cfg))
+    workspace = os.path.join(cfg.OUTPUT_DIR, cfg_name, cfg.SESSION)
+    os.makedirs(workspace, exist_ok=True)
+    cfg.WORKSPACE = workspace
diff --git a/dlhammer/dlhammer/.ipynb_checkpoints/bootstrap-checkpoint.py b/dlhammer/dlhammer/.ipynb_checkpoints/bootstrap-checkpoint.py
new file mode 100644
index 0000000000000000000000000000000000000000..2df2fcae6b6867afb06c186271805142c3ca245f
--- /dev/null
+++ b/dlhammer/dlhammer/.ipynb_checkpoints/bootstrap-checkpoint.py
@@ -0,0 +1,33 @@
+# -*- coding: utf-8 -*-
+#================================================================
+#   Don't go gently into that good night.
+#
+#   author: klaus
+#   description:
+#
+#================================================================
+
+import sys
+import logging
+
+from .logger import bootstrap_logger, logger
+from .argparser import bootstrap_args, CONFIG
+from .utils.misc import to_string
+
+__all__ = ['bootstrap', 'logger', 'CONFIG']
+
+
+def bootstrap(default_cfg=None, print_cfg=True):
+    """TODO: Docstring for bootstrap.
+
+    Kwargs:
+        use_argparser (TODO): TODO
+        use_logger (TODO): TODO
+
+    Returns: TODO
+
+    """
+    config = bootstrap_args(default_cfg)
+    if print_cfg:
+        logger.info(to_string(config))
+    return config
diff --git a/dlhammer/dlhammer/__init__.py b/dlhammer/dlhammer/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..1253e3e11333d1f9a40940a17eb3e37c1d76f763
--- /dev/null
+++ b/dlhammer/dlhammer/__init__.py
@@ -0,0 +1 @@
+from .bootstrap import *
diff --git a/dlhammer/dlhammer/argparser.py b/dlhammer/dlhammer/argparser.py
new file mode 100644
index 0000000000000000000000000000000000000000..72702608063d7a97b020e24ad55aa0ce55a7ed5e
--- /dev/null
+++ b/dlhammer/dlhammer/argparser.py
@@ -0,0 +1,109 @@
+# -*- coding: utf-8 -*-
+#================================================================
+#   Don't go gently into that good night.
+#
+#   author: klaus
+#   description:
+#
+#================================================================
+
+import os
+import argparse
+import datetime
+from functools import partial
+import yaml
+from easydict import EasyDict
+
+# from .utils import get_vacant_gpu
+from .logger import bootstrap_logger, logger
+from .utils.system import get_available_gpuids
+from .utils.misc import merge_dict, merge_opts, to_string, eval_dict_leaf
+
+CONFIG = EasyDict()
+
+BASE_CONFIG = {
+    'OUTPUT_DIR': './workspace',
+    'NUM_GPUS': 1,
+    'LOG_NAME': 'log.txt'
+}
+
+
+def bootstrap_args(default_params=None):
+    """get the params from yaml file and args. The args will override arguemnts in the yaml file.
+    Returns: EasyDict instance.
+
+    """
+    parser = define_default_arg_parser()
+    cfg = update_config(parser, default_params)
+    create_workspace(cfg)    #create workspace
+
+    CONFIG.update(cfg)
+    bootstrap_logger(get_logfile(CONFIG))    # setup logger
+    setup_gpu(CONFIG.NUM_GPUS)    #setup gpu
+
+    return cfg
+
+
+def setup_gpu(ngpu):
+    gpuids = get_available_gpuids()
+    # os.environ['CUDA_VISIBLE_DEVICES'] = ','.join([str(i) for i in gpuids[:ngpu]])
+
+
+def get_logfile(config):
+    return os.path.join(config.WORKSPACE, config.LOG_NAME)
+
+
+def define_default_arg_parser():
+    """Define a default arg_parser.
+
+    Returns: 
+        A argparse.ArgumentParser. More arguments can be added.
+
+    """
+    parser = argparse.ArgumentParser()
+    parser.add_argument('--cfg', help='load configs from yaml file', default='', type=str)
+    parser.add_argument('opts',
+                        default=None,
+                        nargs='*',
+                        help='modify config options using the command-line')
+
+    return parser
+
+
+def update_config(arg_parser, default_config=None):
+    """ update argparser to args.
+
+    Args:
+        arg_parser: argparse.ArgumentParser.
+    """
+
+    parsed, unknown = arg_parser.parse_known_args()
+    if default_config and parsed.cfg == "" and "cfg" in default_config:
+        parsed.cfg = default_config["cfg"]
+
+    config = EasyDict(BASE_CONFIG.copy())
+    config['cfg'] = parsed.cfg
+    # update default config
+    if default_config is not None:
+        config.update(default_config)
+
+    # merge config from yaml
+    if os.path.isfile(config.cfg):
+        with open(config.cfg, 'r') as f:
+            yml_config = yaml.full_load(f)
+        config = merge_dict(config, yml_config)
+
+    # merge opts
+    config = merge_opts(config, parsed.opts)
+
+    # eval values
+    config = eval_dict_leaf(config)
+
+    return config
+
+
+def create_workspace(cfg):
+    cfg_name, ext = os.path.splitext(os.path.basename(cfg.cfg))
+    workspace = os.path.join(cfg.OUTPUT_DIR)
+    os.makedirs(workspace, exist_ok=True)
+    cfg.WORKSPACE = workspace
diff --git a/dlhammer/dlhammer/bootstrap.py b/dlhammer/dlhammer/bootstrap.py
new file mode 100644
index 0000000000000000000000000000000000000000..2df2fcae6b6867afb06c186271805142c3ca245f
--- /dev/null
+++ b/dlhammer/dlhammer/bootstrap.py
@@ -0,0 +1,33 @@
+# -*- coding: utf-8 -*-
+#================================================================
+#   Don't go gently into that good night.
+#
+#   author: klaus
+#   description:
+#
+#================================================================
+
+import sys
+import logging
+
+from .logger import bootstrap_logger, logger
+from .argparser import bootstrap_args, CONFIG
+from .utils.misc import to_string
+
+__all__ = ['bootstrap', 'logger', 'CONFIG']
+
+
+def bootstrap(default_cfg=None, print_cfg=True):
+    """TODO: Docstring for bootstrap.
+
+    Kwargs:
+        use_argparser (TODO): TODO
+        use_logger (TODO): TODO
+
+    Returns: TODO
+
+    """
+    config = bootstrap_args(default_cfg)
+    if print_cfg:
+        logger.info(to_string(config))
+    return config
diff --git a/dlhammer/dlhammer/logger.py b/dlhammer/dlhammer/logger.py
new file mode 100644
index 0000000000000000000000000000000000000000..2c9854b0254aec23d18e2eff17831859278ca36d
--- /dev/null
+++ b/dlhammer/dlhammer/logger.py
@@ -0,0 +1,66 @@
+# -*- coding: utf-8 -*-
+#================================================================
+#   Don't go gently into that good night.
+#
+#   author: klaus
+#   description:
+#
+#================================================================
+
+import os
+import sys
+import logging
+
+logger = logging.getLogger('DLHammer')
+
+
+def bootstrap_logger(logfile=None, fmt=None):
+    """TODO: Docstring for bootstrap_logger.
+
+    Args:
+        logfile (str): file path logging to.
+
+    Kwargs:
+        fmt (TODO): TODO
+
+    Returns: TODO
+
+    """
+    if fmt is None:
+        # fmt = '%(asctime)s - %(levelname)-5s - [%(filename)s:%(lineno)d] %(message)s'
+        fmt = '%(message)s'
+    logging.basicConfig(level=logging.DEBUG, format=fmt)
+
+    #log to file
+    if logfile is not None:
+        formatter = logging.Formatter(fmt)
+        fh = logging.FileHandler(logfile)
+        fh.setLevel(logging.DEBUG)
+        fh.setFormatter(formatter)
+        logger.addHandler(fh)
+
+    # sys.stdout = LoggerWriter(sys.stdout, logger.info)
+    # sys.stderr = LoggerWriter(sys.stderr, logger.error)
+    return
+
+
+class LoggerWriter(object):
+
+    def __init__(self, stream, logfct):
+        self.terminal = stream
+        self.logfct = logfct
+        self.buf = []
+
+    def write(self, msg):
+        if msg.endswith('\n'):
+            self.buf.append(msg.rstrip('\n'))
+
+            message = ''.join(self.buf)
+            self.logfct(message)
+
+            self.buf = []
+        else:
+            self.buf.append(msg)
+
+    def flush(self):
+        pass
diff --git a/dlhammer/dlhammer/test/config.yml b/dlhammer/dlhammer/test/config.yml
new file mode 100644
index 0000000000000000000000000000000000000000..53fe053b489f3e373e7a26dd9b2f20733c0f61e0
--- /dev/null
+++ b/dlhammer/dlhammer/test/config.yml
@@ -0,0 +1,32 @@
+a_int: 12
+a_float: 1e-2
+a_list: [0,1,2]
+eval_list: eval(list(range(10)))
+DATA:
+  PATH_TO_DATA_DIR: /home/ubuntu/data/kinetics/Mini-Kinetics-200
+  PATH_PREFIX: /home/ubuntu/data/kinetics/k400_ver3
+  NUM_FRAMES: 16
+  SAMPLING_RATE: 8
+  TARGET_FPS: 25
+  TRAIN_JITTER_SCALES: [256, 320]
+  TRAIN_CROP_SIZE: 224
+  TEST_CROP_SIZE: 224
+  INPUT_CHANNEL_NUM: [3]
+SOLVER:
+  BACKBONE:
+    OPTIMIZER: sgd
+    MOMENTUM: 0.9
+    BASE_LR: 1e-3
+    SCHEDULER:
+      NAME: warmup_multistep
+      MILESTONES: [13, 24]
+      WARMUP_EPOCHS: 0.5
+      GAMMA: 0.1
+  TEMPORAL_MODEL:
+    OPTIMIZER: sgd
+    MOMENTUM: 0.9
+    BASE_LR: 1e-3
+    SCHEDULER:
+      NAME: multistep
+      MILESTONES: [13, 24]
+      GAMMA: 0.1
diff --git a/dlhammer/dlhammer/test/test_args.py b/dlhammer/dlhammer/test/test_args.py
new file mode 100644
index 0000000000000000000000000000000000000000..18c1faed5c369d7f6c6fa05ea28c39b21ecc6f62
--- /dev/null
+++ b/dlhammer/dlhammer/test/test_args.py
@@ -0,0 +1,20 @@
+# -*- coding: utf-8 -*-
+#================================================================
+#   Don't go gently into that good night.
+#
+#   author: klaus
+#   description:
+#
+#================================================================
+
+import os
+import sys
+
+CURRENT_FILE_DIRECTORY = os.path.abspath(os.path.dirname(__file__))
+sys.path.append(os.path.join(CURRENT_FILE_DIRECTORY, '../..'))
+sys.path.append(os.path.join(CURRENT_FILE_DIRECTORY, '.'))
+
+from dlhammer import bootstrap, CONFIG
+from dlhammer import logger
+
+config = bootstrap(print_cfg=True)
diff --git a/dlhammer/dlhammer/test/test_logger.py b/dlhammer/dlhammer/test/test_logger.py
new file mode 100644
index 0000000000000000000000000000000000000000..7911831f9a03e7a2fa0115277be95fdb124583a8
--- /dev/null
+++ b/dlhammer/dlhammer/test/test_logger.py
@@ -0,0 +1,22 @@
+# -*- coding: utf-8 -*-
+#================================================================
+#   Don't go gently into that good night.
+#
+#   author: klaus
+#   description:
+#
+#================================================================
+
+import os
+import sys
+
+CURRENT_FILE_DIRECTORY = os.path.abspath(os.path.dirname(__file__))
+sys.path.append(os.path.join(CURRENT_FILE_DIRECTORY, '../..'))
+sys.path.append(os.path.join(CURRENT_FILE_DIRECTORY, '.'))
+
+from dlhammer import bootstrap, logger
+bootstrap()
+
+logger.info('dummy output')
+
+raise Exception('dummy error')
diff --git a/dlhammer/dlhammer/utils/__init__.py b/dlhammer/dlhammer/utils/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..e69de29bb2d1d6434b8b29ae775ad8c2e48c5391
diff --git a/dlhammer/dlhammer/utils/misc.py b/dlhammer/dlhammer/utils/misc.py
new file mode 100644
index 0000000000000000000000000000000000000000..1ceacd75cd6013cf9940525514cb4f7c5d965876
--- /dev/null
+++ b/dlhammer/dlhammer/utils/misc.py
@@ -0,0 +1,125 @@
+# -*- coding: utf-8 -*-
+#================================================================
+#   Don't go gently into that good night.
+#
+#   author: klaus
+#   description:
+#
+#================================================================
+
+import ast
+
+
+def merge_dict(a, b, path=None):
+    """merge b into a. The values in b will override values in a.
+
+    Args:
+        a (dict): dict to merge to.
+        b (dict): dict to merge from.
+
+    Returns: dict1 with values merged from b.
+
+    """
+    if path is None: path = []
+    for key in b:
+        if key in a:
+            if isinstance(a[key], dict) and isinstance(b[key], dict):
+                merge_dict(a[key], b[key], path + [str(key)])
+            else:
+                a[key] = b[key]
+        else:
+            a[key] = b[key]
+    return a
+
+
+def merge_opts(d, opts):
+    """merge opts
+    Args:
+        d (dict): The dict.
+        opts (list): The opts to merge. format: [key1, name1, key2, name2,...]
+    Returns: d. the input dict `d` with merged opts.
+
+    """
+    assert len(opts) % 2 == 0, f'length of opts must be even. Got: {opts}'
+    for i in range(0, len(opts), 2):
+        full_k, v = opts[i], opts[i + 1]
+        keys = full_k.split('.')
+        sub_d = d
+        for i, k in enumerate(keys):
+            if not hasattr(sub_d, k):
+                raise ValueError(f'The key {k} not exist in the dict. Full key:{full_k}')
+            if i != len(keys) - 1:
+                sub_d = sub_d[k]
+            else:
+                sub_d[k] = v
+    return d
+
+
+def to_string(params, indent=2):
+    """format params to a string
+
+    Args:
+        params (EasyDict): the params. 
+
+    Returns: The string to display.
+
+    """
+    msg = '{\n'
+    for i, (k, v) in enumerate(params.items()):
+        if isinstance(v, dict):
+            v = to_string(v, indent + 4)
+        spaces = ' ' * indent
+        msg += spaces + '{}: {}'.format(k, v)
+        if i == len(params) - 1:
+            msg += ' }'
+        else:
+            msg += '\n'
+    return msg
+
+
+def eval_dict_leaf(d):
+    """eval values of dict leaf.
+
+    Args:
+        d (dict): The dict to eval.
+
+    Returns: dict.
+
+    """
+    for k, v in d.items():
+        if not isinstance(v, dict):
+            d[k] = eval_string(v)
+        else:
+            eval_dict_leaf(v)
+    return d
+
+
+def eval_string(string):
+    """automatically evaluate string to corresponding types.
+    
+    For example:
+        not a string  -> return the original input
+        '0'  -> 0
+        '0.2' -> 0.2
+        '[0, 1, 2]' -> [0,1,2]
+        'eval(1+2)' -> 3
+        'eval(range(5))' -> [0,1,2,3,4]
+
+
+    Args:
+        value : string.
+
+    Returns: the corresponding type
+
+    """
+    if not isinstance(string, str):
+        return string
+    if len(string) > 1 and string[0] == '[' and string[-1] == ']':
+        return eval(string)
+    if string[0:5] == 'eval(':
+        return eval(string[5:-1])
+    try:
+        v = ast.literal_eval(string)
+    except:
+        v = string
+    return v
diff --git a/dlhammer/dlhammer/utils/system.py b/dlhammer/dlhammer/utils/system.py
new file mode 100644
index 0000000000000000000000000000000000000000..d59df5266db2a5d675400c917c81b0dbbfd1d6c1
--- /dev/null
+++ b/dlhammer/dlhammer/utils/system.py
@@ -0,0 +1,25 @@
+# -*- coding: utf-8 -*-
+#================================================================
+#   Don't go gently into that good night.
+#
+#   author: klaus
+#   description:
+#
+#================================================================
+
+import os
+import sys
+import subprocess
+import numpy as np
+
+
+def get_available_gpuids():
+    """
+    Returns: the gpu ids sorted in descending order w.r.t occupied memory.
+    """
+    com = "nvidia-smi|sed -n '/%/p'|sed 's/|/\\n/g'|sed -n '/MiB/p'|sed 's/ //g'|sed 's/MiB/\\n/'|sed '/\\//d'"
+    gpum = subprocess.check_output(com, shell=True)
+    gpum = gpum.decode('utf-8').split('\n')
+    gpum = gpum[:-1]
+    sorted_gpuid = np.argsort(gpum)
+    return sorted_gpuid
diff --git a/environment.yml b/environment.yml
new file mode 100644
index 0000000000000000000000000000000000000000..a9c704118304cd651e3e1d2c3d24450c48d50a8a
--- /dev/null
+++ b/environment.yml
@@ -0,0 +1,298 @@
+name: loconet
+channels:
+  - conda-forge
+  - defaults
+dependencies:
+  - _libgcc_mutex=0.1=conda_forge
+  - _openmp_mutex=4.5=1_gnu
+  - alsa-lib=1.2.3=h516909a_0
+  - anyio=3.5.0=py37h89c1867_0
+  - argon2-cffi=21.3.0=pyhd8ed1ab_0
+  - argon2-cffi-bindings=21.2.0=py37h5e8e339_1
+  - aria2=1.36.0=h319415d_2
+  - attrs=21.4.0=pyhd8ed1ab_0
+  - babel=2.9.1=pyh44b312d_0
+  - backcall=0.2.0=pyh9f0ad1d_0
+  - backports=1.0=py_2
+  - backports.functools_lru_cache=1.6.4=pyhd8ed1ab_0
+  - bleach=4.1.0=pyhd8ed1ab_0
+  - bottleneck=1.3.4=py37h6c7ee08_0
+  - brotli=1.0.9=h7f98852_6
+  - brotli-bin=1.0.9=h7f98852_6
+  - brotlipy=0.7.0=py37h5e8e339_1003
+  - c-ares=1.18.1=h7f98852_0
+  - ca-certificates=2022.5.18.1=ha878542_0
+  - cffi=1.14.6=py37hc58025e_0
+  - configparser=5.2.0=pyhd8ed1ab_0
+  - cryptography=36.0.1=py37hf1a17b8_0
+  - cycler=0.11.0=pyhd8ed1ab_0
+  - cython=0.29.27=py37hcd2ae1e_0
+  - dbus=1.13.6=h48d8840_2
+  - debugpy=1.5.1=py37hcd2ae1e_0
+  - defusedxml=0.7.1=pyhd8ed1ab_0
+  - easydict=1.9=py_0
+  - entrypoints=0.4=pyhd8ed1ab_0
+  - expat=2.4.6=h27087fc_0
+  - flit-core=3.7.0=pyhd8ed1ab_0
+  - fontconfig=2.13.96=ha180cfb_0
+  - fonttools=4.29.1=py37h5e8e339_0
+  - freetype=2.10.4=h0708190_1
+  - gettext=0.19.8.1=h0b5b191_1005
+  - giflib=5.2.1=h36c2ea0_2
+  - glib=2.68.4=h9c3ff4c_0
+  - glib-tools=2.68.4=h9c3ff4c_0
+  - gst-plugins-base=1.18.5=hf529b03_0
+  - gstreamer=1.18.5=h76c114f_0
+  - icu=68.2=h9c3ff4c_0
+  - idna=3.3=pyhd8ed1ab_0
+  - importlib_resources=5.4.0=pyhd8ed1ab_0
+  - ipykernel=6.9.1=py37h6531663_0
+  - ipython=7.31.1=py37h89c1867_0
+  - ipython_genutils=0.2.0=py_1
+  - jbig=2.1=h7f98852_2003
+  - jedi=0.18.1=py37h89c1867_0
+  - jinja2=3.0.3=pyhd8ed1ab_0
+  - jpeg=9e=h7f98852_0
+  - json5=0.9.5=pyh9f0ad1d_0
+  - jsonschema=4.4.0=pyhd8ed1ab_0
+  - jupyter_client=7.1.2=pyhd8ed1ab_0
+  - jupyter_core=4.9.2=py37h89c1867_0
+  - jupyter_server=1.13.5=pyhd8ed1ab_1
+  - jupyterlab=3.2.9=pyhd8ed1ab_0
+  - jupyterlab_pygments=0.1.2=pyh9f0ad1d_0
+  - jupyterlab_server=2.10.3=pyhd8ed1ab_0
+  - kiwisolver=1.3.2=py37h2527ec5_1
+  - krb5=1.19.2=hcc1bbae_3
+  - lcms2=2.12=hddcbb42_0
+  - ld_impl_linux-64=2.36.1=hea4e1c9_2
+  - lerc=3.0=h9c3ff4c_0
+  - libblas=3.9.0=13_linux64_openblas
+  - libbrotlicommon=1.0.9=h7f98852_6
+  - libbrotlidec=1.0.9=h7f98852_6
+  - libbrotlienc=1.0.9=h7f98852_6
+  - libcblas=3.9.0=13_linux64_openblas
+  - libclang=11.1.0=default_ha53f305_1
+  - libdeflate=1.10=h7f98852_0
+  - libedit=3.1.20191231=he28a2e2_2
+  - libevent=2.1.10=h9b69904_4
+  - libffi=3.3=h58526e2_2
+  - libgcc-ng=11.2.0=h1d223b6_12
+  - libgfortran-ng=11.2.0=h69a702a_12
+  - libgfortran5=11.2.0=h5c6108e_12
+  - libglib=2.68.4=h3e27bee_0
+  - libgomp=11.2.0=h1d223b6_12
+  - libiconv=1.16=h516909a_0
+  - liblapack=3.9.0=13_linux64_openblas
+  - libllvm11=11.1.0=hf817b99_3
+  - libogg=1.3.4=h7f98852_1
+  - libopenblas=0.3.18=pthreads_h8fe5266_0
+  - libopus=1.3.1=h7f98852_1
+  - libpng=1.6.37=h21135ba_2
+  - libpq=13.5=hd57d9b9_1
+  - libsodium=1.0.18=h36c2ea0_1
+  - libssh2=1.10.0=ha56f1ee_2
+  - libstdcxx-ng=11.2.0=he4da1e4_12
+  - libtiff=4.3.0=h542a066_3
+  - libuuid=2.32.1=h7f98852_1000
+  - libvorbis=1.3.7=h9c3ff4c_0
+  - libwebp=1.2.2=h3452ae3_0
+  - libwebp-base=1.2.2=h7f98852_1
+  - libxcb=1.13=h7f98852_1004
+  - libxkbcommon=1.0.3=he3ba5ed_0
+  - libxml2=2.9.12=h72842e0_0
+  - libzlib=1.2.11=h36c2ea0_1013
+  - llvmlite=0.38.0=py37h0761922_1
+  - lz4-c=1.9.3=h9c3ff4c_1
+  - markupsafe=2.1.0=py37h540881e_0
+  - matplotlib=3.5.1=py37h89c1867_0
+  - matplotlib-base=3.5.1=py37h1058ff1_0
+  - matplotlib-inline=0.1.3=pyhd8ed1ab_0
+  - mistune=0.8.4=py37h5e8e339_1005
+  - munkres=1.1.4=pyh9f0ad1d_0
+  - mysql-common=8.0.28=ha770c72_0
+  - mysql-libs=8.0.28=hfa10184_0
+  - nbclassic=0.3.5=pyhd8ed1ab_0
+  - nbclient=0.5.11=pyhd8ed1ab_0
+  - nbconvert=6.4.2=py37h89c1867_0
+  - nbformat=5.1.3=pyhd8ed1ab_0
+  - ncurses=6.2=h58526e2_4
+  - nest-asyncio=1.5.4=pyhd8ed1ab_0
+  - nomkl=1.0=h5ca1d4c_0
+  - notebook=6.4.8=pyha770c72_0
+  - nspr=4.32=h9c3ff4c_1
+  - nss=3.74=hb5efdd6_0
+  - numba=0.55.1=py37h2d894fd_0
+  - numexpr=2.8.0=py37hfe5f03c_101
+  - numpy=1.21.5=py37hf2998dd_0
+  - openjpeg=2.4.0=hb52868f_1
+  - openssl=1.1.1o=h166bdaf_0
+  - packaging=21.3=pyhd8ed1ab_0
+  - pandas=1.3.5=py37h8c16a72_0
+  - pandoc=2.17.1.1=ha770c72_0
+  - pandocfilters=1.5.0=pyhd8ed1ab_0
+  - parso=0.8.3=pyhd8ed1ab_0
+  - patsy=0.5.2=pyhd8ed1ab_0
+  - pcre=8.45=h9c3ff4c_0
+  - pexpect=4.8.0=pyh9f0ad1d_2
+  - pickleshare=0.7.5=py_1003
+  - pip=22.0.3=pyhd8ed1ab_0
+  - prometheus_client=0.13.1=pyhd8ed1ab_0
+  - prompt-toolkit=3.0.27=pyha770c72_0
+  - pthread-stubs=0.4=h36c2ea0_1001
+  - ptyprocess=0.7.0=pyhd3deb0d_0
+  - pycparser=2.21=pyhd8ed1ab_0
+  - pygments=2.11.2=pyhd8ed1ab_0
+  - pyopenssl=22.0.0=pyhd8ed1ab_0
+  - pyparsing=3.0.7=pyhd8ed1ab_0
+  - pyqt=5.12.3=py37h89c1867_8
+  - pyqt-impl=5.12.3=py37hac37412_8
+  - pyqt5-sip=4.19.18=py37hcd2ae1e_8
+  - pyqtchart=5.12=py37he336c9b_8
+  - pyqtwebengine=5.12.1=py37he336c9b_8
+  - pyrsistent=0.18.1=py37h5e8e339_0
+  - pysocks=1.7.1=py37h89c1867_4
+  - python=3.7.9=hffdb5ce_100_cpython
+  - python-dateutil=2.8.2=pyhd8ed1ab_0
+  - python_abi=3.7=2_cp37m
+  - pytz=2021.3=pyhd8ed1ab_0
+  - pyzmq=22.3.0=py37h336d617_1
+  - qt=5.12.9=hda022c4_4
+  - readline=8.1=h46c0cb4_0
+  - resampy=0.2.2=py_0
+  - scipy=1.7.3=py37hf2a6cf1_0
+  - seaborn=0.11.2=hd8ed1ab_0
+  - seaborn-base=0.11.2=pyhd8ed1ab_0
+  - send2trash=1.8.0=pyhd8ed1ab_0
+  - six=1.16.0=pyh6c4a22f_0
+  - sniffio=1.2.0=py37h89c1867_2
+  - sqlite=3.37.0=h9cd32fc_0
+  - statsmodels=0.13.2=py37hb1e94ed_0
+  - terminado=0.13.1=py37h89c1867_0
+  - testpath=0.5.0=pyhd8ed1ab_0
+  - tk=8.6.12=h27826a3_0
+  - tornado=6.1=py37h5e8e339_2
+  - traitlets=5.1.1=pyhd8ed1ab_0
+  - typing_extensions=4.1.1=pyha770c72_0
+  - unicodedata2=14.0.0=py37h5e8e339_0
+  - wcwidth=0.2.5=pyh9f0ad1d_2
+  - webencodings=0.5.1=py_1
+  - websocket-client=1.2.3=pyhd8ed1ab_0
+  - wheel=0.37.1=pyhd8ed1ab_0
+  - xorg-libxau=1.0.9=h7f98852_0
+  - xorg-libxdmcp=1.1.3=h7f98852_0
+  - xz=5.2.5=h516909a_1
+  - zeromq=4.3.4=h9c3ff4c_1
+  - zlib=1.2.11=h36c2ea0_1013
+  - zstd=1.5.2=ha95c52a_0
+  - pip:
+    - absl-py==1.0.0
+    - addict==2.4.0
+    - aiohttp==3.8.1
+    - aiosignal==1.2.0
+    - analytics-python==1.4.0
+    - appdirs==1.4.4
+    - asgiref==3.5.2
+    - async-timeout==4.0.2
+    - asynctest==0.13.0
+    - audioread==2.1.9
+    - backoff==1.10.0
+    - bcrypt==3.2.2
+    - beautifulsoup4==4.10.0
+    - cachetools==4.2.4
+    - certifi==2021.10.8
+    - charset-normalizer==2.0.9
+    - click==8.0.3
+    - decorator==4.4.2
+    - decord==0.6.0
+    - einops==0.4.0
+    - fastapi==0.78.0
+    - ffmpeg==1.4
+    - ffmpy==0.3.0
+    - filelock==3.4.0
+    - frozenlist==1.3.0
+    - fsspec==2022.1.0
+    - future==0.18.2
+    - fvcore==0.1.5.post20221221
+    - gdown==4.2.0
+    - google-auth==2.3.3
+    - google-auth-oauthlib==0.4.6
+    - gradio==3.0.2
+    - grpcio==1.43.0
+    - h11==0.13.0
+    - imageio==2.23.0
+    - imageio-ffmpeg==0.4.7
+    - importlib-metadata==4.10.0
+    - iopath==0.1.10
+    - ipywidgets==8.0.4
+    - joblib==1.1.0
+    - jupyterlab-widgets==3.0.5
+    - librosa==0.9.1
+    - linkify-it-py==1.0.3
+    - lmdb==1.4.1
+    - markdown==3.3.6
+    - markdown-it-py==2.1.0
+    - mdit-py-plugins==0.3.0
+    - mdurl==0.1.1
+    - mmaction2==0.24.1
+    - mmcv==1.7.0
+    - mmcv-full==1.4.6
+    - monotonic==1.6
+    - moviepy==1.0.3
+    - multidict==5.2.0
+    - oauthlib==3.1.1
+    - opencv-contrib-python==4.7.0.68
+    - opencv-python==4.5.5.62
+    - orjson==3.6.8
+    - paramiko==2.11.0
+    - pillow==8.3.2
+    - pooch==1.6.0
+    - portalocker==2.7.0
+    - proglog==0.1.10
+    - protobuf==3.19.3
+    - pyasn1==0.4.8
+    - pyasn1-modules==0.2.8
+    - pycryptodome==3.14.1
+    - pydantic==1.9.0
+    - pydeprecate==0.3.1
+    - pydub==0.25.1
+    - pynacl==1.5.0
+    - python-box==6.0.2
+    - python-multipart==0.0.5
+    - python-speech-features==0.6
+    - pytorch-lightning==1.5.8
+    - pyyaml==6.0
+    - requests==2.26.0
+    - requests-oauthlib==1.3.0
+    - rsa==4.8
+    - scenedetect==0.5.6.1
+    - scikit-learn==1.0.1
+    - setuptools==60.9.3
+    - soundfile==0.10.3.post1
+    - soupsieve==2.3.1
+    - starlette==0.19.1
+    - tabulate==0.9.0
+    - tensorboard==2.7.0
+    - tensorboard-data-server==0.6.1
+    - tensorboard-plugin-wit==1.8.1
+    - termcolor==2.2.0
+    - threadpoolctl==3.0.0
+    - timm==0.4.5
+    - torch==1.10.1
+    - torchaudio==0.10.1
+    - torchlibrosa==0.0.9
+    - torchmetrics==0.7.0
+    - torchvision==0.11.2
+    - tqdm==4.62.3
+    - typing-extensions==4.0.1
+    - uc-micro-py==1.0.1
+    - urllib3==1.26.7
+    - uvicorn==0.17.6
+    - warmup-scheduler-pytorch==0.1.2
+    - werkzeug==2.0.2
+    - wget==3.2
+    - widgetsnbextension==4.0.5
+    - yacs==0.1.8
+    - yapf==0.32.0
+    - yarl==1.7.2
+    - youtube-dl==2021.12.17
+    - zipp==3.6.0
diff --git a/legacy/talkNet_multi_multicard.py b/legacy/talkNet_multi_multicard.py
new file mode 100755
index 0000000000000000000000000000000000000000..bb60a1796690dad773b7d46e005fb17bb35c13b4
--- /dev/null
+++ b/legacy/talkNet_multi_multicard.py
@@ -0,0 +1,124 @@
+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+
+import sys, time, numpy, os, subprocess, pandas, tqdm
+
+from loss_multi import lossAV, lossA, lossV
+from model.talkNetModel import talkNetModel
+
+import pytorch_lightning as pl
+from torch import distributed as dist
+
+
+class talkNet(pl.LightningModule):
+
+    def __init__(self, cfg):
+        super(talkNet, self).__init__()
+        self.model = talkNetModel().cuda()
+        self.cfg = cfg
+        self.lossAV = lossAV().cuda()
+        self.lossA = lossA().cuda()
+        self.lossV = lossV().cuda()
+        print(
+            time.strftime("%m-%d %H:%M:%S") + " Model para number = %.2f" %
+            (sum(param.numel() for param in self.model.parameters()) / 1024 / 1024))
+
+    def configure_optimizers(self):
+        optimizer = torch.optim.Adam(self.parameters(), lr=self.cfg.SOLVER.BASE_LR)
+        scheduler = torch.optim.lr_scheduler.StepLR(optimizer,
+                                                    step_size=1,
+                                                    gamma=self.cfg.SOLVER.SCHEDULER.GAMMA)
+        return {"optimizer": optimizer, "lr_scheduler": scheduler}
+
+    def training_step(self, batch, batch_idx):
+        audioFeature, visualFeature, labels, masks = batch
+        b, s, t = visualFeature.shape[0], visualFeature.shape[1], visualFeature.shape[2]
+        audioFeature = audioFeature.repeat(1, s, 1, 1)
+        audioFeature = audioFeature.view(b * s, *audioFeature.shape[2:])
+        visualFeature = visualFeature.view(b * s, *visualFeature.shape[2:])
+        labels = labels.view(b * s, *labels.shape[2:])
+        masks = masks.view(b * s, *masks.shape[2:])
+
+        audioEmbed = self.model.forward_audio_frontend(audioFeature)    # feedForward
+        visualEmbed = self.model.forward_visual_frontend(visualFeature)
+        audioEmbed, visualEmbed = self.model.forward_cross_attention(audioEmbed, visualEmbed)
+        outsAV = self.model.forward_audio_visual_backend(audioEmbed, visualEmbed)
+        outsA = self.model.forward_audio_backend(audioEmbed)
+        outsV = self.model.forward_visual_backend(visualEmbed)
+        labels = labels.reshape((-1))
+        nlossAV, _, _, prec = self.lossAV.forward(outsAV, labels, masks)
+        nlossA = self.lossA.forward(outsA, labels, masks)
+        nlossV = self.lossV.forward(outsV, labels, masks)
+        loss = nlossAV + 0.4 * nlossA + 0.4 * nlossV
+        self.log("train_loss", loss, on_step=True, on_epoch=True, prog_bar=True, logger=True)
+        return loss
+
+    def training_epoch_end(self, training_step_outputs):
+        self.saveParameters(
+            os.path.join(self.cfg.WORKSPACE, "model", "{}.pth".format(self.current_epoch)))
+
+    def evaluate_network(self, loader):
+        self.eval()
+        predScores = []
+        self.model = self.model.cuda()
+        self.lossAV = self.lossAV.cuda()
+        self.lossA = self.lossA.cuda()
+        self.lossV = self.lossV.cuda()
+        evalCsvSave = self.cfg.evalCsvSave
+        evalOrig = self.cfg.evalOrig
+        for audioFeature, visualFeature, labels, masks in tqdm.tqdm(loader):
+            with torch.no_grad():
+                b, s = visualFeature.shape[0], visualFeature.shape[1]
+                t = visualFeature.shape[2]
+                audioFeature = audioFeature.repeat(1, s, 1, 1)
+                audioFeature = audioFeature.view(b * s, *audioFeature.shape[2:])
+                visualFeature = visualFeature.view(b * s, *visualFeature.shape[2:])
+                labels = labels.view(b * s, *labels.shape[2:])
+                masks = masks.view(b * s, *masks.shape[2:])
+                audioEmbed = self.model.forward_audio_frontend(audioFeature.cuda())
+                visualEmbed = self.model.forward_visual_frontend(visualFeature.cuda())
+                audioEmbed, visualEmbed = self.model.forward_cross_attention(
+                    audioEmbed, visualEmbed)
+                outsAV = self.model.forward_audio_visual_backend(audioEmbed, visualEmbed)
+                labels = labels.reshape((-1)).cuda()
+                outsAV = outsAV.view(b, s, t, -1)[:, 0, :, :].view(b * t, -1)
+                labels = labels.view(b, s, t)[:, 0, :].view(b * t)
+                masks = masks.view(b, s, t)[:, 0, :].view(b * t)
+                _, predScore, _, _ = self.lossAV.forward(outsAV, labels, masks)
+                predScore = predScore.detach().cpu().numpy()
+                predScores.extend(predScore)
+        evalLines = open(evalOrig).read().splitlines()[1:]
+        labels = []
+        labels = pandas.Series(['SPEAKING_AUDIBLE' for line in evalLines])
+        scores = pandas.Series(predScores)
+        evalRes = pandas.read_csv(evalOrig)
+        evalRes['score'] = scores
+        evalRes['label'] = labels
+        evalRes.drop(['label_id'], axis=1, inplace=True)
+        evalRes.drop(['instance_id'], axis=1, inplace=True)
+        evalRes.to_csv(evalCsvSave, index=False)
+        cmd = "python -O utils/get_ava_active_speaker_performance.py -g %s -p %s " % (evalOrig,
+                                                                                      evalCsvSave)
+        mAP = float(
+            str(subprocess.run(cmd, shell=True, capture_output=True).stdout).split(' ')[2][:5])
+        return mAP
+
+    def saveParameters(self, path):
+        torch.save(self.state_dict(), path)
+
+    def loadParameters(self, path):
+        selfState = self.state_dict()
+        loadedState = torch.load(path)
+        for name, param in loadedState.items():
+            origName = name
+            if name not in selfState:
+                name = name.replace("module.", "")
+                if name not in selfState:
+                    print("%s is not in the model." % origName)
+                    continue
+            if selfState[name].size() != loadedState[origName].size():
+                sys.stderr.write("Wrong parameter length: %s, model: %s, loaded: %s" %
+                                 (origName, selfState[name].size(), loadedState[origName].size()))
+                continue
+            selfState[name].copy_(param)
diff --git a/legacy/talkNet_multicard.py b/legacy/talkNet_multicard.py
new file mode 100755
index 0000000000000000000000000000000000000000..9ac7303feed93cc2595b32c9f6cd0a306229e2be
--- /dev/null
+++ b/legacy/talkNet_multicard.py
@@ -0,0 +1,146 @@
+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+
+import sys, time, numpy, os, subprocess, pandas, tqdm
+
+from loss import lossAV, lossA, lossV
+from model.talkNetModel import talkNetModel
+
+import pytorch_lightning as pl
+from torch import distributed as dist
+
+
+class talkNet(pl.LightningModule):
+
+    def __init__(self, cfg):
+        super(talkNet, self).__init__()
+        self.cfg = cfg
+        self.model = talkNetModel()
+        self.lossAV = lossAV()
+        self.lossA = lossA()
+        self.lossV = lossV()
+        print(
+            time.strftime("%m-%d %H:%M:%S") + " Model para number = %.2f" %
+            (sum(param.numel() for param in self.model.parameters()) / 1024 / 1024))
+
+    def configure_optimizers(self):
+        optimizer = torch.optim.Adam(self.parameters(), lr=self.cfg.SOLVER.BASE_LR)
+        scheduler = torch.optim.lr_scheduler.StepLR(optimizer,
+                                                    step_size=1,
+                                                    gamma=self.cfg.SOLVER.SCHEDULER.GAMMA)
+        return {"optimizer": optimizer, "lr_scheduler": scheduler}
+
+    def training_step(self, batch, batch_idx):
+        audioFeature, visualFeature, labels = batch
+        audioEmbed = self.model.forward_audio_frontend(audioFeature[0])    # feedForward
+        visualEmbed = self.model.forward_visual_frontend(visualFeature[0])
+        audioEmbed, visualEmbed = self.model.forward_cross_attention(audioEmbed, visualEmbed)
+        outsAV = self.model.forward_audio_visual_backend(audioEmbed, visualEmbed)
+        outsA = self.model.forward_audio_backend(audioEmbed)
+        outsV = self.model.forward_visual_backend(visualEmbed)
+        labels = labels[0].reshape((-1))
+        nlossAV, _, _, prec = self.lossAV.forward(outsAV, labels)
+        nlossA = self.lossA.forward(outsA, labels)
+        nlossV = self.lossV.forward(outsV, labels)
+        loss = nlossAV + 0.4 * nlossA + 0.4 * nlossV
+        self.log("train_loss", loss, on_step=True, on_epoch=True, prog_bar=True, logger=True)
+
+        return loss
+
+    def training_epoch_end(self, training_step_outputs):
+        self.saveParameters(
+            os.path.join(self.cfg.WORKSPACE, "model", "{}.pth".format(self.current_epoch)))
+
+    def validation_step(self, batch, batch_idx):
+        audioFeature, visualFeature, labels, indices = batch
+        audioEmbed = self.model.forward_audio_frontend(audioFeature[0])
+        visualEmbed = self.model.forward_visual_frontend(visualFeature[0])
+        audioEmbed, visualEmbed = self.model.forward_cross_attention(audioEmbed, visualEmbed)
+        outsAV = self.model.forward_audio_visual_backend(audioEmbed, visualEmbed)
+        labels = labels[0].reshape((-1))
+        loss, predScore, _, _ = self.lossAV.forward(outsAV, labels)
+        predScore = predScore[:, -1:].detach().cpu().numpy()
+        # self.log("val_loss", loss)
+
+        return predScore
+
+    def validation_epoch_end(self, validation_step_outputs):
+        evalCsvSave = self.cfg.evalCsvSave
+        evalOrig = self.cfg.evalOrig
+        predScores = []
+
+        for out in validation_step_outputs:    # batch size =1
+            predScores.extend(out)
+
+        evalLines = open(evalOrig).read().splitlines()[1:]
+        labels = []
+        labels = pandas.Series(['SPEAKING_AUDIBLE' for line in evalLines])
+        scores = pandas.Series(predScores)
+        evalRes = pandas.read_csv(evalOrig)
+        print(len(evalRes), len(predScores), len(evalLines))
+        evalRes['score'] = scores
+        evalRes['label'] = labels
+        evalRes.drop(['label_id'], axis=1, inplace=True)
+        evalRes.drop(['instance_id'], axis=1, inplace=True)
+        evalRes.to_csv(evalCsvSave, index=False)
+        cmd = "python -O utils/get_ava_active_speaker_performance.py -g %s -p %s " % (evalOrig,
+                                                                                      evalCsvSave)
+        mAP = float(
+            str(subprocess.run(cmd, shell=True, capture_output=True).stdout).split(' ')[2][:5])
+        print("validation mAP: {}".format(mAP))
+
+    def saveParameters(self, path):
+        torch.save(self.state_dict(), path)
+
+    def loadParameters(self, path):
+        selfState = self.state_dict()
+        loadedState = torch.load(path, map_location='cpu')
+        for name, param in loadedState.items():
+            origName = name
+            if name not in selfState:
+                name = name.replace("module.", "")
+                if name not in selfState:
+                    print("%s is not in the model." % origName)
+                    continue
+            if selfState[name].size() != loadedState[origName].size():
+                sys.stderr.write("Wrong parameter length: %s, model: %s, loaded: %s" %
+                                 (origName, selfState[name].size(), loadedState[origName].size()))
+                continue
+            selfState[name].copy_(param)
+
+    def evaluate_network(self, loader):
+        self.eval()
+        self.model = self.model.cuda()
+        self.lossAV = self.lossAV.cuda()
+        self.lossA = self.lossA.cuda()
+        self.lossV = self.lossV.cuda()
+        predScores = []
+        evalCsvSave = self.cfg.evalCsvSave
+        evalOrig = self.cfg.evalOrig
+        for audioFeature, visualFeature, labels in tqdm.tqdm(loader):
+            with torch.no_grad():
+                audioEmbed = self.model.forward_audio_frontend(audioFeature[0].cuda())
+                visualEmbed = self.model.forward_visual_frontend(visualFeature[0].cuda())
+                audioEmbed, visualEmbed = self.model.forward_cross_attention(
+                    audioEmbed, visualEmbed)
+                outsAV = self.model.forward_audio_visual_backend(audioEmbed, visualEmbed)
+                labels = labels[0].reshape((-1)).cuda()
+                _, predScore, _, _ = self.lossAV.forward(outsAV, labels)
+                predScore = predScore[:, 1].detach().cpu().numpy()
+                predScores.extend(predScore)
+        evalLines = open(evalOrig).read().splitlines()[1:]
+        labels = []
+        labels = pandas.Series(['SPEAKING_AUDIBLE' for line in evalLines])
+        scores = pandas.Series(predScores)
+        evalRes = pandas.read_csv(evalOrig)
+        evalRes['score'] = scores
+        evalRes['label'] = labels
+        evalRes.drop(['label_id'], axis=1, inplace=True)
+        evalRes.drop(['instance_id'], axis=1, inplace=True)
+        evalRes.to_csv(evalCsvSave, index=False)
+        cmd = "python -O utils/get_ava_active_speaker_performance.py -g %s -p %s " % (evalOrig,
+                                                                                      evalCsvSave)
+        mAP = float(
+            str(subprocess.run(cmd, shell=True, capture_output=True).stdout).split(' ')[2][:5])
+        return mAP
diff --git a/legacy/talkNet_orig.py b/legacy/talkNet_orig.py
new file mode 100755
index 0000000000000000000000000000000000000000..43d4d8d1190e5852429ebd58b848e41d91af528b
--- /dev/null
+++ b/legacy/talkNet_orig.py
@@ -0,0 +1,102 @@
+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+
+import sys, time, numpy, os, subprocess, pandas, tqdm
+
+from loss import lossAV, lossA, lossV
+from model.talkNetModel import talkNetModel
+
+
+class talkNet(nn.Module):
+
+    def __init__(self, lr=0.0001, lrDecay=0.95, **kwargs):
+        super(talkNet, self).__init__()
+        self.model = talkNetModel().cuda()
+        self.lossAV = lossAV().cuda()
+        self.lossA = lossA().cuda()
+        self.lossV = lossV().cuda()
+        self.optim = torch.optim.Adam(self.parameters(), lr=lr)
+        self.scheduler = torch.optim.lr_scheduler.StepLR(self.optim, step_size=1, gamma=lrDecay)
+        print(
+            time.strftime("%m-%d %H:%M:%S") + " Model para number = %.2f" %
+            (sum(param.numel() for param in self.model.parameters()) / 1024 / 1024))
+
+    def train_network(self, loader, epoch, **kwargs):
+        self.train()
+        self.scheduler.step(epoch - 1)
+        index, top1, loss = 0, 0, 0
+        lr = self.optim.param_groups[0]['lr']
+        for num, (audioFeature, visualFeature, labels) in enumerate(loader, start=1):
+            self.zero_grad()
+            audioEmbed = self.model.forward_audio_frontend(audioFeature[0].cuda())    # feedForward
+            visualEmbed = self.model.forward_visual_frontend(visualFeature[0].cuda())
+            audioEmbed, visualEmbed = self.model.forward_cross_attention(audioEmbed, visualEmbed)
+            outsAV = self.model.forward_audio_visual_backend(audioEmbed, visualEmbed)
+            outsA = self.model.forward_audio_backend(audioEmbed)
+            outsV = self.model.forward_visual_backend(visualEmbed)
+            labels = labels[0].reshape((-1)).cuda()    # Loss
+            nlossAV, _, _, prec = self.lossAV.forward(outsAV, labels)
+            nlossA = self.lossA.forward(outsA, labels)
+            nlossV = self.lossV.forward(outsV, labels)
+            nloss = nlossAV + 0.4 * nlossA + 0.4 * nlossV
+            loss += nloss.detach().cpu().numpy()
+            top1 += prec
+            nloss.backward()
+            self.optim.step()
+            index += len(labels)
+            sys.stderr.write(time.strftime("%m-%d %H:%M:%S") + \
+            " [%2d] Lr: %5f, Training: %.2f%%, "    %(epoch, lr, 100 * (num / loader.__len__())) + \
+            " Loss: %.5f, ACC: %2.2f%% \r"        %(loss/(num), 100 * (top1/index)))
+            sys.stderr.flush()
+        sys.stdout.write("\n")
+        return loss / num, lr
+
+    def evaluate_network(self, loader, evalCsvSave, evalOrig, **kwargs):
+        self.eval()
+        predScores = []
+        for audioFeature, visualFeature, labels in tqdm.tqdm(loader):
+            with torch.no_grad():
+                audioEmbed = self.model.forward_audio_frontend(audioFeature[0].cuda())
+                visualEmbed = self.model.forward_visual_frontend(visualFeature[0].cuda())
+                audioEmbed, visualEmbed = self.model.forward_cross_attention(
+                    audioEmbed, visualEmbed)
+                outsAV = self.model.forward_audio_visual_backend(audioEmbed, visualEmbed)
+                labels = labels[0].reshape((-1)).cuda()
+                _, predScore, _, _ = self.lossAV.forward(outsAV, labels)
+                predScore = predScore[:, 1].detach().cpu().numpy()
+                predScores.extend(predScore)
+        evalLines = open(evalOrig).read().splitlines()[1:]
+        labels = []
+        labels = pandas.Series(['SPEAKING_AUDIBLE' for line in evalLines])
+        scores = pandas.Series(predScores)
+        evalRes = pandas.read_csv(evalOrig)
+        evalRes['score'] = scores
+        evalRes['label'] = labels
+        evalRes.drop(['label_id'], axis=1, inplace=True)
+        evalRes.drop(['instance_id'], axis=1, inplace=True)
+        evalRes.to_csv(evalCsvSave, index=False)
+        cmd = "python -O utils/get_ava_active_speaker_performance.py -g %s -p %s " % (evalOrig,
+                                                                                      evalCsvSave)
+        mAP = float(
+            str(subprocess.run(cmd, shell=True, capture_output=True).stdout).split(' ')[2][:5])
+        return mAP
+
+    def saveParameters(self, path):
+        torch.save(self.state_dict(), path)
+
+    def loadParameters(self, path):
+        selfState = self.state_dict()
+        loadedState = torch.load(path)
+        for name, param in loadedState.items():
+            origName = name
+            if name not in selfState:
+                name = name.replace("module.", "")
+                if name not in selfState:
+                    print("%s is not in the model." % origName)
+                    continue
+            if selfState[name].size() != loadedState[origName].size():
+                sys.stderr.write("Wrong parameter length: %s, model: %s, loaded: %s" %
+                                 (origName, selfState[name].size(), loadedState[origName].size()))
+                continue
+            selfState[name].copy_(param)
diff --git a/legacy/trainTalkNet_multicard.py b/legacy/trainTalkNet_multicard.py
new file mode 100755
index 0000000000000000000000000000000000000000..5f698d5a76f0f7ae0c8f2f31d776678f3ef8be8d
--- /dev/null
+++ b/legacy/trainTalkNet_multicard.py
@@ -0,0 +1,171 @@
+import time, os, torch, argparse, warnings, glob
+
+from utils.tools import *
+from dlhammer import bootstrap
+import pytorch_lightning as pl
+from pytorch_lightning import Trainer, seed_everything
+from pytorch_lightning.callbacks import ModelCheckpoint
+os.environ["CUDA_DEVICE_ORDER"] = "PCI_BUS_ID"
+
+
+class MyCollator(object):
+
+    def __init__(self, cfg):
+        self.cfg = cfg
+
+    def __call__(self, data):
+        audiofeatures = [item[0] for item in data]
+        visualfeatures = [item[1] for item in data]
+        labels = [item[2] for item in data]
+        masks = [item[3] for item in data]
+        cut_limit = self.cfg.MODEL.CLIP_LENGTH
+        # pad audio
+        lengths = torch.tensor([t.shape[1] for t in audiofeatures])
+        max_len = max(lengths)
+        padded_audio = torch.stack([
+            torch.cat([i, i.new_zeros((i.shape[0], max_len - i.shape[1], i.shape[2]))], 1)
+            for i in audiofeatures
+        ], 0)
+
+        if max_len > cut_limit * 4:
+            padded_audio = padded_audio[:, :, :cut_limit * 4, ...]
+
+        # pad video
+        lengths = torch.tensor([t.shape[1] for t in visualfeatures])
+        max_len = max(lengths)
+        padded_video = torch.stack([
+            torch.cat(
+                [i, i.new_zeros((i.shape[0], max_len - i.shape[1], i.shape[2], i.shape[3]))], 1)
+            for i in visualfeatures
+        ], 0)
+        padded_labels = torch.stack(
+            [torch.cat([i, i.new_zeros((i.shape[0], max_len - i.shape[1]))], 1) for i in labels], 0)
+        padded_masks = torch.stack(
+            [torch.cat([i, i.new_zeros((i.shape[0], max_len - i.shape[1]))], 1) for i in masks], 0)
+
+        if max_len > cut_limit:
+            padded_video = padded_video[:, :, :cut_limit, ...]
+            padded_labels = padded_labels[:, :, :cut_limit, ...]
+            padded_masks = padded_masks[:, :, :cut_limit, ...]
+        return padded_audio, padded_video, padded_labels, padded_masks
+
+
+class DataPrep(pl.LightningDataModule):
+
+    def __init__(self, cfg):
+        self.cfg = cfg
+
+    def train_dataloader(self):
+        cfg = self.cfg
+
+        if self.cfg.MODEL.NAME == "baseline":
+            from dataLoader import train_loader, val_loader
+            loader = train_loader(trialFileName = cfg.trainTrialAVA, \
+                              audioPath      = os.path.join(cfg.audioPathAVA , 'train'), \
+                              visualPath     = os.path.join(cfg.visualPathAVA, 'train'), \
+                              batchSize=2500
+                              )
+        elif self.cfg.MODEL.NAME == "multi":
+            from dataLoader_multiperson import train_loader, val_loader
+            loader = train_loader(trialFileName = cfg.trainTrialAVA, \
+                              audioPath      = os.path.join(cfg.audioPathAVA , 'train'), \
+                              visualPath     = os.path.join(cfg.visualPathAVA, 'train'), \
+                              num_speakers=cfg.MODEL.NUM_SPEAKERS,
+                              )
+        if cfg.MODEL.NAME == "baseline":
+            trainLoader = torch.utils.data.DataLoader(
+                loader,
+                batch_size=1,
+                shuffle=True,
+                num_workers=4,
+            )
+        elif cfg.MODEL.NAME == "multi":
+            collator = MyCollator(cfg)
+            trainLoader = torch.utils.data.DataLoader(loader,
+                                                      batch_size=1,
+                                                      shuffle=True,
+                                                      num_workers=4,
+                                                      collate_fn=collator)
+
+        return trainLoader
+
+    def val_dataloader(self):
+        cfg = self.cfg
+        loader = val_loader(trialFileName = cfg.evalTrialAVA, \
+                            audioPath     = os.path.join(cfg.audioPathAVA , cfg.evalDataType), \
+                            visualPath    = os.path.join(cfg.visualPathAVA, cfg.evalDataType), \
+                            )
+        valLoader = torch.utils.data.DataLoader(loader,
+                                                batch_size=cfg.VAL.BATCH_SIZE,
+                                                shuffle=False,
+                                                num_workers=16)
+        return valLoader
+
+
+def main():
+    # The structure of this code is learnt from https://github.com/clovaai/voxceleb_trainer
+    cfg = bootstrap(print_cfg=False)
+    print(cfg)
+
+    warnings.filterwarnings("ignore")
+    seed_everything(42, workers=True)
+
+    cfg = init_args(cfg)
+
+    # checkpoint_callback = ModelCheckpoint(dirpath=os.path.join(cfg.WORKSPACE, "model"),
+    #                                       save_top_k=-1,
+    #                                       filename='{epoch}')
+
+    data = DataPrep(cfg)
+
+    trainer = Trainer(
+        gpus=int(cfg.TRAIN.TRAINER_GPU),
+        precision=32,
+    # callbacks=[checkpoint_callback],
+        max_epochs=25,
+        replace_sampler_ddp=True)
+    # val_trainer = Trainer(deterministic=True, num_sanity_val_steps=-1, gpus=1)
+    if cfg.downloadAVA == True:
+        preprocess_AVA(cfg)
+        quit()
+
+    # if cfg.RESUME:
+    #     modelfiles = glob.glob('%s/model_0*.model' % cfg.modelSavePath)
+    #     modelfiles.sort()
+    #     if len(modelfiles) >= 1:
+    #         print("Model %s loaded from previous state!" % modelfiles[-1])
+    #         epoch = int(os.path.splitext(os.path.basename(modelfiles[-1]))[0][6:]) + 1
+    #         s = talkNet(cfg)
+    #         s.loadParameters(modelfiles[-1])
+    #     else:
+    #         epoch = 1
+    #         s = talkNet(cfg)
+    epoch = 1
+    if cfg.MODEL.NAME == "baseline":
+        from talkNet_multicard import talkNet
+    elif cfg.MODEL.NAME == "multi":
+        from talkNet_multi import talkNet
+
+    s = talkNet(cfg)
+
+    # scoreFile = open(cfg.scoreSavePath, "a+")
+
+    trainer.fit(s, train_dataloaders=data.train_dataloader())
+
+    modelfiles = glob.glob('%s/*.pth' % os.path.join(cfg.WORKSPACE, "model"))
+
+    modelfiles.sort()
+    for path in modelfiles:
+        s.loadParameters(path)
+        prec = trainer.validate(s, data.val_dataloader())
+
+    # if epoch % cfg.testInterval == 0:
+    # s.saveParameters(cfg.modelSavePath + "/model_%04d.model" % epoch)
+    # trainer.validate(dataloaders=valLoader)
+    # print(time.strftime("%Y-%m-%d %H:%M:%S"), "%d epoch, mAP %2.2f%%" % (epoch, mAPs[-1]))
+    # scoreFile.write("%d epoch, LOSS %f, mAP %2.2f%%\n" % (epoch, loss, mAPs[-1]))
+    # scoreFile.flush()
+
+
+if __name__ == '__main__':
+    main()
diff --git a/legacy/train_multi.py b/legacy/train_multi.py
new file mode 100755
index 0000000000000000000000000000000000000000..951f163f6bcd37b93748b222a84a9f9c1d34648e
--- /dev/null
+++ b/legacy/train_multi.py
@@ -0,0 +1,156 @@
+import time, os, torch, argparse, warnings, glob
+
+from dataLoader_multiperson import train_loader, val_loader
+from utils.tools import *
+from talkNet_multi import talkNet
+
+
+def collate_fn_padding(data):
+    audiofeatures = [item[0] for item in data]
+    visualfeatures = [item[1] for item in data]
+    labels = [item[2] for item in data]
+    masks = [item[3] for item in data]
+    cut_limit = 200
+    # pad audio
+    lengths = torch.tensor([t.shape[1] for t in audiofeatures])
+    max_len = max(lengths)
+    padded_audio = torch.stack([
+        torch.cat([i, i.new_zeros((i.shape[0], max_len - i.shape[1], i.shape[2]))], 1)
+        for i in audiofeatures
+    ], 0)
+
+    if max_len > cut_limit * 4:
+        padded_audio = padded_audio[:, :, :cut_limit * 4, ...]
+
+    # pad video
+    lengths = torch.tensor([t.shape[1] for t in visualfeatures])
+    max_len = max(lengths)
+    padded_video = torch.stack([
+        torch.cat([i, i.new_zeros((i.shape[0], max_len - i.shape[1], i.shape[2], i.shape[3]))], 1)
+        for i in visualfeatures
+    ], 0)
+    padded_labels = torch.stack(
+        [torch.cat([i, i.new_zeros((i.shape[0], max_len - i.shape[1]))], 1) for i in labels], 0)
+    padded_masks = torch.stack(
+        [torch.cat([i, i.new_zeros((i.shape[0], max_len - i.shape[1]))], 1) for i in masks], 0)
+
+    if max_len > cut_limit:
+        padded_video = padded_video[:, :, :cut_limit, ...]
+        padded_labels = padded_labels[:, :, :cut_limit, ...]
+        padded_masks = padded_masks[:, :, :cut_limit, ...]
+    # print(padded_audio.shape, padded_video.shape, padded_labels.shape, padded_masks.shape)
+    return padded_audio, padded_video, padded_labels, padded_masks
+
+
+def main():
+    # The structure of this code is learnt from https://github.com/clovaai/voxceleb_trainer
+    warnings.filterwarnings("ignore")
+
+    parser = argparse.ArgumentParser(description="TalkNet Training")
+    # Training details
+    parser.add_argument('--lr', type=float, default=0.0001, help='Learning rate')
+    parser.add_argument('--lrDecay', type=float, default=0.95, help='Learning rate decay rate')
+    parser.add_argument('--maxEpoch', type=int, default=25, help='Maximum number of epochs')
+    parser.add_argument('--testInterval',
+                        type=int,
+                        default=1,
+                        help='Test and save every [testInterval] epochs')
+    parser.add_argument(
+        '--batchSize',
+        type=int,
+        default=2500,
+        help=
+        'Dynamic batch size, default is 2500 frames, other batchsize (such as 1500) will not affect the performance'
+    )
+    parser.add_argument('--batch_size', type=int, default=1, help='batch_size')
+    parser.add_argument('--num_speakers', type=int, default=5, help='num_speakers')
+    parser.add_argument('--nDataLoaderThread', type=int, default=4, help='Number of loader threads')
+    # Data path
+    parser.add_argument('--dataPathAVA',
+                        type=str,
+                        default="/data08/AVA",
+                        help='Save path of AVA dataset')
+    parser.add_argument('--savePath', type=str, default="exps/exp1")
+    # Data selection
+    parser.add_argument('--evalDataType',
+                        type=str,
+                        default="val",
+                        help='Only for AVA, to choose the dataset for evaluation, val or test')
+    # For download dataset only, for evaluation only
+    parser.add_argument('--downloadAVA',
+                        dest='downloadAVA',
+                        action='store_true',
+                        help='Only download AVA dataset and do related preprocess')
+    parser.add_argument('--evaluation',
+                        dest='evaluation',
+                        action='store_true',
+                        help='Only do evaluation by using pretrained model [pretrain_AVA.model]')
+    args = parser.parse_args()
+    # Data loader
+    args = init_args(args)
+
+    if args.downloadAVA == True:
+        preprocess_AVA(args)
+        quit()
+
+    loader = train_loader(trialFileName = args.trainTrialAVA, \
+                          audioPath      = os.path.join(args.audioPathAVA , 'train'), \
+                          visualPath     = os.path.join(args.visualPathAVA, 'train'), \
+                          # num_speakers = args.num_speakers, \
+                          **vars(args))
+    trainLoader = torch.utils.data.DataLoader(loader,
+                                              batch_size=args.batch_size,
+                                              shuffle=True,
+                                              num_workers=args.nDataLoaderThread,
+                                              collate_fn=collate_fn_padding)
+
+    loader = val_loader(trialFileName = args.evalTrialAVA, \
+                        audioPath     = os.path.join(args.audioPathAVA , args.evalDataType), \
+                        visualPath    = os.path.join(args.visualPathAVA, args.evalDataType), \
+                        # num_speakers = args.num_speakers, \
+                        **vars(args))
+    valLoader = torch.utils.data.DataLoader(loader, batch_size=1, shuffle=False, num_workers=16)
+
+    if args.evaluation == True:
+        download_pretrain_model_AVA()
+        s = talkNet(**vars(args))
+        s.loadParameters('pretrain_AVA.model')
+        print("Model %s loaded from previous state!" % ('pretrain_AVA.model'))
+        mAP = s.evaluate_network(loader=valLoader, **vars(args))
+        print("mAP %2.2f%%" % (mAP))
+        quit()
+
+    modelfiles = glob.glob('%s/model_0*.model' % args.modelSavePath)
+    modelfiles.sort()
+    if len(modelfiles) >= 1:
+        print("Model %s loaded from previous state!" % modelfiles[-1])
+        epoch = int(os.path.splitext(os.path.basename(modelfiles[-1]))[0][6:]) + 1
+        s = talkNet(epoch=epoch, **vars(args))
+        s.loadParameters(modelfiles[-1])
+    else:
+        epoch = 1
+        s = talkNet(epoch=epoch, **vars(args))
+
+    mAPs = []
+    scoreFile = open(args.scoreSavePath, "a+")
+
+    while (1):
+        loss, lr = s.train_network(epoch=epoch, loader=trainLoader, **vars(args))
+
+        if epoch % args.testInterval == 0:
+            s.saveParameters(args.modelSavePath + "/model_%04d.model" % epoch)
+            mAPs.append(s.evaluate_network(epoch=epoch, loader=valLoader, **vars(args)))
+            print(time.strftime("%Y-%m-%d %H:%M:%S"),
+                  "%d epoch, mAP %2.2f%%, bestmAP %2.2f%%" % (epoch, mAPs[-1], max(mAPs)))
+            scoreFile.write("%d epoch, LR %f, LOSS %f, mAP %2.2f%%, bestmAP %2.2f%%\n" %
+                            (epoch, lr, loss, mAPs[-1], max(mAPs)))
+            scoreFile.flush()
+
+        if epoch >= args.maxEpoch:
+            quit()
+
+        epoch += 1
+
+
+if __name__ == '__main__':
+    main()
diff --git a/loconet.py b/loconet.py
new file mode 100755
index 0000000000000000000000000000000000000000..835826dd4b777d4a4d79f3d66e76191f03434da0
--- /dev/null
+++ b/loconet.py
@@ -0,0 +1,182 @@
+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+
+import sys, time, numpy, os, subprocess, pandas, tqdm
+
+from loss_multi import lossAV, lossA, lossV
+from model.loconet_encoder import locoencoder
+
+import torch.distributed as dist
+from xxlib.utils.distributed import all_gather, all_reduce
+
+
+class Loconet(nn.Module):
+
+    def __init__(self, cfg):
+        super(Loconet, self).__init__()
+        self.cfg = cfg
+        self.model = locoencoder(cfg)
+        self.lossAV = lossAV()
+        self.lossA = lossA()
+        self.lossV = lossV()
+
+    def forward(self, audioFeature, visualFeature, labels, masks):
+        b, s, t = visualFeature.shape[:3]
+        visualFeature = visualFeature.view(b * s, *visualFeature.shape[2:])
+        labels = labels.view(b * s, *labels.shape[2:])
+        masks = masks.view(b * s, *masks.shape[2:])
+
+        audioEmbed = self.model.forward_audio_frontend(audioFeature)    # B, C, T, 4
+        visualEmbed = self.model.forward_visual_frontend(visualFeature)
+        audioEmbed = audioEmbed.repeat(s, 1, 1)
+
+        audioEmbed, visualEmbed = self.model.forward_cross_attention(audioEmbed, visualEmbed)
+        outsAV = self.model.forward_audio_visual_backend(audioEmbed, visualEmbed, b, s)
+        outsA = self.model.forward_audio_backend(audioEmbed)
+        outsV = self.model.forward_visual_backend(visualEmbed)
+
+        labels = labels.reshape((-1))
+        masks = masks.reshape((-1))
+        nlossAV, _, _, prec = self.lossAV.forward(outsAV, labels, masks)
+        nlossA = self.lossA.forward(outsA, labels, masks)
+        nlossV = self.lossV.forward(outsV, labels, masks)
+
+        nloss = nlossAV + 0.4 * nlossA + 0.4 * nlossV
+
+        num_frames = masks.sum()
+        return nloss, prec, num_frames
+
+
+class loconet(nn.Module):
+
+    def __init__(self, cfg, rank=None, device=None):
+        super(loconet, self).__init__()
+        self.cfg = cfg
+        self.rank = rank
+        if rank != None:
+            self.rank = rank
+            self.device = device
+
+            self.model = Loconet(cfg).to(device)
+            self.model = nn.SyncBatchNorm.convert_sync_batchnorm(self.model)
+            self.model = nn.parallel.DistributedDataParallel(self.model,
+                                                             device_ids=[rank],
+                                                             output_device=rank,
+                                                             find_unused_parameters=False)
+            self.optim = torch.optim.Adam(self.model.parameters(), lr=self.cfg.SOLVER.BASE_LR)
+            self.scheduler = torch.optim.lr_scheduler.StepLR(self.optim,
+                                                             step_size=1,
+                                                             gamma=self.cfg.SOLVER.SCHEDULER.GAMMA)
+        else:
+            self.model = locoencoder(cfg).cuda()
+            self.lossAV = lossAV().cuda()
+            self.lossA = lossA().cuda()
+            self.lossV = lossV().cuda()
+
+        print(
+            time.strftime("%m-%d %H:%M:%S") + " Model para number = %.2f" %
+            (sum(param.numel() for param in self.model.parameters()) / 1024 / 1024))
+
+    def train_network(self, epoch, loader):
+        self.model.train()
+        self.scheduler.step(epoch - 1)
+        index, top1, loss = 0, 0, 0
+        lr = self.optim.param_groups[0]['lr']
+        loader.sampler.set_epoch(epoch)
+        device = self.device
+
+        pbar = enumerate(loader, start=1)
+        if self.rank == 0:
+            pbar = tqdm.tqdm(pbar, total=loader.__len__())
+
+        for num, (audioFeature, visualFeature, labels, masks) in pbar:
+
+            audioFeature = audioFeature.to(device)
+            visualFeature = visualFeature.to(device)
+            labels = labels.to(device)
+            masks = masks.to(device)
+            nloss, prec, num_frames = self.model(
+                audioFeature,
+                visualFeature,
+                labels,
+                masks,
+            )
+
+            self.optim.zero_grad()
+            nloss.backward()
+            self.optim.step()
+
+            [nloss, prec, num_frames] = all_reduce([nloss, prec, num_frames], average=False)
+            top1 += prec.detach().cpu().numpy()
+            loss += nloss.detach().cpu().numpy()
+            index += int(num_frames.detach().cpu().item())
+            if self.rank == 0:
+                pbar.set_postfix(
+                    dict(epoch=epoch,
+                         lr=lr,
+                         loss=loss / (num * self.cfg.NUM_GPUS),
+                         acc=(top1 / index)))
+        dist.barrier()
+        return loss / num, lr
+
+    def evaluate_network(self, epoch, loader):
+        self.eval()
+        predScores = []
+        evalCsvSave = os.path.join(self.cfg.WORKSPACE, "{}_res.csv".format(epoch))
+        evalOrig = self.cfg.evalOrig
+        for audioFeature, visualFeature, labels, masks in tqdm.tqdm(loader):
+            with torch.no_grad():
+                audioFeature = audioFeature.cuda()
+                visualFeature = visualFeature.cuda()
+                labels = labels.cuda()
+                masks = masks.cuda()
+                b, s, t = visualFeature.shape[0], visualFeature.shape[1], visualFeature.shape[2]
+                visualFeature = visualFeature.view(b * s, *visualFeature.shape[2:])
+                labels = labels.view(b * s, *labels.shape[2:])
+                masks = masks.view(b * s, *masks.shape[2:])
+                audioEmbed = self.model.forward_audio_frontend(audioFeature)
+                visualEmbed = self.model.forward_visual_frontend(visualFeature)
+                audioEmbed = audioEmbed.repeat(s, 1, 1)
+                audioEmbed, visualEmbed = self.model.forward_cross_attention(
+                    audioEmbed, visualEmbed)
+                outsAV = self.model.forward_audio_visual_backend(audioEmbed, visualEmbed, b, s)
+                labels = labels.reshape((-1))
+                masks = masks.reshape((-1))
+                outsAV = outsAV.view(b, s, t, -1)[:, 0, :, :].view(b * t, -1)
+                labels = labels.view(b, s, t)[:, 0, :].view(b * t).cuda()
+                masks = masks.view(b, s, t)[:, 0, :].view(b * t)
+                _, predScore, _, _ = self.lossAV.forward(outsAV, labels, masks)
+                predScore = predScore[:, 1].detach().cpu().numpy()
+                predScores.extend(predScore)
+        evalLines = open(evalOrig).read().splitlines()[1:]
+        labels = []
+        labels = pandas.Series(['SPEAKING_AUDIBLE' for line in evalLines])
+        scores = pandas.Series(predScores)
+        evalRes = pandas.read_csv(evalOrig)
+        evalRes['score'] = scores
+        evalRes['label'] = labels
+        evalRes.drop(['label_id'], axis=1, inplace=True)
+        evalRes.drop(['instance_id'], axis=1, inplace=True)
+        evalRes.to_csv(evalCsvSave, index=False)
+        cmd = "python -O utils/get_ava_active_speaker_performance.py -g %s -p %s " % (evalOrig,
+                                                                                      evalCsvSave)
+        mAP = float(
+            str(subprocess.run(cmd, shell=True, capture_output=True).stdout).split(' ')[2][:5])
+        return mAP
+
+    def saveParameters(self, path):
+        torch.save(self.state_dict(), path)
+
+    def loadParameters(self, path):
+        selfState = self.state_dict()
+        loadedState = torch.load(path, map_location='cpu')
+        if self.rank != None:
+            info = self.load_state_dict(loadedState)
+        else:
+            new_state = {}
+
+            for k, v in loadedState.items():
+                new_state[k.replace("model.module.", "")] = v
+            info = self.load_state_dict(new_state, strict=False)
+        print(info)
diff --git a/loss_multi.py b/loss_multi.py
new file mode 100755
index 0000000000000000000000000000000000000000..47edcba1e3251d9cff47efd6e6cc2a54dde88795
--- /dev/null
+++ b/loss_multi.py
@@ -0,0 +1,72 @@
+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+import utils.distributed as du
+
+
+class lossAV(nn.Module):
+
+    def __init__(self):
+        super(lossAV, self).__init__()
+        self.criterion = nn.CrossEntropyLoss(reduction='none')
+        self.FC = nn.Linear(256, 2)
+
+    def forward(self, x, labels=None, masks=None):
+        x = x.squeeze(1)
+        x = self.FC(x)
+        if labels == None:
+            predScore = x[:, 1]
+            predScore = predScore.t()
+            predScore = predScore.view(-1).detach().cpu().numpy()
+            return predScore
+        else:
+            nloss = self.criterion(x, labels) * masks
+
+            num_valid = masks.sum().float()
+            if self.training:
+                [num_valid] = du.all_reduce([num_valid],average=True)
+            nloss = torch.sum(nloss) / num_valid
+
+            predScore = F.softmax(x, dim=-1)
+            predLabel = torch.round(F.softmax(x, dim=-1))[:, 1]
+            correctNum = ((predLabel == labels) * masks).sum().float()
+            return nloss, predScore, predLabel, correctNum
+
+
+class lossA(nn.Module):
+
+    def __init__(self):
+        super(lossA, self).__init__()
+        self.criterion = nn.CrossEntropyLoss(reduction='none')
+        self.FC = nn.Linear(128, 2)
+
+    def forward(self, x, labels, masks=None):
+        x = x.squeeze(1)
+        x = self.FC(x)
+        nloss = self.criterion(x, labels) * masks
+        num_valid = masks.sum().float()
+        if self.training:
+            [num_valid] = du.all_reduce([num_valid],average=True)
+        nloss = torch.sum(nloss) / num_valid
+        #nloss = torch.sum(nloss) / torch.sum(masks)
+        return nloss
+
+
+class lossV(nn.Module):
+
+    def __init__(self):
+        super(lossV, self).__init__()
+
+        self.criterion = nn.CrossEntropyLoss(reduction='none')
+        self.FC = nn.Linear(128, 2)
+
+    def forward(self, x, labels, masks=None):
+        x = x.squeeze(1)
+        x = self.FC(x)
+        nloss = self.criterion(x, labels) * masks
+        # nloss = torch.sum(nloss) / torch.sum(masks)
+        num_valid = masks.sum().float()
+        if self.training:
+            [num_valid] = du.all_reduce([num_valid],average=True)
+        nloss = torch.sum(nloss) / num_valid
+        return nloss
diff --git a/metrics/AverageMeter.py b/metrics/AverageMeter.py
new file mode 100755
index 0000000000000000000000000000000000000000..d5b1bc57d204e76690d92878a46584de98a4f1bd
--- /dev/null
+++ b/metrics/AverageMeter.py
@@ -0,0 +1,18 @@
+#taken from pytorch imagenet example
+class AverageMeter(object):
+    """Computes and stores the average and current value"""
+    def __init__(self):
+        self.reset()
+
+    def reset(self):
+        self.val = 0
+        self.avg = 0
+        self.sum = 0
+        self.count = 0
+
+    def update(self, val, n=1):
+        self.val = val
+        self.sum += val * n
+        self.count += n
+        self.avg = self.sum / self.count
+
diff --git a/metrics/__pycache__/.nfs000000035f4a8257000000eb b/metrics/__pycache__/.nfs000000035f4a8257000000eb
new file mode 100644
index 0000000000000000000000000000000000000000..5c481da38ae427d717e3fae5b8a12b0efc12ae3b
Binary files /dev/null and b/metrics/__pycache__/.nfs000000035f4a8257000000eb differ
diff --git a/metrics/__pycache__/AverageMeter.cpython-36.pyc b/metrics/__pycache__/AverageMeter.cpython-36.pyc
new file mode 100755
index 0000000000000000000000000000000000000000..d928a415daff76824c9e873f38db2928dd1dc58d
Binary files /dev/null and b/metrics/__pycache__/AverageMeter.cpython-36.pyc differ
diff --git a/metrics/__pycache__/AverageMeter.cpython-38.pyc b/metrics/__pycache__/AverageMeter.cpython-38.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..90592f04e63826430507fd72bdb195cf923e2c7f
Binary files /dev/null and b/metrics/__pycache__/AverageMeter.cpython-38.pyc differ
diff --git a/metrics/__pycache__/accuracy.cpython-36.pyc b/metrics/__pycache__/accuracy.cpython-36.pyc
new file mode 100755
index 0000000000000000000000000000000000000000..4f5c7021ffc19382a03dd31b665e2e1ed66ea090
Binary files /dev/null and b/metrics/__pycache__/accuracy.cpython-36.pyc differ
diff --git a/metrics/__pycache__/accuracy.cpython-38.pyc b/metrics/__pycache__/accuracy.cpython-38.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..28af79a9c47d1d10af6a5a364c21f8331be772ee
Binary files /dev/null and b/metrics/__pycache__/accuracy.cpython-38.pyc differ
diff --git a/metrics/accuracy.py b/metrics/accuracy.py
new file mode 100755
index 0000000000000000000000000000000000000000..03cc9ef95d02130276d02b2a68526a8e30baa1ab
--- /dev/null
+++ b/metrics/accuracy.py
@@ -0,0 +1,20 @@
+import torch
+
+accuracy = lambda output,target : acc_topk(output, target)[0]    
+
+#taken from pytorch imagenet example 
+def acc_topk(output, target, topk=(1,)):
+    """Computes the accuracy over the k top predictions for the specified values of k"""
+    with torch.no_grad():
+        maxk = max(topk)
+        batch_size = target.size(0)
+
+        _, pred = output.topk(maxk, 1, True, True)
+        pred = pred.t()
+        correct = pred.eq(target.view(1, -1).expand_as(pred))
+
+        res = []
+        for k in topk:
+            correct_k = correct[:k].view(-1).float().sum(0, keepdim=True)
+            res.append(correct_k.mul_(1.0 / batch_size))
+        return res
\ No newline at end of file
diff --git a/model/.DS_Store b/model/.DS_Store
new file mode 100644
index 0000000000000000000000000000000000000000..e734ea84fafb31cbe2b4f456dc438a531432a91e
Binary files /dev/null and b/model/.DS_Store differ
diff --git a/model/__init__.py b/model/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..59239dd9719a28db4a512938c678f4d804630c2d
--- /dev/null
+++ b/model/__init__.py
@@ -0,0 +1,5 @@
+from model.transformer.position_encoding import PositionalEncoding
+from model.transformer.transformer import Transformer
+from model.transformer.transformer import TransformerEncoder, TransformerEncoderLayer
+from model.transformer.transformer import TransformerDecoder, TransformerDecoderLayer
+from model.transformer.utils import layer_norm, generate_square_subsequent_mask, generate_proposal_mask
diff --git a/model/__pycache__/__init__.cpython-36.pyc b/model/__pycache__/__init__.cpython-36.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..eccec07dc8794bd26846b65adc7193f2fc477490
Binary files /dev/null and b/model/__pycache__/__init__.cpython-36.pyc differ
diff --git a/model/__pycache__/__init__.cpython-37.pyc b/model/__pycache__/__init__.cpython-37.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..27a223bf75d5e07c4bf849e58178a865f155dc0a
Binary files /dev/null and b/model/__pycache__/__init__.cpython-37.pyc differ
diff --git a/model/__pycache__/attentionLayer.cpython-37.pyc b/model/__pycache__/attentionLayer.cpython-37.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..d40e2eb05278a0f315581e737313adacb3540e72
Binary files /dev/null and b/model/__pycache__/attentionLayer.cpython-37.pyc differ
diff --git a/model/__pycache__/convLayer.cpython-37.pyc b/model/__pycache__/convLayer.cpython-37.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..10f90442a4c8e2d5a29476edf4620b016738711b
Binary files /dev/null and b/model/__pycache__/convLayer.cpython-37.pyc differ
diff --git a/model/__pycache__/loconet_encoder.cpython-37.pyc b/model/__pycache__/loconet_encoder.cpython-37.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..a222c2bfc64a7f9f5ef1d84917c1ce66f5f80c96
Binary files /dev/null and b/model/__pycache__/loconet_encoder.cpython-37.pyc differ
diff --git a/model/__pycache__/position_encoding.cpython-36.pyc b/model/__pycache__/position_encoding.cpython-36.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..0e58bf2dada185cabe183810dfc4162b67eddcd8
Binary files /dev/null and b/model/__pycache__/position_encoding.cpython-36.pyc differ
diff --git a/model/__pycache__/talkNetModel.cpython-37.pyc b/model/__pycache__/talkNetModel.cpython-37.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..925e49a9132958b0f48da11987a3cedfeadf95a0
Binary files /dev/null and b/model/__pycache__/talkNetModel.cpython-37.pyc differ
diff --git a/model/__pycache__/transformer.cpython-36.pyc b/model/__pycache__/transformer.cpython-36.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..692c8b1f538d5202e7b297df3c611775c944d2e5
Binary files /dev/null and b/model/__pycache__/transformer.cpython-36.pyc differ
diff --git a/model/__pycache__/utils.cpython-36.pyc b/model/__pycache__/utils.cpython-36.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..66e2d2ad6fab6f4851e6eb2875ca6353334caa7b
Binary files /dev/null and b/model/__pycache__/utils.cpython-36.pyc differ
diff --git a/model/__pycache__/visualEncoder.cpython-37.pyc b/model/__pycache__/visualEncoder.cpython-37.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..1e82303ff2b2f0155a2cd1888a4ccb8c5faa308e
Binary files /dev/null and b/model/__pycache__/visualEncoder.cpython-37.pyc differ
diff --git a/model/attentionLayer.py b/model/attentionLayer.py
new file mode 100755
index 0000000000000000000000000000000000000000..f4f1efd8da6dcccfd133aaddeb415ab5b38ab5d3
--- /dev/null
+++ b/model/attentionLayer.py
@@ -0,0 +1,39 @@
+import torch
+import torch.nn as nn
+from torch.nn import functional as F
+from torch.nn import MultiheadAttention
+
+
+class attentionLayer(nn.Module):
+
+    def __init__(self, d_model, nhead, dropout=0.1):
+        super(attentionLayer, self).__init__()
+        self.self_attn = MultiheadAttention(d_model, nhead, dropout=dropout)
+
+        self.linear1 = nn.Linear(d_model, d_model * 4)
+        self.dropout = nn.Dropout(dropout)
+        self.linear2 = nn.Linear(d_model * 4, d_model)
+
+        self.norm1 = nn.LayerNorm(d_model)
+        self.norm2 = nn.LayerNorm(d_model)
+        self.dropout1 = nn.Dropout(dropout)
+        self.dropout2 = nn.Dropout(dropout)
+
+        self.activation = F.relu
+
+    def forward(self, src, tar, adjust=False, attn_mask=None):
+        # type: (Tensor, Optional[Tensor], Optional[Tensor]) -> Tensor
+        src = src.transpose(0, 1)    # B, T, C -> T, B, C
+        tar = tar.transpose(0, 1)    # B, T, C -> T, B, C
+        if adjust:
+            src2 = self.self_attn(src, tar, tar, attn_mask=None, key_padding_mask=None)[0]
+        else:
+            src2 = self.self_attn(tar, src, src, attn_mask=None, key_padding_mask=None)[0]
+        src = src + self.dropout1(src2)
+        src = self.norm1(src)
+
+        src2 = self.linear2(self.dropout(self.activation(self.linear1(src))))
+        src = src + self.dropout2(src2)
+        src = self.norm2(src)
+        src = src.transpose(0, 1)    # T, B, C -> B, T, C
+        return src
diff --git a/model/audioEncoder.py b/model/audioEncoder.py
new file mode 100755
index 0000000000000000000000000000000000000000..6aaaf66b29d9453662bd20a918ebff35229f2966
--- /dev/null
+++ b/model/audioEncoder.py
@@ -0,0 +1,108 @@
+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+
+class SEBasicBlock(nn.Module):
+    expansion = 1
+
+    def __init__(self, inplanes, planes, stride=1, downsample=None, reduction=8):
+        super(SEBasicBlock, self).__init__()
+        self.conv1 = nn.Conv2d(inplanes, planes, kernel_size=3, stride=stride, padding=1, bias=False)
+        self.bn1 = nn.BatchNorm2d(planes)
+        self.conv2 = nn.Conv2d(planes, planes, kernel_size=3, padding=1, bias=False)
+        self.bn2 = nn.BatchNorm2d(planes)
+        self.relu = nn.ReLU(inplace=True)
+        self.se = SELayer(planes, reduction)
+        self.downsample = downsample
+        self.stride = stride
+
+    def forward(self, x):
+        residual = x
+
+        out = self.conv1(x)
+        out = self.relu(out)
+        out = self.bn1(out)
+
+        out = self.conv2(out)
+        out = self.bn2(out)
+        out = self.se(out)
+
+        if self.downsample is not None:
+            residual = self.downsample(x)
+
+        out += residual
+        out = self.relu(out)
+        return out
+
+class SELayer(nn.Module):
+    def __init__(self, channel, reduction=8):
+        super(SELayer, self).__init__()
+        self.avg_pool = nn.AdaptiveAvgPool2d(1)
+        self.fc = nn.Sequential(
+                nn.Linear(channel, channel // reduction),
+                nn.ReLU(inplace=True),
+                nn.Linear(channel // reduction, channel),
+                nn.Sigmoid()
+        )
+
+    def forward(self, x):
+        b, c, _, _ = x.size()
+        y = self.avg_pool(x).view(b, c)
+        y = self.fc(y).view(b, c, 1, 1)
+        return x * y
+
+class audioEncoder(nn.Module):
+    def __init__(self, layers, num_filters, **kwargs):
+        super(audioEncoder, self).__init__()
+        block = SEBasicBlock
+        self.inplanes   = num_filters[0]
+
+        self.conv1 = nn.Conv2d(1, num_filters[0] , kernel_size=7, stride=(2, 1), padding=3,
+                               bias=False)
+        self.bn1 = nn.BatchNorm2d(num_filters[0])
+        self.relu = nn.ReLU(inplace=True)
+
+        self.layer1 = self._make_layer(block, num_filters[0], layers[0])
+        self.layer2 = self._make_layer(block, num_filters[1], layers[1], stride=(2, 2))
+        self.layer3 = self._make_layer(block, num_filters[2], layers[2], stride=(2, 2))
+        self.layer4 = self._make_layer(block, num_filters[3], layers[3], stride=(1, 1))
+        out_dim = num_filters[3] * block.expansion
+
+        for m in self.modules():
+            if isinstance(m, nn.Conv2d):
+                nn.init.kaiming_normal_(m.weight, mode='fan_out', nonlinearity='relu')
+            elif isinstance(m, nn.BatchNorm2d):
+                nn.init.constant_(m.weight, 1)
+                nn.init.constant_(m.bias, 0)
+
+    def _make_layer(self, block, planes, blocks, stride=1):
+        downsample = None
+        if stride != 1 or self.inplanes != planes * block.expansion:
+            downsample = nn.Sequential(
+                nn.Conv2d(self.inplanes, planes * block.expansion,
+                          kernel_size=1, stride=stride, bias=False),
+                nn.BatchNorm2d(planes * block.expansion),
+            )
+
+        layers = []
+        layers.append(block(self.inplanes, planes, stride, downsample))
+        self.inplanes = planes * block.expansion
+        for i in range(1, blocks):
+            layers.append(block(self.inplanes, planes))
+
+        return nn.Sequential(*layers)
+
+    def forward(self, x):
+        x = self.conv1(x)
+        x = self.bn1(x)
+        x = self.relu(x)
+
+        x = self.layer1(x)
+        x = self.layer2(x)
+        x = self.layer3(x)
+        x = self.layer4(x)
+        x = torch.mean(x, dim=2, keepdim=True)
+        x = x.view((x.size()[0], x.size()[1], -1))
+        x = x.transpose(1, 2)
+
+        return x
\ No newline at end of file
diff --git a/model/convLayer.py b/model/convLayer.py
new file mode 100755
index 0000000000000000000000000000000000000000..827d83e61e208bc0acfb9bb587ec4f07e1d1104b
--- /dev/null
+++ b/model/convLayer.py
@@ -0,0 +1,42 @@
+import torch
+import torch.nn as nn
+from torch.nn import functional as F
+
+
+class ConvLayer(nn.Module):
+
+    def __init__(self, cfg):
+        super(ConvLayer, self).__init__()
+        self.cfg = cfg
+        self.s = cfg.MODEL.NUM_SPEAKERS
+        self.conv2d = torch.nn.Conv2d(256, 256 * self.s, (self.s, 7), padding=(0, 3))
+        # below line is speaker parallel 93.88 code
+        # self.conv2d = torch.nn.Conv2d(256, 256 * self.s, (3, 7), padding=(0, 3))
+        self.ln = torch.nn.LayerNorm(256)
+        self.conv2d_1x1 = torch.nn.Conv2d(256, 512, (1, 1), padding=(0, 0))
+        self.conv2d_1x1_2 = torch.nn.Conv2d(512, 256, (1, 1), padding=(0, 0))
+        self.gelu = nn.GELU()
+
+    def forward(self, x, b, s):
+
+        identity = x    # b*s, t, c
+        t = x.shape[1]
+        c = x.shape[2]
+        out = x.view(b, s, t, c)
+        out = out.permute(0, 3, 1, 2)    # b, c, s, t
+
+        out = self.conv2d(out)    # b, s*c, 1, t
+        out = out.view(b, c, s, t)
+        out = out.permute(0, 2, 3, 1)    # b, s, t, c
+        out = self.ln(out)
+        out = out.permute(0, 3, 1, 2)
+        out = self.conv2d_1x1(out)
+        out = self.gelu(out)
+        out = self.conv2d_1x1_2(out)    # b, c, s, t
+
+        out = out.permute(0, 2, 3, 1)    # b, s, t, c
+        out = out.view(b * s, t, c)
+
+        out += identity
+
+        return out, b, s
diff --git a/model/faceDetector/README.md b/model/faceDetector/README.md
new file mode 100755
index 0000000000000000000000000000000000000000..f5a8d4feb007f86f8c60075d8538f9ee5e93b325
--- /dev/null
+++ b/model/faceDetector/README.md
@@ -0,0 +1,3 @@
+# Face detector
+
+This face detector is adapted from `https://github.com/cs-giung/face-detection-pytorch`.
diff --git a/model/faceDetector/__init__.py b/model/faceDetector/__init__.py
new file mode 100755
index 0000000000000000000000000000000000000000..059d49bf0b8e8a17f641984e7d889e5b008257b9
--- /dev/null
+++ b/model/faceDetector/__init__.py
@@ -0,0 +1 @@
+from .s3fd import S3FD
\ No newline at end of file
diff --git a/model/faceDetector/s3fd/__init__.py b/model/faceDetector/s3fd/__init__.py
new file mode 100755
index 0000000000000000000000000000000000000000..943292ad2afae6ba9eebd03b6f9bb684a7de5ca5
--- /dev/null
+++ b/model/faceDetector/s3fd/__init__.py
@@ -0,0 +1,66 @@
+import time, os, sys, subprocess
+import numpy as np
+import cv2
+import torch
+from torchvision import transforms
+from .nets import S3FDNet
+from .box_utils import nms_
+
+PATH_WEIGHT = 'model/faceDetector/s3fd/sfd_face.pth'
+if os.path.isfile(PATH_WEIGHT) == False:
+    Link = "1KafnHz7ccT-3IyddBsL5yi2xGtxAKypt"
+    cmd = "gdown --id %s -O %s"%(Link, PATH_WEIGHT)
+    subprocess.call(cmd, shell=True, stdout=None)
+img_mean = np.array([104., 117., 123.])[:, np.newaxis, np.newaxis].astype('float32')
+
+
+class S3FD():
+
+    def __init__(self, device='cuda'):
+
+        tstamp = time.time()
+        self.device = device
+
+        # print('[S3FD] loading with', self.device)
+        self.net = S3FDNet(device=self.device).to(self.device)
+        PATH = os.path.join(os.getcwd(), PATH_WEIGHT)
+        state_dict = torch.load(PATH, map_location=self.device)
+        self.net.load_state_dict(state_dict)
+        self.net.eval()
+        # print('[S3FD] finished loading (%.4f sec)' % (time.time() - tstamp))
+    
+    def detect_faces(self, image, conf_th=0.8, scales=[1]):
+
+        w, h = image.shape[1], image.shape[0]
+
+        bboxes = np.empty(shape=(0, 5))
+
+        with torch.no_grad():
+            for s in scales:
+                scaled_img = cv2.resize(image, dsize=(0, 0), fx=s, fy=s, interpolation=cv2.INTER_LINEAR)
+
+                scaled_img = np.swapaxes(scaled_img, 1, 2)
+                scaled_img = np.swapaxes(scaled_img, 1, 0)
+                scaled_img = scaled_img[[2, 1, 0], :, :]
+                scaled_img = scaled_img.astype('float32')
+                scaled_img -= img_mean
+                scaled_img = scaled_img[[2, 1, 0], :, :]
+                x = torch.from_numpy(scaled_img).unsqueeze(0).to(self.device)
+                y = self.net(x)
+
+                detections = y.data
+                scale = torch.Tensor([w, h, w, h])
+
+                for i in range(detections.size(1)):
+                    j = 0
+                    while detections[0, i, j, 0] > conf_th:
+                        score = detections[0, i, j, 0]
+                        pt = (detections[0, i, j, 1:] * scale).cpu().numpy()
+                        bbox = (pt[0], pt[1], pt[2], pt[3], score)
+                        bboxes = np.vstack((bboxes, bbox))
+                        j += 1
+
+            keep = nms_(bboxes, 0.1)
+            bboxes = bboxes[keep]
+
+        return bboxes
diff --git a/model/faceDetector/s3fd/box_utils.py b/model/faceDetector/s3fd/box_utils.py
new file mode 100755
index 0000000000000000000000000000000000000000..0779bcd58062510a2979f5673f14189c4c817e92
--- /dev/null
+++ b/model/faceDetector/s3fd/box_utils.py
@@ -0,0 +1,217 @@
+import numpy as np
+from itertools import product as product
+import torch
+from torch.autograd import Function
+
+
+def nms_(dets, thresh):
+    """
+    Courtesy of Ross Girshick
+    [https://github.com/rbgirshick/py-faster-rcnn/blob/master/lib/nms/py_cpu_nms.py]
+    """
+    x1 = dets[:, 0]
+    y1 = dets[:, 1]
+    x2 = dets[:, 2]
+    y2 = dets[:, 3]
+    scores = dets[:, 4]
+
+    areas = (x2 - x1) * (y2 - y1)
+    order = scores.argsort()[::-1]
+
+    keep = []
+    while order.size > 0:
+        i = order[0]
+        keep.append(int(i))
+        xx1 = np.maximum(x1[i], x1[order[1:]])
+        yy1 = np.maximum(y1[i], y1[order[1:]])
+        xx2 = np.minimum(x2[i], x2[order[1:]])
+        yy2 = np.minimum(y2[i], y2[order[1:]])
+
+        w = np.maximum(0.0, xx2 - xx1)
+        h = np.maximum(0.0, yy2 - yy1)
+        inter = w * h
+        ovr = inter / (areas[i] + areas[order[1:]] - inter)
+
+        inds = np.where(ovr <= thresh)[0]
+        order = order[inds + 1]
+
+    return np.array(keep).astype(np.int)
+
+
+def decode(loc, priors, variances):
+    """Decode locations from predictions using priors to undo
+    the encoding we did for offset regression at train time.
+    Args:
+        loc (tensor): location predictions for loc layers,
+            Shape: [num_priors,4]
+        priors (tensor): Prior boxes in center-offset form.
+            Shape: [num_priors,4].
+        variances: (list[float]) Variances of priorboxes
+    Return:
+        decoded bounding box predictions
+    """
+
+    boxes = torch.cat((
+        priors[:, :2] + loc[:, :2] * variances[0] * priors[:, 2:],
+        priors[:, 2:] * torch.exp(loc[:, 2:] * variances[1])), 1)
+    boxes[:, :2] -= boxes[:, 2:] / 2
+    boxes[:, 2:] += boxes[:, :2]
+    return boxes
+
+
+def nms(boxes, scores, overlap=0.5, top_k=200):
+    """Apply non-maximum suppression at test time to avoid detecting too many
+    overlapping bounding boxes for a given object.
+    Args:
+        boxes: (tensor) The location preds for the img, Shape: [num_priors,4].
+        scores: (tensor) The class predscores for the img, Shape:[num_priors].
+        overlap: (float) The overlap thresh for suppressing unnecessary boxes.
+        top_k: (int) The Maximum number of box preds to consider.
+    Return:
+        The indices of the kept boxes with respect to num_priors.
+    """
+
+    keep = scores.new(scores.size(0)).zero_().long()
+    if boxes.numel() == 0:
+        return keep, 0
+    x1 = boxes[:, 0]
+    y1 = boxes[:, 1]
+    x2 = boxes[:, 2]
+    y2 = boxes[:, 3]
+    area = torch.mul(x2 - x1, y2 - y1)
+    v, idx = scores.sort(0)  # sort in ascending order
+    # I = I[v >= 0.01]
+    idx = idx[-top_k:]  # indices of the top-k largest vals
+    xx1 = boxes.new()
+    yy1 = boxes.new()
+    xx2 = boxes.new()
+    yy2 = boxes.new()
+    w = boxes.new()
+    h = boxes.new()
+
+    # keep = torch.Tensor()
+    count = 0
+    while idx.numel() > 0:
+        i = idx[-1]  # index of current largest val
+        # keep.append(i)
+        keep[count] = i
+        count += 1
+        if idx.size(0) == 1:
+            break
+        idx = idx[:-1]  # remove kept element from view
+        # load bboxes of next highest vals
+        torch.index_select(x1, 0, idx, out=xx1)
+        torch.index_select(y1, 0, idx, out=yy1)
+        torch.index_select(x2, 0, idx, out=xx2)
+        torch.index_select(y2, 0, idx, out=yy2)
+        # store element-wise max with next highest score
+        xx1 = torch.clamp(xx1, min=x1[i])
+        yy1 = torch.clamp(yy1, min=y1[i])
+        xx2 = torch.clamp(xx2, max=x2[i])
+        yy2 = torch.clamp(yy2, max=y2[i])
+        w.resize_as_(xx2)
+        h.resize_as_(yy2)
+        w = xx2 - xx1
+        h = yy2 - yy1
+        # check sizes of xx1 and xx2.. after each iteration
+        w = torch.clamp(w, min=0.0)
+        h = torch.clamp(h, min=0.0)
+        inter = w * h
+        # IoU = i / (area(a) + area(b) - i)
+        rem_areas = torch.index_select(area, 0, idx)  # load remaining areas)
+        union = (rem_areas - inter) + area[i]
+        IoU = inter / union  # store result in iou
+        # keep only elements with an IoU <= overlap
+        idx = idx[IoU.le(overlap)]
+    return keep, count
+
+
+class Detect(object):
+
+    def __init__(self, num_classes=2,
+                    top_k=750, nms_thresh=0.3, conf_thresh=0.05,
+                    variance=[0.1, 0.2], nms_top_k=5000):
+        
+        self.num_classes = num_classes
+        self.top_k = top_k
+        self.nms_thresh = nms_thresh
+        self.conf_thresh = conf_thresh
+        self.variance = variance
+        self.nms_top_k = nms_top_k
+
+    def forward(self, loc_data, conf_data, prior_data):
+
+        num = loc_data.size(0)
+        num_priors = prior_data.size(0)
+
+        conf_preds = conf_data.view(num, num_priors, self.num_classes).transpose(2, 1)
+        batch_priors = prior_data.view(-1, num_priors, 4).expand(num, num_priors, 4)
+        batch_priors = batch_priors.contiguous().view(-1, 4)
+
+        decoded_boxes = decode(loc_data.view(-1, 4), batch_priors, self.variance)
+        decoded_boxes = decoded_boxes.view(num, num_priors, 4)
+
+        output = torch.zeros(num, self.num_classes, self.top_k, 5)
+
+        for i in range(num):
+            boxes = decoded_boxes[i].clone()
+            conf_scores = conf_preds[i].clone()
+
+            for cl in range(1, self.num_classes):
+                c_mask = conf_scores[cl].gt(self.conf_thresh)
+                scores = conf_scores[cl][c_mask]
+                
+                if scores.dim() == 0:
+                    continue
+                l_mask = c_mask.unsqueeze(1).expand_as(boxes)
+                boxes_ = boxes[l_mask].view(-1, 4)
+                ids, count = nms(boxes_, scores, self.nms_thresh, self.nms_top_k)
+                count = count if count < self.top_k else self.top_k
+
+                output[i, cl, :count] = torch.cat((scores[ids[:count]].unsqueeze(1), boxes_[ids[:count]]), 1)
+
+        return output
+
+
+class PriorBox(object):
+
+    def __init__(self, input_size, feature_maps,
+                    variance=[0.1, 0.2],
+                    min_sizes=[16, 32, 64, 128, 256, 512],
+                    steps=[4, 8, 16, 32, 64, 128],
+                    clip=False):
+
+        super(PriorBox, self).__init__()
+
+        self.imh = input_size[0]
+        self.imw = input_size[1]
+        self.feature_maps = feature_maps
+
+        self.variance = variance
+        self.min_sizes = min_sizes
+        self.steps = steps
+        self.clip = clip
+
+    def forward(self):
+        mean = []
+        for k, fmap in enumerate(self.feature_maps):
+            feath = fmap[0]
+            featw = fmap[1]
+            for i, j in product(range(feath), range(featw)):
+                f_kw = self.imw / self.steps[k]
+                f_kh = self.imh / self.steps[k]
+
+                cx = (j + 0.5) / f_kw
+                cy = (i + 0.5) / f_kh
+
+                s_kw = self.min_sizes[k] / self.imw
+                s_kh = self.min_sizes[k] / self.imh
+
+                mean += [cx, cy, s_kw, s_kh]
+
+        output = torch.FloatTensor(mean).view(-1, 4)
+        
+        if self.clip:
+            output.clamp_(max=1, min=0)
+        
+        return output
diff --git a/model/faceDetector/s3fd/nets.py b/model/faceDetector/s3fd/nets.py
new file mode 100755
index 0000000000000000000000000000000000000000..85b5c82c142f02cef75c1e03557b2a1a748c32b0
--- /dev/null
+++ b/model/faceDetector/s3fd/nets.py
@@ -0,0 +1,174 @@
+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+import torch.nn.init as init
+from .box_utils import Detect, PriorBox
+
+
+class L2Norm(nn.Module):
+
+    def __init__(self, n_channels, scale):
+        super(L2Norm, self).__init__()
+        self.n_channels = n_channels
+        self.gamma = scale or None
+        self.eps = 1e-10
+        self.weight = nn.Parameter(torch.Tensor(self.n_channels))
+        self.reset_parameters()
+
+    def reset_parameters(self):
+        init.constant_(self.weight, self.gamma)
+
+    def forward(self, x):
+        norm = x.pow(2).sum(dim=1, keepdim=True).sqrt() + self.eps
+        x = torch.div(x, norm)
+        out = self.weight.unsqueeze(0).unsqueeze(2).unsqueeze(3).expand_as(x) * x
+        return out
+
+
+class S3FDNet(nn.Module):
+
+    def __init__(self, device='cuda'):
+        super(S3FDNet, self).__init__()
+        self.device = device
+
+        self.vgg = nn.ModuleList([
+            nn.Conv2d(3, 64, 3, 1, padding=1),
+            nn.ReLU(inplace=True),
+            nn.Conv2d(64, 64, 3, 1, padding=1),
+            nn.ReLU(inplace=True),
+            nn.MaxPool2d(2, 2),
+
+            nn.Conv2d(64, 128, 3, 1, padding=1),
+            nn.ReLU(inplace=True),
+            nn.Conv2d(128, 128, 3, 1, padding=1),
+            nn.ReLU(inplace=True),
+            nn.MaxPool2d(2, 2),
+            
+            nn.Conv2d(128, 256, 3, 1, padding=1),
+            nn.ReLU(inplace=True),
+            nn.Conv2d(256, 256, 3, 1, padding=1),
+            nn.ReLU(inplace=True),
+            nn.Conv2d(256, 256, 3, 1, padding=1),
+            nn.ReLU(inplace=True),
+            nn.MaxPool2d(2, 2, ceil_mode=True),
+            
+            nn.Conv2d(256, 512, 3, 1, padding=1),
+            nn.ReLU(inplace=True),
+            nn.Conv2d(512, 512, 3, 1, padding=1),
+            nn.ReLU(inplace=True),
+            nn.Conv2d(512, 512, 3, 1, padding=1),
+            nn.ReLU(inplace=True),
+            nn.MaxPool2d(2, 2),
+
+            nn.Conv2d(512, 512, 3, 1, padding=1),
+            nn.ReLU(inplace=True),
+            nn.Conv2d(512, 512, 3, 1, padding=1),
+            nn.ReLU(inplace=True),
+            nn.Conv2d(512, 512, 3, 1, padding=1),
+            nn.ReLU(inplace=True),
+            nn.MaxPool2d(2, 2),
+
+            nn.Conv2d(512, 1024, 3, 1, padding=6, dilation=6),
+            nn.ReLU(inplace=True),
+            nn.Conv2d(1024, 1024, 1, 1),
+            nn.ReLU(inplace=True),
+        ])
+
+        self.L2Norm3_3 = L2Norm(256, 10)
+        self.L2Norm4_3 = L2Norm(512, 8)
+        self.L2Norm5_3 = L2Norm(512, 5)
+
+        self.extras = nn.ModuleList([
+            nn.Conv2d(1024, 256, 1, 1),
+            nn.Conv2d(256, 512, 3, 2, padding=1),
+            nn.Conv2d(512, 128, 1, 1),
+            nn.Conv2d(128, 256, 3, 2, padding=1),
+        ])
+        
+        self.loc = nn.ModuleList([
+            nn.Conv2d(256, 4, 3, 1, padding=1),
+            nn.Conv2d(512, 4, 3, 1, padding=1),
+            nn.Conv2d(512, 4, 3, 1, padding=1),
+            nn.Conv2d(1024, 4, 3, 1, padding=1),
+            nn.Conv2d(512, 4, 3, 1, padding=1),
+            nn.Conv2d(256, 4, 3, 1, padding=1),
+        ])
+
+        self.conf = nn.ModuleList([
+            nn.Conv2d(256, 4, 3, 1, padding=1),
+            nn.Conv2d(512, 2, 3, 1, padding=1),
+            nn.Conv2d(512, 2, 3, 1, padding=1),
+            nn.Conv2d(1024, 2, 3, 1, padding=1),
+            nn.Conv2d(512, 2, 3, 1, padding=1),
+            nn.Conv2d(256, 2, 3, 1, padding=1),
+        ])
+
+        self.softmax = nn.Softmax(dim=-1)
+        self.detect = Detect()
+
+    def forward(self, x):
+        size = x.size()[2:]
+        sources = list()
+        loc = list()
+        conf = list()
+
+        for k in range(16):
+            x = self.vgg[k](x)
+        s = self.L2Norm3_3(x)
+        sources.append(s)
+
+        for k in range(16, 23):
+            x = self.vgg[k](x)
+        s = self.L2Norm4_3(x)
+        sources.append(s)
+
+        for k in range(23, 30):
+            x = self.vgg[k](x)
+        s = self.L2Norm5_3(x)
+        sources.append(s)
+
+        for k in range(30, len(self.vgg)):
+            x = self.vgg[k](x)
+        sources.append(x)
+        
+        # apply extra layers and cache source layer outputs
+        for k, v in enumerate(self.extras):
+            x = F.relu(v(x), inplace=True)
+            if k % 2 == 1:
+                sources.append(x)
+
+        # apply multibox head to source layers
+        loc_x = self.loc[0](sources[0])
+        conf_x = self.conf[0](sources[0])
+
+        max_conf, _ = torch.max(conf_x[:, 0:3, :, :], dim=1, keepdim=True)
+        conf_x = torch.cat((max_conf, conf_x[:, 3:, :, :]), dim=1)
+
+        loc.append(loc_x.permute(0, 2, 3, 1).contiguous())
+        conf.append(conf_x.permute(0, 2, 3, 1).contiguous())
+
+        for i in range(1, len(sources)):
+            x = sources[i]
+            conf.append(self.conf[i](x).permute(0, 2, 3, 1).contiguous())
+            loc.append(self.loc[i](x).permute(0, 2, 3, 1).contiguous())
+
+        features_maps = []
+        for i in range(len(loc)):
+            feat = []
+            feat += [loc[i].size(1), loc[i].size(2)]
+            features_maps += [feat]
+
+        loc = torch.cat([o.view(o.size(0), -1) for o in loc], 1)
+        conf = torch.cat([o.view(o.size(0), -1) for o in conf], 1)
+
+        with torch.no_grad():
+            self.priorbox = PriorBox(size, features_maps)
+            self.priors = self.priorbox.forward()
+
+        output = self.detect.forward(
+            loc.view(loc.size(0), -1, 4),
+            self.softmax(conf.view(conf.size(0), -1, 2)),
+            self.priors.type(type(x.data)).to(self.device)
+        )
+
+        return output
diff --git a/model/loconet_encoder.py b/model/loconet_encoder.py
new file mode 100755
index 0000000000000000000000000000000000000000..5437007285b1e9ea495478e95a5697e9a54ca799
--- /dev/null
+++ b/model/loconet_encoder.py
@@ -0,0 +1,91 @@
+import torch
+import torch.nn as nn
+
+# from model.visualEncoder import visualFrontend, visualTCN, visualConv1D
+from model.attentionLayer import attentionLayer
+from model.convLayer import ConvLayer
+from torchvggish import vggish
+from model.visualEncoder import visualFrontend, visualConv1D, visualTCN
+
+
+class locoencoder(nn.Module):
+
+    def __init__(self, cfg):
+        super(locoencoder, self).__init__()
+        self.cfg = cfg
+        # Visual Temporal Encoder
+        self.visualFrontend = visualFrontend(cfg)    # Visual Frontend
+        self.visualTCN = visualTCN()    # Visual Temporal Network TCN
+        self.visualConv1D = visualConv1D()    # Visual Temporal Network Conv1d
+
+        urls = {
+            'vggish':
+                "https://github.com/harritaylor/torchvggish/releases/download/v0.1/vggish-10086976.pth"
+        }
+        self.audioEncoder = vggish.VGGish(urls, preprocess=False, postprocess=False)
+        self.audio_pool = nn.AdaptiveAvgPool1d(1)
+
+        # Audio-visual Cross Attention
+        self.crossA2V = attentionLayer(d_model=128, nhead=8)
+        self.crossV2A = attentionLayer(d_model=128, nhead=8)
+
+        # Audio-visual Self Attention
+
+        num_layers = self.cfg.MODEL.AV_layers
+        layers = nn.ModuleList()
+        for i in range(num_layers):
+            layers.append(ConvLayer(cfg))
+            layers.append(attentionLayer(d_model=256, nhead=8))
+        self.convAV = layers
+
+    def forward_visual_frontend(self, x):
+
+        B, T, W, H = x.shape
+        x = x.view(B * T, 1, 1, W, H)
+        x = (x / 255 - 0.4161) / 0.1688
+        x = self.visualFrontend(x)
+        x = x.view(B, T, 512)
+        x = x.transpose(1, 2)
+        x = self.visualTCN(x)
+        x = self.visualConv1D(x)
+        x = x.transpose(1, 2)
+        return x
+
+    def forward_audio_frontend(self, x):
+        t = x.shape[-2]
+        numFrames = t // 4
+        pad = 8 - (t % 8)
+        x = torch.nn.functional.pad(x, (0, 0, 0, pad), "constant")
+        # x = x.unsqueeze(1).transpose(2, 3)
+        x = self.audioEncoder(x)
+
+        b, c, t2, freq = x.shape
+        x = x.view(b * c, t2, freq)
+        x = self.audio_pool(x)
+        x = x.view(b, c, t2)[:, :, :numFrames]
+        x = x.permute(0, 2, 1)
+        return x
+
+    def forward_cross_attention(self, x1, x2):
+        x1_c = self.crossA2V(src=x1, tar=x2, adjust=self.cfg.MODEL.ADJUST_ATTENTION)
+        x2_c = self.crossV2A(src=x2, tar=x1, adjust=self.cfg.MODEL.ADJUST_ATTENTION)
+        return x1_c, x2_c
+
+    def forward_audio_visual_backend(self, x1, x2, b=1, s=1):
+        x = torch.cat((x1, x2), 2)    # B*S, T, 2C
+        for i, layer in enumerate(self.convAV):
+            if i % 2 == 0:
+                x, b, s = layer(x, b, s)
+            else:
+                x = layer(src=x, tar=x)
+
+        x = torch.reshape(x, (-1, 256))
+        return x
+
+    def forward_audio_backend(self, x):
+        x = torch.reshape(x, (-1, 128))
+        return x
+
+    def forward_visual_backend(self, x):
+        x = torch.reshape(x, (-1, 128))
+        return x
diff --git a/model/transformer/__pycache__/position_encoding.cpython-37.pyc b/model/transformer/__pycache__/position_encoding.cpython-37.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..6d3bb438aaecc9285935804e22e24a85ef38e9d4
Binary files /dev/null and b/model/transformer/__pycache__/position_encoding.cpython-37.pyc differ
diff --git a/model/transformer/__pycache__/transformer.cpython-37.pyc b/model/transformer/__pycache__/transformer.cpython-37.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..bb3498e405b0429759bc61a8dc23b2ecd7d7b13c
Binary files /dev/null and b/model/transformer/__pycache__/transformer.cpython-37.pyc differ
diff --git a/model/transformer/__pycache__/utils.cpython-37.pyc b/model/transformer/__pycache__/utils.cpython-37.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..aff02afd5a033a716de5a956fac03810ec6fb80d
Binary files /dev/null and b/model/transformer/__pycache__/utils.cpython-37.pyc differ
diff --git a/model/transformer/position_encoding.py b/model/transformer/position_encoding.py
new file mode 100644
index 0000000000000000000000000000000000000000..07b8ad68c878ab5a8c0e4a4de95e0bdcdc68fcfe
--- /dev/null
+++ b/model/transformer/position_encoding.py
@@ -0,0 +1,28 @@
+##########################################################################
+# We adopt the positional encoding method from PyTorch Turorial.
+# Source: https://pytorch.org/tutorials/beginner/transformer_tutorial.html
+##########################################################################
+import math
+
+import torch
+import torch.nn as nn
+
+
+class PositionalEncoding(nn.Module):
+
+    def __init__(self, d_model, dropout=0.1, max_len=5000):
+        super(PositionalEncoding, self).__init__()
+
+        self.dropout = nn.Dropout(p=dropout)
+
+        pe = torch.zeros(max_len, d_model)
+        position = torch.arange(0, max_len, dtype=torch.float).unsqueeze(1)
+        div_term = torch.exp(torch.arange(0, d_model, 2).float() * (-math.log(10000.0) / d_model))
+        pe[:, 0::2] = torch.sin(position * div_term)
+        pe[:, 1::2] = torch.cos(position * div_term)
+        pe = pe.unsqueeze(0).transpose(0, 1)
+        self.register_buffer('pe', pe)
+
+    def forward(self, x, padding=0):
+        x = x + self.pe[padding:padding + x.shape[0], :]
+        return self.dropout(x)
diff --git a/model/transformer/transformer.py b/model/transformer/transformer.py
new file mode 100644
index 0000000000000000000000000000000000000000..3c3faa13db57d78fa13a964b89582b4b02728d05
--- /dev/null
+++ b/model/transformer/transformer.py
@@ -0,0 +1,334 @@
+import copy
+
+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+
+
+class DotProductAttention(nn.Module):
+
+    def __init__(self, dropout=0.0):
+        super(DotProductAttention, self).__init__()
+
+        self.dropout = dropout
+
+    def forward(self, q, k, v, attn_mask=None):
+        attn_output_weights = torch.bmm(q, k.transpose(1, 2))
+
+        if attn_mask is not None:
+            attn_output_weights += attn_mask
+
+        attn_output_weights = F.softmax(attn_output_weights, dim=-1)
+        attn_output_weights = F.dropout(attn_output_weights, p=self.dropout, training=self.training)
+        attn_output = torch.bmm(attn_output_weights, v)
+        return attn_output
+
+
+class MultiheadAttention(nn.Module):
+
+    def __init__(self, embed_dim, num_heads, dropout=0.0, bias=True, kdim=None, vdim=None):
+        super(MultiheadAttention, self).__init__()
+
+        self.embed_dim = embed_dim
+        self.num_heads = num_heads
+        self.kdim = kdim if kdim is not None else embed_dim
+        self.vdim = vdim if vdim is not None else embed_dim
+        self._qkv_same_embed_dim = self.kdim == embed_dim and self.vdim == embed_dim
+
+        if self._qkv_same_embed_dim:
+            self.in_proj_weight = nn.Parameter(torch.empty(3 * embed_dim, embed_dim))
+        else:
+            raise RuntimeError('Do not support q, k, v have different dimensions')
+
+        if bias:
+            self.in_proj_bias = nn.Parameter(torch.empty(3 * embed_dim))
+        else:
+            self.register_parameter('in_proj_bias', None)
+
+        self.out_proj = nn.Linear(embed_dim, embed_dim)
+
+        if self._qkv_same_embed_dim:
+            nn.init.xavier_uniform_(self.in_proj_weight)
+
+        if self.in_proj_bias is not None:
+            nn.init.constant_(self.in_proj_bias, 0.)
+            nn.init.constant_(self.out_proj.bias, 0.)
+
+        self.dotproductattention = DotProductAttention(dropout)
+
+    def forward(self, q, k, v, attn_mask=None, key_padding_mask=None):
+        tsz, bsz, embed_dim = q.shape[0], q.shape[1], q.shape[2]
+
+        head_dim = embed_dim // self.num_heads
+        assert head_dim * self.num_heads == embed_dim, \
+            'embed_dim must be divisible by num_heads'
+        scaling = float(head_dim)**-0.5
+
+        _b = self.in_proj_bias
+        _start = None
+        _end = embed_dim
+        _w = self.in_proj_weight[:_end, :]
+        if _b is not None:
+            _b = _b[:_end]
+        q = F.linear(q, _w, _b)
+
+        _b = self.in_proj_bias
+        _start = embed_dim
+        _end = embed_dim * 2
+        _w = self.in_proj_weight[_start:_end, :]
+        if _b is not None:
+            _b = _b[_start:_end]
+        k = F.linear(k, _w, _b)
+
+        _b = self.in_proj_bias
+        _start = embed_dim * 2
+        _end = None
+        _w = self.in_proj_weight[_start:, :]
+        if _b is not None:
+            _b = _b[_start:]
+        v = F.linear(v, _w, _b)
+
+        q = q * scaling
+
+        q = q.contiguous().view(-1, bsz * self.num_heads, head_dim).transpose(0, 1)
+        k = k.contiguous().view(-1, bsz * self.num_heads, head_dim).transpose(0, 1)
+        v = v.contiguous().view(-1, bsz * self.num_heads, head_dim).transpose(0, 1)
+
+        if attn_mask is not None:
+            attn_mask = attn_mask.unsqueeze(0).repeat(bsz, 1, 1)
+            attn_mask = attn_mask.unsqueeze(1).repeat(1, self.num_heads, 1, 1)
+            attn_mask = attn_mask.reshape(-1, *attn_mask.shape[2:])
+
+        if key_padding_mask is not None:
+            key_padding_mask = key_padding_mask.unsqueeze(1).repeat(1, tsz, 1)
+            key_padding_mask = key_padding_mask.unsqueeze(1).repeat(1, self.num_heads, 1, 1)
+            key_padding_mask = key_padding_mask.reshape(-1, *key_padding_mask.shape[2:])
+
+        if attn_mask is not None and key_padding_mask is not None:
+            mask = attn_mask + key_padding_mask
+        elif attn_mask is not None:
+            mask = attn_mask
+        elif key_padding_mask is not None:
+            mask = key_padding_mask
+        else:
+            mask = None
+
+        attn_output = self.dotproductattention(q, k, v, mask)
+        attn_output = attn_output.transpose(0, 1).contiguous().view(tsz, bsz, self.embed_dim)
+        return self.out_proj(attn_output), None
+
+
+class Transformer(nn.Module):
+
+    def __init__(self,
+                 d_model=512,
+                 nhead=8,
+                 num_encoder_layers=6,
+                 num_decoder_layers=6,
+                 dim_feedforward=2048,
+                 dropout=0.1,
+                 activation='relu',
+                 custom_encoder=None,
+                 custom_decoder=None):
+        super(Transformer, self).__init__()
+
+        if custom_encoder is not None:
+            self.encoder = custom_encoder
+        else:
+            encoder_layer = TransformerEncoderLayer(d_model, nhead, dim_feedforward, dropout,
+                                                    activation)
+            encoder_norm = nn.LayerNorm(d_model)
+            self.encoder = TransformerEncoder(encoder_layer, num_encoder_layers, encoder_norm)
+
+        if custom_decoder is not None:
+            self.decoder = custom_decoder
+        else:
+            decoder_layer = TransformerDecoderLayer(d_model, nhead, dim_feedforward, dropout,
+                                                    activation)
+            decoder_norm = nn.LayerNorm(d_model)
+            self.decoder = TransformerDecoder(decoder_layer, num_decoder_layers, decoder_norm)
+
+        self.d_model = d_model
+        self.nhead = nhead
+
+    def forward(self,
+                src,
+                tgt,
+                src_mask=None,
+                tgt_mask=None,
+                memory_mask=None,
+                src_key_padding_mask=None,
+                tgt_key_padding_mask=None,
+                memory_key_padding_mask=None):
+        if src.size(1) != tgt.size(1):
+            raise RuntimeError('the batch number of src and tgt must be equal')
+
+        if src.size(2) != self.d_model or tgt.size(2) != self.d_model:
+            raise RuntimeError('the feature number of src and tgt must be equal to d_model')
+
+        memory = self.encoder(src, mask=src_mask, src_key_padding_mask=src_key_padding_mask)
+        output = self.decoder(tgt,
+                              memory,
+                              tgt_mask=tgt_mask,
+                              memory_mask=memory_mask,
+                              tgt_key_padding_mask=tgt_key_padding_mask,
+                              memory_key_padding_mask=memory_key_padding_mask)
+        return output
+
+
+class TransformerEncoder(nn.Module):
+
+    def __init__(self, encoder_layer, num_layers, norm=None):
+        super(TransformerEncoder, self).__init__()
+
+        self.layers = _get_clones(encoder_layer, num_layers)
+        self.num_layers = num_layers
+        self.norm = norm
+
+    def forward(self, src, src_mask=None, src_key_padding_mask=None):
+        output = src
+
+        for mod in self.layers:
+            output = mod(output, src_mask=src_mask, src_key_padding_mask=src_key_padding_mask)
+
+        if self.norm is not None:
+            output = self.norm(output)
+
+        return output
+
+
+class TransformerDecoder(nn.Module):
+
+    def __init__(self, decoder_layer, num_layers, norm=None):
+        super(TransformerDecoder, self).__init__()
+
+        self.layers = _get_clones(decoder_layer, num_layers)
+        self.num_layers = num_layers
+        self.norm = norm
+
+    def forward(self,
+                tgt,
+                memory,
+                tgt_mask=None,
+                memory_mask=None,
+                tgt_key_padding_mask=None,
+                memory_key_padding_mask=None):
+        output = tgt
+
+        for mod in self.layers:
+            output = mod(output,
+                         memory,
+                         tgt_mask=tgt_mask,
+                         memory_mask=memory_mask,
+                         tgt_key_padding_mask=tgt_key_padding_mask,
+                         memory_key_padding_mask=memory_key_padding_mask)
+
+        if self.norm is not None:
+            output = self.norm(output)
+
+        return output
+
+
+class TransformerEncoderLayer(nn.Module):
+
+    def __init__(self, d_model, nhead, dim_feedforward=2048, dropout=0.1, activation='relu'):
+        super(TransformerEncoderLayer, self).__init__()
+
+        self.self_attn = MultiheadAttention(d_model, nhead, dropout=dropout)
+
+        # Implementation of Feedforward model
+        self.linear1 = nn.Linear(d_model, dim_feedforward)
+        self.dropout = nn.Dropout(dropout)
+        self.linear2 = nn.Linear(dim_feedforward, d_model)
+
+        self.norm1 = nn.LayerNorm(d_model)
+        self.norm2 = nn.LayerNorm(d_model)
+        self.dropout1 = nn.Dropout(dropout)
+        self.dropout2 = nn.Dropout(dropout)
+
+        self.activation = _get_activation_fn(activation)
+
+    def __setstate__(self, state):
+        if 'activation' not in state:
+            state['activation'] = F.relu
+        super(TransformerEncoderLayer, self).__setstate__(state)
+
+    def forward(self, src, src_mask=None, src_key_padding_mask=None):
+        src2 = self.self_attn(src,
+                              src,
+                              src,
+                              attn_mask=src_mask,
+                              key_padding_mask=src_key_padding_mask)[0]
+        src = src + self.dropout1(src2)
+        src = self.norm1(src)
+        src2 = self.linear2(self.dropout(self.activation(self.linear1(src))))
+        src = src + self.dropout2(src2)
+        src = self.norm2(src)
+        return src
+
+
+class TransformerDecoderLayer(nn.Module):
+
+    def __init__(self, d_model, nhead, dim_feedforward=2048, dropout=0.1, activation='relu'):
+        super(TransformerDecoderLayer, self).__init__()
+
+        self.self_attn = MultiheadAttention(d_model, nhead, dropout=dropout)
+        self.multihead_attn = MultiheadAttention(d_model, nhead, dropout=dropout)
+
+        # Implementation of Feedforward model
+        self.linear1 = nn.Linear(d_model, dim_feedforward)
+        self.dropout = nn.Dropout(dropout)
+        self.linear2 = nn.Linear(dim_feedforward, d_model)
+
+        self.norm1 = nn.LayerNorm(d_model)
+        self.norm2 = nn.LayerNorm(d_model)
+        self.norm3 = nn.LayerNorm(d_model)
+        self.dropout1 = nn.Dropout(dropout)
+        self.dropout2 = nn.Dropout(dropout)
+        self.dropout3 = nn.Dropout(dropout)
+
+        self.activation = _get_activation_fn(activation)
+
+    def __setstate__(self, state):
+        if 'activation' not in state:
+            state['activation'] = F.relu
+        super(TransformerDecoderLayer, self).__setstate__(state)
+
+    def forward(self,
+                tgt,
+                memory,
+                tgt_mask=None,
+                memory_mask=None,
+                tgt_key_padding_mask=None,
+                memory_key_padding_mask=None):
+        tgt2 = self.self_attn(tgt,
+                              tgt,
+                              tgt,
+                              attn_mask=tgt_mask,
+                              key_padding_mask=tgt_key_padding_mask)[0]
+        tgt = tgt + self.dropout1(tgt2)
+        tgt = self.norm1(tgt)
+        tgt2 = self.multihead_attn(tgt,
+                                   memory,
+                                   memory,
+                                   attn_mask=memory_mask,
+                                   key_padding_mask=memory_key_padding_mask)[0]
+        tgt = tgt + self.dropout2(tgt2)
+        tgt = self.norm2(tgt)
+        tgt2 = self.linear2(self.dropout(self.activation(self.linear1(tgt))))
+        tgt = tgt + self.dropout3(tgt2)
+        tgt = self.norm3(tgt)
+        return tgt
+
+
+def _get_clones(module, N):
+    return nn.ModuleList([copy.deepcopy(module) for i in range(N)])
+
+
+def _get_activation_fn(activation):
+    if activation == 'relu':
+        return F.relu
+    elif activation == 'gelu':
+        return F.gelu
+
+    raise RuntimeError('activation should be relu/gelu, not {}'.format(activation))
diff --git a/model/transformer/utils.py b/model/transformer/utils.py
new file mode 100644
index 0000000000000000000000000000000000000000..17a5463e62ba77f80f5a89e8a338f2658187f3f8
--- /dev/null
+++ b/model/transformer/utils.py
@@ -0,0 +1,22 @@
+import torch
+assert torch.__version__ >= '1.6.0'
+import torch.nn as nn
+import numpy as np
+
+
+def layer_norm(d_model, condition=True):
+    return nn.LayerNorm(d_model) if condition else None
+
+
+def generate_square_subsequent_mask(sz):
+    mask = (torch.triu(torch.ones(sz, sz)) == 1).transpose(0, 1)
+    mask = mask.float().masked_fill(mask == 0, float('-inf')).masked_fill(mask == 1, float(0.0))
+    return mask
+
+
+def generate_proposal_mask(T, B):
+    mask = torch.zeros(T, (T + 1) * T // 2)
+    for sz, idx in zip(range(1, T + 1), np.cumsum(range(T))):
+        mask[:sz, idx: idx + sz] = torch.fliplr(torch.tril(torch.ones(sz, sz)))
+    mask = mask.unsqueeze(1).repeat(1, B, 1)
+    return mask
diff --git a/model/visualEncoder.py b/model/visualEncoder.py
new file mode 100755
index 0000000000000000000000000000000000000000..4b66c06b6b8b3b192fb9d86435dd178fc7150e18
--- /dev/null
+++ b/model/visualEncoder.py
@@ -0,0 +1,199 @@
+##
+# ResNet18 Pretrained network to extract lip embedding
+# This code is modified based on https://github.com/lordmartian/deep_avsr
+##
+
+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+from model.attentionLayer import attentionLayer
+
+
+class ResNetLayer(nn.Module):
+    """
+    A ResNet layer used to build the ResNet network.
+    Architecture:
+    --> conv-bn-relu -> conv -> + -> bn-relu -> conv-bn-relu -> conv -> + -> bn-relu -->
+     |                        |   |                                    |
+     -----> downsample ------>    ------------------------------------->
+    """
+
+    def __init__(self, inplanes, outplanes, stride):
+        super(ResNetLayer, self).__init__()
+        self.conv1a = nn.Conv2d(inplanes,
+                                outplanes,
+                                kernel_size=3,
+                                stride=stride,
+                                padding=1,
+                                bias=False)
+        self.bn1a = nn.BatchNorm2d(outplanes, momentum=0.01, eps=0.001)
+        self.conv2a = nn.Conv2d(outplanes,
+                                outplanes,
+                                kernel_size=3,
+                                stride=1,
+                                padding=1,
+                                bias=False)
+        self.stride = stride
+        if self.stride != 1:
+            self.downsample = nn.Conv2d(inplanes,
+                                        outplanes,
+                                        kernel_size=(1, 1),
+                                        stride=stride,
+                                        bias=False)
+        self.outbna = nn.BatchNorm2d(outplanes, momentum=0.01, eps=0.001)
+
+        self.conv1b = nn.Conv2d(outplanes,
+                                outplanes,
+                                kernel_size=3,
+                                stride=1,
+                                padding=1,
+                                bias=False)
+        self.bn1b = nn.BatchNorm2d(outplanes, momentum=0.01, eps=0.001)
+        self.conv2b = nn.Conv2d(outplanes,
+                                outplanes,
+                                kernel_size=3,
+                                stride=1,
+                                padding=1,
+                                bias=False)
+        self.outbnb = nn.BatchNorm2d(outplanes, momentum=0.01, eps=0.001)
+        return
+
+    def forward(self, inputBatch):
+        batch = F.relu(self.bn1a(self.conv1a(inputBatch)))
+        batch = self.conv2a(batch)
+        if self.stride == 1:
+            residualBatch = inputBatch
+        else:
+            residualBatch = self.downsample(inputBatch)
+        batch = batch + residualBatch
+        intermediateBatch = batch
+        batch = F.relu(self.outbna(batch))
+
+        batch = F.relu(self.bn1b(self.conv1b(batch)))
+        batch = self.conv2b(batch)
+        residualBatch = intermediateBatch
+        batch = batch + residualBatch
+        outputBatch = F.relu(self.outbnb(batch))
+        return outputBatch
+
+
+class ResNet(nn.Module):
+    """
+    An 18-layer ResNet architecture.
+    """
+
+    def __init__(self):
+        super(ResNet, self).__init__()
+        self.layer1 = ResNetLayer(64, 64, stride=1)
+        self.layer2 = ResNetLayer(64, 128, stride=2)
+        self.layer3 = ResNetLayer(128, 256, stride=2)
+        self.layer4 = ResNetLayer(256, 512, stride=2)
+        self.avgpool = nn.AvgPool2d(kernel_size=(4, 4), stride=(1, 1))
+
+        return
+
+    def forward(self, inputBatch):
+        batch = self.layer1(inputBatch)
+        batch = self.layer2(batch)
+        batch = self.layer3(batch)
+        batch = self.layer4(batch)
+        outputBatch = self.avgpool(batch)
+        return outputBatch
+
+
+class GlobalLayerNorm(nn.Module):
+
+    def __init__(self, channel_size):
+        super(GlobalLayerNorm, self).__init__()
+        self.gamma = nn.Parameter(torch.Tensor(1, channel_size, 1))    # [1, N, 1]
+        self.beta = nn.Parameter(torch.Tensor(1, channel_size, 1))    # [1, N, 1]
+        self.reset_parameters()
+
+    def reset_parameters(self):
+        self.gamma.data.fill_(1)
+        self.beta.data.zero_()
+
+    def forward(self, y):
+        mean = y.mean(dim=1, keepdim=True).mean(dim=2, keepdim=True)    #[M, 1, 1]
+        var = (torch.pow(y - mean, 2)).mean(dim=1, keepdim=True).mean(dim=2, keepdim=True)
+        gLN_y = self.gamma * (y - mean) / torch.pow(var + 1e-8, 0.5) + self.beta
+        return gLN_y
+
+
+class visualFrontend(nn.Module):
+    """
+    A visual feature extraction module. Generates a 512-dim feature vector per video frame.
+    Architecture: A 3D convolution block followed by an 18-layer ResNet.
+    """
+
+    def __init__(self, cfg):
+        self.cfg = cfg
+        super(visualFrontend, self).__init__()
+        self.frontend3D = nn.Sequential(
+            nn.Conv3d(1, 64, kernel_size=(5, 7, 7), stride=(1, 2, 2), padding=(2, 3, 3),
+                      bias=False), nn.BatchNorm3d(64, momentum=0.01, eps=0.001), nn.ReLU(),
+            nn.MaxPool3d(kernel_size=(1, 3, 3), stride=(1, 2, 2), padding=(0, 1, 1)))
+        self.resnet = ResNet()
+        return
+
+    def forward(self, inputBatch):
+        inputBatch = inputBatch.transpose(0, 1).transpose(1, 2)
+        batchsize = inputBatch.shape[0]
+        batch = self.frontend3D(inputBatch)
+
+        batch = batch.transpose(1, 2)
+        batch = batch.reshape(batch.shape[0] * batch.shape[1], batch.shape[2], batch.shape[3],
+                              batch.shape[4])
+        outputBatch = self.resnet(batch)
+        outputBatch = outputBatch.reshape(batchsize, -1, 512)
+        outputBatch = outputBatch.transpose(1, 2)
+        outputBatch = outputBatch.transpose(1, 2).transpose(0, 1)
+        return outputBatch
+
+
+class DSConv1d(nn.Module):
+
+    def __init__(self):
+        super(DSConv1d, self).__init__()
+        self.net = nn.Sequential(
+            nn.ReLU(),
+            nn.BatchNorm1d(512),
+            nn.Conv1d(512, 512, 3, stride=1, padding=1, dilation=1, groups=512, bias=False),
+            nn.PReLU(),
+            GlobalLayerNorm(512),
+            nn.Conv1d(512, 512, 1, bias=False),
+        )
+
+    def forward(self, x):
+        out = self.net(x)
+        return out + x
+
+
+class visualTCN(nn.Module):
+
+    def __init__(self):
+        super(visualTCN, self).__init__()
+        stacks = []
+        for x in range(5):
+            stacks += [DSConv1d()]
+        self.net = nn.Sequential(*stacks)    # Visual Temporal Network V-TCN
+
+    def forward(self, x):
+        out = self.net(x)
+        return out
+
+
+class visualConv1D(nn.Module):
+
+    def __init__(self):
+        super(visualConv1D, self).__init__()
+        self.net = nn.Sequential(
+            nn.Conv1d(512, 256, 5, stride=1, padding=2),
+            nn.BatchNorm1d(256),
+            nn.ReLU(),
+            nn.Conv1d(256, 128, 1),
+        )
+
+    def forward(self, x):
+        out = self.net(x)
+        return out
diff --git a/scripts/.ipynb_checkpoints/test-checkpoint.sh b/scripts/.ipynb_checkpoints/test-checkpoint.sh
new file mode 100755
index 0000000000000000000000000000000000000000..764356c2b36ca9e8cac15fca010498a0aab16382
--- /dev/null
+++ b/scripts/.ipynb_checkpoints/test-checkpoint.sh
@@ -0,0 +1,40 @@
+# #expid: 1.a
+# python -W ignore::UserWarning tools/test.py --cfg configs/selfattention_noise.yaml \
+#     MODEL.SELFATTENTION.VERB_BASE_NOISE 1e-3 \
+#     MODEL.SELFATTENTION.NOUN_BASE_NOISE 1e-2 \
+#     MODEL.SELFATTENTION.ADD_NOISE_LAYERS "[0,1,2]" \
+#     TEST.RESUME 1.a-v1e3_n1e2_012 \
+#     TEST.DATASET "unseen" \
+#     TEST.MODEL "seen" \
+#     TEST.EPOCH 85
+    
+# #expid: 1.b
+# python -W ignore::UserWarning tools/test.py --cfg configs/selfattention_noise.yaml \
+#     MODEL.SELFATTENTION.VERB_BASE_NOISE 1e-3 \
+#     MODEL.SELFATTENTION.NOUN_BASE_NOISE 1e-2 \
+#     MODEL.SELFATTENTION.ADD_NOISE_LAYERS "[0]" \
+#     TEST.RESUME 1.b-v1e3_n1e2_0 \
+#     TEST.DATASET "unseen" \
+#     TEST.MODEL "seen" \
+#     TEST.EPOCH 63
+    
+
+# #expid: 1.c
+# python -W ignore::UserWarning tools/test.py --cfg configs/selfattention_noise.yaml \
+#     MODEL.SELFATTENTION.VERB_BASE_NOISE 1e-3 \
+#     MODEL.SELFATTENTION.NOUN_BASE_NOISE 1e-2 \
+#     MODEL.SELFATTENTION.ADD_NOISE_LAYERS "[1]" \
+#     TEST.RESUME 1.c-v1e3_n1e2_1 \
+#     TEST.DATASET "unseen" \
+#     TEST.MODEL "seen" \
+#     TEST.EPOCH 65
+
+#expid: 1.d
+python -W ignore::UserWarning tools/test.py --cfg configs/selfattention_noise.yaml \
+    MODEL.SELFATTENTION.VERB_BASE_NOISE 1e-3 \
+    MODEL.SELFATTENTION.NOUN_BASE_NOISE 1e-2 \
+    MODEL.SELFATTENTION.ADD_NOISE_LAYERS "[2]" \
+    TEST.RESUME 1.d-v1e3_n1e2_2 \
+    TEST.DATASET "unseen" \
+    TEST.MODEL "seen" \
+    TEST.EPOCH 41
\ No newline at end of file
diff --git a/scripts/.ipynb_checkpoints/train-checkpoint.sh b/scripts/.ipynb_checkpoints/train-checkpoint.sh
new file mode 100755
index 0000000000000000000000000000000000000000..40e10d5e5272974b47f792b813c617b46e96fea1
--- /dev/null
+++ b/scripts/.ipynb_checkpoints/train-checkpoint.sh
@@ -0,0 +1,117 @@
+# #expid: 0.a
+# python -W ignore::UserWarning tools/train.py --cfg configs/selfattention.yaml \
+#     SESSION 0.a
+    
+# #expid: 0.b
+# python -W ignore::UserWarning tools/train.py --cfg configs/selfattention.yaml \
+#     SESSION 0.b
+
+# #expid: 0.c # change from utils deterministic to original method in coattention
+# python -W ignore::UserWarning tools/train.py --cfg configs/selfattention.yaml \
+#     SESSION 0.c
+
+# #expid: 0.d # use deterministic 123 as coattention
+# python -W ignore::UserWarning tools/train.py --cfg configs/selfattention.yaml \
+#     SESSION 0.d
+
+
+# 1 compare adding noise at different layers
+# #expid: 1.a
+# python -W ignore::UserWarning tools/train.py --cfg configs/selfattention_noise.yaml \
+#     MODEL.SELFATTENTION.VERB_BASE_NOISE 1e-3 \
+#     MODEL.SELFATTENTION.NOUN_BASE_NOISE 1e-2 \
+#     MODEL.SELFATTENTION.ADD_NOISE_LAYERS "[0,1,2]" \
+#     SESSION 1.a-v1e3_n1e2_012
+    
+# #expid: 1.b
+# python -W ignore::UserWarning tools/train.py --cfg configs/selfattention_noise.yaml \
+#     MODEL.SELFATTENTION.VERB_BASE_NOISE 1e-3 \
+#     MODEL.SELFATTENTION.NOUN_BASE_NOISE 1e-2 \
+#     MODEL.SELFATTENTION.ADD_NOISE_LAYERS "[0]" \
+#     SESSION 1.b-v1e3_n1e2_0
+    
+# #expid: 1.c
+# python -W ignore::UserWarning tools/train.py --cfg configs/selfattention_noise.yaml \
+#     MODEL.SELFATTENTION.VERB_BASE_NOISE 1e-3 \
+#     MODEL.SELFATTENTION.NOUN_BASE_NOISE 1e-2 \
+#     MODEL.SELFATTENTION.ADD_NOISE_LAYERS "[1]" \
+#     SESSION 1.c-v1e3_n1e2_1
+    
+# #expid: 1.d
+# python -W ignore::UserWarning tools/train.py --cfg configs/selfattention_noise.yaml \
+#     MODEL.SELFATTENTION.VERB_BASE_NOISE 1e-3 \
+#     MODEL.SELFATTENTION.NOUN_BASE_NOISE 1e-2 \
+#     MODEL.SELFATTENTION.ADD_NOISE_LAYERS "[2]" \
+#     SESSION 1.d-v1e3_n1e2_2
+    
+    
+# #expid: 2.a
+# python -W ignore::UserWarning tools/train.py --cfg configs/selfattention_noise.yaml \
+#     MODEL.SELFATTENTION.VERB_BASE_NOISE 1e-3 \
+#     MODEL.SELFATTENTION.NOUN_BASE_NOISE 1e-2 \
+#     MODEL.SELFATTENTION.ADD_NOISE_LAYERS "[0]" \
+#     TRAIN.SAMPLE 1    \
+#     SESSION 2.a-v1e3_n1e2_0_0.5
+    
+# #expid: 3.a
+# python -W ignore::UserWarning tools/train.py --cfg configs/selfattention_noise.yaml \
+#     MODEL.SELFATTENTION.VERB_BASE_NOISE 1e-2 \
+#     MODEL.SELFATTENTION.NOUN_BASE_NOISE 1e-2 \
+#     MODEL.SELFATTENTION.ADD_NOISE_LAYERS "[0]" \
+#     SESSION 3.a-v1e2_n1e2_0
+    
+# #expid: 3.b
+# python -W ignore::UserWarning tools/train.py --cfg configs/selfattention_noise.yaml \
+#     MODEL.SELFATTENTION.VERB_BASE_NOISE 1e-4 \
+#     MODEL.SELFATTENTION.NOUN_BASE_NOISE 1e-2 \
+#     MODEL.SELFATTENTION.ADD_NOISE_LAYERS "[0]" \
+#     SESSION 3.b-v1e4_n1e2_0
+    
+# #expid: 3.c
+# python -W ignore::UserWarning tools/train.py --cfg configs/selfattention_noise.yaml \
+#     MODEL.SELFATTENTION.VERB_BASE_NOISE 1e-5 \
+#     MODEL.SELFATTENTION.NOUN_BASE_NOISE 1e-2 \
+#     MODEL.SELFATTENTION.ADD_NOISE_LAYERS "[0]" \
+#     SESSION 3.c-v1e5_n1e2_0
+  
+  
+# # try different noise type
+# #expid: 4.a
+# python -W ignore::UserWarning tools/train.py --cfg configs/selfattention_noise.yaml \
+#     MODEL.SELFATTENTION.VERB_BASE_NOISE 1e-3 \
+#     MODEL.SELFATTENTION.NOUN_BASE_NOISE 1e-2 \
+#     MODEL.SELFATTENTION.ADD_NOISE_LAYERS "[0]" \
+#     MODEL.SELFATTENTION.NOISE_TYPE "blurry" \
+#     SESSION 4.a_v1e3_n1e2_0_blurry
+
+
+# #expid: 4.b  # to be run
+# python -W ignore::UserWarning tools/train.py --cfg configs/selfattention_noise.yaml \
+#     MODEL.SELFATTENTION.VERB_BASE_NOISE 1e-3 \
+#     MODEL.SELFATTENTION.NOUN_BASE_NOISE 1e-2 \
+#     MODEL.SELFATTENTION.ADD_NOISE_LAYERS "[0]" \
+#     MODEL.SELFATTENTION.NOISE_TYPE "adaptive" \
+#     SESSION 4.b_v1e3_n1e2_0_adaptive
+
+    
+# #expid: 5.a
+# python -W ignore::UserWarning tools/train.py --cfg configs/selfattention_noise.yaml \
+#     MODEL.SELFATTENTION.VERB_BASE_NOISE 1e-3 \
+#     MODEL.SELFATTENTION.NOUN_BASE_NOISE 1e-3 \
+#     MODEL.SELFATTENTION.ADD_NOISE_LAYERS "[0]" \
+#     SESSION 5.a-v1e3_n1e3_0
+    
+# #expid: 5.b  # zekrom
+# python -W ignore::UserWarning tools/train.py --cfg configs/selfattention_noise.yaml \
+#     MODEL.SELFATTENTION.VERB_BASE_NOISE 1e-3 \
+#     MODEL.SELFATTENTION.NOUN_BASE_NOISE 1e-4 \
+#     MODEL.SELFATTENTION.ADD_NOISE_LAYERS "[0]" \
+#     SESSION 5.b-v1e3_n1e4_0
+
+#expid: 6.a
+python -W ignore::UserWarning tools/train.py --cfg configs/selfattention_noise.yaml \
+    MODEL.SELFATTENTION.VERB_BASE_NOISE 1e-4 \
+    MODEL.SELFATTENTION.NOUN_BASE_NOISE 1e-2 \
+    MODEL.SELFATTENTION.ADD_NOISE_LAYERS "[1]" \
+    SESSION 6.a-v1e4_n1e2_1
+    
\ No newline at end of file
diff --git a/scripts/.ipynb_checkpoints/train_i3d_epic.sh b/scripts/.ipynb_checkpoints/train_i3d_epic.sh
new file mode 100644
index 0000000000000000000000000000000000000000..e69de29bb2d1d6434b8b29ae775ad8c2e48c5391
diff --git a/scripts/.ipynb_checkpoints/try-checkpoint.ipynb b/scripts/.ipynb_checkpoints/try-checkpoint.ipynb
new file mode 100644
index 0000000000000000000000000000000000000000..2450729121f75e804d0ae2efdde5ffa7f05b88be
--- /dev/null
+++ b/scripts/.ipynb_checkpoints/try-checkpoint.ipynb
@@ -0,0 +1,154 @@
+{
+ "cells": [
+  {
+   "cell_type": "code",
+   "execution_count": 1,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "import math\n",
+    "import numbers\n",
+    "import torch\n",
+    "from torch import nn\n",
+    "from torch.nn import functional as F\n",
+    "\n",
+    "class GaussianSmoothing(nn.Module):\n",
+    "    \"\"\"\n",
+    "    Apply gaussian smoothing on a\n",
+    "    1d, 2d or 3d tensor. Filtering is performed seperately for each channel\n",
+    "    in the input using a depthwise convolution.\n",
+    "    Arguments:\n",
+    "        channels (int, sequence): Number of channels of the input tensors. Output will\n",
+    "            have this number of channels as well.\n",
+    "        kernel_size (int, sequence): Size of the gaussian kernel.\n",
+    "        sigma (float, sequence): Standard deviation of the gaussian kernel.\n",
+    "        dim (int, optional): The number of dimensions of the data.\n",
+    "            Default value is 2 (spatial).\n",
+    "    \"\"\"\n",
+    "    def __init__(self, channels, kernel_size, sigma, dim=2):\n",
+    "        super(GaussianSmoothing, self).__init__()\n",
+    "        if isinstance(kernel_size, numbers.Number):\n",
+    "            kernel_size = [kernel_size] * dim\n",
+    "        if isinstance(sigma, numbers.Number):\n",
+    "            sigma = [sigma] * dim\n",
+    "\n",
+    "        # The gaussian kernel is the product of the\n",
+    "        # gaussian function of each dimension.\n",
+    "        kernel = 1\n",
+    "        meshgrids = torch.meshgrid(\n",
+    "            [\n",
+    "                torch.arange(size, dtype=torch.float32)\n",
+    "                for size in kernel_size\n",
+    "            ]\n",
+    "        )\n",
+    "        for size, std, mgrid in zip(kernel_size, sigma, meshgrids):\n",
+    "            mean = (size - 1) / 2\n",
+    "            kernel *= 1 / (std * math.sqrt(2 * math.pi)) * \\\n",
+    "                      torch.exp(-((mgrid - mean) / std) ** 2 / 2)\n",
+    "\n",
+    "        # Make sure sum of values in gaussian kernel equals 1.\n",
+    "        kernel = kernel / torch.sum(kernel)\n",
+    "\n",
+    "        # Reshape to depthwise convolutional weight\n",
+    "        kernel = kernel.view(1, 1, *kernel.size())\n",
+    "        kernel = kernel.repeat(channels, *[1] * (kernel.dim() - 1))\n",
+    "\n",
+    "        self.register_buffer('weight', kernel)\n",
+    "        self.groups = channels\n",
+    "\n",
+    "        if dim == 1:\n",
+    "            self.conv = F.conv1d\n",
+    "        elif dim == 2:\n",
+    "            self.conv = F.conv2d\n",
+    "        elif dim == 3:\n",
+    "            self.conv = F.conv3d\n",
+    "        else:\n",
+    "            raise RuntimeError(\n",
+    "                'Only 1, 2 and 3 dimensions are supported. Received {}.'.format(dim)\n",
+    "            )\n",
+    "\n",
+    "    def forward(self, input):\n",
+    "        \"\"\"\n",
+    "        Apply gaussian filter to input.\n",
+    "        Arguments:\n",
+    "            input (torch.Tensor): Input to apply gaussian filter on.\n",
+    "        Returns:\n",
+    "            filtered (torch.Tensor): Filtered output.\n",
+    "        \"\"\"\n",
+    "        return self.conv(input, weight=self.weight, groups=self.groups)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 18,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "smoothing = GaussianSmoothing(1024, 5, 1, dim=1)\n",
+    "input = torch.rand(4, 16, 100, 64)\n",
+    "b = input.shape[0]\n",
+    "numhead = input.shape[1]\n",
+    "t = input.shape[2]\n",
+    "c = input.shape[3]\n",
+    "input = input.permute(0,1,3,2)\n",
+    "input = input.reshape(b, numhead*c, t)\n",
+    "input = F.pad(input, (2, 2), mode='reflect')\n",
+    "output = smoothing(input)\n",
+    "output = output.reshape(b, numhead, c, t)\n",
+    "output = output.permute(0,1,3,2)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 19,
+   "metadata": {},
+   "outputs": [
+    {
+     "data": {
+      "text/plain": [
+       "torch.Size([4, 16, 100, 64])"
+      ]
+     },
+     "execution_count": 19,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "input\n",
+    "output.shape"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "input = torch.rand(4, 16, 100, 64)\n",
+    "attention = torch.normal(0,1 size = input.shape)"
+   ]
+  }
+ ],
+ "metadata": {
+  "kernelspec": {
+   "display_name": "Python 3",
+   "language": "python",
+   "name": "python3"
+  },
+  "language_info": {
+   "codemirror_mode": {
+    "name": "ipython",
+    "version": 3
+   },
+   "file_extension": ".py",
+   "mimetype": "text/x-python",
+   "name": "python",
+   "nbconvert_exporter": "python",
+   "pygments_lexer": "ipython3",
+   "version": "3.8.3"
+  }
+ },
+ "nbformat": 4,
+ "nbformat_minor": 4
+}
diff --git a/scripts/get_incorrect_samples.py b/scripts/get_incorrect_samples.py
new file mode 100755
index 0000000000000000000000000000000000000000..cfe943f3b8fb06724c5693a6515ff75e8b4ad506
--- /dev/null
+++ b/scripts/get_incorrect_samples.py
@@ -0,0 +1,88 @@
+r"""Compute active speaker detection performance for the AVA dataset.
+Please send any questions about this code to the Google Group ava-dataset-users:
+https://groups.google.com/forum/#!forum/ava-dataset-users
+Example usage:
+python -O get_ava_active_speaker_performance.py \
+-g testdata/eval.csv \
+-p testdata/predictions.csv \
+-v
+"""
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+import argparse
+import logging
+import time, warnings
+import numpy as np
+import pandas as pd
+import matplotlib.pyplot as plt
+warnings.filterwarnings("ignore")
+
+
+def parse_arguments():
+    """Parses command-line flags.
+  Returns:
+    args: a named tuple containing three file objects args.labelmap,
+    args.groundtruth, and args.detections.
+  """
+    parser = argparse.ArgumentParser()
+    parser.add_argument("-g",
+                        "--groundtruth",
+                        help="CSV file containing ground truth.",
+                        type=argparse.FileType("r"),
+                        required=True)
+    parser.add_argument("-p",
+                        "--predictions",
+                        help="CSV file containing active speaker predictions.",
+                        type=argparse.FileType("r"),
+                        required=True)
+    parser.add_argument("-v", "--verbose", help="Increase output verbosity.", action="store_true")
+    return parser.parse_args()
+
+
+def run_evaluation(groundtruth, predictions):
+    prediction = pd.read_csv(predictions)
+    groundtruth = pd.read_csv(groundtruth)
+    wrong_list = []
+    num = 0
+    audible_num = 0
+    total = 0
+    for i, row in prediction.iterrows():
+        entity_id = row['entity_id']
+        ts = row['frame_timestamp']
+        if row['score'] < 0.5:
+            label = "NOT_SPEAKING"
+        else:
+            label = "SPEAKING_AUDIBLE"
+
+        true_label = groundtruth.loc[(groundtruth['entity_id'] == entity_id) &
+                                     (groundtruth['frame_timestamp'] == ts)].iloc[0]["label"]
+        if true_label != label:
+            wrong_list.append([entity_id, ts, true_label, label])
+
+        if label == "SPEAKING_AUDIBLE":
+            num += 1
+        if true_label == "SPEAKING_AUDIBLE":
+            audible_num += 1
+        total += 1
+    print(num, audible_num, total)
+
+    df = pd.DataFrame(wrong_list, columns=['entity_id', 'frame_timestamp', "gt", "prediction"])
+    df = df.sort_values(by=["frame_timestamp"])
+    df.to_csv("wrong_list.csv")
+
+
+def main():
+    start = time.time()
+    args = parse_arguments()
+    if args.verbose:
+        logging.basicConfig(level=logging.DEBUG)
+    del args.verbose
+    run_evaluation(**vars(args))
+    logging.info("Computed in %s seconds", time.time() - start)
+
+
+if __name__ == "__main__":
+    main()
diff --git a/test_multicard.py b/test_multicard.py
new file mode 100755
index 0000000000000000000000000000000000000000..915594ac586b6b0f468f3aa65c4803b74cf96d56
--- /dev/null
+++ b/test_multicard.py
@@ -0,0 +1,99 @@
+import time, os, torch, argparse, warnings, glob, pandas, json
+
+from utils.tools import *
+from dlhammer import bootstrap
+
+from dataLoader_multiperson import val_loader
+from loconet import loconet
+
+
+class DataPrep():
+
+    def __init__(self, cfg):
+        self.cfg = cfg
+
+    def val_dataloader(self):
+        cfg = self.cfg
+        loader = val_loader(cfg, trialFileName = cfg.evalTrialAVA, \
+                            audioPath     = os.path.join(cfg.audioPathAVA , cfg.evalDataType), \
+                            visualPath    = os.path.join(cfg.visualPathAVA, cfg.evalDataType), \
+                            num_speakers=cfg.MODEL.NUM_SPEAKERS,
+                            )
+        valLoader = torch.utils.data.DataLoader(loader,
+                                                batch_size=cfg.VAL.BATCH_SIZE,
+                                                shuffle=False,
+                                                num_workers=16)
+        return valLoader
+
+
+def prepare_context_files(cfg):
+    path = os.path.join(cfg.DATA.dataPathAVA, "csv")
+    for phase in ["val", "test"]:
+        csv_f = f"{phase}_loader.csv"
+        csv_orig = f"{phase}_orig.csv"
+        entity_f = os.path.join(path, phase + "_entity.json")
+        ts_f = os.path.join(path, phase + "_ts.json")
+        if os.path.exists(entity_f) and os.path.exists(ts_f):
+            continue
+        orig_df = pandas.read_csv(os.path.join(path, csv_orig))
+        entity_data = {}
+        ts_to_entity = {}
+
+        for index, row in orig_df.iterrows():
+
+            entity_id = row['entity_id']
+            video_id = row['video_id']
+            if row['label'] == "SPEAKING_AUDIBLE":
+                label = 1
+            else:
+                label = 0
+            ts = float(row['frame_timestamp'])
+            if video_id not in entity_data.keys():
+                entity_data[video_id] = {}
+            if entity_id not in entity_data[video_id].keys():
+                entity_data[video_id][entity_id] = {}
+            if ts not in entity_data[video_id][entity_id].keys():
+                entity_data[video_id][entity_id][ts] = []
+
+            entity_data[video_id][entity_id][ts] = label
+
+            if video_id not in ts_to_entity.keys():
+                ts_to_entity[video_id] = {}
+            if ts not in ts_to_entity[video_id].keys():
+                ts_to_entity[video_id][ts] = []
+            ts_to_entity[video_id][ts].append(entity_id)
+
+        with open(entity_f) as f:
+            json.dump(entity_data, f)
+
+        with open(ts_f) as f:
+            json.dump(ts_to_entity, f)
+
+
+def main():
+    cfg = bootstrap(print_cfg=False)
+    print(cfg)
+    epoch = cfg.RESUME_EPOCH
+
+    warnings.filterwarnings("ignore")
+
+    cfg = init_args(cfg)
+
+    data = DataPrep(cfg)
+
+    prepare_context_files(cfg)
+
+    if cfg.downloadAVA == True:
+        preprocess_AVA(cfg)
+        quit()
+
+    s = loconet(cfg)
+
+    s.loadParameters(cfg.RESUME_PATH)
+    mAP = s.evaluate_network(epoch=epoch, loader=data.val_dataloader())
+    print(f"evaluate ckpt: {cfg.RESUME_PATH}")
+    print(mAP)
+
+
+if __name__ == '__main__':
+    main()
diff --git a/torchvggish/__pycache__/mel_features.cpython-37.pyc b/torchvggish/__pycache__/mel_features.cpython-37.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..20371fe601d853220baa1f2457b4b722ffdb09ff
Binary files /dev/null and b/torchvggish/__pycache__/mel_features.cpython-37.pyc differ
diff --git a/torchvggish/__pycache__/vggish.cpython-37.pyc b/torchvggish/__pycache__/vggish.cpython-37.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..1b6c6ffdbf569e1ed2323b0bf5b8f3dc2c2670e9
Binary files /dev/null and b/torchvggish/__pycache__/vggish.cpython-37.pyc differ
diff --git a/torchvggish/__pycache__/vggish_input.cpython-37.pyc b/torchvggish/__pycache__/vggish_input.cpython-37.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..534c983ab69de151597a6f245c0e6003ad6a4cbe
Binary files /dev/null and b/torchvggish/__pycache__/vggish_input.cpython-37.pyc differ
diff --git a/torchvggish/__pycache__/vggish_params.cpython-37.pyc b/torchvggish/__pycache__/vggish_params.cpython-37.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..fe9d8921017e615b1b823c2b15c751c4d8d2423f
Binary files /dev/null and b/torchvggish/__pycache__/vggish_params.cpython-37.pyc differ
diff --git a/torchvggish/mel_features.py b/torchvggish/mel_features.py
new file mode 100644
index 0000000000000000000000000000000000000000..ac58fb5427f772fcced9cbd3cec3373ffbe5908c
--- /dev/null
+++ b/torchvggish/mel_features.py
@@ -0,0 +1,223 @@
+# Copyright 2017 The TensorFlow Authors All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+
+"""Defines routines to compute mel spectrogram features from audio waveform."""
+
+import numpy as np
+
+
+def frame(data, window_length, hop_length):
+  """Convert array into a sequence of successive possibly overlapping frames.
+
+  An n-dimensional array of shape (num_samples, ...) is converted into an
+  (n+1)-D array of shape (num_frames, window_length, ...), where each frame
+  starts hop_length points after the preceding one.
+
+  This is accomplished using stride_tricks, so the original data is not
+  copied.  However, there is no zero-padding, so any incomplete frames at the
+  end are not included.
+
+  Args:
+    data: np.array of dimension N >= 1.
+    window_length: Number of samples in each frame.
+    hop_length: Advance (in samples) between each window.
+
+  Returns:
+    (N+1)-D np.array with as many rows as there are complete frames that can be
+    extracted.
+  """
+  num_samples = data.shape[0]
+  num_frames = 1 + int(np.floor((num_samples - window_length) / hop_length))
+  shape = (num_frames, window_length) + data.shape[1:]
+  strides = (data.strides[0] * hop_length,) + data.strides
+  return np.lib.stride_tricks.as_strided(data, shape=shape, strides=strides)
+
+
+def periodic_hann(window_length):
+  """Calculate a "periodic" Hann window.
+
+  The classic Hann window is defined as a raised cosine that starts and
+  ends on zero, and where every value appears twice, except the middle
+  point for an odd-length window.  Matlab calls this a "symmetric" window
+  and np.hanning() returns it.  However, for Fourier analysis, this
+  actually represents just over one cycle of a period N-1 cosine, and
+  thus is not compactly expressed on a length-N Fourier basis.  Instead,
+  it's better to use a raised cosine that ends just before the final
+  zero value - i.e. a complete cycle of a period-N cosine.  Matlab
+  calls this a "periodic" window. This routine calculates it.
+
+  Args:
+    window_length: The number of points in the returned window.
+
+  Returns:
+    A 1D np.array containing the periodic hann window.
+  """
+  return 0.5 - (0.5 * np.cos(2 * np.pi / window_length *
+                             np.arange(window_length)))
+
+
+def stft_magnitude(signal, fft_length,
+                   hop_length=None,
+                   window_length=None):
+  """Calculate the short-time Fourier transform magnitude.
+
+  Args:
+    signal: 1D np.array of the input time-domain signal.
+    fft_length: Size of the FFT to apply.
+    hop_length: Advance (in samples) between each frame passed to FFT.
+    window_length: Length of each block of samples to pass to FFT.
+
+  Returns:
+    2D np.array where each row contains the magnitudes of the fft_length/2+1
+    unique values of the FFT for the corresponding frame of input samples.
+  """
+  frames = frame(signal, window_length, hop_length)
+  # Apply frame window to each frame. We use a periodic Hann (cosine of period
+  # window_length) instead of the symmetric Hann of np.hanning (period
+  # window_length-1).
+  window = periodic_hann(window_length)
+  windowed_frames = frames * window
+  return np.abs(np.fft.rfft(windowed_frames, int(fft_length)))
+
+
+# Mel spectrum constants and functions.
+_MEL_BREAK_FREQUENCY_HERTZ = 700.0
+_MEL_HIGH_FREQUENCY_Q = 1127.0
+
+
+def hertz_to_mel(frequencies_hertz):
+  """Convert frequencies to mel scale using HTK formula.
+
+  Args:
+    frequencies_hertz: Scalar or np.array of frequencies in hertz.
+
+  Returns:
+    Object of same size as frequencies_hertz containing corresponding values
+    on the mel scale.
+  """
+  return _MEL_HIGH_FREQUENCY_Q * np.log(
+      1.0 + (frequencies_hertz / _MEL_BREAK_FREQUENCY_HERTZ))
+
+
+def spectrogram_to_mel_matrix(num_mel_bins=20,
+                              num_spectrogram_bins=129,
+                              audio_sample_rate=8000,
+                              lower_edge_hertz=125.0,
+                              upper_edge_hertz=3800.0):
+  """Return a matrix that can post-multiply spectrogram rows to make mel.
+
+  Returns a np.array matrix A that can be used to post-multiply a matrix S of
+  spectrogram values (STFT magnitudes) arranged as frames x bins to generate a
+  "mel spectrogram" M of frames x num_mel_bins.  M = S A.
+
+  The classic HTK algorithm exploits the complementarity of adjacent mel bands
+  to multiply each FFT bin by only one mel weight, then add it, with positive
+  and negative signs, to the two adjacent mel bands to which that bin
+  contributes.  Here, by expressing this operation as a matrix multiply, we go
+  from num_fft multiplies per frame (plus around 2*num_fft adds) to around
+  num_fft^2 multiplies and adds.  However, because these are all presumably
+  accomplished in a single call to np.dot(), it's not clear which approach is
+  faster in Python.  The matrix multiplication has the attraction of being more
+  general and flexible, and much easier to read.
+
+  Args:
+    num_mel_bins: How many bands in the resulting mel spectrum.  This is
+      the number of columns in the output matrix.
+    num_spectrogram_bins: How many bins there are in the source spectrogram
+      data, which is understood to be fft_size/2 + 1, i.e. the spectrogram
+      only contains the nonredundant FFT bins.
+    audio_sample_rate: Samples per second of the audio at the input to the
+      spectrogram. We need this to figure out the actual frequencies for
+      each spectrogram bin, which dictates how they are mapped into mel.
+    lower_edge_hertz: Lower bound on the frequencies to be included in the mel
+      spectrum.  This corresponds to the lower edge of the lowest triangular
+      band.
+    upper_edge_hertz: The desired top edge of the highest frequency band.
+
+  Returns:
+    An np.array with shape (num_spectrogram_bins, num_mel_bins).
+
+  Raises:
+    ValueError: if frequency edges are incorrectly ordered or out of range.
+  """
+  nyquist_hertz = audio_sample_rate / 2.
+  if lower_edge_hertz < 0.0:
+    raise ValueError("lower_edge_hertz %.1f must be >= 0" % lower_edge_hertz)
+  if lower_edge_hertz >= upper_edge_hertz:
+    raise ValueError("lower_edge_hertz %.1f >= upper_edge_hertz %.1f" %
+                     (lower_edge_hertz, upper_edge_hertz))
+  if upper_edge_hertz > nyquist_hertz:
+    raise ValueError("upper_edge_hertz %.1f is greater than Nyquist %.1f" %
+                     (upper_edge_hertz, nyquist_hertz))
+  spectrogram_bins_hertz = np.linspace(0.0, nyquist_hertz, num_spectrogram_bins)
+  spectrogram_bins_mel = hertz_to_mel(spectrogram_bins_hertz)
+  # The i'th mel band (starting from i=1) has center frequency
+  # band_edges_mel[i], lower edge band_edges_mel[i-1], and higher edge
+  # band_edges_mel[i+1].  Thus, we need num_mel_bins + 2 values in
+  # the band_edges_mel arrays.
+  band_edges_mel = np.linspace(hertz_to_mel(lower_edge_hertz),
+                               hertz_to_mel(upper_edge_hertz), num_mel_bins + 2)
+  # Matrix to post-multiply feature arrays whose rows are num_spectrogram_bins
+  # of spectrogram values.
+  mel_weights_matrix = np.empty((num_spectrogram_bins, num_mel_bins))
+  for i in range(num_mel_bins):
+    lower_edge_mel, center_mel, upper_edge_mel = band_edges_mel[i:i + 3]
+    # Calculate lower and upper slopes for every spectrogram bin.
+    # Line segments are linear in the *mel* domain, not hertz.
+    lower_slope = ((spectrogram_bins_mel - lower_edge_mel) /
+                   (center_mel - lower_edge_mel))
+    upper_slope = ((upper_edge_mel - spectrogram_bins_mel) /
+                   (upper_edge_mel - center_mel))
+    # .. then intersect them with each other and zero.
+    mel_weights_matrix[:, i] = np.maximum(0.0, np.minimum(lower_slope,
+                                                          upper_slope))
+  # HTK excludes the spectrogram DC bin; make sure it always gets a zero
+  # coefficient.
+  mel_weights_matrix[0, :] = 0.0
+  return mel_weights_matrix
+
+
+def log_mel_spectrogram(data,
+                        audio_sample_rate=8000,
+                        log_offset=0.0,
+                        window_length_secs=0.025,
+                        hop_length_secs=0.010,
+                        **kwargs):
+  """Convert waveform to a log magnitude mel-frequency spectrogram.
+
+  Args:
+    data: 1D np.array of waveform data.
+    audio_sample_rate: The sampling rate of data.
+    log_offset: Add this to values when taking log to avoid -Infs.
+    window_length_secs: Duration of each window to analyze.
+    hop_length_secs: Advance between successive analysis windows.
+    **kwargs: Additional arguments to pass to spectrogram_to_mel_matrix.
+
+  Returns:
+    2D np.array of (num_frames, num_mel_bins) consisting of log mel filterbank
+    magnitudes for successive frames.
+  """
+  window_length_samples = int(round(audio_sample_rate * window_length_secs))
+  hop_length_samples = int(round(audio_sample_rate * hop_length_secs))
+  fft_length = 2 ** int(np.ceil(np.log(window_length_samples) / np.log(2.0)))
+  spectrogram = stft_magnitude(
+      data,
+      fft_length=fft_length,
+      hop_length=hop_length_samples,
+      window_length=window_length_samples)
+  mel_spectrogram = np.dot(spectrogram, spectrogram_to_mel_matrix(
+      num_spectrogram_bins=spectrogram.shape[1],
+      audio_sample_rate=audio_sample_rate, **kwargs))
+  return np.log(mel_spectrogram + log_offset)
diff --git a/torchvggish/vggish.py b/torchvggish/vggish.py
new file mode 100644
index 0000000000000000000000000000000000000000..f612fc261d8810665da934cef925bb261b69ee36
--- /dev/null
+++ b/torchvggish/vggish.py
@@ -0,0 +1,205 @@
+import numpy as np
+import torch
+import torch.nn as nn
+from torch import hub
+
+from . import vggish_input, vggish_params
+
+
+class VGG(nn.Module):
+
+    def __init__(self, features):
+        super(VGG, self).__init__()
+        self.features = features
+        # self.embeddings = nn.Sequential(
+        #     nn.Linear(512 * 4 * 6, 4096),
+        #     nn.ReLU(True),
+        #     nn.Linear(4096, 4096),
+        #     nn.ReLU(True),
+        #     nn.Linear(4096, 128),
+        #     nn.ReLU(True))
+        self.deconv = nn.ConvTranspose2d(512, 256, (2, 2), stride=(2, 2))
+        self.conv1 = nn.Conv2d(512, 256, 1, stride=1)
+        self.conv2 = nn.Conv2d(256, 128, 1, stride=1)
+        # self.pool = nn.AdaptiveAvgPool2d((1, 1))
+
+    def forward(self, x):
+        # x = self.features(x)
+        for i, layer in enumerate(self.features):
+            x = layer(x)
+            if i == 9:
+                output4 = x
+            elif i == 14:
+                output8 = x
+        output8 = self.deconv(output8)
+        cat48 = torch.cat((output4, output8), 1)
+        output4 = self.conv1(cat48)
+        output4 = self.conv2(output4)
+        # res = self.pool(output4)
+
+        # Transpose the output from features to
+        # remain compatible with vggish embeddings
+        # x = torch.transpose(x, 1, 3)
+        # x = torch.transpose(x, 1, 2)
+        # x = x.contiguous()
+        # x = x.view(x.size(0), -1)
+
+        # return self.embeddings(x)
+        return output4
+
+
+class Postprocessor(nn.Module):
+    """Post-processes VGGish embeddings. Returns a torch.Tensor instead of a
+    numpy array in order to preserve the gradient.
+
+    "The initial release of AudioSet included 128-D VGGish embeddings for each
+    segment of AudioSet. These released embeddings were produced by applying
+    a PCA transformation (technically, a whitening transform is included as well)
+    and 8-bit quantization to the raw embedding output from VGGish, in order to
+    stay compatible with the YouTube-8M project which provides visual embeddings
+    in the same format for a large set of YouTube videos. This class implements
+    the same PCA (with whitening) and quantization transformations."
+    """
+
+    def __init__(self):
+        """Constructs a postprocessor."""
+        super(Postprocessor, self).__init__()
+        # Create empty matrix, for user's state_dict to load
+        self.pca_eigen_vectors = torch.empty(
+            (
+                vggish_params.EMBEDDING_SIZE,
+                vggish_params.EMBEDDING_SIZE,
+            ),
+            dtype=torch.float,
+        )
+        self.pca_means = torch.empty((vggish_params.EMBEDDING_SIZE, 1), dtype=torch.float)
+
+        self.pca_eigen_vectors = nn.Parameter(self.pca_eigen_vectors, requires_grad=False)
+        self.pca_means = nn.Parameter(self.pca_means, requires_grad=False)
+
+    def postprocess(self, embeddings_batch):
+        """Applies tensor postprocessing to a batch of embeddings.
+
+        Args:
+          embeddings_batch: An tensor of shape [batch_size, embedding_size]
+            containing output from the embedding layer of VGGish.
+
+        Returns:
+          A tensor of the same shape as the input, containing the PCA-transformed,
+          quantized, and clipped version of the input.
+        """
+        assert len(
+            embeddings_batch.shape) == 2, "Expected 2-d batch, got %r" % (embeddings_batch.shape,)
+        assert (embeddings_batch.shape[1] == vggish_params.EMBEDDING_SIZE
+               ), "Bad batch shape: %r" % (embeddings_batch.shape,)
+
+        # Apply PCA.
+        # - Embeddings come in as [batch_size, embedding_size].
+        # - Transpose to [embedding_size, batch_size].
+        # - Subtract pca_means column vector from each column.
+        # - Premultiply by PCA matrix of shape [output_dims, input_dims]
+        #   where both are are equal to embedding_size in our case.
+        # - Transpose result back to [batch_size, embedding_size].
+        pca_applied = torch.mm(self.pca_eigen_vectors, (embeddings_batch.t() - self.pca_means)).t()
+
+        # Quantize by:
+        # - clipping to [min, max] range
+        clipped_embeddings = torch.clamp(pca_applied, vggish_params.QUANTIZE_MIN_VAL,
+                                         vggish_params.QUANTIZE_MAX_VAL)
+        # - convert to 8-bit in range [0.0, 255.0]
+        quantized_embeddings = torch.round(
+            (clipped_embeddings - vggish_params.QUANTIZE_MIN_VAL) *
+            (255.0 / (vggish_params.QUANTIZE_MAX_VAL - vggish_params.QUANTIZE_MIN_VAL)))
+        return torch.squeeze(quantized_embeddings)
+
+    def forward(self, x):
+        return self.postprocess(x)
+
+
+def make_layers():
+    layers = []
+    in_channels = 1
+    for v in [64, "M", 128, "M", 256, 256, "M", 512, 512]:
+        if v == "M":
+            layers += [nn.MaxPool2d(kernel_size=2, stride=2)]
+        else:
+            conv2d = nn.Conv2d(in_channels, v, kernel_size=3, padding=1)
+            layers += [conv2d, nn.ReLU(inplace=True)]
+            in_channels = v
+    return nn.Sequential(*layers)
+
+
+def _vgg():
+    return VGG(make_layers())
+
+
+# def _spectrogram():
+#     config = dict(
+#         sr=16000,
+#         n_fft=400,
+#         n_mels=64,
+#         hop_length=160,
+#         window="hann",
+#         center=False,
+#         pad_mode="reflect",
+#         htk=True,
+#         fmin=125,
+#         fmax=7500,
+#         output_format='Magnitude',
+#         #             device=device,
+#     )
+#     return Spectrogram.MelSpectrogram(**config)
+
+
+class VGGish(VGG):
+
+    def __init__(self,
+                 urls,
+                 device=None,
+                 pretrained=True,
+                 preprocess=True,
+                 postprocess=True,
+                 progress=True):
+        super().__init__(make_layers())
+        if pretrained:
+            state_dict = hub.load_state_dict_from_url(urls['vggish'], progress=progress)
+            info = super().load_state_dict(state_dict, strict=False)
+
+        if device is None:
+            device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
+        self.device = device
+        self.preprocess = preprocess
+        self.postprocess = postprocess
+        if self.postprocess:
+            self.pproc = Postprocessor()
+            if pretrained:
+                state_dict = hub.load_state_dict_from_url(urls['pca'], progress=progress)
+                # TODO: Convert the state_dict to torch
+                state_dict[vggish_params.PCA_EIGEN_VECTORS_NAME] = torch.as_tensor(
+                    state_dict[vggish_params.PCA_EIGEN_VECTORS_NAME], dtype=torch.float)
+                state_dict[vggish_params.PCA_MEANS_NAME] = torch.as_tensor(
+                    state_dict[vggish_params.PCA_MEANS_NAME].reshape(-1, 1), dtype=torch.float)
+
+                self.pproc.load_state_dict(state_dict)
+        self.to(self.device)
+
+    def forward(self, x, fs=None):
+        if self.preprocess:
+            x = self._preprocess(x, fs)
+        x = x.to(self.device)
+        x = VGG.forward(self, x)
+        if self.postprocess:
+            x = self._postprocess(x)
+        return x
+
+    def _preprocess(self, x, fs):
+        if isinstance(x, np.ndarray):
+            x = vggish_input.waveform_to_examples(x, fs)
+        elif isinstance(x, str):
+            x = vggish_input.wavfile_to_examples(x)
+        else:
+            raise AttributeError
+        return x
+
+    def _postprocess(self, x):
+        return self.pproc(x)
diff --git a/torchvggish/vggish_input.py b/torchvggish/vggish_input.py
new file mode 100644
index 0000000000000000000000000000000000000000..351be446f588f0e03d12dc454dcad9a05b8eef69
--- /dev/null
+++ b/torchvggish/vggish_input.py
@@ -0,0 +1,101 @@
+# Copyright 2017 The TensorFlow Authors All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""Compute input examples for VGGish from audio waveform."""
+
+# Modification: Return torch tensors rather than numpy arrays
+import torch
+
+import numpy as np
+import resampy
+
+from . import mel_features
+from . import vggish_params
+
+import soundfile as sf
+
+
+def waveform_to_examples(data, sample_rate, numFrames, fps, return_tensor=True):
+    """Converts audio waveform into an array of examples for VGGish.
+
+  Args:
+    data: np.array of either one dimension (mono) or two dimensions
+      (multi-channel, with the outer dimension representing channels).
+      Each sample is generally expected to lie in the range [-1.0, +1.0],
+      although this is not required.
+    sample_rate: Sample rate of data.
+    return_tensor: Return data as a Pytorch tensor ready for VGGish
+
+  Returns:
+    3-D np.array of shape [num_examples, num_frames, num_bands] which represents
+    a sequence of examples, each of which contains a patch of log mel
+    spectrogram, covering num_frames frames of audio and num_bands mel frequency
+    bands, where the frame length is vggish_params.STFT_HOP_LENGTH_SECONDS.
+
+  """
+    # Convert to mono.
+    if len(data.shape) > 1:
+        data = np.mean(data, axis=1)
+    # Resample to the rate assumed by VGGish.
+    if sample_rate != vggish_params.SAMPLE_RATE:
+        data = resampy.resample(data, sample_rate, vggish_params.SAMPLE_RATE)
+    window_length_seconds = vggish_params.STFT_WINDOW_LENGTH_SECONDS * 25. / fps
+    hop_length_seconds = vggish_params.STFT_HOP_LENGTH_SECONDS * 25. / fps
+
+    # Compute log mel spectrogram features.
+    log_mel = mel_features.log_mel_spectrogram(data,
+                                               audio_sample_rate=vggish_params.SAMPLE_RATE,
+                                               log_offset=vggish_params.LOG_OFFSET,
+                                               window_length_secs=window_length_seconds,
+                                               hop_length_secs=hop_length_seconds,
+                                               num_mel_bins=vggish_params.NUM_MEL_BINS,
+                                               lower_edge_hertz=vggish_params.MEL_MIN_HZ,
+                                               upper_edge_hertz=vggish_params.MEL_MAX_HZ)
+
+    maxAudio = int(numFrames * 4)
+    if log_mel.shape[0] < maxAudio:
+        shortage = maxAudio - log_mel.shape[0]
+        log_mel = np.pad(log_mel, ((0, shortage), (0, 0)), 'wrap')
+    log_mel = log_mel[:int(round(numFrames * 4)), :]
+
+    # Frame features into examples.
+    # features_sample_rate = 1.0 / vggish_params.STFT_HOP_LENGTH_SECONDS
+    # example_window_length = int(round(vggish_params.EXAMPLE_WINDOW_SECONDS * features_sample_rate))
+    # example_hop_length = int(round(vggish_params.EXAMPLE_HOP_SECONDS * features_sample_rate))
+    # log_mel_examples = mel_features.frame(log_mel,
+    #                                       window_length=example_window_length,
+    #                                       hop_length=example_hop_length)
+
+    if return_tensor:
+        log_mel_examples = torch.tensor(log_mel_examples, requires_grad=True)[:, None, :, :].float()
+
+    # return log_mel_examples
+    return log_mel
+
+
+def wavfile_to_examples(wav_file, return_tensor=True):
+    """Convenience wrapper around waveform_to_examples() for a common WAV format.
+
+  Args:
+    wav_file: String path to a file, or a file-like object. The file
+    is assumed to contain WAV audio data with signed 16-bit PCM samples.
+    torch: Return data as a Pytorch tensor ready for VGGish
+
+  Returns:
+    See waveform_to_examples.
+  """
+    wav_data, sr = sf.read(wav_file, dtype='int16')
+    assert wav_data.dtype == np.int16, 'Bad sample type: %r' % wav_data.dtype
+    samples = wav_data / 32768.0    # Convert to [-1.0, +1.0]
+    return waveform_to_examples(samples, sr, return_tensor)
diff --git a/torchvggish/vggish_params.py b/torchvggish/vggish_params.py
new file mode 100644
index 0000000000000000000000000000000000000000..526784bceaa4c9c8b8dc2b8f82e0f3d395d4bec2
--- /dev/null
+++ b/torchvggish/vggish_params.py
@@ -0,0 +1,53 @@
+# Copyright 2017 The TensorFlow Authors All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+
+"""Global parameters for the VGGish model.
+
+See vggish_slim.py for more information.
+"""
+
+# Architectural constants.
+NUM_FRAMES = 96  # Frames in input mel-spectrogram patch.
+NUM_BANDS = 64  # Frequency bands in input mel-spectrogram patch.
+EMBEDDING_SIZE = 128  # Size of embedding layer.
+
+# Hyperparameters used in feature and example generation.
+SAMPLE_RATE = 16000
+STFT_WINDOW_LENGTH_SECONDS = 0.025
+STFT_HOP_LENGTH_SECONDS = 0.010
+NUM_MEL_BINS = NUM_BANDS
+MEL_MIN_HZ = 125
+MEL_MAX_HZ = 7500
+LOG_OFFSET = 0.01  # Offset used for stabilized log of input mel-spectrogram.
+EXAMPLE_WINDOW_SECONDS = 0.96  # Each example contains 96 10ms frames
+EXAMPLE_HOP_SECONDS = 0.96  # with zero overlap.
+
+# Parameters used for embedding postprocessing.
+PCA_EIGEN_VECTORS_NAME = 'pca_eigen_vectors'
+PCA_MEANS_NAME = 'pca_means'
+QUANTIZE_MIN_VAL = -2.0
+QUANTIZE_MAX_VAL = +2.0
+
+# Hyperparameters used in training.
+INIT_STDDEV = 0.01  # Standard deviation used to initialize weights.
+LEARNING_RATE = 1e-4  # Learning rate for the Adam optimizer.
+ADAM_EPSILON = 1e-8  # Epsilon for the Adam optimizer.
+
+# Names of ops, tensors, and features.
+INPUT_OP_NAME = 'vggish/input_features'
+INPUT_TENSOR_NAME = INPUT_OP_NAME + ':0'
+OUTPUT_OP_NAME = 'vggish/embedding'
+OUTPUT_TENSOR_NAME = OUTPUT_OP_NAME + ':0'
+AUDIO_EMBEDDING_FEATURE_NAME = 'audio_embedding'
diff --git a/train.py b/train.py
new file mode 100755
index 0000000000000000000000000000000000000000..969d9cfb02f63d889f65030159f0edb8ffd424b1
--- /dev/null
+++ b/train.py
@@ -0,0 +1,197 @@
+import time, os, torch, argparse, warnings, glob, pandas, json
+
+from utils.tools import *
+from dlhammer import bootstrap
+os.environ["CUDA_DEVICE_ORDER"] = "PCI_BUS_ID"
+import torch.multiprocessing as mp
+import torch.distributed as dist
+
+from xxlib.utils.distributed import all_gather, all_reduce
+from torch import nn
+from dataLoader_multiperson import train_loader, val_loader
+
+from loconet import loconet
+
+
+class MyCollator(object):
+
+    def __init__(self, cfg):
+        self.cfg = cfg
+
+    def __call__(self, data):
+        audiofeatures = [item[0] for item in data]
+        visualfeatures = [item[1] for item in data]
+        labels = [item[2] for item in data]
+        masks = [item[3] for item in data]
+        cut_limit = self.cfg.MODEL.CLIP_LENGTH
+        # pad audio
+        lengths = torch.tensor([t.shape[1] for t in audiofeatures])
+        max_len = max(lengths)
+        padded_audio = torch.stack([
+            torch.cat([i, i.new_zeros((i.shape[0], max_len - i.shape[1], i.shape[2]))], 1)
+            for i in audiofeatures
+        ], 0)
+
+        if max_len > cut_limit * 4:
+            padded_audio = padded_audio[:, :, :cut_limit * 4, ...]
+
+        # pad video
+        lengths = torch.tensor([t.shape[1] for t in visualfeatures])
+        max_len = max(lengths)
+        padded_video = torch.stack([
+            torch.cat(
+                [i, i.new_zeros((i.shape[0], max_len - i.shape[1], i.shape[2], i.shape[3]))], 1)
+            for i in visualfeatures
+        ], 0)
+        padded_labels = torch.stack(
+            [torch.cat([i, i.new_zeros((i.shape[0], max_len - i.shape[1]))], 1) for i in labels], 0)
+        padded_masks = torch.stack(
+            [torch.cat([i, i.new_zeros((i.shape[0], max_len - i.shape[1]))], 1) for i in masks], 0)
+
+        if max_len > cut_limit:
+            padded_video = padded_video[:, :, :cut_limit, ...]
+            padded_labels = padded_labels[:, :, :cut_limit, ...]
+            padded_masks = padded_masks[:, :, :cut_limit, ...]
+
+        return padded_audio, padded_video, padded_labels, padded_masks
+
+
+class DataPrep():
+
+    def __init__(self, cfg, world_size, rank):
+        self.cfg = cfg
+        self.world_size = world_size
+        self.rank = rank
+
+    def train_dataloader(self):
+
+        loader = train_loader(self.cfg, trialFileName = self.cfg.trainTrialAVA, \
+                          audioPath      = os.path.join(self.cfg.audioPathAVA , 'train'), \
+                          visualPath     = os.path.join(self.cfg.visualPathAVA, 'train'), \
+                          num_speakers=self.cfg.MODEL.NUM_SPEAKERS,
+                          )
+        train_sampler = torch.utils.data.distributed.DistributedSampler(
+            loader, num_replicas=self.world_size, rank=self.rank)
+        collator = MyCollator(self.cfg)
+        trainLoader = torch.utils.data.DataLoader(loader,
+                                                  batch_size=self.cfg.TRAIN.BATCH_SIZE,
+                                                  pin_memory=False,
+                                                  num_workers=self.cfg.NUM_WORKERS,
+                                                  collate_fn=collator,
+                                                  sampler=train_sampler)
+        return trainLoader
+
+    def val_dataloader(self):
+        loader = val_loader(self.cfg, trialFileName = self.cfg.evalTrialAVA, \
+                            audioPath     = os.path.join(self.cfg
+                                .audioPathAVA , self.cfg
+                                .evalDataType), \
+                            visualPath    = os.path.join(self.cfg
+                                .visualPathAVA, self.cfg
+                                .evalDataType), \
+                            num_speakers = self.cfg.MODEL.NUM_SPEAKERS
+                                )
+        valLoader = torch.utils.data.DataLoader(loader,
+                                                batch_size=self.cfg.VAL.BATCH_SIZE,
+                                                shuffle=False,
+                                                pin_memory=True,
+                                                num_workers=16)
+
+        return valLoader
+
+
+def prepare_context_files(cfg):
+    path = os.path.join(cfg.DATA.dataPathAVA, "csv")
+    for phase in ["train", "val", "test"]:
+        csv_f = f"{phase}_loader.csv"
+        csv_orig = f"{phase}_orig.csv"
+        entity_f = os.path.join(path, phase + "_entity.json")
+        ts_f = os.path.join(path, phase + "_ts.json")
+        if os.path.exists(entity_f) and os.path.exists(ts_f):
+            continue
+        orig_df = pandas.read_csv(os.path.join(path, csv_orig))
+        entity_data = {}
+        ts_to_entity = {}
+
+        for index, row in orig_df.iterrows():
+
+            entity_id = row['entity_id']
+            video_id = row['video_id']
+            if row['label'] == "SPEAKING_AUDIBLE":
+                label = 1
+            else:
+                label = 0
+            ts = float(row['frame_timestamp'])
+            if video_id not in entity_data.keys():
+                entity_data[video_id] = {}
+            if entity_id not in entity_data[video_id].keys():
+                entity_data[video_id][entity_id] = {}
+            if ts not in entity_data[video_id][entity_id].keys():
+                entity_data[video_id][entity_id][ts] = []
+
+            entity_data[video_id][entity_id][ts] = label
+
+            if video_id not in ts_to_entity.keys():
+                ts_to_entity[video_id] = {}
+            if ts not in ts_to_entity[video_id].keys():
+                ts_to_entity[video_id][ts] = []
+            ts_to_entity[video_id][ts].append(entity_id)
+
+        with open(entity_f) as f:
+            json.dump(entity_data, f)
+
+        with open(ts_f) as f:
+            json.dump(ts_to_entity, f)
+
+
+def main(gpu, world_size):
+    # The structure of this code is learnt from https://github.com/clovaai/voxceleb_trainer
+    cfg = bootstrap(print_cfg=False)
+    rank = gpu
+    dist.init_process_group(backend='nccl', init_method='env://', world_size=world_size, rank=rank)
+
+    make_deterministic(seed=int(cfg.SEED))
+    torch.cuda.set_device(gpu)
+    device = torch.device("cuda:{}".format(gpu))
+
+    warnings.filterwarnings("ignore")
+
+    cfg = init_args(cfg)
+
+    data = DataPrep(cfg, world_size, rank)
+
+    if cfg.downloadAVA == True:
+        preprocess_AVA(cfg)
+        quit()
+
+    prepare_context_files(cfg)
+
+    modelfiles = glob.glob('%s/model_0*.model' % cfg.modelSavePath)
+    modelfiles.sort()
+    if len(modelfiles) >= 1:
+        print("Model %s loaded from previous state!" % modelfiles[-1])
+        epoch = int(os.path.splitext(os.path.basename(modelfiles[-1]))[0][6:]) + 1
+        s = loconet(cfg, rank, device)
+        s.loadParameters(modelfiles[-1])
+    else:
+        epoch = 1
+        s = loconet(cfg, rank, device)
+
+    while (1):
+        loss, lr = s.train_network(epoch=epoch, loader=data.train_dataloader())
+
+        s.saveParameters(cfg.modelSavePath + "/model_%04d.model" % epoch)
+
+        if epoch >= cfg.TRAIN.MAX_EPOCH:
+            quit()
+
+        epoch += 1
+
+
+if __name__ == '__main__':
+
+    cfg = bootstrap()
+    world_size = cfg.NUM_GPUS    #
+    os.environ['MASTER_ADDR'] = '127.0.0.1'    #
+    os.environ['MASTER_PORT'] = str(random.randint(4000, 8888))    #
+    mp.spawn(main, nprocs=cfg.NUM_GPUS, args=(world_size,))
diff --git a/utils/.DS_Store b/utils/.DS_Store
new file mode 100644
index 0000000000000000000000000000000000000000..926c61bce65c727c84bd22125b080e66142dcbdc
Binary files /dev/null and b/utils/.DS_Store differ
diff --git a/utils/.ipynb_checkpoints/utils-checkpoint.py b/utils/.ipynb_checkpoints/utils-checkpoint.py
new file mode 100644
index 0000000000000000000000000000000000000000..24503319d88ebaa2d8eac5460a5549801ff11ce6
--- /dev/null
+++ b/utils/.ipynb_checkpoints/utils-checkpoint.py
@@ -0,0 +1,135 @@
+import time
+import os
+import sys
+import json
+import random
+import numpy as np
+import torch
+
+def setup_device(gpu_id):
+    #set up GPUS
+    os.environ["CUDA_DEVICE_ORDER"]="PCI_BUS_ID"
+    if int(gpu_id)==-2 and os.getenv('CUDA_VISIBLE_DEVICES') is not None:
+        gpu_id = os.getenv('CUDA_VISIBLE_DEVICES')
+    elif  int(gpu_id) >= 0:
+        os.environ['CUDA_VISIBLE_DEVICES'] = str(gpu_id)
+        print("set CUDA_VISIBLE_DEVICES=",gpu_id)
+    else:
+        os.environ['CUDA_VISIBLE_DEVICES'] = ""
+    device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
+    print("using device %s"%device)
+    return device
+
+def setup_seed(seed):
+    if seed < 0:
+        if os.getenv('SATOSHI_SEED') is not None and seed == -2:
+            seed = int(os.getenv('SATOSHI_SEED'))
+            print("env seed used")
+        else:
+            import math
+            seed = int(10**4*math.modf(time.time())[0])
+            seed = seed
+    print("random seed",seed)
+    return seed
+
+def setup_savedir(prefix="",basedir="./experiments",args=None,append_args=[]):
+    savedir = prefix
+    if len(append_args) > 0 and args is not None: 
+        for arg_opt in append_args:
+            arg_value = getattr(args, arg_opt)
+            savedir +="_"+arg_opt+"-"+str(arg_value)
+    else:
+        savedir += "exp"
+        
+    savedir = savedir.replace(" ","").replace("'","").replace('"','')
+    savedir = os.path.join(basedir,savedir)
+    
+    #if exists, append _num-[num]
+    i = 1
+    savedir_ori = savedir
+    while True:
+        try:
+            os.makedirs(savedir)
+            break
+        except FileExistsError as e:
+            savedir = savedir_ori+"_num-%d"%i
+            i+=1
+            
+    print("made the log directory",savedir)
+    return savedir
+
+def save_args(savedir,args,name="args.json"):
+    #save args as "args.json" in the savedir
+    path = os.path.join(savedir,name)
+    with open(path, 'w') as f:
+        json.dump( vars(args), f, sort_keys=True, indent=4)
+    print("args saved as %s"%path)
+        
+def save_json(dict,path):
+    with open(path, 'w') as f:
+        json.dump( dict, f, sort_keys=True, indent=4)
+        print("log saved at %s"%path)
+        
+def resume_model(model,resume,state_dict_key = "model"):
+    '''
+    model:pytorch model
+    resume: path to the resume file
+    state_dict_key: dict key 
+    '''
+    print("resuming trained weights from %s"%resume)
+    
+    checkpoint = torch.load(resume,map_location='cpu')
+    if state_dict_key is not None:
+        pretrained_dict = checkpoint[state_dict_key]
+    else:
+        pretrained_dict = checkpoint
+        
+    try:
+        model.load_state_dict(pretrained_dict)
+    except RuntimeError as e:
+        print(e)
+        print("can't load the all weights due to error above, trying to load part of them!")
+        model_dict = model.state_dict()
+        # 1. filter out unnecessary keys
+        pretrained_dict_use = {}
+        pretrained_dict_ignored = {}
+        for k, v in pretrained_dict.items():
+            if k in model_dict:
+                pretrained_dict_use[k] = v
+            else:
+                pretrained_dict_ignored[k] = v
+        pretrained_dict =pretrained_dict_use
+        # 2. overwrite entries in the existing state dict
+        model_dict.update(pretrained_dict) 
+        # 3. load the new state dict
+        model.load_state_dict(model_dict)
+        print("resumed only",pretrained_dict.keys())
+        print("ignored:",pretrained_dict_ignored.keys())
+        
+    return model
+
+def save_checkpoint(path,model,key="model"):
+    #save model state dict
+    checkpoint = {}
+    checkpoint[key] = model.state_dict()
+    torch.save(checkpoint, path)
+    print("checkpoint saved at",path)
+    
+
+def make_deterministic(seed,strict=False):
+    #https://github.com/pytorch/pytorch/issues/7068#issuecomment-487907668
+    random.seed(seed)
+    os.environ['PYTHONHASHSEED'] = str(seed)
+    np.random.seed(seed)
+    torch.manual_seed(seed)
+    torch.cuda.manual_seed(seed)
+    torch.cuda.manual_seed_all(seed) # if you are using multi-GPU.
+    torch.backends.cudnn.benchmark = False
+    torch.backends.cudnn.deterministic = True
+    if strict:
+        #https://github.com/pytorch/pytorch/issues/7068#issuecomment-515728600
+        torch.backends.cudnn.enabled = False
+        print("strict reproducability required! cudnn disabled. make sure to set num_workers=0 too!")
+        
+        
+        
diff --git a/utils/__pycache__/distributed.cpython-37.pyc b/utils/__pycache__/distributed.cpython-37.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..b9eac8ce3fb9f5976a24888ecd8ae0d8b0d65fb3
Binary files /dev/null and b/utils/__pycache__/distributed.cpython-37.pyc differ
diff --git a/utils/__pycache__/distributed.cpython-38.pyc b/utils/__pycache__/distributed.cpython-38.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..08cd33261bf05cbc0bdb2b83c4a55b2dfc6fb3fb
Binary files /dev/null and b/utils/__pycache__/distributed.cpython-38.pyc differ
diff --git a/utils/__pycache__/model_utils.cpython-38.pyc b/utils/__pycache__/model_utils.cpython-38.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..96695a8c7479efef2ca5dc665590b64c98c15e9d
Binary files /dev/null and b/utils/__pycache__/model_utils.cpython-38.pyc differ
diff --git a/utils/__pycache__/tools.cpython-37.pyc b/utils/__pycache__/tools.cpython-37.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..bf5aad58d4435ffde1967a30d5eb0c185284b109
Binary files /dev/null and b/utils/__pycache__/tools.cpython-37.pyc differ
diff --git a/utils/__pycache__/utils.cpython-38.pyc b/utils/__pycache__/utils.cpython-38.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..59ab27101441a0cf7cbf40d353a9fcfae0a5547e
Binary files /dev/null and b/utils/__pycache__/utils.cpython-38.pyc differ
diff --git a/utils/distributed.py b/utils/distributed.py
new file mode 100755
index 0000000000000000000000000000000000000000..b70e16c4c153ffddb78b70d728f39a8a62786641
--- /dev/null
+++ b/utils/distributed.py
@@ -0,0 +1,306 @@
+#!/usr/bin/env python3
+# Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved.
+"""Distributed helpers."""
+
+import functools
+import logging
+import pickle
+import torch
+import torch.distributed as dist
+
+_LOCAL_PROCESS_GROUP = None
+
+
+def all_gather(tensors):
+    """
+    All gathers the provided tensors from all processes across machines.
+    Args:
+        tensors (list): tensors to perform all gather across all processes in
+        all machines.
+    """
+
+    gather_list = []
+    output_tensor = []
+    world_size = dist.get_world_size()
+    for tensor in tensors:
+        tensor_placeholder = [
+            torch.ones_like(tensor) for _ in range(world_size)
+        ]
+        dist.all_gather(tensor_placeholder, tensor, async_op=False)
+        gather_list.append(tensor_placeholder)
+    for gathered_tensor in gather_list:
+        output_tensor.append(torch.cat(gathered_tensor, dim=0))
+    return output_tensor
+
+
+def all_reduce(tensors, average=True):
+    """
+    All reduce the provided tensors from all processes across machines.
+    Args:
+        tensors (list): tensors to perform all reduce across all processes in
+        all machines.
+        average (bool): scales the reduced tensor by the number of overall
+        processes across all machines.
+    """
+
+    for tensor in tensors:
+        dist.all_reduce(tensor, async_op=False)
+    if average:
+        world_size = dist.get_world_size()
+        for tensor in tensors:
+            tensor.mul_(1.0 / world_size)
+    return tensors
+
+
+def init_process_group(
+    local_rank,
+    local_world_size,
+    shard_id,
+    num_shards,
+    init_method,
+    dist_backend="nccl",
+):
+    """
+    Initializes the default process group.
+    Args:
+        local_rank (int): the rank on the current local machine.
+        local_world_size (int): the world size (number of processes running) on
+        the current local machine.
+        shard_id (int): the shard index (machine rank) of the current machine.
+        num_shards (int): number of shards for distributed training.
+        init_method (string): supporting three different methods for
+            initializing process groups:
+            "file": use shared file system to initialize the groups across
+            different processes.
+            "tcp": use tcp address to initialize the groups across different
+        dist_backend (string): backend to use for distributed training. Options
+            includes gloo, mpi and nccl, the details can be found here:
+            https://pytorch.org/docs/stable/distributed.html
+    """
+    # Sets the GPU to use.
+    torch.cuda.set_device(local_rank)
+    # Initialize the process group.
+    proc_rank = local_rank + shard_id * local_world_size
+    world_size = local_world_size * num_shards
+    dist.init_process_group(
+        backend=dist_backend,
+        init_method=init_method,
+        world_size=world_size,
+        rank=proc_rank,
+    )
+
+
+def is_master_proc(num_gpus=8):
+    """
+    Determines if the current process is the master process.
+    """
+    if torch.distributed.is_initialized():
+        return dist.get_rank() % num_gpus == 0
+    else:
+        return True
+
+
+def is_root_proc():
+    """
+    Determines if the current process is the root process.
+    """
+    if torch.distributed.is_initialized():
+        return dist.get_rank() == 0
+    else:
+        return True
+
+
+def get_world_size():
+    """
+    Get the size of the world.
+    """
+    if not dist.is_available():
+        return 1
+    if not dist.is_initialized():
+        return 1
+    return dist.get_world_size()
+
+
+def get_rank():
+    """
+    Get the rank of the current process.
+    """
+    if not dist.is_available():
+        return 0
+    if not dist.is_initialized():
+        return 0
+    return dist.get_rank()
+
+
+def synchronize():
+    """
+    Helper function to synchronize (barrier) among all processes when
+    using distributed training
+    """
+    if not dist.is_available():
+        return
+    if not dist.is_initialized():
+        return
+    world_size = dist.get_world_size()
+    if world_size == 1:
+        return
+    dist.barrier()
+
+
+@functools.lru_cache()
+def _get_global_gloo_group():
+    """
+    Return a process group based on gloo backend, containing all the ranks
+    The result is cached.
+    Returns:
+        (group): pytorch dist group.
+    """
+    if dist.get_backend() == "nccl":
+        return dist.new_group(backend="gloo")
+    else:
+        return dist.group.WORLD
+
+
+def _serialize_to_tensor(data, group):
+    """
+    Seriialize the tensor to ByteTensor. Note that only `gloo` and `nccl`
+        backend is supported.
+    Args:
+        data (data): data to be serialized.
+        group (group): pytorch dist group.
+    Returns:
+        tensor (ByteTensor): tensor that serialized.
+    """
+
+    backend = dist.get_backend(group)
+    assert backend in ["gloo", "nccl"]
+    device = torch.device("cpu" if backend == "gloo" else "cuda")
+
+    buffer = pickle.dumps(data)
+    if len(buffer) > 1024**3:
+        logger = logging.getLogger(__name__)
+        logger.warning(
+            "Rank {} trying to all-gather {:.2f} GB of data on device {}".
+            format(get_rank(),
+                    len(buffer) / (1024**3), device))
+    storage = torch.ByteStorage.from_buffer(buffer)
+    tensor = torch.ByteTensor(storage).to(device=device)
+    return tensor
+
+
+def _pad_to_largest_tensor(tensor, group):
+    """
+    Padding all the tensors from different GPUs to the largest ones.
+    Args:
+        tensor (tensor): tensor to pad.
+        group (group): pytorch dist group.
+    Returns:
+        list[int]: size of the tensor, on each rank
+        Tensor: padded tensor that has the max size
+    """
+    world_size = dist.get_world_size(group=group)
+    assert (
+        world_size >= 1
+    ), "comm.gather/all_gather must be called from ranks within the given group!"
+    local_size = torch.tensor([tensor.numel()],
+                                dtype=torch.int64,
+                                device=tensor.device)
+    size_list = [
+        torch.zeros([1], dtype=torch.int64, device=tensor.device)
+        for _ in range(world_size)
+    ]
+    dist.all_gather(size_list, local_size, group=group)
+    size_list = [int(size.item()) for size in size_list]
+
+    max_size = max(size_list)
+
+    # we pad the tensor because torch all_gather does not support
+    # gathering tensors of different shapes
+    if local_size != max_size:
+        padding = torch.zeros((max_size - local_size,),
+                                dtype=torch.uint8,
+                                device=tensor.device)
+        tensor = torch.cat((tensor, padding), dim=0)
+    return size_list, tensor
+
+
+def all_gather_unaligned(data, group=None):
+    """
+    Run all_gather on arbitrary picklable data (not necessarily tensors).
+
+    Args:
+        data: any picklable object
+        group: a torch process group. By default, will use a group which
+            contains all ranks on gloo backend.
+
+    Returns:
+        list[data]: list of data gathered from each rank
+    """
+    if get_world_size() == 1:
+        return [data]
+    if group is None:
+        group = _get_global_gloo_group()
+    if dist.get_world_size(group) == 1:
+        return [data]
+
+    tensor = _serialize_to_tensor(data, group)
+
+    size_list, tensor = _pad_to_largest_tensor(tensor, group)
+    max_size = max(size_list)
+
+    # receiving Tensor from all ranks
+    tensor_list = [
+        torch.empty((max_size,), dtype=torch.uint8, device=tensor.device)
+        for _ in size_list
+    ]
+    dist.all_gather(tensor_list, tensor, group=group)
+
+    data_list = []
+    for size, tensor in zip(size_list, tensor_list):
+        buffer = tensor.cpu().numpy().tobytes()[:size]
+        data_list.append(pickle.loads(buffer))
+
+    return data_list
+
+
+def init_distributed_training(cfg):
+    """
+    Initialize variables needed for distributed training.
+    """
+    if cfg.NUM_GPUS <= 1:
+        return
+    num_gpus_per_machine = cfg.NUM_GPUS
+    num_machines = dist.get_world_size() // num_gpus_per_machine
+    for i in range(num_machines):
+        ranks_on_i = list(
+            range(i * num_gpus_per_machine, (i + 1) * num_gpus_per_machine))
+        pg = dist.new_group(ranks_on_i)
+        if i == cfg.SHARD_ID:
+            global _LOCAL_PROCESS_GROUP
+            _LOCAL_PROCESS_GROUP = pg
+
+
+def get_local_size() -> int:
+    """
+    Returns:
+        The size of the per-machine process group,
+        i.e. the number of processes per machine.
+    """
+    if not dist.is_available():
+        return 1
+    if not dist.is_initialized():
+        return 1
+    return dist.get_world_size(group=_LOCAL_PROCESS_GROUP)
+
+
+def get_local_rank() -> int:
+    """
+    Returns:
+        The rank of the current process within the local (per-machine) process group.
+    """
+    if not dist.is_available():
+        return 0
+    if not dist.is_initialized():
+        return 0
+    assert _LOCAL_PROCESS_GROUP is not None
+    return dist.get_rank(group=_LOCAL_PROCESS_GROUP)
diff --git a/utils/get_ava_active_speaker_performance.py b/utils/get_ava_active_speaker_performance.py
new file mode 100755
index 0000000000000000000000000000000000000000..2e66d1da9b2a06234b2f7afc6f1cecc81b0cf931
--- /dev/null
+++ b/utils/get_ava_active_speaker_performance.py
@@ -0,0 +1,236 @@
+r"""Compute active speaker detection performance for the AVA dataset.
+Please send any questions about this code to the Google Group ava-dataset-users:
+https://groups.google.com/forum/#!forum/ava-dataset-users
+Example usage:
+python -O get_ava_active_speaker_performance.py \
+-g testdata/eval.csv \
+-p testdata/predictions.csv \
+-v
+"""
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+import argparse
+import logging
+import time, warnings
+import numpy as np
+import pandas as pd
+import matplotlib.pyplot as plt
+warnings.filterwarnings("ignore")
+
+def compute_average_precision(precision, recall):
+  """Compute Average Precision according to the definition in VOCdevkit.
+  Precision is modified to ensure that it does not decrease as recall
+  decrease.
+  Args:
+    precision: A float [N, 1] numpy array of precisions
+    recall: A float [N, 1] numpy array of recalls
+  Raises:
+    ValueError: if the input is not of the correct format
+  Returns:
+    average_precison: The area under the precision recall curve. NaN if
+      precision and recall are None.
+  """
+  if precision is None:
+    if recall is not None:
+      raise ValueError("If precision is None, recall must also be None")
+    return np.NAN
+
+  if not isinstance(precision, np.ndarray) or not isinstance(
+      recall, np.ndarray):
+    raise ValueError("precision and recall must be numpy array")
+  if precision.dtype != np.float or recall.dtype != np.float:
+    raise ValueError("input must be float numpy array.")
+  if len(precision) != len(recall):
+    raise ValueError("precision and recall must be of the same size.")
+  if not precision.size:
+    return 0.0
+  if np.amin(precision) < 0 or np.amax(precision) > 1:
+    raise ValueError("Precision must be in the range of [0, 1].")
+  if np.amin(recall) < 0 or np.amax(recall) > 1:
+    raise ValueError("recall must be in the range of [0, 1].")
+  if not all(recall[i] <= recall[i + 1] for i in range(len(recall) - 1)):
+    raise ValueError("recall must be a non-decreasing array")
+
+  recall = np.concatenate([[0], recall, [1]])
+  precision = np.concatenate([[0], precision, [0]])
+
+  # Smooth precision to be monotonically decreasing.
+  for i in range(len(precision) - 2, -1, -1):
+    precision[i] = np.maximum(precision[i], precision[i + 1])
+
+  indices = np.where(recall[1:] != recall[:-1])[0] + 1
+  average_precision = np.sum(
+      (recall[indices] - recall[indices - 1]) * precision[indices])
+  return average_precision
+
+
+def load_csv(filename, column_names):
+  """Loads CSV from the filename using given column names.
+  Adds uid column.
+  Args:
+    filename: Path to the CSV file to load.
+    column_names: A list of column names for the data.
+  Returns:
+    df: A Pandas DataFrame containing the data.
+  """
+  # Here and elsewhere, df indicates a DataFrame variable.
+
+  df = pd.read_csv(filename, usecols=column_names)
+  #df = pd.read_csv(filename, header=None, names=column_names)
+  
+  # Creates a unique id from frame timestamp and entity id.
+  df["uid"] = (df["frame_timestamp"].map(str) + ":" + df["entity_id"])  
+  return df
+
+
+def eq(a, b, tolerance=1e-09):
+  """Returns true if values are approximately equal."""
+  return abs(a - b) <= tolerance
+
+
+def merge_groundtruth_and_predictions(df_groundtruth, df_predictions):
+  """Merges groundtruth and prediction DataFrames.
+  The returned DataFrame is merged on uid field and sorted in descending order
+  by score field. Bounding boxes are checked to make sure they match between
+  groundtruth and predictions.
+  Args:
+    df_groundtruth: A DataFrame with groundtruth data.
+    df_predictions: A DataFrame with predictions data.
+  Returns:
+    df_merged: A merged DataFrame, with rows matched on uid column.
+  """
+  if df_groundtruth["uid"].count() != df_predictions["uid"].count():
+    raise ValueError(
+        "Groundtruth and predictions CSV must have the same number of "
+        "unique rows.")
+  # print(df_predictions["label"].unique())
+  if df_predictions["label"].unique() != ["SPEAKING_AUDIBLE"]:
+    raise ValueError(
+        "Predictions CSV must contain only SPEAKING_AUDIBLE label.")
+
+  if df_predictions["score"].count() < df_predictions["uid"].count():
+    raise ValueError("Predictions CSV must contain score value for every row.")
+
+  # Merges groundtruth and predictions on uid, validates that uid is unique
+  # in both frames, and sorts the resulting frame by the predictions score.
+  df_merged = df_groundtruth.merge(
+      df_predictions,
+      on="uid",
+      suffixes=("_groundtruth", "_prediction"),
+      validate="1:1").sort_values(
+          by=["score"], ascending=False).reset_index()
+  # Validates that bounding boxes in ground truth and predictions match for the
+  # same uids.
+  df_merged["bounding_box_correct"] = np.where(
+      eq(df_merged["entity_box_x1_groundtruth"],
+         df_merged["entity_box_x1_prediction"])
+      & eq(df_merged["entity_box_x2_groundtruth"],
+           df_merged["entity_box_x2_prediction"])
+      & eq(df_merged["entity_box_y1_groundtruth"],
+           df_merged["entity_box_y1_prediction"])
+      & eq(df_merged["entity_box_y2_groundtruth"],
+           df_merged["entity_box_y2_prediction"]), True, False)
+
+  if (~df_merged["bounding_box_correct"]).sum() > 0:
+    raise ValueError(
+        "Mismatch between groundtruth and predictions bounding boxes found at "
+        + str(list(df_merged[~df_merged["bounding_box_correct"]]["uid"])))
+
+  return df_merged
+
+
+def get_all_positives(df_merged):
+  """Counts all positive examples in the groundtruth dataset."""
+  return df_merged[df_merged["label_groundtruth"] ==
+                   "SPEAKING_AUDIBLE"]["uid"].count()
+
+
+def calculate_precision_recall(df_merged):
+  """Calculates precision and recall arrays going through df_merged row-wise."""
+  all_positives = get_all_positives(df_merged)
+  # Populates each row with 1 if this row is a true positive
+  # (at its score level).
+  df_merged["is_tp"] = np.where(
+      (df_merged["label_groundtruth"] == "SPEAKING_AUDIBLE") &
+      (df_merged["label_prediction"] == "SPEAKING_AUDIBLE"), 1, 0)
+
+  # Counts true positives up to and including that row.
+  df_merged["tp"] = df_merged["is_tp"].cumsum()
+
+  # Calculates precision for every row counting true positives up to
+  # and including that row over the index (1-based) of that row.
+  df_merged["precision"] = df_merged["tp"] / (df_merged.index + 1)
+  # Calculates recall for every row counting true positives up to
+  # and including that row over all positives in the groundtruth dataset.
+
+  df_merged["recall"] = df_merged["tp"] / all_positives
+  logging.info(
+      "\n%s\n",
+      df_merged.head(10)[[
+          "uid", "score", "label_groundtruth", "is_tp", "tp", "precision",
+          "recall"
+      ]])
+
+  return np.array(df_merged["precision"]), np.array(df_merged["recall"])
+
+
+def run_evaluation(groundtruth, predictions):
+  """Runs AVA Active Speaker evaluation, printing average precision result."""
+  df_groundtruth = load_csv(
+      groundtruth,
+      column_names=[
+          "video_id", "frame_timestamp", "entity_box_x1", "entity_box_y1",
+          "entity_box_x2", "entity_box_y2", "label", "entity_id"
+      ])
+  df_predictions = load_csv(
+      predictions,
+      column_names=[
+          "video_id", "frame_timestamp", "entity_box_x1", "entity_box_y1",
+          "entity_box_x2", "entity_box_y2", "label", "entity_id", "score"
+      ])
+  df_merged = merge_groundtruth_and_predictions(df_groundtruth, df_predictions)
+  precision, recall = calculate_precision_recall(df_merged)
+  mAP = 100 * compute_average_precision(precision, recall)
+  print("average precision: %2.2f%%"%(mAP))
+  return mAP
+
+
+def parse_arguments():
+  """Parses command-line flags.
+  Returns:
+    args: a named tuple containing three file objects args.labelmap,
+    args.groundtruth, and args.detections.
+  """
+  parser = argparse.ArgumentParser()
+  parser.add_argument(
+      "-g",
+      "--groundtruth",
+      help="CSV file containing ground truth.",
+      type=argparse.FileType("r"),
+      required=True)
+  parser.add_argument(
+      "-p",
+      "--predictions",
+      help="CSV file containing active speaker predictions.",
+      type=argparse.FileType("r"),
+      required=True)
+  parser.add_argument(
+      "-v", "--verbose", help="Increase output verbosity.", action="store_true")
+  return parser.parse_args()
+
+
+def main():
+  start = time.time()
+  args = parse_arguments()
+  if args.verbose:
+    logging.basicConfig(level=logging.DEBUG)
+  del args.verbose
+  mAP = run_evaluation(**vars(args))
+  logging.info("Computed in %s seconds", time.time() - start)
+  return mAP
+
+if __name__ == "__main__":
+  main()
\ No newline at end of file
diff --git a/utils/get_multiperson_csv.py b/utils/get_multiperson_csv.py
new file mode 100644
index 0000000000000000000000000000000000000000..fc6140b0b47576b47ed1a9273e6366275e9bfb30
--- /dev/null
+++ b/utils/get_multiperson_csv.py
@@ -0,0 +1,49 @@
+import os, pandas
+import json
+
+phase = "test"
+path = "/nfs/jolteon/data/ssd/xiziwang/AVA_dataset/csv"
+
+if phase == "train":
+    csv_f = "train_loader.csv"
+    csv_orig = "train_orig.csv"
+elif phase == "val":
+    csv_f = "val_loader.csv"
+    csv_orig = "val_orig.csv"
+else:
+    csv_f = "test_loader.csv"
+    csv_orig = "test_orig.csv"
+
+orig_df = pandas.read_csv(os.path.join(path, csv_orig))
+entity_data = {}
+ts_to_entity = {}
+
+for index, row in orig_df.iterrows():
+
+    entity_id = row['entity_id']
+    video_id = row['video_id']
+    if row['label'] == "SPEAKING_AUDIBLE":
+        label = 1
+    else:
+        label = 0
+    ts = float(row['frame_timestamp'])
+    if video_id not in entity_data.keys():
+        entity_data[video_id] = {}
+    if entity_id not in entity_data[video_id].keys():
+        entity_data[video_id][entity_id] = {}
+    if ts not in entity_data[video_id][entity_id].keys():
+        entity_data[video_id][entity_id][ts] = []
+
+    entity_data[video_id][entity_id][ts] = label
+
+    if video_id not in ts_to_entity.keys():
+        ts_to_entity[video_id] = {}
+    if ts not in ts_to_entity[video_id].keys():
+        ts_to_entity[video_id][ts] = []
+    ts_to_entity[video_id][ts].append(entity_id)
+
+with open(os.path.join(path, phase + "_entity.json"), 'w') as f:
+    json.dump(entity_data, f)
+
+with open(os.path.join(path, phase + "_ts.json"), 'w') as f:
+    json.dump(ts_to_entity, f)
diff --git a/utils/model_utils.py b/utils/model_utils.py
new file mode 100644
index 0000000000000000000000000000000000000000..1df761634de87bbe5f8d929d8a9768d0331a00d1
--- /dev/null
+++ b/utils/model_utils.py
@@ -0,0 +1,31 @@
+def set_bn_eval(m):
+    classname = m.__class__.__name__
+    if classname.find('BatchNorm') != -1:
+        m.eval()
+
+
+def set_bn_non_trainable(m):
+    classname = m.__class__.__name__
+    if classname.find('BatchNorm') != -1:
+        m.weight.requires_grad = False
+        m.bias.requires_grad = False
+
+
+def freeze_bn_statistics(model):
+    """freeze the statistic mean and variance in BN
+    Args:
+        model (nn.Module): The model to be freezed statistics.
+    """
+    model.apply(set_bn_eval)
+
+
+def freeze_bn_parameters(model):
+    """
+
+    Args:
+        model (nn.Module): The model to be freezed statistics.
+
+    Returns: TODO
+
+    """
+    model.apply(set_bn_non_trainable)
diff --git a/utils/overall.png b/utils/overall.png
new file mode 100755
index 0000000000000000000000000000000000000000..9a1dd5cca29bc80835ace3114fff7b5814a4b0f5
Binary files /dev/null and b/utils/overall.png differ
diff --git a/utils/tools.py b/utils/tools.py
new file mode 100755
index 0000000000000000000000000000000000000000..5fdfd51cfdc81c961bcfe873051069cb252570f1
--- /dev/null
+++ b/utils/tools.py
@@ -0,0 +1,217 @@
+import os, subprocess, glob, pandas, tqdm, cv2, numpy
+from scipy.io import wavfile
+import random
+import torch
+import numpy as np
+
+
+def init_args(args):
+    # The details for the following folders/files can be found in the annotation of the function 'preprocess_AVA' below
+    args.modelSavePath = os.path.join(args.WORKSPACE, 'model')
+    args.scoreSavePath = os.path.join(args.WORKSPACE, 'score.txt')
+    args.trialPathAVA = os.path.join(args.DATA.dataPathAVA, 'csv')
+    args.audioOrigPathAVA = os.path.join(args.DATA.dataPathAVA, 'orig_audios')
+    args.visualOrigPathAVA = os.path.join(args.DATA.dataPathAVA, 'orig_videos')
+    args.audioPathAVA = os.path.join(args.DATA.dataPathAVA, 'clips_audios')
+    args.visualPathAVA = os.path.join(args.DATA.dataPathAVA, 'clips_videos')
+    args.trainTrialAVA = os.path.join(args.trialPathAVA, 'train_loader.csv')
+
+    if args.evalDataType == 'val':
+        args.evalTrialAVA = os.path.join(args.trialPathAVA, 'val_loader.csv')
+        args.evalOrig = os.path.join(args.trialPathAVA, 'val_orig.csv')
+        args.evalCsvSave = os.path.join(args.WORKSPACE, 'val_res.csv')
+    else:
+        args.evalTrialAVA = os.path.join(args.trialPathAVA, 'test_loader.csv')
+        args.evalOrig = os.path.join(args.trialPathAVA, 'test_orig.csv')
+        args.evalCsvSave = os.path.join(args.WORKSPACE, 'test_res.csv')
+
+    os.makedirs(args.modelSavePath, exist_ok=True)
+    os.makedirs(args.DATA.dataPathAVA, exist_ok=True)
+    return args
+
+
+def make_deterministic(seed, strict=False):
+    #https://github.com/pytorch/pytorch/issues/7068#issuecomment-487907668
+    random.seed(seed)
+    os.environ['PYTHONHASHSEED'] = str(seed)
+    np.random.seed(seed)
+    torch.manual_seed(seed)
+    torch.cuda.manual_seed(seed)
+    torch.cuda.manual_seed_all(seed)    # if you are using multi-GPU.
+    # torch.backends.cudnn.benchmark = False
+    torch.backends.cudnn.deterministic = True
+    # torch.set_deterministic(True)
+    if strict:
+        #https://github.com/pytorch/pytorch/issues/7068#issuecomment-515728600
+        torch.backends.cudnn.enabled = False
+        print(
+            "strict reproducability required! cudnn disabled. make sure to set num_workers=0 too!")
+
+
+def download_pretrain_model_AVA():
+    if os.path.isfile('pretrain_AVA.model') == False:
+        Link = "1NVIkksrD3zbxbDuDbPc_846bLfPSZcZm"
+        cmd = "gdown --id %s -O %s" % (Link, 'pretrain_AVA.model')
+        subprocess.call(cmd, shell=True, stdout=None)
+
+
+def preprocess_AVA(args):
+    # This preprocesstion is modified based on this [repository](https://github.com/fuankarion/active-speakers-context).
+    # The required space is 302 G.
+    # If you do not have enough space, you can delate `orig_videos`(167G) when you get `clips_videos(85G)`.
+    #                             also you can delate `orig_audios`(44G) when you get `clips_audios`(6.4G).
+    # So the final space is less than 100G.
+    # The AVA dataset will be saved in 'AVApath' folder like the following format:
+    # ```
+    # ├── clips_audios  (The audio clips cut from the original movies)
+    # │   ├── test
+    # │   ├── train
+    # │   └── val
+    # ├── clips_videos (The face clips cut from the original movies, be save in the image format, frame-by-frame)
+    # │   ├── test
+    # │   ├── train
+    # │   └── val
+    # ├── csv
+    # │   ├── test_file_list.txt (name of the test videos)
+    # │   ├── test_loader.csv (The csv file we generated to load data for testing)
+    # │   ├── test_orig.csv (The combination of the given test csv files)
+    # │   ├── train_loader.csv (The csv file we generated to load data for training)
+    # │   ├── train_orig.csv (The combination of the given training csv files)
+    # │   ├── trainval_file_list.txt (name of the train/val videos)
+    # │   ├── val_loader.csv (The csv file we generated to load data for validation)
+    # │   └── val_orig.csv (The combination of the given validation csv files)
+    # ├── orig_audios (The original audios from the movies)
+    # │   ├── test
+    # │   └── trainval
+    # └── orig_videos (The original movies)
+    #     ├── test
+    #     └── trainval
+    # ```
+
+    download_csv(args)    # Take 1 minute
+    download_videos(args)    # Take 6 hours
+    extract_audio(args)    # Take 1 hour
+    extract_audio_clips(args)    # Take 3 minutes
+    extract_video_clips(args)    # Take about 2 days
+
+
+def download_csv(args):
+    # Take 1 minute to download the required csv files
+    Link = "1C1cGxPHaJAl1NQ2i7IhRgWmdvsPhBCUy"
+    cmd = "gdown --id %s -O %s" % (Link, args.dataPathAVA + '/csv.tar.gz')
+    subprocess.call(cmd, shell=True, stdout=None)
+    cmd = "tar -xzvf %s -C %s" % (args.dataPathAVA + '/csv.tar.gz', args.dataPathAVA)
+    subprocess.call(cmd, shell=True, stdout=None)
+    os.remove(args.dataPathAVA + '/csv.tar.gz')
+
+
+def download_videos(args):
+    # Take 6 hours to download the original movies, follow this repository: https://github.com/cvdfoundation/ava-dataset
+    for dataType in ['trainval', 'test']:
+        fileList = open('%s/%s_file_list.txt' % (args.trialPathAVA, dataType)).read().splitlines()
+        outFolder = '%s/%s' % (args.visualOrigPathAVA, dataType)
+        for fileName in fileList:
+            cmd = "wget -P %s https://s3.amazonaws.com/ava-dataset/%s/%s" % (outFolder, dataType,
+                                                                             fileName)
+            subprocess.call(cmd, shell=True, stdout=None)
+
+
+def extract_audio(args):
+    # Take 1 hour to extract the audio from movies
+    for dataType in ['trainval', 'test']:
+        inpFolder = '%s/%s' % (args.visualOrigPathAVA, dataType)
+        outFolder = '%s/%s' % (args.audioOrigPathAVA, dataType)
+        os.makedirs(outFolder, exist_ok=True)
+        videos = glob.glob("%s/*" % (inpFolder))
+        for videoPath in tqdm.tqdm(videos):
+            audioPath = '%s/%s' % (outFolder, videoPath.split('/')[-1].split('.')[0] + '.wav')
+            cmd = (
+                "ffmpeg -y -i %s -async 1 -ac 1 -vn -acodec pcm_s16le -ar 16000 -threads 8 %s -loglevel panic"
+                % (videoPath, audioPath))
+            subprocess.call(cmd, shell=True, stdout=None)
+
+
+def extract_audio_clips(args):
+    # Take 3 minutes to extract the audio clips
+    dic = {'train': 'trainval', 'val': 'trainval', 'test': 'test'}
+    for dataType in ['train', 'val', 'test']:
+        df = pandas.read_csv(os.path.join(args.trialPathAVA, '%s_orig.csv' % (dataType)),
+                             engine='python')
+        dfNeg = pandas.concat([df[df['label_id'] == 0], df[df['label_id'] == 2]])
+        dfPos = df[df['label_id'] == 1]
+        insNeg = dfNeg['instance_id'].unique().tolist()
+        insPos = dfPos['instance_id'].unique().tolist()
+        df = pandas.concat([dfPos, dfNeg]).reset_index(drop=True)
+        df = df.sort_values(['entity_id', 'frame_timestamp']).reset_index(drop=True)
+        entityList = df['entity_id'].unique().tolist()
+        df = df.groupby('entity_id')
+        audioFeatures = {}
+        outDir = os.path.join(args.audioPathAVA, dataType)
+        audioDir = os.path.join(args.audioOrigPathAVA, dic[dataType])
+        for l in df['video_id'].unique().tolist():
+            d = os.path.join(outDir, l[0])
+            if not os.path.isdir(d):
+                os.makedirs(d)
+        for entity in tqdm.tqdm(entityList, total=len(entityList)):
+            insData = df.get_group(entity)
+            videoKey = insData.iloc[0]['video_id']
+            start = insData.iloc[0]['frame_timestamp']
+            end = insData.iloc[-1]['frame_timestamp']
+            entityID = insData.iloc[0]['entity_id']
+            insPath = os.path.join(outDir, videoKey, entityID + '.wav')
+            if videoKey not in audioFeatures.keys():
+                audioFile = os.path.join(audioDir, videoKey + '.wav')
+                sr, audio = wavfile.read(audioFile)
+                audioFeatures[videoKey] = audio
+            audioStart = int(float(start) * sr)
+            audioEnd = int(float(end) * sr)
+            audioData = audioFeatures[videoKey][audioStart:audioEnd]
+            wavfile.write(insPath, sr, audioData)
+
+
+def extract_video_clips(args):
+    # Take about 2 days to crop the face clips.
+    # You can optimize this code to save time, while this process is one-time.
+    # If you do not need the data for the test set, you can only deal with the train and val part. That will take 1 day.
+    # This procession may have many warning info, you can just ignore it.
+    dic = {'train': 'trainval', 'val': 'trainval', 'test': 'test'}
+    for dataType in ['train', 'val', 'test']:
+        df = pandas.read_csv(os.path.join(args.trialPathAVA, '%s_orig.csv' % (dataType)))
+        dfNeg = pandas.concat([df[df['label_id'] == 0], df[df['label_id'] == 2]])
+        dfPos = df[df['label_id'] == 1]
+        insNeg = dfNeg['instance_id'].unique().tolist()
+        insPos = dfPos['instance_id'].unique().tolist()
+        df = pandas.concat([dfPos, dfNeg]).reset_index(drop=True)
+        df = df.sort_values(['entity_id', 'frame_timestamp']).reset_index(drop=True)
+        entityList = df['entity_id'].unique().tolist()
+        df = df.groupby('entity_id')
+        outDir = os.path.join(args.visualPathAVA, dataType)
+        audioDir = os.path.join(args.visualOrigPathAVA, dic[dataType])
+        for l in df['video_id'].unique().tolist():
+            d = os.path.join(outDir, l[0])
+            if not os.path.isdir(d):
+                os.makedirs(d)
+        for entity in tqdm.tqdm(entityList, total=len(entityList)):
+            insData = df.get_group(entity)
+            videoKey = insData.iloc[0]['video_id']
+            entityID = insData.iloc[0]['entity_id']
+            videoDir = os.path.join(args.visualOrigPathAVA, dic[dataType])
+            videoFile = glob.glob(os.path.join(videoDir, '{}.*'.format(videoKey)))[0]
+            V = cv2.VideoCapture(videoFile)
+            insDir = os.path.join(os.path.join(outDir, videoKey, entityID))
+            if not os.path.isdir(insDir):
+                os.makedirs(insDir)
+            j = 0
+            for _, row in insData.iterrows():
+                imageFilename = os.path.join(insDir, str("%.2f" % row['frame_timestamp']) + '.jpg')
+                V.set(cv2.CAP_PROP_POS_MSEC, row['frame_timestamp'] * 1e3)
+                _, frame = V.read()
+                h = numpy.size(frame, 0)
+                w = numpy.size(frame, 1)
+                x1 = int(row['entity_box_x1'] * w)
+                y1 = int(row['entity_box_y1'] * h)
+                x2 = int(row['entity_box_x2'] * w)
+                y2 = int(row['entity_box_y2'] * h)
+                face = frame[y1:y2, x1:x2, :]
+                j = j + 1
+                cv2.imwrite(imageFilename, face)
diff --git a/utils/utils.py b/utils/utils.py
new file mode 100644
index 0000000000000000000000000000000000000000..b5241777f01a9a4a716c84970b0044dd6fb94922
--- /dev/null
+++ b/utils/utils.py
@@ -0,0 +1,141 @@
+import time
+import os
+import sys
+import json
+import random
+import numpy as np
+import torch
+
+
+def setup_device(gpu_id):
+    #set up GPUS
+    os.environ["CUDA_DEVICE_ORDER"] = "PCI_BUS_ID"
+    if int(gpu_id) == -2 and os.getenv('CUDA_VISIBLE_DEVICES') is not None:
+        gpu_id = os.getenv('CUDA_VISIBLE_DEVICES')
+    elif int(gpu_id) >= 0:
+        os.environ['CUDA_VISIBLE_DEVICES'] = str(gpu_id)
+        print("set CUDA_VISIBLE_DEVICES=", gpu_id)
+    else:
+        os.environ['CUDA_VISIBLE_DEVICES'] = ""
+    device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
+    print("using device %s" % device)
+    return device
+
+
+def setup_seed(seed):
+    if seed < 0:
+        if os.getenv('SATOSHI_SEED') is not None and seed == -2:
+            seed = int(os.getenv('SATOSHI_SEED'))
+            print("env seed used")
+        else:
+            import math
+            seed = int(10**4 * math.modf(time.time())[0])
+            seed = seed
+    print("random seed", seed)
+    return seed
+
+
+def setup_savedir(prefix="", basedir="./experiments", args=None, append_args=[]):
+    savedir = prefix
+    if len(append_args) > 0 and args is not None:
+        for arg_opt in append_args:
+            arg_value = getattr(args, arg_opt)
+            savedir += "_" + arg_opt + "-" + str(arg_value)
+    else:
+        savedir += "exp"
+
+    savedir = savedir.replace(" ", "").replace("'", "").replace('"', '')
+    savedir = os.path.join(basedir, savedir)
+
+    #if exists, append _num-[num]
+    i = 1
+    savedir_ori = savedir
+    while True:
+        try:
+            os.makedirs(savedir)
+            break
+        except FileExistsError as e:
+            savedir = savedir_ori + "_num-%d" % i
+            i += 1
+
+    print("made the log directory", savedir)
+    return savedir
+
+
+def save_args(savedir, args, name="args.json"):
+    #save args as "args.json" in the savedir
+    path = os.path.join(savedir, name)
+    with open(path, 'w') as f:
+        json.dump(vars(args), f, sort_keys=True, indent=4)
+    print("args saved as %s" % path)
+
+
+def save_json(dict, path):
+    with open(path, 'w') as f:
+        json.dump(dict, f, sort_keys=True, indent=4)
+        print("log saved at %s" % path)
+
+
+def resume_model(model, resume, state_dict_key="model"):
+    '''
+    model:pytorch model
+    resume: path to the resume file
+    state_dict_key: dict key 
+    '''
+    print("resuming trained weights from %s" % resume)
+
+    checkpoint = torch.load(resume, map_location='cpu')
+    if state_dict_key is not None:
+        pretrained_dict = checkpoint[state_dict_key]
+    else:
+        pretrained_dict = checkpoint
+
+    try:
+        model.load_state_dict(pretrained_dict)
+    except RuntimeError as e:
+        print(e)
+        print("can't load the all weights due to error above, trying to load part of them!")
+        model_dict = model.state_dict()
+        # 1. filter out unnecessary keys
+        pretrained_dict_use = {}
+        pretrained_dict_ignored = {}
+        for k, v in pretrained_dict.items():
+            if k in model_dict:
+                pretrained_dict_use[k] = v
+            else:
+                pretrained_dict_ignored[k] = v
+        pretrained_dict = pretrained_dict_use
+        # 2. overwrite entries in the existing state dict
+        model_dict.update(pretrained_dict)
+        # 3. load the new state dict
+        model.load_state_dict(model_dict)
+        print("resumed only", pretrained_dict.keys())
+        print("ignored:", pretrained_dict_ignored.keys())
+
+    return model
+
+
+def save_checkpoint(path, model, key="model"):
+    #save model state dict
+    checkpoint = {}
+    checkpoint[key] = model.state_dict()
+    torch.save(checkpoint, path)
+    print("checkpoint saved at", path)
+
+
+def make_deterministic(seed, strict=False):
+    #https://github.com/pytorch/pytorch/issues/7068#issuecomment-487907668
+    random.seed(seed)
+    os.environ['PYTHONHASHSEED'] = str(seed)
+    np.random.seed(seed)
+    torch.manual_seed(seed)
+    torch.cuda.manual_seed(seed)
+    torch.cuda.manual_seed_all(seed)    # if you are using multi-GPU.
+    # torch.backends.cudnn.benchmark = False
+    torch.backends.cudnn.deterministic = True
+    # torch.set_deterministic(True)
+    if strict:
+        #https://github.com/pytorch/pytorch/issues/7068#issuecomment-515728600
+        torch.backends.cudnn.enabled = False
+        print(
+            "strict reproducability required! cudnn disabled. make sure to set num_workers=0 too!")
diff --git a/videoloaders/.DS_Store b/videoloaders/.DS_Store
new file mode 100644
index 0000000000000000000000000000000000000000..f06db239bb6f916cf7f04d01a5bc394ab22bcea7
Binary files /dev/null and b/videoloaders/.DS_Store differ
diff --git a/videoloaders/.ipynb_checkpoints/functional_video-checkpoint.py b/videoloaders/.ipynb_checkpoints/functional_video-checkpoint.py
new file mode 100755
index 0000000000000000000000000000000000000000..861504ec04cf1998403bea8ee067620216c0ca05
--- /dev/null
+++ b/videoloaders/.ipynb_checkpoints/functional_video-checkpoint.py
@@ -0,0 +1,117 @@
+#copied from  https://raw.githubusercontent.com/pytorch/vision/f0d3daa7f65bcde560e242d9bccc284721368f02/torchvision/transforms/functional_video.py
+#copied from https://raw.githubusercontent.com/pytorch/vision/f0d3daa7f65bcde560e242d9bccc284721368f02/torchvision/transforms/transforms_video.py
+import torch
+
+
+def _is_tensor_video_clip(clip):
+    if not torch.is_tensor(clip):
+        raise TypeError("clip should be Tesnor. Got %s" % type(clip))
+
+    if not clip.ndimension() == 4:
+        raise ValueError("clip should be 4D. Got %dD" % clip.dim())
+
+    return True
+
+
+def crop(clip, i, j, h, w):
+    """
+    Args:
+        clip (torch.tensor): Video clip to be cropped. Size is (C, T, H, W)
+    """
+    assert len(clip.size()) == 4, "clip should be a 4D tensor"
+    return clip[..., i:i + h, j:j + w]
+
+
+def resize(clip, target_size, interpolation_mode):
+    assert len(target_size) == 2, "target size should be tuple (height, width)"
+    return torch.nn.functional.interpolate(
+        clip, size=target_size, mode=interpolation_mode
+    )
+
+
+def resized_crop(clip, i, j, h, w, size, interpolation_mode="bilinear"):
+    """
+    Do spatial cropping and resizing to the video clip
+    Args:
+        clip (torch.tensor): Video clip to be cropped. Size is (C, T, H, W)
+        i (int): i in (i,j) i.e coordinates of the upper left corner.
+        j (int): j in (i,j) i.e coordinates of the upper left corner.
+        h (int): Height of the cropped region.
+        w (int): Width of the cropped region.
+        size (tuple(int, int)): height and width of resized clip
+    Returns:
+        clip (torch.tensor): Resized and cropped clip. Size is (C, T, H, W)
+    """
+    assert _is_tensor_video_clip(clip), "clip should be a 4D torch.tensor"
+    clip = crop(clip, i, j, h, w)
+    clip = resize(clip, size, interpolation_mode)
+    return clip
+
+
+def center_crop(clip, crop_size):
+    assert _is_tensor_video_clip(clip), "clip should be a 4D torch.tensor"
+    h, w = clip.size(-2), clip.size(-1)
+    th, tw = crop_size
+    assert h >= th and w >= tw, "height and width must be no smaller than crop_size"
+
+    i = int(round((h - th) / 2.0))
+    j = int(round((w - tw) / 2.0))
+    return crop(clip, i, j, th, tw)
+
+def corner_crop(clip, crop_size, i, j):
+    assert _is_tensor_video_clip(clip),"clip should be a 4d torch tensor"
+    h, w = clip.size(-2), clip.size(-1)
+    th, tw = crop_size
+    assert h>=th and w>=tw, "height and width must be no smaller than crop_size"
+    return crop(clip, i, j, th, tw)
+
+
+def to_tensor(clip):
+    """
+    Convert tensor data type from uint8 to float, divide value by 255.0 and
+    permute the dimenions of clip tensor
+    Args:
+        clip (torch.tensor, dtype=torch.uint8): Size is (T, H, W, C)
+    Return:
+    """
+    _is_tensor_video_clip(clip)
+    if not clip.dtype == torch.uint8:
+        raise TypeError("clip tensor should have data type uint8. Got %s" % str(clip.dtype))
+    return clip.float().permute(3, 0, 1, 2) / 255.0
+
+
+def normalize(clip, mean, std, inplace=False):
+    """
+    Args:
+        clip (torch.tensor): Video clip to be normalized. Size is (C, T, H, W)
+        mean (tuple): pixel RGB mean. Size is (3)
+        std (tuple): pixel standard deviation. Size is (3)
+    Returns:
+        normalized clip (torch.tensor): Size is (C, T, H, W)
+    """
+    assert _is_tensor_video_clip(clip), "clip should be a 4D torch.tensor"
+    if not inplace:
+        clip = clip.clone()
+    mean = torch.as_tensor(mean, dtype=clip.dtype, device=clip.device)
+    std = torch.as_tensor(std, dtype=clip.dtype, device=clip.device)
+    if clip.size(0) == 3:
+        clip.sub_(mean[:, None, None, None]).div_(std[:, None, None, None])
+    elif clip.size(0) == 1:
+        #make it compatibale with depth image
+        mean = mean.mean()
+        std = std.mean()
+        clip.sub_(mean).div_(std)
+    else:
+        raise NotImplementedError()
+    return clip
+
+
+def hflip(clip):
+    """
+    Args:
+        clip (torch.tensor): Video clip to be normalized. Size is (C, T, H, W)
+    Returns:
+        flipped clip (torch.tensor): Size is (C, T, H, W)
+    """
+    assert _is_tensor_video_clip(clip), "clip should be a 4D torch.tensor"
+    return clip.flip((-1))
diff --git a/videoloaders/.ipynb_checkpoints/transform_temporal-checkpoint.py b/videoloaders/.ipynb_checkpoints/transform_temporal-checkpoint.py
new file mode 100755
index 0000000000000000000000000000000000000000..14948b753f81ab5e6fd31b6de3d65b9c59e82348
--- /dev/null
+++ b/videoloaders/.ipynb_checkpoints/transform_temporal-checkpoint.py
@@ -0,0 +1,162 @@
+import os
+import random
+import math
+
+
+def temporal_batching_index(fr,length=16):
+    '''
+    Do padding or half-overlapping clips for video.
+    
+    Input:
+        fr: number of frames
+    Output:
+        batch_indices: array for batch where each element is frame index 
+    '''
+    if fr < length: 
+        #e.g. (1,2,3,4,5) to (1,1,....,1,2,3,4,5,5,...,5,5)
+        right = int((length-fr)/2)
+        left = length - right - fr
+        return [[0]*left + list(range(fr)) + [fr-1]*right]
+    
+    batch_indices = []
+    last_idx = fr - 1
+    assert length%2 == 0
+    half = int(length/2)
+    for i in range(0,fr-half,half):
+            frame_indices = [0,]*length
+            for j in range(length):
+                current_idx =  i + j 
+                if current_idx < last_idx:
+                    frame_indices[j] = current_idx
+                else:
+                    frame_indices[j] = last_idx
+            batch_indices.append(frame_indices)
+            
+    return batch_indices
+
+def temporal_sliding_window(clip,window = 16):
+    '''
+    Make a batched tensor with 16 frame sliding window with the overlap of 8. 
+    If a clip is not the multiply of 8, it's padded with the last frames. (1,2...,13,14,14,14) for (1,..,14) 
+    If a clip is less than 16 frames, padding is applied like (1,1,....,1,2,3,4,5,5,...,5,5) for (1,2,3,4,5)
+    This can be used for sliding window evaluation.
+    
+    Input:  list of image paths
+    Output: torch tensor of shape of (batch,ch,16,h,w).
+    '''
+
+    batch_indices = temporal_batching_index(len(clip),length = window)
+    
+    return [[clip[idx] for idx in  indices] for indices in batch_indices]
+
+def temporal_center_crop(clip,length = 16):
+    '''
+    Input:  list of image paths
+    Output: torch tensor of shape of (1,ch,16,h,w).
+    '''
+    fr = len(clip) 
+    if fr < length: 
+        #e.g. (1,2,3,4,5) to (1,1,....,1,2,3,4,5,5,...,5,5)
+        right = int((length-fr)/2)
+        left = length - right - fr
+        indicies =  [0]*left + list(range(fr)) + [fr-1]*right
+        output =  [clip[i] for i in indicies]
+    elif fr==length:
+        output =  clip    
+    else:
+        middle = int(fr/2)
+        assert length%2 == 0
+        half = int(length/2)
+        start = middle - half
+        output =  clip[start : start+length]
+        
+    return output[::2]
+
+
+
+def random_temporal_crop(clip,length = 16):
+    '''
+    Just randomly sample 16 consecutive frames
+    if less than 16 frames, just add padding.
+    '''
+    fr = len(clip) 
+    if fr < length: 
+        #e.g. (1,2,3,4,5) to (1,1,....,1,2,3,4,5,5,...,5,5)
+        right = int((length-fr)/2)
+        left = length - right - fr
+        indicies =  [0]*left + list(range(fr)) + [fr-1]*right
+        output =  [clip[i] for i in indicies]
+    elif fr==length:
+        output =  clip
+    else:
+        start=random.randint(0,fr-length)
+        output =  clip[start : start+length]
+    return output[::2]
+
+
+def use_all_frames(clip):
+    '''
+    Just use it as it is :)
+    '''
+    return clip
+
+def looppadding(clip, length=16):
+
+
+        out = clip
+
+        for index in out:
+            if len(out) >= length:
+                break
+            out.append(index)
+
+        return out[::2]
+
+def temporal_even_crop(clip, length=16, n_samples=1):
+
+        clip = list(clip)
+        n_frames = len(clip)
+        indices = list(range(len(clip)))
+        stride = max(
+            1, math.ceil((n_frames - 1 - length) / (n_samples - 1)))
+
+        out = []
+        for begin_index in indices[::stride]:
+            if len(out) >= n_samples:
+                break
+            end_index = min(indices[-1] + 1, begin_index + length)
+            sample = list(range(begin_index, end_index))
+
+            if len(sample) < length:
+                out.append([clip[i] for i in looppadding(sample, length=length)])
+               # out.append(clip[looppadding(sample, length=length)])
+                break
+            else:
+                out.append([clip[i] for i in sample[::2]])
+               # out.append(clip[sample[::2]])
+
+        return out
+
+
+class TemporalTransform(object):
+    def __init__(self,length,mode="center"):
+        self.mode = mode
+        self.length = length
+        #pass dummpy in order to catch incoored mode
+        self.__call__(range(128))
+        
+    def __call__(self, clip):
+        if self.mode == "random":
+            return random_temporal_crop(clip,self.length)
+        elif self.mode == "center":
+            return temporal_center_crop(clip,self.length)
+        elif self.mode == "all" or self.mode == "nocrop":
+            #note that length cannot be satisfied!
+            return use_all_frames(clip)
+        elif self.mode == "slide":
+            #note that output has one more dimention
+            return temporal_sliding_window(clip,self.length)
+        elif self.mode == "even":
+            return temporal_even_crop(clip, self.length, n_samples=5)
+        else:
+            raise NotImplementedError("this option is not defined:",self.mode)
\ No newline at end of file
diff --git a/videoloaders/.ipynb_checkpoints/transforms_video-checkpoint.py b/videoloaders/.ipynb_checkpoints/transforms_video-checkpoint.py
new file mode 100755
index 0000000000000000000000000000000000000000..156cf463abf94932421cb2dd7d500fdcdb998bbe
--- /dev/null
+++ b/videoloaders/.ipynb_checkpoints/transforms_video-checkpoint.py
@@ -0,0 +1,312 @@
+#copied from  https://raw.githubusercontent.com/pytorch/vision/f0d3daa7f65bcde560e242d9bccc284721368f02/torchvision/transforms/functional_video.py
+#copied from https://raw.githubusercontent.com/pytorch/vision/f0d3daa7f65bcde560e242d9bccc284721368f02/torchvision/transforms/transforms_video.py
+#!/usr/bin/env python3
+
+import numbers
+import random
+import torch
+
+try:
+    import accimage
+except:
+    pass
+
+from torchvision.transforms import (
+    RandomResizedCrop,
+)
+
+from . import functional_video as F
+
+def _get_image_size(img):
+    if isinstance(img, torch.Tensor) and img.dim() > 2:
+        return img.shape[-2:][::-1]
+    else:
+        raise TypeError("Unexpected type {}".format(type(img)))
+
+class RandomCrop(object):
+    """Crop the given PIL Image at a random location.
+    Args:
+        size (sequence or int): Desired output size of the crop. If size is an
+            int instead of sequence like (h, w), a square crop (size, size) is
+            made.
+        padding (int or sequence, optional): Optional padding on each border
+            of the image. Default is None, i.e no padding. If a sequence of length
+            4 is provided, it is used to pad left, top, right, bottom borders
+            respectively. If a sequence of length 2 is provided, it is used to
+            pad left/right, top/bottom borders, respectively.
+        pad_if_needed (boolean): It will pad the image if smaller than the
+            desired size to avoid raising an exception. Since cropping is done
+            after padding, the padding seems to be done at a random offset.
+        fill: Pixel fill value for constant fill. Default is 0. If a tuple of
+            length 3, it is used to fill R, G, B channels respectively.
+            This value is only used when the padding_mode is constant
+        padding_mode: Type of padding. Should be: constant, edge, reflect or symmetric. Default is constant.
+             - constant: pads with a constant value, this value is specified with fill
+             - edge: pads with the last value on the edge of the image
+             - reflect: pads with reflection of image (without repeating the last value on the edge)
+                padding [1, 2, 3, 4] with 2 elements on both sides in reflect mode
+                will result in [3, 2, 1, 2, 3, 4, 3, 2]
+             - symmetric: pads with reflection of image (repeating the last value on the edge)
+                padding [1, 2, 3, 4] with 2 elements on both sides in symmetric mode
+                will result in [2, 1, 1, 2, 3, 4, 4, 3]
+    """
+
+    def __init__(self, size, padding=None, pad_if_needed=False, fill=0, padding_mode='constant'):
+        if isinstance(size, numbers.Number):
+            self.size = (int(size), int(size))
+        else:
+            self.size = size
+        self.padding = padding
+        self.pad_if_needed = pad_if_needed
+        self.fill = fill
+        self.padding_mode = padding_mode
+
+    @staticmethod
+    def get_params(img, output_size):
+        """Get parameters for ``crop`` for a random crop.
+        Args:
+            img (PIL Image): Image to be cropped.
+            output_size (tuple): Expected output size of the crop.
+        Returns:
+            tuple: params (i, j, h, w) to be passed to ``crop`` for random crop.
+        """
+        w, h = _get_image_size(img)
+        th, tw = output_size
+        if w == tw and h == th:
+            return 0, 0, h, w
+
+        i = random.randint(0, h - th)
+        j = random.randint(0, w - tw)
+        return i, j, th, tw
+
+    def __call__(self, img):
+        """
+        Args:
+            img (PIL Image): Image to be cropped.
+        Returns:
+            PIL Image: Cropped image.
+        """
+        if self.padding is not None:
+            img = F.pad(img, self.padding, self.fill, self.padding_mode)
+
+        # pad the width if needed
+        if self.pad_if_needed and img.size[0] < self.size[1]:
+            img = F.pad(img, (self.size[1] - img.size[0], 0), self.fill, self.padding_mode)
+        # pad the height if needed
+        if self.pad_if_needed and img.size[1] < self.size[0]:
+            img = F.pad(img, (0, self.size[0] - img.size[1]), self.fill, self.padding_mode)
+
+        i, j, h, w = self.get_params(img, self.size)
+
+        return F.crop(img, i, j, h, w)
+
+    def __repr__(self):
+        return self.__class__.__name__ + '(size={0}, padding={1})'.format(self.size, self.padding)
+
+
+
+    
+
+class RandomCropVideo(RandomCrop):
+    def __init__(self, size):
+        if isinstance(size, numbers.Number):
+            self.size = (int(size), int(size))
+        else:
+            self.size = size
+
+    def __call__(self, clip):
+        """
+        Args:
+            clip (torch.tensor): Video clip to be cropped. Size is (C, T, H, W)
+        Returns:
+            torch.tensor: randomly cropped/resized video clip.
+                size is (C, T, OH, OW)
+        """
+        i, j, h, w = self.get_params(clip, self.size)
+        return F.crop(clip, i, j, h, w)
+
+    def __repr__(self):
+        return self.__class__.__name__ + '(size={0})'.format(self.size)
+
+
+class RandomResizedCropVideo(RandomResizedCrop):
+    def __init__(
+        self,
+        size,
+        scale=(0.08, 1.0),
+        ratio=(3.0 / 4.0, 4.0 / 3.0),
+        interpolation_mode="bilinear",
+    ):
+        if isinstance(size, tuple):
+            assert len(size) == 2, "size should be tuple (height, width)"
+            self.size = size
+        else:
+            self.size = (size, size)
+
+        self.interpolation_mode = interpolation_mode
+        self.scale = scale
+        self.ratio = ratio
+
+    def __call__(self, clip):
+        """
+        Args:
+            clip (torch.tensor): Video clip to be cropped. Size is (C, T, H, W)
+        Returns:
+            torch.tensor: randomly cropped/resized video clip.
+                size is (C, T, H, W)
+        """
+        i, j, h, w = self.get_params(clip, self.scale, self.ratio)
+        return F.resized_crop(clip, i, j, h, w, self.size, self.interpolation_mode)
+
+    def __repr__(self):
+        return self.__class__.__name__ + \
+            '(size={0}, interpolation_mode={1}, scale={2}, ratio={3})'.format(
+                self.size, self.interpolation_mode, self.scale, self.ratio
+            )
+
+
+class CenterCropVideo(object):
+    def __init__(self, crop_size):
+        if isinstance(crop_size, numbers.Number):
+            self.crop_size = (int(crop_size), int(crop_size))
+        else:
+            self.crop_size = crop_size
+        
+
+    def __call__(self, clip):
+        """
+        Args:
+            clip (torch.tensor): Video clip to be cropped. Size is (C, T, H, W)
+        Returns:
+            torch.tensor: central cropping of video clip. Size is
+            (C, T, crop_size, crop_size)
+        """
+        
+        return F.center_crop(clip, self.crop_size)
+
+    def __repr__(self):
+        return self.__class__.__name__ + '(crop_size={0})'.format(self.crop_size)
+    
+class CornerCropVideo(object):
+    def __init__(self, crop_size, loc="tr"):
+        if isinstance(crop_size, numbers.Number):
+            self.crop_size = (int(crop_size), int(crop_size))
+        else:
+            self.crop_size = crop_size
+
+    def __call__(self, clip, loc="tr"):
+        """
+        Args:
+            clip (torch.tensor): Video clip to be cropped. Size is (C, T, H, W)
+        Returns:
+            torch.tensor: central cropping of video clip. Size is
+            (C, T, crop_size, crop_size)
+        """
+        if loc == "tr":
+            i = 0
+            j = 0
+        elif loc == "center":
+            return F.corner_crop(clip, self.crop_size)
+        else:
+            i = clip.size(-2) - self.crop_size
+            j = clip.size(-1) - self.crop_size
+        return F.corner_crop(clip, self.crop_size, i, j)
+
+    def __repr__(self):
+        return self.__class__.__name__ + '(crop_size={0})'.format(self.crop_size)
+
+
+class NormalizeVideo(object):
+    """
+    Normalize the video clip by mean subtraction and division by standard deviation
+    Args:
+        mean (3-tuple): pixel RGB mean
+        std (3-tuple): pixel RGB standard deviation
+        inplace (boolean): whether do in-place normalization
+    """
+
+    def __init__(self, mean, std, inplace=False):
+        self.mean = mean
+        self.std = std
+        self.inplace = inplace
+
+    def __call__(self, clip):
+        """
+        Args:
+            clip (torch.tensor): video clip to be normalized. Size is (C, T, H, W)
+        """
+        return F.normalize(clip, self.mean, self.std, self.inplace)
+
+    def __repr__(self):
+        return self.__class__.__name__ + '(mean={0}, std={1}, inplace={2})'.format(
+            self.mean, self.std, self.inplace)
+
+
+class ToTensorVideo(object):
+    """
+    Convert tensor data type from uint8 to float, divide value by 255.0 and
+    permute the dimenions of clip tensor
+    """
+
+    def __init__(self):
+        pass
+
+    def __call__(self, clip):
+        """
+        Args:
+            clip (torch.tensor, dtype=torch.uint8): Size is (T, H, W, C)
+        Return:
+            clip (torch.tensor, dtype=torch.float): Size is (C, T, H, W)
+        """
+        return F.to_tensor(clip)
+
+    def __repr__(self):
+        return self.__class__.__name__
+
+
+class RandomHorizontalFlipVideo(object):
+    """
+    Flip the video clip along the horizonal direction with a given probability
+    Args:
+        p (float): probability of the clip being flipped. Default value is 0.5
+    """
+
+    def __init__(self, p=0.5):
+        self.p = p
+
+    def __call__(self, clip):
+        """
+        Args:
+            clip (torch.tensor): Size is (C, T, H, W)
+        Return:
+            clip (torch.tensor): Size is (C, T, H, W)
+        """
+        if random.random() < self.p:
+            clip = F.hflip(clip)
+        return clip
+
+    def __repr__(self):
+        return self.__class__.__name__ + "(p={0})".format(self.p)
+
+    
+    
+class ResizeVideo(object):
+    """
+    Resize the video clip
+    """
+    def __init__(self, w,h):
+        self.w = w
+        self.h = h
+    def __call__(self, clip):
+        """
+        Args:
+            clip (torch.tensor): Size is (C, T, H, W)
+        Return:
+            clip (torch.tensor): Size is (C, T, h, w)
+        """
+        #interpolare needs (T,C, H, W) order while clip is (C, T, H, W)
+        return torch.nn.functional.interpolate(
+                    clip.permute(1,0,2,3),(self.h,self.w),mode="bilinear",align_corners=False).permute(1,0,2,3)
+
+    def __repr__(self):
+        return self.__class__.__name__ + "(w=%d,h=%d)"%(self.w,self.h)
\ No newline at end of file
diff --git a/videoloaders/README.md b/videoloaders/README.md
new file mode 100755
index 0000000000000000000000000000000000000000..d54e8f32beb7313158a7e79f45dc3b2c76799d63
--- /dev/null
+++ b/videoloaders/README.md
@@ -0,0 +1,15 @@
+# How to process video as data loader
+
+We assume that video is preprocessed in to image files in advance. Usually, we do not use all frames in a clip but sample a certain duration (e.g. 16 frames). The pipline we assume for each chunk is the following.
+
+- Get a list of images paths of clips e.g. ["./video/clip1/frame0.jpg",...,"./video/clip1/frame101.jpg"]
+- Sample a certain duration we want to use  e.g. ["./video/clip1/frame11.jpg",...,"./video/clip1/frame26.jpg"]
+- Load each frames into a tensor shaped as (T, H, W, C). HW can be changed later. 
+- Use torchvision builtin utilities to crop, flip, etc. For example, 
+    - ToTensorVideo() from (T, H, W, C) to (C, T, H, W)), from 0-255 to 0-1 (devide by 225), and from uint8 to float.   
+    - CenterCropVideo
+    - RandomHorizontalFlipVideo
+    - NormalizeVideo with kinetics mean and std
+    -See more https://github.com/pytorch/vision/blob/f0d3daa7f65bcde560e242d9bccc284721368f02/torchvision/transforms/transforms_video.py
+
+Note that the first part is different from what official pytorch repository ( https://github.com/pytorch/vision/tree/master/references/video_classification ) does. We don't use VideoClip class.
\ No newline at end of file
diff --git a/videoloaders/__pycache__/functional_video.cpython-36.pyc b/videoloaders/__pycache__/functional_video.cpython-36.pyc
new file mode 100755
index 0000000000000000000000000000000000000000..f256a41ce9f8584dd65e0f86e7c5154e56bce323
Binary files /dev/null and b/videoloaders/__pycache__/functional_video.cpython-36.pyc differ
diff --git a/videoloaders/__pycache__/functional_video.cpython-38.pyc b/videoloaders/__pycache__/functional_video.cpython-38.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..5f3499fe8f1cd7e25cf41147062fd973f1d59dfe
Binary files /dev/null and b/videoloaders/__pycache__/functional_video.cpython-38.pyc differ
diff --git a/videoloaders/__pycache__/transform_temporal.cpython-36.pyc b/videoloaders/__pycache__/transform_temporal.cpython-36.pyc
new file mode 100755
index 0000000000000000000000000000000000000000..8518f65c3f3446994991e5bb1d3b43d41df266d3
Binary files /dev/null and b/videoloaders/__pycache__/transform_temporal.cpython-36.pyc differ
diff --git a/videoloaders/__pycache__/transform_temporal.cpython-38.pyc b/videoloaders/__pycache__/transform_temporal.cpython-38.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..93747c49628499d828f57f0572d1e6260e5c9e08
Binary files /dev/null and b/videoloaders/__pycache__/transform_temporal.cpython-38.pyc differ
diff --git a/videoloaders/__pycache__/transforms_video.cpython-36.pyc b/videoloaders/__pycache__/transforms_video.cpython-36.pyc
new file mode 100755
index 0000000000000000000000000000000000000000..bab5b115399399841ee2d17579ef61feb7e8773b
Binary files /dev/null and b/videoloaders/__pycache__/transforms_video.cpython-36.pyc differ
diff --git a/videoloaders/__pycache__/transforms_video.cpython-38.pyc b/videoloaders/__pycache__/transforms_video.cpython-38.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..31d0eaae05905c76c44e5d98198f6d79f501abef
Binary files /dev/null and b/videoloaders/__pycache__/transforms_video.cpython-38.pyc differ
diff --git a/videoloaders/collate_functions.py b/videoloaders/collate_functions.py
new file mode 100755
index 0000000000000000000000000000000000000000..837239cf6ad0208c77ba518a1632966f3233b03e
--- /dev/null
+++ b/videoloaders/collate_functions.py
@@ -0,0 +1,15 @@
+import torch
+from torch.utils.data.dataloader import default_collate
+def collate_video(batch):
+    '''
+    Our video is (temporal_crops, C, T, H, W) where temporal_crops differes from clip to clip
+    We can't use standard collate function. 
+    Instead of stacking, let's do cat
+    Keep in mind that this will also need list of frame length in order to restore each videos later. 
+    '''
+    elem = batch[0]
+    assert isinstance(elem,dict)
+    output = {key: default_collate([d[key] for d in batch]) for key in elem if key!='input'}
+    output["input"] = torch.cat([d["input"] for d in batch])
+    return output
+    
\ No newline at end of file
diff --git a/videoloaders/functional_video.py b/videoloaders/functional_video.py
new file mode 100755
index 0000000000000000000000000000000000000000..861504ec04cf1998403bea8ee067620216c0ca05
--- /dev/null
+++ b/videoloaders/functional_video.py
@@ -0,0 +1,117 @@
+#copied from  https://raw.githubusercontent.com/pytorch/vision/f0d3daa7f65bcde560e242d9bccc284721368f02/torchvision/transforms/functional_video.py
+#copied from https://raw.githubusercontent.com/pytorch/vision/f0d3daa7f65bcde560e242d9bccc284721368f02/torchvision/transforms/transforms_video.py
+import torch
+
+
+def _is_tensor_video_clip(clip):
+    if not torch.is_tensor(clip):
+        raise TypeError("clip should be Tesnor. Got %s" % type(clip))
+
+    if not clip.ndimension() == 4:
+        raise ValueError("clip should be 4D. Got %dD" % clip.dim())
+
+    return True
+
+
+def crop(clip, i, j, h, w):
+    """
+    Args:
+        clip (torch.tensor): Video clip to be cropped. Size is (C, T, H, W)
+    """
+    assert len(clip.size()) == 4, "clip should be a 4D tensor"
+    return clip[..., i:i + h, j:j + w]
+
+
+def resize(clip, target_size, interpolation_mode):
+    assert len(target_size) == 2, "target size should be tuple (height, width)"
+    return torch.nn.functional.interpolate(
+        clip, size=target_size, mode=interpolation_mode
+    )
+
+
+def resized_crop(clip, i, j, h, w, size, interpolation_mode="bilinear"):
+    """
+    Do spatial cropping and resizing to the video clip
+    Args:
+        clip (torch.tensor): Video clip to be cropped. Size is (C, T, H, W)
+        i (int): i in (i,j) i.e coordinates of the upper left corner.
+        j (int): j in (i,j) i.e coordinates of the upper left corner.
+        h (int): Height of the cropped region.
+        w (int): Width of the cropped region.
+        size (tuple(int, int)): height and width of resized clip
+    Returns:
+        clip (torch.tensor): Resized and cropped clip. Size is (C, T, H, W)
+    """
+    assert _is_tensor_video_clip(clip), "clip should be a 4D torch.tensor"
+    clip = crop(clip, i, j, h, w)
+    clip = resize(clip, size, interpolation_mode)
+    return clip
+
+
+def center_crop(clip, crop_size):
+    assert _is_tensor_video_clip(clip), "clip should be a 4D torch.tensor"
+    h, w = clip.size(-2), clip.size(-1)
+    th, tw = crop_size
+    assert h >= th and w >= tw, "height and width must be no smaller than crop_size"
+
+    i = int(round((h - th) / 2.0))
+    j = int(round((w - tw) / 2.0))
+    return crop(clip, i, j, th, tw)
+
+def corner_crop(clip, crop_size, i, j):
+    assert _is_tensor_video_clip(clip),"clip should be a 4d torch tensor"
+    h, w = clip.size(-2), clip.size(-1)
+    th, tw = crop_size
+    assert h>=th and w>=tw, "height and width must be no smaller than crop_size"
+    return crop(clip, i, j, th, tw)
+
+
+def to_tensor(clip):
+    """
+    Convert tensor data type from uint8 to float, divide value by 255.0 and
+    permute the dimenions of clip tensor
+    Args:
+        clip (torch.tensor, dtype=torch.uint8): Size is (T, H, W, C)
+    Return:
+    """
+    _is_tensor_video_clip(clip)
+    if not clip.dtype == torch.uint8:
+        raise TypeError("clip tensor should have data type uint8. Got %s" % str(clip.dtype))
+    return clip.float().permute(3, 0, 1, 2) / 255.0
+
+
+def normalize(clip, mean, std, inplace=False):
+    """
+    Args:
+        clip (torch.tensor): Video clip to be normalized. Size is (C, T, H, W)
+        mean (tuple): pixel RGB mean. Size is (3)
+        std (tuple): pixel standard deviation. Size is (3)
+    Returns:
+        normalized clip (torch.tensor): Size is (C, T, H, W)
+    """
+    assert _is_tensor_video_clip(clip), "clip should be a 4D torch.tensor"
+    if not inplace:
+        clip = clip.clone()
+    mean = torch.as_tensor(mean, dtype=clip.dtype, device=clip.device)
+    std = torch.as_tensor(std, dtype=clip.dtype, device=clip.device)
+    if clip.size(0) == 3:
+        clip.sub_(mean[:, None, None, None]).div_(std[:, None, None, None])
+    elif clip.size(0) == 1:
+        #make it compatibale with depth image
+        mean = mean.mean()
+        std = std.mean()
+        clip.sub_(mean).div_(std)
+    else:
+        raise NotImplementedError()
+    return clip
+
+
+def hflip(clip):
+    """
+    Args:
+        clip (torch.tensor): Video clip to be normalized. Size is (C, T, H, W)
+    Returns:
+        flipped clip (torch.tensor): Size is (C, T, H, W)
+    """
+    assert _is_tensor_video_clip(clip), "clip should be a 4D torch.tensor"
+    return clip.flip((-1))
diff --git a/videoloaders/transform_temporal.py b/videoloaders/transform_temporal.py
new file mode 100755
index 0000000000000000000000000000000000000000..14948b753f81ab5e6fd31b6de3d65b9c59e82348
--- /dev/null
+++ b/videoloaders/transform_temporal.py
@@ -0,0 +1,162 @@
+import os
+import random
+import math
+
+
+def temporal_batching_index(fr,length=16):
+    '''
+    Do padding or half-overlapping clips for video.
+    
+    Input:
+        fr: number of frames
+    Output:
+        batch_indices: array for batch where each element is frame index 
+    '''
+    if fr < length: 
+        #e.g. (1,2,3,4,5) to (1,1,....,1,2,3,4,5,5,...,5,5)
+        right = int((length-fr)/2)
+        left = length - right - fr
+        return [[0]*left + list(range(fr)) + [fr-1]*right]
+    
+    batch_indices = []
+    last_idx = fr - 1
+    assert length%2 == 0
+    half = int(length/2)
+    for i in range(0,fr-half,half):
+            frame_indices = [0,]*length
+            for j in range(length):
+                current_idx =  i + j 
+                if current_idx < last_idx:
+                    frame_indices[j] = current_idx
+                else:
+                    frame_indices[j] = last_idx
+            batch_indices.append(frame_indices)
+            
+    return batch_indices
+
+def temporal_sliding_window(clip,window = 16):
+    '''
+    Make a batched tensor with 16 frame sliding window with the overlap of 8. 
+    If a clip is not the multiply of 8, it's padded with the last frames. (1,2...,13,14,14,14) for (1,..,14) 
+    If a clip is less than 16 frames, padding is applied like (1,1,....,1,2,3,4,5,5,...,5,5) for (1,2,3,4,5)
+    This can be used for sliding window evaluation.
+    
+    Input:  list of image paths
+    Output: torch tensor of shape of (batch,ch,16,h,w).
+    '''
+
+    batch_indices = temporal_batching_index(len(clip),length = window)
+    
+    return [[clip[idx] for idx in  indices] for indices in batch_indices]
+
+def temporal_center_crop(clip,length = 16):
+    '''
+    Input:  list of image paths
+    Output: torch tensor of shape of (1,ch,16,h,w).
+    '''
+    fr = len(clip) 
+    if fr < length: 
+        #e.g. (1,2,3,4,5) to (1,1,....,1,2,3,4,5,5,...,5,5)
+        right = int((length-fr)/2)
+        left = length - right - fr
+        indicies =  [0]*left + list(range(fr)) + [fr-1]*right
+        output =  [clip[i] for i in indicies]
+    elif fr==length:
+        output =  clip    
+    else:
+        middle = int(fr/2)
+        assert length%2 == 0
+        half = int(length/2)
+        start = middle - half
+        output =  clip[start : start+length]
+        
+    return output[::2]
+
+
+
+def random_temporal_crop(clip,length = 16):
+    '''
+    Just randomly sample 16 consecutive frames
+    if less than 16 frames, just add padding.
+    '''
+    fr = len(clip) 
+    if fr < length: 
+        #e.g. (1,2,3,4,5) to (1,1,....,1,2,3,4,5,5,...,5,5)
+        right = int((length-fr)/2)
+        left = length - right - fr
+        indicies =  [0]*left + list(range(fr)) + [fr-1]*right
+        output =  [clip[i] for i in indicies]
+    elif fr==length:
+        output =  clip
+    else:
+        start=random.randint(0,fr-length)
+        output =  clip[start : start+length]
+    return output[::2]
+
+
+def use_all_frames(clip):
+    '''
+    Just use it as it is :)
+    '''
+    return clip
+
+def looppadding(clip, length=16):
+
+
+        out = clip
+
+        for index in out:
+            if len(out) >= length:
+                break
+            out.append(index)
+
+        return out[::2]
+
+def temporal_even_crop(clip, length=16, n_samples=1):
+
+        clip = list(clip)
+        n_frames = len(clip)
+        indices = list(range(len(clip)))
+        stride = max(
+            1, math.ceil((n_frames - 1 - length) / (n_samples - 1)))
+
+        out = []
+        for begin_index in indices[::stride]:
+            if len(out) >= n_samples:
+                break
+            end_index = min(indices[-1] + 1, begin_index + length)
+            sample = list(range(begin_index, end_index))
+
+            if len(sample) < length:
+                out.append([clip[i] for i in looppadding(sample, length=length)])
+               # out.append(clip[looppadding(sample, length=length)])
+                break
+            else:
+                out.append([clip[i] for i in sample[::2]])
+               # out.append(clip[sample[::2]])
+
+        return out
+
+
+class TemporalTransform(object):
+    def __init__(self,length,mode="center"):
+        self.mode = mode
+        self.length = length
+        #pass dummpy in order to catch incoored mode
+        self.__call__(range(128))
+        
+    def __call__(self, clip):
+        if self.mode == "random":
+            return random_temporal_crop(clip,self.length)
+        elif self.mode == "center":
+            return temporal_center_crop(clip,self.length)
+        elif self.mode == "all" or self.mode == "nocrop":
+            #note that length cannot be satisfied!
+            return use_all_frames(clip)
+        elif self.mode == "slide":
+            #note that output has one more dimention
+            return temporal_sliding_window(clip,self.length)
+        elif self.mode == "even":
+            return temporal_even_crop(clip, self.length, n_samples=5)
+        else:
+            raise NotImplementedError("this option is not defined:",self.mode)
\ No newline at end of file
diff --git a/videoloaders/transforms_video.py b/videoloaders/transforms_video.py
new file mode 100755
index 0000000000000000000000000000000000000000..156cf463abf94932421cb2dd7d500fdcdb998bbe
--- /dev/null
+++ b/videoloaders/transforms_video.py
@@ -0,0 +1,312 @@
+#copied from  https://raw.githubusercontent.com/pytorch/vision/f0d3daa7f65bcde560e242d9bccc284721368f02/torchvision/transforms/functional_video.py
+#copied from https://raw.githubusercontent.com/pytorch/vision/f0d3daa7f65bcde560e242d9bccc284721368f02/torchvision/transforms/transforms_video.py
+#!/usr/bin/env python3
+
+import numbers
+import random
+import torch
+
+try:
+    import accimage
+except:
+    pass
+
+from torchvision.transforms import (
+    RandomResizedCrop,
+)
+
+from . import functional_video as F
+
+def _get_image_size(img):
+    if isinstance(img, torch.Tensor) and img.dim() > 2:
+        return img.shape[-2:][::-1]
+    else:
+        raise TypeError("Unexpected type {}".format(type(img)))
+
+class RandomCrop(object):
+    """Crop the given PIL Image at a random location.
+    Args:
+        size (sequence or int): Desired output size of the crop. If size is an
+            int instead of sequence like (h, w), a square crop (size, size) is
+            made.
+        padding (int or sequence, optional): Optional padding on each border
+            of the image. Default is None, i.e no padding. If a sequence of length
+            4 is provided, it is used to pad left, top, right, bottom borders
+            respectively. If a sequence of length 2 is provided, it is used to
+            pad left/right, top/bottom borders, respectively.
+        pad_if_needed (boolean): It will pad the image if smaller than the
+            desired size to avoid raising an exception. Since cropping is done
+            after padding, the padding seems to be done at a random offset.
+        fill: Pixel fill value for constant fill. Default is 0. If a tuple of
+            length 3, it is used to fill R, G, B channels respectively.
+            This value is only used when the padding_mode is constant
+        padding_mode: Type of padding. Should be: constant, edge, reflect or symmetric. Default is constant.
+             - constant: pads with a constant value, this value is specified with fill
+             - edge: pads with the last value on the edge of the image
+             - reflect: pads with reflection of image (without repeating the last value on the edge)
+                padding [1, 2, 3, 4] with 2 elements on both sides in reflect mode
+                will result in [3, 2, 1, 2, 3, 4, 3, 2]
+             - symmetric: pads with reflection of image (repeating the last value on the edge)
+                padding [1, 2, 3, 4] with 2 elements on both sides in symmetric mode
+                will result in [2, 1, 1, 2, 3, 4, 4, 3]
+    """
+
+    def __init__(self, size, padding=None, pad_if_needed=False, fill=0, padding_mode='constant'):
+        if isinstance(size, numbers.Number):
+            self.size = (int(size), int(size))
+        else:
+            self.size = size
+        self.padding = padding
+        self.pad_if_needed = pad_if_needed
+        self.fill = fill
+        self.padding_mode = padding_mode
+
+    @staticmethod
+    def get_params(img, output_size):
+        """Get parameters for ``crop`` for a random crop.
+        Args:
+            img (PIL Image): Image to be cropped.
+            output_size (tuple): Expected output size of the crop.
+        Returns:
+            tuple: params (i, j, h, w) to be passed to ``crop`` for random crop.
+        """
+        w, h = _get_image_size(img)
+        th, tw = output_size
+        if w == tw and h == th:
+            return 0, 0, h, w
+
+        i = random.randint(0, h - th)
+        j = random.randint(0, w - tw)
+        return i, j, th, tw
+
+    def __call__(self, img):
+        """
+        Args:
+            img (PIL Image): Image to be cropped.
+        Returns:
+            PIL Image: Cropped image.
+        """
+        if self.padding is not None:
+            img = F.pad(img, self.padding, self.fill, self.padding_mode)
+
+        # pad the width if needed
+        if self.pad_if_needed and img.size[0] < self.size[1]:
+            img = F.pad(img, (self.size[1] - img.size[0], 0), self.fill, self.padding_mode)
+        # pad the height if needed
+        if self.pad_if_needed and img.size[1] < self.size[0]:
+            img = F.pad(img, (0, self.size[0] - img.size[1]), self.fill, self.padding_mode)
+
+        i, j, h, w = self.get_params(img, self.size)
+
+        return F.crop(img, i, j, h, w)
+
+    def __repr__(self):
+        return self.__class__.__name__ + '(size={0}, padding={1})'.format(self.size, self.padding)
+
+
+
+    
+
+class RandomCropVideo(RandomCrop):
+    def __init__(self, size):
+        if isinstance(size, numbers.Number):
+            self.size = (int(size), int(size))
+        else:
+            self.size = size
+
+    def __call__(self, clip):
+        """
+        Args:
+            clip (torch.tensor): Video clip to be cropped. Size is (C, T, H, W)
+        Returns:
+            torch.tensor: randomly cropped/resized video clip.
+                size is (C, T, OH, OW)
+        """
+        i, j, h, w = self.get_params(clip, self.size)
+        return F.crop(clip, i, j, h, w)
+
+    def __repr__(self):
+        return self.__class__.__name__ + '(size={0})'.format(self.size)
+
+
+class RandomResizedCropVideo(RandomResizedCrop):
+    def __init__(
+        self,
+        size,
+        scale=(0.08, 1.0),
+        ratio=(3.0 / 4.0, 4.0 / 3.0),
+        interpolation_mode="bilinear",
+    ):
+        if isinstance(size, tuple):
+            assert len(size) == 2, "size should be tuple (height, width)"
+            self.size = size
+        else:
+            self.size = (size, size)
+
+        self.interpolation_mode = interpolation_mode
+        self.scale = scale
+        self.ratio = ratio
+
+    def __call__(self, clip):
+        """
+        Args:
+            clip (torch.tensor): Video clip to be cropped. Size is (C, T, H, W)
+        Returns:
+            torch.tensor: randomly cropped/resized video clip.
+                size is (C, T, H, W)
+        """
+        i, j, h, w = self.get_params(clip, self.scale, self.ratio)
+        return F.resized_crop(clip, i, j, h, w, self.size, self.interpolation_mode)
+
+    def __repr__(self):
+        return self.__class__.__name__ + \
+            '(size={0}, interpolation_mode={1}, scale={2}, ratio={3})'.format(
+                self.size, self.interpolation_mode, self.scale, self.ratio
+            )
+
+
+class CenterCropVideo(object):
+    def __init__(self, crop_size):
+        if isinstance(crop_size, numbers.Number):
+            self.crop_size = (int(crop_size), int(crop_size))
+        else:
+            self.crop_size = crop_size
+        
+
+    def __call__(self, clip):
+        """
+        Args:
+            clip (torch.tensor): Video clip to be cropped. Size is (C, T, H, W)
+        Returns:
+            torch.tensor: central cropping of video clip. Size is
+            (C, T, crop_size, crop_size)
+        """
+        
+        return F.center_crop(clip, self.crop_size)
+
+    def __repr__(self):
+        return self.__class__.__name__ + '(crop_size={0})'.format(self.crop_size)
+    
+class CornerCropVideo(object):
+    def __init__(self, crop_size, loc="tr"):
+        if isinstance(crop_size, numbers.Number):
+            self.crop_size = (int(crop_size), int(crop_size))
+        else:
+            self.crop_size = crop_size
+
+    def __call__(self, clip, loc="tr"):
+        """
+        Args:
+            clip (torch.tensor): Video clip to be cropped. Size is (C, T, H, W)
+        Returns:
+            torch.tensor: central cropping of video clip. Size is
+            (C, T, crop_size, crop_size)
+        """
+        if loc == "tr":
+            i = 0
+            j = 0
+        elif loc == "center":
+            return F.corner_crop(clip, self.crop_size)
+        else:
+            i = clip.size(-2) - self.crop_size
+            j = clip.size(-1) - self.crop_size
+        return F.corner_crop(clip, self.crop_size, i, j)
+
+    def __repr__(self):
+        return self.__class__.__name__ + '(crop_size={0})'.format(self.crop_size)
+
+
+class NormalizeVideo(object):
+    """
+    Normalize the video clip by mean subtraction and division by standard deviation
+    Args:
+        mean (3-tuple): pixel RGB mean
+        std (3-tuple): pixel RGB standard deviation
+        inplace (boolean): whether do in-place normalization
+    """
+
+    def __init__(self, mean, std, inplace=False):
+        self.mean = mean
+        self.std = std
+        self.inplace = inplace
+
+    def __call__(self, clip):
+        """
+        Args:
+            clip (torch.tensor): video clip to be normalized. Size is (C, T, H, W)
+        """
+        return F.normalize(clip, self.mean, self.std, self.inplace)
+
+    def __repr__(self):
+        return self.__class__.__name__ + '(mean={0}, std={1}, inplace={2})'.format(
+            self.mean, self.std, self.inplace)
+
+
+class ToTensorVideo(object):
+    """
+    Convert tensor data type from uint8 to float, divide value by 255.0 and
+    permute the dimenions of clip tensor
+    """
+
+    def __init__(self):
+        pass
+
+    def __call__(self, clip):
+        """
+        Args:
+            clip (torch.tensor, dtype=torch.uint8): Size is (T, H, W, C)
+        Return:
+            clip (torch.tensor, dtype=torch.float): Size is (C, T, H, W)
+        """
+        return F.to_tensor(clip)
+
+    def __repr__(self):
+        return self.__class__.__name__
+
+
+class RandomHorizontalFlipVideo(object):
+    """
+    Flip the video clip along the horizonal direction with a given probability
+    Args:
+        p (float): probability of the clip being flipped. Default value is 0.5
+    """
+
+    def __init__(self, p=0.5):
+        self.p = p
+
+    def __call__(self, clip):
+        """
+        Args:
+            clip (torch.tensor): Size is (C, T, H, W)
+        Return:
+            clip (torch.tensor): Size is (C, T, H, W)
+        """
+        if random.random() < self.p:
+            clip = F.hflip(clip)
+        return clip
+
+    def __repr__(self):
+        return self.__class__.__name__ + "(p={0})".format(self.p)
+
+    
+    
+class ResizeVideo(object):
+    """
+    Resize the video clip
+    """
+    def __init__(self, w,h):
+        self.w = w
+        self.h = h
+    def __call__(self, clip):
+        """
+        Args:
+            clip (torch.tensor): Size is (C, T, H, W)
+        Return:
+            clip (torch.tensor): Size is (C, T, h, w)
+        """
+        #interpolare needs (T,C, H, W) order while clip is (C, T, H, W)
+        return torch.nn.functional.interpolate(
+                    clip.permute(1,0,2,3),(self.h,self.w),mode="bilinear",align_corners=False).permute(1,0,2,3)
+
+    def __repr__(self):
+        return self.__class__.__name__ + "(w=%d,h=%d)"%(self.w,self.h)
\ No newline at end of file