xiziwang commited on Sep 11, 2023

Commit

2e36228

•

1 Parent(s): a9c14c0

push files

Browse files

This view is limited to 50 files because it contains too many changes. See raw diff

Files changed (50) hide show

README.md +61 -0
__pycache__/dataLoader_multiperson.cpython-37.pyc +0 -0
__pycache__/loconet.cpython-37.pyc +0 -0
__pycache__/loss_multi.cpython-37.pyc +0 -0
__pycache__/talkNet_config_multi.cpython-37.pyc +0 -0
builder.py +95 -0
configs/multi.yaml +51 -0
dataLoaderTalkSet.py +182 -0
dataLoader_multiperson.py +402 -0
dlhammer/.gitignore +3 -0
dlhammer/LICENSE +201 -0
dlhammer/README.md +2 -0
dlhammer/dlhammer/.ipynb_checkpoints/argparser-checkpoint.py +110 -0
dlhammer/dlhammer/.ipynb_checkpoints/bootstrap-checkpoint.py +33 -0
dlhammer/dlhammer/__init__.py +1 -0
dlhammer/dlhammer/argparser.py +109 -0
dlhammer/dlhammer/bootstrap.py +33 -0
dlhammer/dlhammer/logger.py +66 -0
dlhammer/dlhammer/test/config.yml +32 -0
dlhammer/dlhammer/test/test_args.py +20 -0
dlhammer/dlhammer/test/test_logger.py +22 -0
dlhammer/dlhammer/utils/__init__.py +0 -0
dlhammer/dlhammer/utils/misc.py +125 -0
dlhammer/dlhammer/utils/system.py +25 -0
environment.yml +298 -0
legacy/talkNet_multi_multicard.py +124 -0
legacy/talkNet_multicard.py +146 -0
legacy/talkNet_orig.py +102 -0
legacy/trainTalkNet_multicard.py +171 -0
legacy/train_multi.py +156 -0
loconet.py +182 -0
loss_multi.py +72 -0
metrics/AverageMeter.py +18 -0
metrics/__pycache__/.nfs000000035f4a8257000000eb +0 -0
metrics/__pycache__/AverageMeter.cpython-36.pyc +0 -0
metrics/__pycache__/AverageMeter.cpython-38.pyc +0 -0
metrics/__pycache__/accuracy.cpython-36.pyc +0 -0
metrics/__pycache__/accuracy.cpython-38.pyc +0 -0
metrics/accuracy.py +20 -0
model/.DS_Store +0 -0
model/__init__.py +5 -0
model/__pycache__/__init__.cpython-36.pyc +0 -0
model/__pycache__/__init__.cpython-37.pyc +0 -0
model/__pycache__/attentionLayer.cpython-37.pyc +0 -0
model/__pycache__/convLayer.cpython-37.pyc +0 -0
model/__pycache__/loconet_encoder.cpython-37.pyc +0 -0
model/__pycache__/position_encoding.cpython-36.pyc +0 -0
model/__pycache__/talkNetModel.cpython-37.pyc +0 -0
model/__pycache__/transformer.cpython-36.pyc +0 -0
model/__pycache__/utils.cpython-36.pyc +0 -0

README.md ADDED Viewed

	@@ -0,0 +1,61 @@

+## LoCoNet: Long-Short Context Network for Active Speaker Detection
+### Dependencies
+Start from building the environment
+```
+conda env create -f requirements.yml
+conda activate loconet
+```
+export PYTHONPATH=**project_dir**/dlhammer:$PYTHONPATH
+and replace **project_dir** with your code base location
+### Data preparation
+We follow TalkNet's data preparation script to download and prepare the AVA dataset.
+```
+python train.py --dataPathAVA AVADataPath --download
+```
+`AVADataPath` is the folder you want to save the AVA dataset and its preprocessing outputs, the details can be found in [here](https://github.com/TaoRuijie/TalkNet_ASD/blob/main/utils/tools.py#L34) . Please read them carefully.
+After AVA dataset is downloaded, please change the DATA.dataPathAVA entry in the config file.
+#### Training script
+```
+python -W ignore::UserWarning train.py --cfg configs/multi.yaml OUTPUT_DIR <output directory>
+```
+#### Pretrained model
+Please download the LoCoNet trained weights on AVA dataset [here](https://drive.google.com/file/d/1EX-V464jCD6S-wg68yGuAa-UcsMrw8mK/view?usp=sharing).
+```
+python -W ignore::UserWarning test_multicard.py --cfg configs/multi.yaml  RESUME_PATH {model download path}
+```
+### Citation
+Please cite the following if our paper or code is helpful to your research.
+```
+@article{wang2023loconet,
+  title={LoCoNet: Long-Short Context Network for Active Speaker Detection},
+  author={Wang, Xizi and Cheng, Feng and Bertasius, Gedas and Crandall, David},
+  journal={arXiv preprint arXiv:2301.08237},
+  year={2023}
+}
+```
+### Acknowledge
+The code base of this project is studied from [TalkNet](https://github.com/TaoRuijie/TalkNet-ASD) which is a very easy-to-use ASD pipeline.

__pycache__/dataLoader_multiperson.cpython-37.pyc ADDED Viewed

Binary file (10.8 kB). View file

__pycache__/loconet.cpython-37.pyc ADDED Viewed

Binary file (6.26 kB). View file

__pycache__/loss_multi.cpython-37.pyc ADDED Viewed

Binary file (2.61 kB). View file

__pycache__/talkNet_config_multi.cpython-37.pyc ADDED Viewed

Binary file (6.59 kB). View file

builder.py ADDED Viewed

	@@ -0,0 +1,95 @@

+# -*- coding: utf-8 -*-
+#================================================================
+#   Don't go gently into that good night.
+#
+#   author: klaus
+#   description:
+#
+#================================================================
+import warnings
+from mmcv.cnn import MODELS as MMCV_MODELS
+from mmcv.utils import Registry
+from mmaction.utils import import_module_error_func
+MODELS = Registry('models', parent=MMCV_MODELS)
+BACKBONES = MODELS
+NECKS = MODELS
+HEADS = MODELS
+RECOGNIZERS = MODELS
+LOSSES = MODELS
+LOCALIZERS = MODELS
+try:
+    from mmdet.models.builder import DETECTORS, build_detector
+except (ImportError, ModuleNotFoundError):
+    # Define an empty registry and building func, so that can import
+    DETECTORS = MODELS
+    @import_module_error_func('mmdet')
+    def build_detector(cfg, train_cfg, test_cfg):
+        pass
+def build_backbone(cfg):
+    """Build backbone."""
+    return BACKBONES.build(cfg)
+def build_head(cfg):
+    """Build head."""
+    return HEADS.build(cfg)
+def build_recognizer(cfg, train_cfg=None, test_cfg=None):
+    """Build recognizer."""
+    if train_cfg is not None or test_cfg is not None:
+        warnings.warn(
+            'train_cfg and test_cfg is deprecated, '
+            'please specify them in model. Details see this '
+            'PR: https://github.com/open-mmlab/mmaction2/pull/629', UserWarning)
+    assert cfg.get(
+        'train_cfg'
+    ) is None or train_cfg is None, 'train_cfg specified in both outer field and model field'    # noqa: E501
+    assert cfg.get(
+        'test_cfg'
+    ) is None or test_cfg is None, 'test_cfg specified in both outer field and model field '    # noqa: E501
+    return RECOGNIZERS.build(cfg, default_args=dict(train_cfg=train_cfg, test_cfg=test_cfg))
+def build_loss(cfg):
+    """Build loss."""
+    return LOSSES.build(cfg)
+def build_localizer(cfg):
+    """Build localizer."""
+    return LOCALIZERS.build(cfg)
+def build_model(cfg, train_cfg=None, test_cfg=None):
+    """Build model."""
+    args = cfg.copy()
+    obj_type = args.pop('type')
+    if obj_type in LOCALIZERS:
+        return build_localizer(cfg)
+    if obj_type in RECOGNIZERS:
+        return build_recognizer(cfg, train_cfg, test_cfg)
+    if obj_type in DETECTORS:
+        if train_cfg is not None or test_cfg is not None:
+            warnings.warn(
+                'train_cfg and test_cfg is deprecated, '
+                'please specify them in model. Details see this '
+                'PR: https://github.com/open-mmlab/mmaction2/pull/629', UserWarning)
+        return build_detector(cfg, train_cfg, test_cfg)
+    model_in_mmdet = ['FastRCNN']
+    if obj_type in model_in_mmdet:
+        raise ImportError('Please install mmdet for spatial temporal detection tasks.')
+    raise ValueError(f'{obj_type} is not registered in ' 'LOCALIZERS, RECOGNIZERS or DETECTORS')
+def build_neck(cfg):
+    """Build neck."""
+    return NECKS.build(cfg)

configs/multi.yaml ADDED Viewed

	@@ -0,0 +1,51 @@

+SEED: "20210617"
+NUM_GPUS: 4
+NUM_WORKERS: 6
+LOG_NAME: 'config.txt'
+OUTPUT_DIR: '/nfs/joltik/data/ssd/xiziwang/TalkNet_models/'  # savePath
+evalDataType: "val"
+downloadAVA: False
+evaluation: False
+RESUME: False
+RESUME_PATH: ""
+RESUME_EPOCH: 0
+DATA:
+    dataPathAVA: '/nfs/jolteon/data/ssd/xiziwang/AVA_dataset/'
+DATALOADER:
+    nDataLoaderThread: 4
+SOLVER:
+    OPTIMIZER: "adam"
+    BASE_LR: 5e-5
+    SCHEDULER:
+        NAME: "multistep"
+        GAMMA: 0.95
+MODEL:
+    NUM_SPEAKERS: 3
+    CLIP_LENGTH: 200
+    AV: "speaker_temporal"
+    AV_layers: 3
+    ADJUST_ATTENTION: 0
+TRAIN:
+    BATCH_SIZE: 1
+    MAX_EPOCH: 25
+    AUDIO_AUG: 1
+    TEST_INTERVAL: 1
+    TRAINER_GPU: 4
+VAL:
+    BATCH_SIZE: 1
+TEST:
+    BATCH_SIZE: 1
+    DATASET: 'seen'
+    MODEL: 'unseen'

dataLoaderTalkSet.py ADDED Viewed

	@@ -0,0 +1,182 @@

+import os, torch, numpy, cv2, imageio, random, python_speech_features
+import matplotlib.pyplot as plt
+from scipy.io import wavfile
+from glob import glob
+from torchvision.transforms import RandomCrop
+from scipy import signal
+def get_noise_list(musanPath, rirPath):
+	augment_files = glob(os.path.join(musanPath, '*/*/*/*.wav'))
+	noiselist = {}
+	rir = numpy.load(rirPath)
+	for file in augment_files:
+		if not file.split('/')[-4] in noiselist:
+			noiselist[file.split('/')[-4]] = []
+		noiselist[file.split('/')[-4]].append(file)
+	return rir, noiselist
+def augment_wav(audio, aug_type, rir, noiselist):
+	if aug_type == 'rir':
+		rir_gains = numpy.random.uniform(-7,3,1)
+		rir_filts = random.choice(rir)
+		rir     = numpy.multiply(rir_filts, pow(10, 0.1 * rir_gains))
+		audio   = signal.convolve(audio, rir, mode='full')[:len(audio)]
+	else:
+		noisecat = aug_type
+		noisefile = random.choice(noiselist[noisecat].copy())
+		snr = [random.uniform({'noise':[0,15],'music':[5,15]}[noisecat][0], {'noise':[0,15],'music':[5,15]}[noisecat][1])]
+		_, noiseaudio = wavfile.read(noisefile)
+		if len(noiseaudio) < len(audio):
+			shortage = len(audio) - len(noiseaudio)
+			noiseaudio = numpy.pad(noiseaudio, (0, shortage), 'wrap')
+		else:
+			noiseaudio = noiseaudio[:len(audio)]
+		noise_db = 10 * numpy.log10(numpy.mean(abs(noiseaudio ** 2)) + 1e-4)
+		clean_db = 10 * numpy.log10(numpy.mean(abs(audio ** 2)) + 1e-4)
+		noise = numpy.sqrt(10 ** ((clean_db - noise_db - snr) / 10)) * noiseaudio
+		audio = audio + noise
+	return audio.astype(numpy.int16)
+def load_audio(data, data_path, length, start, end, audio_aug, rirlist = None, noiselist = None):
+	# Find the path of the audio data
+	data_type = data[0]
+	id_name = data[1][:8]
+	file_name = data[1].split('/')[0] + '_' + data[1].split('/')[1] + '_' + data[1].split('/')[2] + \
+	'_' + data[2].split('/')[0] + '_' + data[2].split('/')[1] + '_' + data[2].split('/')[2] + '.wav'
+	audio_file_path = os.path.join(data_path, data_type, id_name, file_name)
+	# Load audio, compute MFCC, cut it to the required length
+	_, audio = wavfile.read(audio_file_path)
+	if audio_aug == True:
+		augtype = random.randint(0,3)
+		if augtype == 1: # rir
+			audio = augment_wav(audio, 'rir', rirlist, noiselist)
+		elif augtype == 2:
+			audio = augment_wav(audio, 'noise', rirlist, noiselist)
+		elif augtype == 3:
+			audio = augment_wav(audio, 'music', rirlist, noiselist)
+		else:
+			audio = audio
+	feature = python_speech_features.mfcc(audio, 16000, numcep = 13, winlen = 0.025, winstep = 0.010)
+	length_audio = int(round(length * 100))
+	if feature.shape[0] < length_audio:
+		shortage    = length_audio - feature.shape[0]
+		feature     = numpy.pad(feature, ((0, shortage), (0,0)), 'wrap')
+	feature = feature[int(round(start * 100)):int(round(end * 100)),:]
+	return feature
+def load_video(data, data_path, length, start, end, visual_aug):
+	# Find the path of the visual data
+	data_type = data[0]
+	id_name = data[1][:8]
+	file_name = data[1].split('/')[0] + '_' + data[1].split('/')[1] + '_' + data[1].split('/')[2] + \
+	'_' + data[2].split('/')[0] + '_' + data[2].split('/')[1] + '_' + data[2].split('/')[2] + '.mp4'
+	video_file_path = os.path.join(data_path, data_type, id_name, file_name)
+	# Load visual frame-by-frame, cut it to the required length
+	length_video = int(round((end - start) * 25))
+	video = cv2.VideoCapture(video_file_path)
+	faces = []
+	augtype = 'orig'
+	if visual_aug == True:
+		new = int(112*random.uniform(0.7, 1))
+		x, y = numpy.random.randint(0, 112 - new), numpy.random.randint(0, 112 - new)
+		M = cv2.getRotationMatrix2D((112/2,112/2), random.uniform(-15, 15), 1)
+		augtype = random.choice(['orig', 'flip', 'crop', 'rotate'])
+	num_frame = 0
+	while video.isOpened():
+		ret, frames = video.read()
+		if ret == True:
+			num_frame += 1
+			if num_frame >= int(round(start * 25)) and num_frame < int(round(end * 25)):
+				face = cv2.cvtColor(frames, cv2.COLOR_BGR2GRAY)
+				face = cv2.resize(face, (224,224))
+				face = face[int(112-(112/2)):int(112+(112/2)), int(112-(112/2)):int(112+(112/2))]
+				if augtype == 'orig':
+					faces.append(face)
+				elif augtype == 'flip':
+					faces.append(cv2.flip(face, 1))
+				elif augtype == 'crop':
+					faces.append(cv2.resize(face[y:y+new, x:x+new] , (112,112)))
+				elif augtype == 'rotate':
+					faces.append(cv2.warpAffine(face, M, (112,112)))
+		else:
+			break
+	video.release()
+	faces = numpy.array(faces)
+	if faces.shape[0] < length_video:
+		shortage    = length_video - faces.shape[0]
+		faces     = numpy.pad(faces, ((0,shortage), (0,0),(0,0)), 'wrap')
+	# faces = numpy.array(faces)[int(round(start * 25)):int(round(end * 25)),:,:]
+	return faces
+def load_label(data, length, start, end):
+	labels_all = []
+	labels = []
+	data_type = data[0]
+	start_T, end_T, start_F, end_F = float(data[4]), float(data[5]), float(data[6]), float(data[7])
+	for i in range(int(round(length * 100))):
+		if data_type == 'TAudio':
+			labels_all.append(1)
+		elif data_type == 'FAudio' or data_type == 'FSilence':
+			labels_all.append(0)
+		else:
+			if i >= int(round(start_T * 100)) and i <= int(round(end_T * 100)):
+				labels_all.append(1)
+			else:
+				labels_all.append(0)
+	for i in range(int(round(length * 25))):
+		labels.append(int(round(sum(labels_all[i*4: (i+1)*4]) / 4)))
+	return labels[round(start*25): round(end*25)]
+class loader_TalkSet(object):
+	def __init__(self, trial_file_name, data_path, audio_aug, visual_aug, musanPath, rirPath,**kwargs):
+		self.data_path = data_path
+		self.audio_aug = audio_aug
+		self.visual_aug = visual_aug
+		self.minibatch = []
+		self.rir, self.noiselist = get_noise_list(musanPath, rirPath)
+		mix_lst = open(trial_file_name).read().splitlines()
+		mix_lst = list(filter(lambda x: float(x.split()[3]) >= 1, mix_lst)) # filter the video less than 1s
+		# mix_lst = list(filter(lambda x: x.split()[0] == 'TSilence', mix_lst))
+		sorted_mix_lst = sorted(mix_lst, key=lambda data: (float(data.split()[3]), int(data.split()[-1])), reverse=True)
+		start = 0
+		while True:
+			length_total = float(sorted_mix_lst[start].split()[3])
+			batch_size = int(250 / length_total)
+			end = min(len(sorted_mix_lst), start + batch_size)
+			self.minibatch.append(sorted_mix_lst[start:end])
+			if end == len(sorted_mix_lst):
+				break
+			start = end
+		# self.minibatch = self.minibatch[0:5]
+	def __getitem__(self, index):
+		batch_lst = self.minibatch[index]
+		length_total = float(batch_lst[-1].split()[3])
+		length_total = (int(round(length_total * 100)) - int(round(length_total * 100)) % 4) / 100
+		audio_feature, video_feature, labels = [], [], []
+		duration = random.choice([1,2,4,6])
+		#duration = 6
+		length = min(length_total, duration)
+		if length == duration:
+			start = int(round(random.randint(0, round(length_total * 25) - round(length * 25)) * 0.04 * 100)) / 100
+			end = int(round((start + length) * 100)) / 100
+		else:
+			start, end = 0, length
+		for line in batch_lst:
+			data = line.split()
+			audio_feature.append(load_audio(data, self.data_path, length_total, start, end, audio_aug = self.audio_aug, rirlist = self.rir, noiselist = self.noiselist))
+			video_feature.append(load_video(data, self.data_path, length_total, start, end, visual_aug = self.visual_aug))
+			labels.append(load_label(data, length_total, start, end))
+		return torch.FloatTensor(numpy.array(audio_feature)), \
+			   torch.FloatTensor(numpy.array(video_feature)), \
+			   torch.LongTensor(numpy.array(labels))
+	def __len__(self):
+		return len(self.minibatch)

dataLoader_multiperson.py ADDED Viewed

	@@ -0,0 +1,402 @@

+import os, torch, numpy, cv2, random, glob, python_speech_features, json, math
+from scipy.io import wavfile
+from torchvision.transforms import RandomCrop
+from operator import itemgetter
+from torchvggish import vggish_input, vggish_params, mel_features
+def overlap(audio, noiseAudio):
+    snr = [random.uniform(-5, 5)]
+    if len(noiseAudio) < len(audio):
+        shortage = len(audio) - len(noiseAudio)
+        noiseAudio = numpy.pad(noiseAudio, (0, shortage), 'wrap')
+    else:
+        noiseAudio = noiseAudio[:len(audio)]
+    noiseDB = 10 * numpy.log10(numpy.mean(abs(noiseAudio**2)) + 1e-4)
+    cleanDB = 10 * numpy.log10(numpy.mean(abs(audio**2)) + 1e-4)
+    noiseAudio = numpy.sqrt(10**((cleanDB - noiseDB - snr) / 10)) * noiseAudio
+    audio = audio + noiseAudio
+    return audio.astype(numpy.int16)
+def load_audio(data, dataPath, numFrames, audioAug, audioSet=None):
+    dataName = data[0]
+    fps = float(data[2])
+    audio = audioSet[dataName]
+    if audioAug == True:
+        augType = random.randint(0, 1)
+        if augType == 1:
+            audio = overlap(dataName, audio, audioSet)
+        else:
+            audio = audio
+    # fps is not always 25, in order to align the visual, we modify the window and step in MFCC extraction process based on fps
+    audio = python_speech_features.mfcc(audio,
+                                        16000,
+                                        numcep=13,
+                                        winlen=0.025 * 25 / fps,
+                                        winstep=0.010 * 25 / fps)
+    maxAudio = int(numFrames * 4)
+    if audio.shape[0] < maxAudio:
+        shortage = maxAudio - audio.shape[0]
+        audio = numpy.pad(audio, ((0, shortage), (0, 0)), 'wrap')
+    audio = audio[:int(round(numFrames * 4)), :]
+    return audio
+def load_single_audio(audio, fps, numFrames, audioAug=False):
+    audio = python_speech_features.mfcc(audio,
+                                        16000,
+                                        numcep=13,
+                                        winlen=0.025 * 25 / fps,
+                                        winstep=0.010 * 25 / fps)
+    maxAudio = int(numFrames * 4)
+    if audio.shape[0] < maxAudio:
+        shortage = maxAudio - audio.shape[0]
+        audio = numpy.pad(audio, ((0, shortage), (0, 0)), 'wrap')
+    audio = audio[:int(round(numFrames * 4)), :]
+    return audio
+def load_visual(data, dataPath, numFrames, visualAug):
+    dataName = data[0]
+    videoName = data[0][:11]
+    faceFolderPath = os.path.join(dataPath, videoName, dataName)
+    faceFiles = glob.glob("%s/*.jpg" % faceFolderPath)
+    sortedFaceFiles = sorted(faceFiles,
+                             key=lambda data: (float(data.split('/')[-1][:-4])),
+                             reverse=False)
+    faces = []
+    H = 112
+    if visualAug == True:
+        new = int(H * random.uniform(0.7, 1))
+        x, y = numpy.random.randint(0, H - new), numpy.random.randint(0, H - new)
+        M = cv2.getRotationMatrix2D((H / 2, H / 2), random.uniform(-15, 15), 1)
+        augType = random.choice(['orig', 'flip', 'crop', 'rotate'])
+    else:
+        augType = 'orig'
+    for faceFile in sortedFaceFiles[:numFrames]:
+        face = cv2.imread(faceFile)
+        face = cv2.cvtColor(face, cv2.COLOR_BGR2GRAY)
+        face = cv2.resize(face, (H, H))
+        if augType == 'orig':
+            faces.append(face)
+        elif augType == 'flip':
+            faces.append(cv2.flip(face, 1))
+        elif augType == 'crop':
+            faces.append(cv2.resize(face[y:y + new, x:x + new], (H, H)))
+        elif augType == 'rotate':
+            faces.append(cv2.warpAffine(face, M, (H, H)))
+    faces = numpy.array(faces)
+    return faces
+def load_label(data, numFrames):
+    res = []
+    labels = data[3].replace('[', '').replace(']', '')
+    labels = labels.split(',')
+    for label in labels:
+        res.append(int(label))
+    res = numpy.array(res[:numFrames])
+    return res
+class train_loader(object):
+    def __init__(self, cfg, trialFileName, audioPath, visualPath, num_speakers):
+        self.cfg = cfg
+        self.audioPath = audioPath
+        self.visualPath = visualPath
+        self.candidate_speakers = num_speakers
+        self.path = os.path.join(cfg.DATA.dataPathAVA, "csv")
+        self.entity_data = json.load(open(os.path.join(self.path, 'train_entity.json')))
+        self.ts_to_entity = json.load(open(os.path.join(self.path, 'train_ts.json')))
+        self.mixLst = open(trialFileName).read().splitlines()
+        self.list_length = len(self.mixLst)
+        random.shuffle(self.mixLst)
+    def load_single_audio(self, audio, fps, numFrames, audioAug=False, aug_audio=None):
+        if audioAug:
+            augType = random.randint(0, 1)
+            if augType == 1:
+                audio = overlap(audio, aug_audio)
+            else:
+                audio = audio
+        res = vggish_input.waveform_to_examples(audio, 16000, numFrames, fps, return_tensor=False)
+        return res
+    def load_visual_label_mask(self, videoName, entityName, target_ts, context_ts, visualAug=True):
+        faceFolderPath = os.path.join(self.visualPath, videoName, entityName)
+        faces = []
+        H = 112
+        if visualAug == True:
+            new = int(H * random.uniform(0.7, 1))
+            x, y = numpy.random.randint(0, H - new), numpy.random.randint(0, H - new)
+            M = cv2.getRotationMatrix2D((H / 2, H / 2), random.uniform(-15, 15), 1)
+            augType = random.choice(['orig', 'flip', 'crop', 'rotate'])
+        else:
+            augType = 'orig'
+        labels_dict = self.entity_data[videoName][entityName]
+        labels = numpy.zeros(len(target_ts))
+        mask = numpy.zeros(len(target_ts))
+        for i, time in enumerate(target_ts):
+            if time not in context_ts:
+                faces.append(numpy.zeros((H, H)))
+            else:
+                labels[i] = labels_dict[time]
+                mask[i] = 1
+                time = "%.2f" % float(time)
+                faceFile = os.path.join(faceFolderPath, str(time) + '.jpg')
+                face = cv2.imread(faceFile)
+                face = cv2.cvtColor(face, cv2.COLOR_BGR2GRAY)
+                face = cv2.resize(face, (H, H))
+                if augType == 'orig':
+                    faces.append(face)
+                elif augType == 'flip':
+                    faces.append(cv2.flip(face, 1))
+                elif augType == 'crop':
+                    faces.append(cv2.resize(face[y:y + new, x:x + new], (H, H)))
+                elif augType == 'rotate':
+                    faces.append(cv2.warpAffine(face, M, (H, H)))
+        faces = numpy.array(faces)
+        return faces, labels, mask
+    def get_speaker_context(self, videoName, target_entity, all_ts, center_ts):
+        context_speakers = list(self.ts_to_entity[videoName][center_ts])
+        context = {}
+        chosen_speakers = []
+        context[target_entity] = all_ts
+        context_speakers.remove(target_entity)
+        num_frames = len(all_ts)
+        for candidate in context_speakers:
+            candidate_ts = self.entity_data[videoName][candidate]
+            shared_ts = set(all_ts).intersection(set(candidate_ts))
+            if (len(shared_ts) > (num_frames / 2)):
+                context[candidate] = shared_ts
+                chosen_speakers.append(candidate)
+        context_speakers = chosen_speakers
+        random.shuffle(context_speakers)
+        if not context_speakers:
+            context_speakers.insert(0, target_entity)    # make sure is at 0
+            while len(context_speakers) < self.candidate_speakers:
+                context_speakers.append(random.choice(context_speakers))
+        elif len(context_speakers) < self.candidate_speakers:
+            context_speakers.insert(0, target_entity)    # make sure is at 0
+            while len(context_speakers) < self.candidate_speakers:
+                context_speakers.append(random.choice(context_speakers[1:]))
+        else:
+            context_speakers.insert(0, target_entity)    # make sure is at 0
+            context_speakers = context_speakers[:self.candidate_speakers]
+        assert set(context_speakers).issubset(set(list(context.keys()))), target_entity
+        assert target_entity in context_speakers, target_entity
+        return context_speakers, context
+    def __getitem__(self, index):
+        target_video = self.mixLst[index]
+        data = target_video.split('\t')
+        fps = float(data[2])
+        videoName = data[0][:11]
+        target_entity = data[0]
+        all_ts = list(self.entity_data[videoName][target_entity].keys())
+        numFrames = int(data[1])
+        assert numFrames == len(all_ts)
+        center_ts = all_ts[math.floor(numFrames / 2)]
+        # get context speakers which have more than half time overlapped with target speaker
+        context_speakers, context = self.get_speaker_context(videoName, target_entity, all_ts,
+                                                             center_ts)
+        if self.cfg.TRAIN.AUDIO_AUG:
+            other_indices = list(range(0, index)) + list(range(index + 1, self.list_length))
+            augment_entity = self.mixLst[random.choice(other_indices)]
+            augment_data = augment_entity.split('\t')
+            augment_entity = augment_data[0]
+            augment_videoname = augment_data[0][:11]
+            aug_sr, aug_audio = wavfile.read(
+                os.path.join(self.audioPath, augment_videoname, augment_entity + '.wav'))
+        else:
+            aug_audio = None
+        audio_path = os.path.join(self.audioPath, videoName, target_entity + '.wav')
+        sr, audio = wavfile.read(os.path.join(self.audioPath, videoName, target_entity + '.wav'))
+        audio = self.load_single_audio(audio,
+                                       fps,
+                                       numFrames,
+                                       audioAug=self.cfg.TRAIN.AUDIO_AUG,
+                                       aug_audio=aug_audio)
+        visualFeatures, labels, masks = [], [], []
+        # target_label = list(self.entity_data[videoName][target_entity].values())
+        visual, target_labels, target_masks = self.load_visual_label_mask(
+            videoName, target_entity, all_ts, all_ts)
+        for idx, context_entity in enumerate(context_speakers):
+            if context_entity == target_entity:
+                label = target_labels
+                visualfeat = visual
+                mask = target_masks
+            else:
+                visualfeat, label, mask = self.load_visual_label_mask(videoName, context_entity,
+                                                                      all_ts,
+                                                                      context[context_entity])
+            visualFeatures.append(visualfeat)
+            labels.append(label)
+            masks.append(mask)
+        audio = torch.FloatTensor(audio)[None, :, :]
+        visualFeatures = torch.FloatTensor(numpy.array(visualFeatures))
+        audio_t = audio.shape[1]
+        video_t = visualFeatures.shape[1]
+        if audio_t != video_t * 4:
+            print(visualFeatures.shape, audio.shape, videoName, target_entity, numFrames)
+        labels = torch.LongTensor(numpy.array(labels))
+        masks = torch.LongTensor(numpy.array(masks))
+        print(audio.shape)
+        return audio, visualFeatures, labels, masks
+    def __len__(self):
+        return len(self.mixLst)
+class val_loader(object):
+    def __init__(self, cfg, trialFileName, audioPath, visualPath, num_speakers):
+        self.cfg = cfg
+        self.audioPath = audioPath
+        self.visualPath = visualPath
+        self.candidate_speakers = num_speakers
+        self.path = os.path.join(cfg.DATA.dataPathAVA, "csv")
+        self.entity_data = json.load(open(os.path.join(self.path, 'val_entity.json')))
+        self.ts_to_entity = json.load(open(os.path.join(self.path, 'val_ts.json')))
+        self.mixLst = open(trialFileName).read().splitlines()
+    def load_single_audio(self, audio, fps, numFrames, audioAug=False, aug_audio=None):
+        res = vggish_input.waveform_to_examples(audio, 16000, numFrames, fps, return_tensor=False)
+        return res
+    def load_visual_label_mask(self, videoName, entityName, target_ts, context_ts):
+        faceFolderPath = os.path.join(self.visualPath, videoName, entityName)
+        faces = []
+        H = 112
+        labels_dict = self.entity_data[videoName][entityName]
+        labels = numpy.zeros(len(target_ts))
+        mask = numpy.zeros(len(target_ts))
+        for i, time in enumerate(target_ts):
+            if time not in context_ts:
+                faces.append(numpy.zeros((H, H)))
+            else:
+                labels[i] = labels_dict[time]
+                mask[i] = 1
+                time = "%.2f" % float(time)
+                faceFile = os.path.join(faceFolderPath, str(time) + '.jpg')
+                face = cv2.imread(faceFile)
+                face = cv2.cvtColor(face, cv2.COLOR_BGR2GRAY)
+                face = cv2.resize(face, (H, H))
+                faces.append(face)
+        faces = numpy.array(faces)
+        return faces, labels, mask
+    def get_speaker_context(self, videoName, target_entity, all_ts, center_ts):
+        context_speakers = list(self.ts_to_entity[videoName][center_ts])
+        context = {}
+        chosen_speakers = []
+        context[target_entity] = all_ts
+        context_speakers.remove(target_entity)
+        num_frames = len(all_ts)
+        for candidate in context_speakers:
+            candidate_ts = self.entity_data[videoName][candidate]
+            shared_ts = set(all_ts).intersection(set(candidate_ts))
+            context[candidate] = shared_ts
+            chosen_speakers.append(candidate)
+            # if (len(shared_ts) > (num_frames / 2)):
+            # context[candidate] = shared_ts
+            # chosen_speakers.append(candidate)
+        context_speakers = chosen_speakers
+        random.shuffle(context_speakers)
+        if not context_speakers:
+            context_speakers.insert(0, target_entity)    # make sure is at 0
+            while len(context_speakers) < self.candidate_speakers:
+                context_speakers.append(random.choice(context_speakers))
+        elif len(context_speakers) < self.candidate_speakers:
+            context_speakers.insert(0, target_entity)    # make sure is at 0
+            while len(context_speakers) < self.candidate_speakers:
+                context_speakers.append(random.choice(context_speakers[1:]))
+        else:
+            context_speakers.insert(0, target_entity)    # make sure is at 0
+            context_speakers = context_speakers[:self.candidate_speakers]
+        assert set(context_speakers).issubset(set(list(context.keys()))), target_entity
+        return context_speakers, context
+    def __getitem__(self, index):
+        target_video = self.mixLst[index]
+        data = target_video.split('\t')
+        fps = float(data[2])
+        videoName = data[0][:11]
+        target_entity = data[0]
+        all_ts = list(self.entity_data[videoName][target_entity].keys())
+        numFrames = int(data[1])
+        # print(numFrames, len(all_ts))
+        assert numFrames == len(all_ts)
+        center_ts = all_ts[math.floor(numFrames / 2)]
+        # get context speakers which have more than half time overlapped with target speaker
+        context_speakers, context = self.get_speaker_context(videoName, target_entity, all_ts,
+                                                             center_ts)
+        sr, audio = wavfile.read(os.path.join(self.audioPath, videoName, target_entity + '.wav'))
+        audio = self.load_single_audio(audio, fps, numFrames, audioAug=False)
+        visualFeatures, labels, masks = [], [], []
+        # target_label = list(self.entity_data[videoName][target_entity].values())
+        target_visual, target_labels, target_masks = self.load_visual_label_mask(
+            videoName, target_entity, all_ts, all_ts)
+        for idx, context_entity in enumerate(context_speakers):
+            if context_entity == target_entity:
+                label = target_labels
+                visualfeat = target_visual
+                mask = target_masks
+            else:
+                visualfeat, label, mask = self.load_visual_label_mask(videoName, context_entity,
+                                                                      all_ts,
+                                                                      context[context_entity])
+            visualFeatures.append(visualfeat)
+            labels.append(label)
+            masks.append(mask)
+        audio = torch.FloatTensor(audio)[None, :, :]
+        visualFeatures = torch.FloatTensor(numpy.array(visualFeatures))
+        audio_t = audio.shape[1]
+        video_t = visualFeatures.shape[1]
+        if audio_t != video_t * 4:
+            print(visualFeatures.shape, audio.shape, videoName, target_entity, numFrames)
+        labels = torch.LongTensor(numpy.array(labels))
+        masks = torch.LongTensor(numpy.array(masks))
+        return audio, visualFeatures, labels, masks
+    def __len__(self):
+        return len(self.mixLst)

dlhammer/.gitignore ADDED Viewed

	@@ -0,0 +1,3 @@

+*.log
+.vim-arsync
+__pycache__/

dlhammer/LICENSE ADDED Viewed

	@@ -0,0 +1,201 @@

+                                 Apache License
+                           Version 2.0, January 2004
+                        http://www.apache.org/licenses/
+   TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION
+   1. Definitions.
+      "License" shall mean the terms and conditions for use, reproduction,
+      and distribution as defined by Sections 1 through 9 of this document.
+      "Licensor" shall mean the copyright owner or entity authorized by
+      the copyright owner that is granting the License.
+      "Legal Entity" shall mean the union of the acting entity and all
+      other entities that control, are controlled by, or are under common
+      control with that entity. For the purposes of this definition,
+      "control" means (i) the power, direct or indirect, to cause the
+      direction or management of such entity, whether by contract or
+      otherwise, or (ii) ownership of fifty percent (50%) or more of the
+      outstanding shares, or (iii) beneficial ownership of such entity.
+      "You" (or "Your") shall mean an individual or Legal Entity
+      exercising permissions granted by this License.
+      "Source" form shall mean the preferred form for making modifications,
+      including but not limited to software source code, documentation
+      source, and configuration files.
+      "Object" form shall mean any form resulting from mechanical
+      transformation or translation of a Source form, including but
+      not limited to compiled object code, generated documentation,
+      and conversions to other media types.
+      "Work" shall mean the work of authorship, whether in Source or
+      Object form, made available under the License, as indicated by a
+      copyright notice that is included in or attached to the work
+      (an example is provided in the Appendix below).
+      "Derivative Works" shall mean any work, whether in Source or Object
+      form, that is based on (or derived from) the Work and for which the
+      editorial revisions, annotations, elaborations, or other modifications
+      represent, as a whole, an original work of authorship. For the purposes
+      of this License, Derivative Works shall not include works that remain
+      separable from, or merely link (or bind by name) to the interfaces of,
+      the Work and Derivative Works thereof.
+      "Contribution" shall mean any work of authorship, including
+      the original version of the Work and any modifications or additions
+      to that Work or Derivative Works thereof, that is intentionally
+      submitted to Licensor for inclusion in the Work by the copyright owner
+      or by an individual or Legal Entity authorized to submit on behalf of
+      the copyright owner. For the purposes of this definition, "submitted"
+      means any form of electronic, verbal, or written communication sent
+      to the Licensor or its representatives, including but not limited to
+      communication on electronic mailing lists, source code control systems,
+      and issue tracking systems that are managed by, or on behalf of, the
+      Licensor for the purpose of discussing and improving the Work, but
+      excluding communication that is conspicuously marked or otherwise
+      designated in writing by the copyright owner as "Not a Contribution."
+      "Contributor" shall mean Licensor and any individual or Legal Entity
+      on behalf of whom a Contribution has been received by Licensor and
+      subsequently incorporated within the Work.
+   2. Grant of Copyright License. Subject to the terms and conditions of
+      this License, each Contributor hereby grants to You a perpetual,
+      worldwide, non-exclusive, no-charge, royalty-free, irrevocable
+      copyright license to reproduce, prepare Derivative Works of,
+      publicly display, publicly perform, sublicense, and distribute the
+      Work and such Derivative Works in Source or Object form.
+   3. Grant of Patent License. Subject to the terms and conditions of
+      this License, each Contributor hereby grants to You a perpetual,
+      worldwide, non-exclusive, no-charge, royalty-free, irrevocable
+      (except as stated in this section) patent license to make, have made,
+      use, offer to sell, sell, import, and otherwise transfer the Work,
+      where such license applies only to those patent claims licensable
+      by such Contributor that are necessarily infringed by their
+      Contribution(s) alone or by combination of their Contribution(s)
+      with the Work to which such Contribution(s) was submitted. If You
+      institute patent litigation against any entity (including a
+      cross-claim or counterclaim in a lawsuit) alleging that the Work
+      or a Contribution incorporated within the Work constitutes direct
+      or contributory patent infringement, then any patent licenses
+      granted to You under this License for that Work shall terminate
+      as of the date such litigation is filed.
+   4. Redistribution. You may reproduce and distribute copies of the
+      Work or Derivative Works thereof in any medium, with or without
+      modifications, and in Source or Object form, provided that You
+      meet the following conditions:
+      (a) You must give any other recipients of the Work or
+          Derivative Works a copy of this License; and
+      (b) You must cause any modified files to carry prominent notices
+          stating that You changed the files; and
+      (c) You must retain, in the Source form of any Derivative Works
+          that You distribute, all copyright, patent, trademark, and
+          attribution notices from the Source form of the Work,
+          excluding those notices that do not pertain to any part of
+          the Derivative Works; and
+      (d) If the Work includes a "NOTICE" text file as part of its
+          distribution, then any Derivative Works that You distribute must
+          include a readable copy of the attribution notices contained
+          within such NOTICE file, excluding those notices that do not
+          pertain to any part of the Derivative Works, in at least one
+          of the following places: within a NOTICE text file distributed
+          as part of the Derivative Works; within the Source form or
+          documentation, if provided along with the Derivative Works; or,
+          within a display generated by the Derivative Works, if and
+          wherever such third-party notices normally appear. The contents
+          of the NOTICE file are for informational purposes only and
+          do not modify the License. You may add Your own attribution
+          notices within Derivative Works that You distribute, alongside
+          or as an addendum to the NOTICE text from the Work, provided
+          that such additional attribution notices cannot be construed
+          as modifying the License.
+      You may add Your own copyright statement to Your modifications and
+      may provide additional or different license terms and conditions
+      for use, reproduction, or distribution of Your modifications, or
+      for any such Derivative Works as a whole, provided Your use,
+      reproduction, and distribution of the Work otherwise complies with
+      the conditions stated in this License.
+   5. Submission of Contributions. Unless You explicitly state otherwise,
+      any Contribution intentionally submitted for inclusion in the Work
+      by You to the Licensor shall be under the terms and conditions of
+      this License, without any additional terms or conditions.
+      Notwithstanding the above, nothing herein shall supersede or modify
+      the terms of any separate license agreement you may have executed
+      with Licensor regarding such Contributions.
+   6. Trademarks. This License does not grant permission to use the trade
+      names, trademarks, service marks, or product names of the Licensor,
+      except as required for reasonable and customary use in describing the
+      origin of the Work and reproducing the content of the NOTICE file.
+   7. Disclaimer of Warranty. Unless required by applicable law or
+      agreed to in writing, Licensor provides the Work (and each
+      Contributor provides its Contributions) on an "AS IS" BASIS,
+      WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
+      implied, including, without limitation, any warranties or conditions
+      of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A
+      PARTICULAR PURPOSE. You are solely responsible for determining the
+      appropriateness of using or redistributing the Work and assume any
+      risks associated with Your exercise of permissions under this License.
+   8. Limitation of Liability. In no event and under no legal theory,
+      whether in tort (including negligence), contract, or otherwise,
+      unless required by applicable law (such as deliberate and grossly
+      negligent acts) or agreed to in writing, shall any Contributor be
+      liable to You for damages, including any direct, indirect, special,
+      incidental, or consequential damages of any character arising as a
+      result of this License or out of the use or inability to use the
+      Work (including but not limited to damages for loss of goodwill,
+      work stoppage, computer failure or malfunction, or any and all
+      other commercial damages or losses), even if such Contributor
+      has been advised of the possibility of such damages.
+   9. Accepting Warranty or Additional Liability. While redistributing
+      the Work or Derivative Works thereof, You may choose to offer,
+      and charge a fee for, acceptance of support, warranty, indemnity,
+      or other liability obligations and/or rights consistent with this
+      License. However, in accepting such obligations, You may act only
+      on Your own behalf and on Your sole responsibility, not on behalf
+      of any other Contributor, and only if You agree to indemnify,
+      defend, and hold each Contributor harmless for any liability
+      incurred by, or claims asserted against, such Contributor by reason
+      of your accepting any such warranty or additional liability.
+   END OF TERMS AND CONDITIONS
+   APPENDIX: How to apply the Apache License to your work.
+      To apply the Apache License to your work, attach the following
+      boilerplate notice, with the fields enclosed by brackets "[]"
+      replaced with your own identifying information. (Don't include
+      the brackets!)  The text should be enclosed in the appropriate
+      comment syntax for the file format. We also recommend that a
+      file or class name and description of purpose be included on the
+      same "printed page" as the copyright notice for easier
+      identification within third-party archives.
+   Copyright [yyyy] [name of copyright owner]
+   Licensed under the Apache License, Version 2.0 (the "License");
+   you may not use this file except in compliance with the License.
+   You may obtain a copy of the License at
+       http://www.apache.org/licenses/LICENSE-2.0
+   Unless required by applicable law or agreed to in writing, software
+   distributed under the License is distributed on an "AS IS" BASIS,
+   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+   See the License for the specific language governing permissions and
+   limitations under the License.

dlhammer/README.md ADDED Viewed

	@@ -0,0 +1,2 @@


1	+ # dl-hammer
2	+ tools for deep learning coding.

dlhammer/dlhammer/.ipynb_checkpoints/argparser-checkpoint.py ADDED Viewed

	@@ -0,0 +1,110 @@

+# -*- coding: utf-8 -*-
+#================================================================
+#   Don't go gently into that good night.
+#
+#   author: klaus
+#   description:
+#
+#================================================================
+import os
+import argparse
+import datetime
+from functools import partial
+import yaml
+from easydict import EasyDict
+# from .utils import get_vacant_gpu
+from .logger import bootstrap_logger, logger
+from .utils.system import get_available_gpuids
+from .utils.misc import merge_dict, merge_opts, to_string, eval_dict_leaf
+CONFIG = EasyDict()
+BASE_CONFIG = {
+    'OUTPUT_DIR': './workspace',
+    'SESSION': 'base',
+    'NUM_GPUS': 1,
+    'LOG_NAME': 'log.txt'
+}
+def bootstrap_args(default_params=None):
+    """get the params from yaml file and args. The args will override arguemnts in the yaml file.
+    Returns: EasyDict instance.
+    """
+    parser = define_default_arg_parser()
+    cfg = update_config(parser, default_params)
+    create_workspace(cfg)    #create workspace
+    CONFIG.update(cfg)
+    bootstrap_logger(get_logfile(CONFIG))    # setup logger
+    setup_gpu(CONFIG.NUM_GPUS)    #setup gpu
+    return cfg
+def setup_gpu(ngpu):
+    gpuids = get_available_gpuids()
+    # os.environ['CUDA_VISIBLE_DEVICES'] = ','.join([str(i) for i in gpuids[:ngpu]])
+def get_logfile(config):
+    return os.path.join(config.WORKSPACE, config.LOG_NAME)
+def define_default_arg_parser():
+    """Define a default arg_parser.
+    Returns:
+        A argparse.ArgumentParser. More arguments can be added.
+    """
+    parser = argparse.ArgumentParser()
+    parser.add_argument('--cfg', help='load configs from yaml file', default='', type=str)
+    parser.add_argument('opts',
+                        default=None,
+                        nargs='*',
+                        help='modify config options using the command-line')
+    return parser
+def update_config(arg_parser, default_config=None):
+    """ update argparser to args.
+    Args:
+        arg_parser: argparse.ArgumentParser.
+    """
+    parsed, unknown = arg_parser.parse_known_args()
+    if default_config and parsed.cfg == "" and "cfg" in default_config:
+        parsed.cfg = default_config["cfg"]
+    config = EasyDict(BASE_CONFIG.copy())
+    config['cfg'] = parsed.cfg
+    # update default config
+    if default_config is not None:
+        config.update(default_config)
+    # merge config from yaml
+    if os.path.isfile(config.cfg):
+        with open(config.cfg, 'r') as f:
+            yml_config = yaml.full_load(f)
+        config = merge_dict(config, yml_config)
+    # merge opts
+    config = merge_opts(config, parsed.opts)
+    # eval values
+    config = eval_dict_leaf(config)
+    return config
+def create_workspace(cfg):
+    cfg_name, ext = os.path.splitext(os.path.basename(cfg.cfg))
+    workspace = os.path.join(cfg.OUTPUT_DIR, cfg_name, cfg.SESSION)
+    os.makedirs(workspace, exist_ok=True)
+    cfg.WORKSPACE = workspace

dlhammer/dlhammer/.ipynb_checkpoints/bootstrap-checkpoint.py ADDED Viewed

	@@ -0,0 +1,33 @@

+# -*- coding: utf-8 -*-
+#================================================================
+#   Don't go gently into that good night.
+#
+#   author: klaus
+#   description:
+#
+#================================================================
+import sys
+import logging
+from .logger import bootstrap_logger, logger
+from .argparser import bootstrap_args, CONFIG
+from .utils.misc import to_string
+__all__ = ['bootstrap', 'logger', 'CONFIG']
+def bootstrap(default_cfg=None, print_cfg=True):
+    """TODO: Docstring for bootstrap.
+    Kwargs:
+        use_argparser (TODO): TODO
+        use_logger (TODO): TODO
+    Returns: TODO
+    """
+    config = bootstrap_args(default_cfg)
+    if print_cfg:
+        logger.info(to_string(config))
+    return config

dlhammer/dlhammer/__init__.py ADDED Viewed

	@@ -0,0 +1 @@


1	+ from .bootstrap import *

dlhammer/dlhammer/argparser.py ADDED Viewed

	@@ -0,0 +1,109 @@

+# -*- coding: utf-8 -*-
+#================================================================
+#   Don't go gently into that good night.
+#
+#   author: klaus
+#   description:
+#
+#================================================================
+import os
+import argparse
+import datetime
+from functools import partial
+import yaml
+from easydict import EasyDict
+# from .utils import get_vacant_gpu
+from .logger import bootstrap_logger, logger
+from .utils.system import get_available_gpuids
+from .utils.misc import merge_dict, merge_opts, to_string, eval_dict_leaf
+CONFIG = EasyDict()
+BASE_CONFIG = {
+    'OUTPUT_DIR': './workspace',
+    'NUM_GPUS': 1,
+    'LOG_NAME': 'log.txt'
+}
+def bootstrap_args(default_params=None):
+    """get the params from yaml file and args. The args will override arguemnts in the yaml file.
+    Returns: EasyDict instance.
+    """
+    parser = define_default_arg_parser()
+    cfg = update_config(parser, default_params)
+    create_workspace(cfg)    #create workspace
+    CONFIG.update(cfg)
+    bootstrap_logger(get_logfile(CONFIG))    # setup logger
+    setup_gpu(CONFIG.NUM_GPUS)    #setup gpu
+    return cfg
+def setup_gpu(ngpu):
+    gpuids = get_available_gpuids()
+    # os.environ['CUDA_VISIBLE_DEVICES'] = ','.join([str(i) for i in gpuids[:ngpu]])
+def get_logfile(config):
+    return os.path.join(config.WORKSPACE, config.LOG_NAME)
+def define_default_arg_parser():
+    """Define a default arg_parser.
+    Returns:
+        A argparse.ArgumentParser. More arguments can be added.
+    """
+    parser = argparse.ArgumentParser()
+    parser.add_argument('--cfg', help='load configs from yaml file', default='', type=str)
+    parser.add_argument('opts',
+                        default=None,
+                        nargs='*',
+                        help='modify config options using the command-line')
+    return parser
+def update_config(arg_parser, default_config=None):
+    """ update argparser to args.
+    Args:
+        arg_parser: argparse.ArgumentParser.
+    """
+    parsed, unknown = arg_parser.parse_known_args()
+    if default_config and parsed.cfg == "" and "cfg" in default_config:
+        parsed.cfg = default_config["cfg"]
+    config = EasyDict(BASE_CONFIG.copy())
+    config['cfg'] = parsed.cfg
+    # update default config
+    if default_config is not None:
+        config.update(default_config)
+    # merge config from yaml
+    if os.path.isfile(config.cfg):
+        with open(config.cfg, 'r') as f:
+            yml_config = yaml.full_load(f)
+        config = merge_dict(config, yml_config)
+    # merge opts
+    config = merge_opts(config, parsed.opts)
+    # eval values
+    config = eval_dict_leaf(config)
+    return config
+def create_workspace(cfg):
+    cfg_name, ext = os.path.splitext(os.path.basename(cfg.cfg))
+    workspace = os.path.join(cfg.OUTPUT_DIR)
+    os.makedirs(workspace, exist_ok=True)
+    cfg.WORKSPACE = workspace

dlhammer/dlhammer/bootstrap.py ADDED Viewed

	@@ -0,0 +1,33 @@

+# -*- coding: utf-8 -*-
+#================================================================
+#   Don't go gently into that good night.
+#
+#   author: klaus
+#   description:
+#
+#================================================================
+import sys
+import logging
+from .logger import bootstrap_logger, logger
+from .argparser import bootstrap_args, CONFIG
+from .utils.misc import to_string
+__all__ = ['bootstrap', 'logger', 'CONFIG']
+def bootstrap(default_cfg=None, print_cfg=True):
+    """TODO: Docstring for bootstrap.
+    Kwargs:
+        use_argparser (TODO): TODO
+        use_logger (TODO): TODO
+    Returns: TODO
+    """
+    config = bootstrap_args(default_cfg)
+    if print_cfg:
+        logger.info(to_string(config))
+    return config

dlhammer/dlhammer/logger.py ADDED Viewed

	@@ -0,0 +1,66 @@

+# -*- coding: utf-8 -*-
+#================================================================
+#   Don't go gently into that good night.
+#
+#   author: klaus
+#   description:
+#
+#================================================================
+import os
+import sys
+import logging
+logger = logging.getLogger('DLHammer')
+def bootstrap_logger(logfile=None, fmt=None):
+    """TODO: Docstring for bootstrap_logger.
+    Args:
+        logfile (str): file path logging to.
+    Kwargs:
+        fmt (TODO): TODO
+    Returns: TODO
+    """
+    if fmt is None:
+        # fmt = '%(asctime)s - %(levelname)-5s - [%(filename)s:%(lineno)d] %(message)s'
+        fmt = '%(message)s'
+    logging.basicConfig(level=logging.DEBUG, format=fmt)
+    #log to file
+    if logfile is not None:
+        formatter = logging.Formatter(fmt)
+        fh = logging.FileHandler(logfile)
+        fh.setLevel(logging.DEBUG)
+        fh.setFormatter(formatter)
+        logger.addHandler(fh)
+    # sys.stdout = LoggerWriter(sys.stdout, logger.info)
+    # sys.stderr = LoggerWriter(sys.stderr, logger.error)
+    return
+class LoggerWriter(object):
+    def __init__(self, stream, logfct):
+        self.terminal = stream
+        self.logfct = logfct
+        self.buf = []
+    def write(self, msg):
+        if msg.endswith('\n'):
+            self.buf.append(msg.rstrip('\n'))
+            message = ''.join(self.buf)
+            self.logfct(message)
+            self.buf = []
+        else:
+            self.buf.append(msg)
+    def flush(self):
+        pass

dlhammer/dlhammer/test/config.yml ADDED Viewed

	@@ -0,0 +1,32 @@

+a_int: 12
+a_float: 1e-2
+a_list: [0,1,2]
+eval_list: eval(list(range(10)))
+DATA:
+  PATH_TO_DATA_DIR: /home/ubuntu/data/kinetics/Mini-Kinetics-200
+  PATH_PREFIX: /home/ubuntu/data/kinetics/k400_ver3
+  NUM_FRAMES: 16
+  SAMPLING_RATE: 8
+  TARGET_FPS: 25
+  TRAIN_JITTER_SCALES: [256, 320]
+  TRAIN_CROP_SIZE: 224
+  TEST_CROP_SIZE: 224
+  INPUT_CHANNEL_NUM: [3]
+SOLVER:
+  BACKBONE:
+    OPTIMIZER: sgd
+    MOMENTUM: 0.9
+    BASE_LR: 1e-3
+    SCHEDULER:
+      NAME: warmup_multistep
+      MILESTONES: [13, 24]
+      WARMUP_EPOCHS: 0.5
+      GAMMA: 0.1
+  TEMPORAL_MODEL:
+    OPTIMIZER: sgd
+    MOMENTUM: 0.9
+    BASE_LR: 1e-3
+    SCHEDULER:
+      NAME: multistep
+      MILESTONES: [13, 24]
+      GAMMA: 0.1

dlhammer/dlhammer/test/test_args.py ADDED Viewed

	@@ -0,0 +1,20 @@

+# -*- coding: utf-8 -*-
+#================================================================
+#   Don't go gently into that good night.
+#
+#   author: klaus
+#   description:
+#
+#================================================================
+import os
+import sys
+CURRENT_FILE_DIRECTORY = os.path.abspath(os.path.dirname(__file__))
+sys.path.append(os.path.join(CURRENT_FILE_DIRECTORY, '../..'))
+sys.path.append(os.path.join(CURRENT_FILE_DIRECTORY, '.'))
+from dlhammer import bootstrap, CONFIG
+from dlhammer import logger
+config = bootstrap(print_cfg=True)

dlhammer/dlhammer/test/test_logger.py ADDED Viewed

	@@ -0,0 +1,22 @@

+# -*- coding: utf-8 -*-
+#================================================================
+#   Don't go gently into that good night.
+#
+#   author: klaus
+#   description:
+#
+#================================================================
+import os
+import sys
+CURRENT_FILE_DIRECTORY = os.path.abspath(os.path.dirname(__file__))
+sys.path.append(os.path.join(CURRENT_FILE_DIRECTORY, '../..'))
+sys.path.append(os.path.join(CURRENT_FILE_DIRECTORY, '.'))
+from dlhammer import bootstrap, logger
+bootstrap()
+logger.info('dummy output')
+raise Exception('dummy error')

dlhammer/dlhammer/utils/__init__.py ADDED Viewed

File without changes

dlhammer/dlhammer/utils/misc.py ADDED Viewed

	@@ -0,0 +1,125 @@

+# -*- coding: utf-8 -*-
+#================================================================
+#   Don't go gently into that good night.
+#
+#   author: klaus
+#   description:
+#
+#================================================================
+import ast
+def merge_dict(a, b, path=None):
+    """merge b into a. The values in b will override values in a.
+    Args:
+        a (dict): dict to merge to.
+        b (dict): dict to merge from.
+    Returns: dict1 with values merged from b.
+    """
+    if path is None: path = []
+    for key in b:
+        if key in a:
+            if isinstance(a[key], dict) and isinstance(b[key], dict):
+                merge_dict(a[key], b[key], path + [str(key)])
+            else:
+                a[key] = b[key]
+        else:
+            a[key] = b[key]
+    return a
+def merge_opts(d, opts):
+    """merge opts
+    Args:
+        d (dict): The dict.
+        opts (list): The opts to merge. format: [key1, name1, key2, name2,...]
+    Returns: d. the input dict `d` with merged opts.
+    """
+    assert len(opts) % 2 == 0, f'length of opts must be even. Got: {opts}'
+    for i in range(0, len(opts), 2):
+        full_k, v = opts[i], opts[i + 1]
+        keys = full_k.split('.')
+        sub_d = d
+        for i, k in enumerate(keys):
+            if not hasattr(sub_d, k):
+                raise ValueError(f'The key {k} not exist in the dict. Full key:{full_k}')
+            if i != len(keys) - 1:
+                sub_d = sub_d[k]
+            else:
+                sub_d[k] = v
+    return d
+def to_string(params, indent=2):
+    """format params to a string
+    Args:
+        params (EasyDict): the params.
+    Returns: The string to display.
+    """
+    msg = '{\n'
+    for i, (k, v) in enumerate(params.items()):
+        if isinstance(v, dict):
+            v = to_string(v, indent + 4)
+        spaces = ' ' * indent
+        msg += spaces + '{}: {}'.format(k, v)
+        if i == len(params) - 1:
+            msg += ' }'
+        else:
+            msg += '\n'
+    return msg
+def eval_dict_leaf(d):
+    """eval values of dict leaf.
+    Args:
+        d (dict): The dict to eval.
+    Returns: dict.
+    """
+    for k, v in d.items():
+        if not isinstance(v, dict):
+            d[k] = eval_string(v)
+        else:
+            eval_dict_leaf(v)
+    return d
+def eval_string(string):
+    """automatically evaluate string to corresponding types.
+    For example:
+        not a string  -> return the original input
+        '0'  -> 0
+        '0.2' -> 0.2
+        '[0, 1, 2]' -> [0,1,2]
+        'eval(1+2)' -> 3
+        'eval(range(5))' -> [0,1,2,3,4]
+    Args:
+        value : string.
+    Returns: the corresponding type
+    """
+    if not isinstance(string, str):
+        return string
+    if len(string) > 1 and string[0] == '[' and string[-1] == ']':
+        return eval(string)
+    if string[0:5] == 'eval(':
+        return eval(string[5:-1])
+    try:
+        v = ast.literal_eval(string)
+    except:
+        v = string
+    return v

dlhammer/dlhammer/utils/system.py ADDED Viewed

	@@ -0,0 +1,25 @@

+# -*- coding: utf-8 -*-
+#================================================================
+#   Don't go gently into that good night.
+#
+#   author: klaus
+#   description:
+#
+#================================================================
+import os
+import sys
+import subprocess
+import numpy as np
+def get_available_gpuids():
+    """
+    Returns: the gpu ids sorted in descending order w.r.t occupied memory.
+    """
+    com = "nvidia-smi|sed -n '/%/p'|sed 's/|/\\n/g'|sed -n '/MiB/p'|sed 's/ //g'|sed 's/MiB/\\n/'|sed '/\\//d'"
+    gpum = subprocess.check_output(com, shell=True)
+    gpum = gpum.decode('utf-8').split('\n')
+    gpum = gpum[:-1]
+    sorted_gpuid = np.argsort(gpum)
+    return sorted_gpuid

environment.yml ADDED Viewed

	@@ -0,0 +1,298 @@

+name: loconet
+channels:
+  - conda-forge
+  - defaults
+dependencies:
+  - _libgcc_mutex=0.1=conda_forge
+  - _openmp_mutex=4.5=1_gnu
+  - alsa-lib=1.2.3=h516909a_0
+  - anyio=3.5.0=py37h89c1867_0
+  - argon2-cffi=21.3.0=pyhd8ed1ab_0
+  - argon2-cffi-bindings=21.2.0=py37h5e8e339_1
+  - aria2=1.36.0=h319415d_2
+  - attrs=21.4.0=pyhd8ed1ab_0
+  - babel=2.9.1=pyh44b312d_0
+  - backcall=0.2.0=pyh9f0ad1d_0
+  - backports=1.0=py_2
+  - backports.functools_lru_cache=1.6.4=pyhd8ed1ab_0
+  - bleach=4.1.0=pyhd8ed1ab_0
+  - bottleneck=1.3.4=py37h6c7ee08_0
+  - brotli=1.0.9=h7f98852_6
+  - brotli-bin=1.0.9=h7f98852_6
+  - brotlipy=0.7.0=py37h5e8e339_1003
+  - c-ares=1.18.1=h7f98852_0
+  - ca-certificates=2022.5.18.1=ha878542_0
+  - cffi=1.14.6=py37hc58025e_0
+  - configparser=5.2.0=pyhd8ed1ab_0
+  - cryptography=36.0.1=py37hf1a17b8_0
+  - cycler=0.11.0=pyhd8ed1ab_0
+  - cython=0.29.27=py37hcd2ae1e_0
+  - dbus=1.13.6=h48d8840_2
+  - debugpy=1.5.1=py37hcd2ae1e_0
+  - defusedxml=0.7.1=pyhd8ed1ab_0
+  - easydict=1.9=py_0
+  - entrypoints=0.4=pyhd8ed1ab_0
+  - expat=2.4.6=h27087fc_0
+  - flit-core=3.7.0=pyhd8ed1ab_0
+  - fontconfig=2.13.96=ha180cfb_0
+  - fonttools=4.29.1=py37h5e8e339_0
+  - freetype=2.10.4=h0708190_1
+  - gettext=0.19.8.1=h0b5b191_1005
+  - giflib=5.2.1=h36c2ea0_2
+  - glib=2.68.4=h9c3ff4c_0
+  - glib-tools=2.68.4=h9c3ff4c_0
+  - gst-plugins-base=1.18.5=hf529b03_0
+  - gstreamer=1.18.5=h76c114f_0
+  - icu=68.2=h9c3ff4c_0
+  - idna=3.3=pyhd8ed1ab_0
+  - importlib_resources=5.4.0=pyhd8ed1ab_0
+  - ipykernel=6.9.1=py37h6531663_0
+  - ipython=7.31.1=py37h89c1867_0
+  - ipython_genutils=0.2.0=py_1
+  - jbig=2.1=h7f98852_2003
+  - jedi=0.18.1=py37h89c1867_0
+  - jinja2=3.0.3=pyhd8ed1ab_0
+  - jpeg=9e=h7f98852_0
+  - json5=0.9.5=pyh9f0ad1d_0
+  - jsonschema=4.4.0=pyhd8ed1ab_0
+  - jupyter_client=7.1.2=pyhd8ed1ab_0
+  - jupyter_core=4.9.2=py37h89c1867_0
+  - jupyter_server=1.13.5=pyhd8ed1ab_1
+  - jupyterlab=3.2.9=pyhd8ed1ab_0
+  - jupyterlab_pygments=0.1.2=pyh9f0ad1d_0
+  - jupyterlab_server=2.10.3=pyhd8ed1ab_0
+  - kiwisolver=1.3.2=py37h2527ec5_1
+  - krb5=1.19.2=hcc1bbae_3
+  - lcms2=2.12=hddcbb42_0
+  - ld_impl_linux-64=2.36.1=hea4e1c9_2
+  - lerc=3.0=h9c3ff4c_0
+  - libblas=3.9.0=13_linux64_openblas
+  - libbrotlicommon=1.0.9=h7f98852_6
+  - libbrotlidec=1.0.9=h7f98852_6
+  - libbrotlienc=1.0.9=h7f98852_6
+  - libcblas=3.9.0=13_linux64_openblas
+  - libclang=11.1.0=default_ha53f305_1
+  - libdeflate=1.10=h7f98852_0
+  - libedit=3.1.20191231=he28a2e2_2
+  - libevent=2.1.10=h9b69904_4
+  - libffi=3.3=h58526e2_2
+  - libgcc-ng=11.2.0=h1d223b6_12
+  - libgfortran-ng=11.2.0=h69a702a_12
+  - libgfortran5=11.2.0=h5c6108e_12
+  - libglib=2.68.4=h3e27bee_0
+  - libgomp=11.2.0=h1d223b6_12
+  - libiconv=1.16=h516909a_0
+  - liblapack=3.9.0=13_linux64_openblas
+  - libllvm11=11.1.0=hf817b99_3
+  - libogg=1.3.4=h7f98852_1
+  - libopenblas=0.3.18=pthreads_h8fe5266_0
+  - libopus=1.3.1=h7f98852_1
+  - libpng=1.6.37=h21135ba_2
+  - libpq=13.5=hd57d9b9_1
+  - libsodium=1.0.18=h36c2ea0_1
+  - libssh2=1.10.0=ha56f1ee_2
+  - libstdcxx-ng=11.2.0=he4da1e4_12
+  - libtiff=4.3.0=h542a066_3
+  - libuuid=2.32.1=h7f98852_1000
+  - libvorbis=1.3.7=h9c3ff4c_0
+  - libwebp=1.2.2=h3452ae3_0
+  - libwebp-base=1.2.2=h7f98852_1
+  - libxcb=1.13=h7f98852_1004
+  - libxkbcommon=1.0.3=he3ba5ed_0
+  - libxml2=2.9.12=h72842e0_0
+  - libzlib=1.2.11=h36c2ea0_1013
+  - llvmlite=0.38.0=py37h0761922_1
+  - lz4-c=1.9.3=h9c3ff4c_1
+  - markupsafe=2.1.0=py37h540881e_0
+  - matplotlib=3.5.1=py37h89c1867_0
+  - matplotlib-base=3.5.1=py37h1058ff1_0
+  - matplotlib-inline=0.1.3=pyhd8ed1ab_0
+  - mistune=0.8.4=py37h5e8e339_1005
+  - munkres=1.1.4=pyh9f0ad1d_0
+  - mysql-common=8.0.28=ha770c72_0
+  - mysql-libs=8.0.28=hfa10184_0
+  - nbclassic=0.3.5=pyhd8ed1ab_0
+  - nbclient=0.5.11=pyhd8ed1ab_0
+  - nbconvert=6.4.2=py37h89c1867_0
+  - nbformat=5.1.3=pyhd8ed1ab_0
+  - ncurses=6.2=h58526e2_4
+  - nest-asyncio=1.5.4=pyhd8ed1ab_0
+  - nomkl=1.0=h5ca1d4c_0
+  - notebook=6.4.8=pyha770c72_0
+  - nspr=4.32=h9c3ff4c_1
+  - nss=3.74=hb5efdd6_0
+  - numba=0.55.1=py37h2d894fd_0
+  - numexpr=2.8.0=py37hfe5f03c_101
+  - numpy=1.21.5=py37hf2998dd_0
+  - openjpeg=2.4.0=hb52868f_1
+  - openssl=1.1.1o=h166bdaf_0
+  - packaging=21.3=pyhd8ed1ab_0
+  - pandas=1.3.5=py37h8c16a72_0
+  - pandoc=2.17.1.1=ha770c72_0
+  - pandocfilters=1.5.0=pyhd8ed1ab_0
+  - parso=0.8.3=pyhd8ed1ab_0
+  - patsy=0.5.2=pyhd8ed1ab_0
+  - pcre=8.45=h9c3ff4c_0
+  - pexpect=4.8.0=pyh9f0ad1d_2
+  - pickleshare=0.7.5=py_1003
+  - pip=22.0.3=pyhd8ed1ab_0
+  - prometheus_client=0.13.1=pyhd8ed1ab_0
+  - prompt-toolkit=3.0.27=pyha770c72_0
+  - pthread-stubs=0.4=h36c2ea0_1001
+  - ptyprocess=0.7.0=pyhd3deb0d_0
+  - pycparser=2.21=pyhd8ed1ab_0
+  - pygments=2.11.2=pyhd8ed1ab_0
+  - pyopenssl=22.0.0=pyhd8ed1ab_0
+  - pyparsing=3.0.7=pyhd8ed1ab_0
+  - pyqt=5.12.3=py37h89c1867_8
+  - pyqt-impl=5.12.3=py37hac37412_8
+  - pyqt5-sip=4.19.18=py37hcd2ae1e_8
+  - pyqtchart=5.12=py37he336c9b_8
+  - pyqtwebengine=5.12.1=py37he336c9b_8
+  - pyrsistent=0.18.1=py37h5e8e339_0
+  - pysocks=1.7.1=py37h89c1867_4
+  - python=3.7.9=hffdb5ce_100_cpython
+  - python-dateutil=2.8.2=pyhd8ed1ab_0
+  - python_abi=3.7=2_cp37m
+  - pytz=2021.3=pyhd8ed1ab_0
+  - pyzmq=22.3.0=py37h336d617_1
+  - qt=5.12.9=hda022c4_4
+  - readline=8.1=h46c0cb4_0
+  - resampy=0.2.2=py_0
+  - scipy=1.7.3=py37hf2a6cf1_0
+  - seaborn=0.11.2=hd8ed1ab_0
+  - seaborn-base=0.11.2=pyhd8ed1ab_0
+  - send2trash=1.8.0=pyhd8ed1ab_0
+  - six=1.16.0=pyh6c4a22f_0
+  - sniffio=1.2.0=py37h89c1867_2
+  - sqlite=3.37.0=h9cd32fc_0
+  - statsmodels=0.13.2=py37hb1e94ed_0
+  - terminado=0.13.1=py37h89c1867_0
+  - testpath=0.5.0=pyhd8ed1ab_0
+  - tk=8.6.12=h27826a3_0
+  - tornado=6.1=py37h5e8e339_2
+  - traitlets=5.1.1=pyhd8ed1ab_0
+  - typing_extensions=4.1.1=pyha770c72_0
+  - unicodedata2=14.0.0=py37h5e8e339_0
+  - wcwidth=0.2.5=pyh9f0ad1d_2
+  - webencodings=0.5.1=py_1
+  - websocket-client=1.2.3=pyhd8ed1ab_0
+  - wheel=0.37.1=pyhd8ed1ab_0
+  - xorg-libxau=1.0.9=h7f98852_0
+  - xorg-libxdmcp=1.1.3=h7f98852_0
+  - xz=5.2.5=h516909a_1
+  - zeromq=4.3.4=h9c3ff4c_1
+  - zlib=1.2.11=h36c2ea0_1013
+  - zstd=1.5.2=ha95c52a_0
+  - pip:
+    - absl-py==1.0.0
+    - addict==2.4.0
+    - aiohttp==3.8.1
+    - aiosignal==1.2.0
+    - analytics-python==1.4.0
+    - appdirs==1.4.4
+    - asgiref==3.5.2
+    - async-timeout==4.0.2
+    - asynctest==0.13.0
+    - audioread==2.1.9
+    - backoff==1.10.0
+    - bcrypt==3.2.2
+    - beautifulsoup4==4.10.0
+    - cachetools==4.2.4
+    - certifi==2021.10.8
+    - charset-normalizer==2.0.9
+    - click==8.0.3
+    - decorator==4.4.2
+    - decord==0.6.0
+    - einops==0.4.0
+    - fastapi==0.78.0
+    - ffmpeg==1.4
+    - ffmpy==0.3.0
+    - filelock==3.4.0
+    - frozenlist==1.3.0
+    - fsspec==2022.1.0
+    - future==0.18.2
+    - fvcore==0.1.5.post20221221
+    - gdown==4.2.0
+    - google-auth==2.3.3
+    - google-auth-oauthlib==0.4.6
+    - gradio==3.0.2
+    - grpcio==1.43.0
+    - h11==0.13.0
+    - imageio==2.23.0
+    - imageio-ffmpeg==0.4.7
+    - importlib-metadata==4.10.0
+    - iopath==0.1.10
+    - ipywidgets==8.0.4
+    - joblib==1.1.0
+    - jupyterlab-widgets==3.0.5
+    - librosa==0.9.1
+    - linkify-it-py==1.0.3
+    - lmdb==1.4.1
+    - markdown==3.3.6
+    - markdown-it-py==2.1.0
+    - mdit-py-plugins==0.3.0
+    - mdurl==0.1.1
+    - mmaction2==0.24.1
+    - mmcv==1.7.0
+    - mmcv-full==1.4.6
+    - monotonic==1.6
+    - moviepy==1.0.3
+    - multidict==5.2.0
+    - oauthlib==3.1.1
+    - opencv-contrib-python==4.7.0.68
+    - opencv-python==4.5.5.62
+    - orjson==3.6.8
+    - paramiko==2.11.0
+    - pillow==8.3.2
+    - pooch==1.6.0
+    - portalocker==2.7.0
+    - proglog==0.1.10
+    - protobuf==3.19.3
+    - pyasn1==0.4.8
+    - pyasn1-modules==0.2.8
+    - pycryptodome==3.14.1
+    - pydantic==1.9.0
+    - pydeprecate==0.3.1
+    - pydub==0.25.1
+    - pynacl==1.5.0
+    - python-box==6.0.2
+    - python-multipart==0.0.5
+    - python-speech-features==0.6
+    - pytorch-lightning==1.5.8
+    - pyyaml==6.0
+    - requests==2.26.0
+    - requests-oauthlib==1.3.0
+    - rsa==4.8
+    - scenedetect==0.5.6.1
+    - scikit-learn==1.0.1
+    - setuptools==60.9.3
+    - soundfile==0.10.3.post1
+    - soupsieve==2.3.1
+    - starlette==0.19.1
+    - tabulate==0.9.0
+    - tensorboard==2.7.0
+    - tensorboard-data-server==0.6.1
+    - tensorboard-plugin-wit==1.8.1
+    - termcolor==2.2.0
+    - threadpoolctl==3.0.0
+    - timm==0.4.5
+    - torch==1.10.1
+    - torchaudio==0.10.1
+    - torchlibrosa==0.0.9
+    - torchmetrics==0.7.0
+    - torchvision==0.11.2
+    - tqdm==4.62.3
+    - typing-extensions==4.0.1
+    - uc-micro-py==1.0.1
+    - urllib3==1.26.7
+    - uvicorn==0.17.6
+    - warmup-scheduler-pytorch==0.1.2
+    - werkzeug==2.0.2
+    - wget==3.2
+    - widgetsnbextension==4.0.5
+    - yacs==0.1.8
+    - yapf==0.32.0
+    - yarl==1.7.2
+    - youtube-dl==2021.12.17
+    - zipp==3.6.0

legacy/talkNet_multi_multicard.py ADDED Viewed

	@@ -0,0 +1,124 @@

+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+import sys, time, numpy, os, subprocess, pandas, tqdm
+from loss_multi import lossAV, lossA, lossV
+from model.talkNetModel import talkNetModel
+import pytorch_lightning as pl
+from torch import distributed as dist
+class talkNet(pl.LightningModule):
+    def __init__(self, cfg):
+        super(talkNet, self).__init__()
+        self.model = talkNetModel().cuda()
+        self.cfg = cfg
+        self.lossAV = lossAV().cuda()
+        self.lossA = lossA().cuda()
+        self.lossV = lossV().cuda()
+        print(
+            time.strftime("%m-%d %H:%M:%S") + " Model para number = %.2f" %
+            (sum(param.numel() for param in self.model.parameters()) / 1024 / 1024))
+    def configure_optimizers(self):
+        optimizer = torch.optim.Adam(self.parameters(), lr=self.cfg.SOLVER.BASE_LR)
+        scheduler = torch.optim.lr_scheduler.StepLR(optimizer,
+                                                    step_size=1,
+                                                    gamma=self.cfg.SOLVER.SCHEDULER.GAMMA)
+        return {"optimizer": optimizer, "lr_scheduler": scheduler}
+    def training_step(self, batch, batch_idx):
+        audioFeature, visualFeature, labels, masks = batch
+        b, s, t = visualFeature.shape[0], visualFeature.shape[1], visualFeature.shape[2]
+        audioFeature = audioFeature.repeat(1, s, 1, 1)
+        audioFeature = audioFeature.view(b * s, *audioFeature.shape[2:])
+        visualFeature = visualFeature.view(b * s, *visualFeature.shape[2:])
+        labels = labels.view(b * s, *labels.shape[2:])
+        masks = masks.view(b * s, *masks.shape[2:])
+        audioEmbed = self.model.forward_audio_frontend(audioFeature)    # feedForward
+        visualEmbed = self.model.forward_visual_frontend(visualFeature)
+        audioEmbed, visualEmbed = self.model.forward_cross_attention(audioEmbed, visualEmbed)
+        outsAV = self.model.forward_audio_visual_backend(audioEmbed, visualEmbed)
+        outsA = self.model.forward_audio_backend(audioEmbed)
+        outsV = self.model.forward_visual_backend(visualEmbed)
+        labels = labels.reshape((-1))
+        nlossAV, _, _, prec = self.lossAV.forward(outsAV, labels, masks)
+        nlossA = self.lossA.forward(outsA, labels, masks)
+        nlossV = self.lossV.forward(outsV, labels, masks)
+        loss = nlossAV + 0.4 * nlossA + 0.4 * nlossV
+        self.log("train_loss", loss, on_step=True, on_epoch=True, prog_bar=True, logger=True)
+        return loss
+    def training_epoch_end(self, training_step_outputs):
+        self.saveParameters(
+            os.path.join(self.cfg.WORKSPACE, "model", "{}.pth".format(self.current_epoch)))
+    def evaluate_network(self, loader):
+        self.eval()
+        predScores = []
+        self.model = self.model.cuda()
+        self.lossAV = self.lossAV.cuda()
+        self.lossA = self.lossA.cuda()
+        self.lossV = self.lossV.cuda()
+        evalCsvSave = self.cfg.evalCsvSave
+        evalOrig = self.cfg.evalOrig
+        for audioFeature, visualFeature, labels, masks in tqdm.tqdm(loader):
+            with torch.no_grad():
+                b, s = visualFeature.shape[0], visualFeature.shape[1]
+                t = visualFeature.shape[2]
+                audioFeature = audioFeature.repeat(1, s, 1, 1)
+                audioFeature = audioFeature.view(b * s, *audioFeature.shape[2:])
+                visualFeature = visualFeature.view(b * s, *visualFeature.shape[2:])
+                labels = labels.view(b * s, *labels.shape[2:])
+                masks = masks.view(b * s, *masks.shape[2:])
+                audioEmbed = self.model.forward_audio_frontend(audioFeature.cuda())
+                visualEmbed = self.model.forward_visual_frontend(visualFeature.cuda())
+                audioEmbed, visualEmbed = self.model.forward_cross_attention(
+                    audioEmbed, visualEmbed)
+                outsAV = self.model.forward_audio_visual_backend(audioEmbed, visualEmbed)
+                labels = labels.reshape((-1)).cuda()
+                outsAV = outsAV.view(b, s, t, -1)[:, 0, :, :].view(b * t, -1)
+                labels = labels.view(b, s, t)[:, 0, :].view(b * t)
+                masks = masks.view(b, s, t)[:, 0, :].view(b * t)
+                _, predScore, _, _ = self.lossAV.forward(outsAV, labels, masks)
+                predScore = predScore.detach().cpu().numpy()
+                predScores.extend(predScore)
+        evalLines = open(evalOrig).read().splitlines()[1:]
+        labels = []
+        labels = pandas.Series(['SPEAKING_AUDIBLE' for line in evalLines])
+        scores = pandas.Series(predScores)
+        evalRes = pandas.read_csv(evalOrig)
+        evalRes['score'] = scores
+        evalRes['label'] = labels
+        evalRes.drop(['label_id'], axis=1, inplace=True)
+        evalRes.drop(['instance_id'], axis=1, inplace=True)
+        evalRes.to_csv(evalCsvSave, index=False)
+        cmd = "python -O utils/get_ava_active_speaker_performance.py -g %s -p %s " % (evalOrig,
+                                                                                      evalCsvSave)
+        mAP = float(
+            str(subprocess.run(cmd, shell=True, capture_output=True).stdout).split(' ')[2][:5])
+        return mAP
+    def saveParameters(self, path):
+        torch.save(self.state_dict(), path)
+    def loadParameters(self, path):
+        selfState = self.state_dict()
+        loadedState = torch.load(path)
+        for name, param in loadedState.items():
+            origName = name
+            if name not in selfState:
+                name = name.replace("module.", "")
+                if name not in selfState:
+                    print("%s is not in the model." % origName)
+                    continue
+            if selfState[name].size() != loadedState[origName].size():
+                sys.stderr.write("Wrong parameter length: %s, model: %s, loaded: %s" %
+                                 (origName, selfState[name].size(), loadedState[origName].size()))
+                continue
+            selfState[name].copy_(param)

legacy/talkNet_multicard.py ADDED Viewed

	@@ -0,0 +1,146 @@

+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+import sys, time, numpy, os, subprocess, pandas, tqdm
+from loss import lossAV, lossA, lossV
+from model.talkNetModel import talkNetModel
+import pytorch_lightning as pl
+from torch import distributed as dist
+class talkNet(pl.LightningModule):
+    def __init__(self, cfg):
+        super(talkNet, self).__init__()
+        self.cfg = cfg
+        self.model = talkNetModel()
+        self.lossAV = lossAV()
+        self.lossA = lossA()
+        self.lossV = lossV()
+        print(
+            time.strftime("%m-%d %H:%M:%S") + " Model para number = %.2f" %
+            (sum(param.numel() for param in self.model.parameters()) / 1024 / 1024))
+    def configure_optimizers(self):
+        optimizer = torch.optim.Adam(self.parameters(), lr=self.cfg.SOLVER.BASE_LR)
+        scheduler = torch.optim.lr_scheduler.StepLR(optimizer,
+                                                    step_size=1,
+                                                    gamma=self.cfg.SOLVER.SCHEDULER.GAMMA)
+        return {"optimizer": optimizer, "lr_scheduler": scheduler}
+    def training_step(self, batch, batch_idx):
+        audioFeature, visualFeature, labels = batch
+        audioEmbed = self.model.forward_audio_frontend(audioFeature[0])    # feedForward
+        visualEmbed = self.model.forward_visual_frontend(visualFeature[0])
+        audioEmbed, visualEmbed = self.model.forward_cross_attention(audioEmbed, visualEmbed)
+        outsAV = self.model.forward_audio_visual_backend(audioEmbed, visualEmbed)
+        outsA = self.model.forward_audio_backend(audioEmbed)
+        outsV = self.model.forward_visual_backend(visualEmbed)
+        labels = labels[0].reshape((-1))
+        nlossAV, _, _, prec = self.lossAV.forward(outsAV, labels)
+        nlossA = self.lossA.forward(outsA, labels)
+        nlossV = self.lossV.forward(outsV, labels)
+        loss = nlossAV + 0.4 * nlossA + 0.4 * nlossV
+        self.log("train_loss", loss, on_step=True, on_epoch=True, prog_bar=True, logger=True)
+        return loss
+    def training_epoch_end(self, training_step_outputs):
+        self.saveParameters(
+            os.path.join(self.cfg.WORKSPACE, "model", "{}.pth".format(self.current_epoch)))
+    def validation_step(self, batch, batch_idx):
+        audioFeature, visualFeature, labels, indices = batch
+        audioEmbed = self.model.forward_audio_frontend(audioFeature[0])
+        visualEmbed = self.model.forward_visual_frontend(visualFeature[0])
+        audioEmbed, visualEmbed = self.model.forward_cross_attention(audioEmbed, visualEmbed)
+        outsAV = self.model.forward_audio_visual_backend(audioEmbed, visualEmbed)
+        labels = labels[0].reshape((-1))
+        loss, predScore, _, _ = self.lossAV.forward(outsAV, labels)
+        predScore = predScore[:, -1:].detach().cpu().numpy()
+        # self.log("val_loss", loss)
+        return predScore
+    def validation_epoch_end(self, validation_step_outputs):
+        evalCsvSave = self.cfg.evalCsvSave
+        evalOrig = self.cfg.evalOrig
+        predScores = []
+        for out in validation_step_outputs:    # batch size =1
+            predScores.extend(out)
+        evalLines = open(evalOrig).read().splitlines()[1:]
+        labels = []
+        labels = pandas.Series(['SPEAKING_AUDIBLE' for line in evalLines])
+        scores = pandas.Series(predScores)
+        evalRes = pandas.read_csv(evalOrig)
+        print(len(evalRes), len(predScores), len(evalLines))
+        evalRes['score'] = scores
+        evalRes['label'] = labels
+        evalRes.drop(['label_id'], axis=1, inplace=True)
+        evalRes.drop(['instance_id'], axis=1, inplace=True)
+        evalRes.to_csv(evalCsvSave, index=False)
+        cmd = "python -O utils/get_ava_active_speaker_performance.py -g %s -p %s " % (evalOrig,
+                                                                                      evalCsvSave)
+        mAP = float(
+            str(subprocess.run(cmd, shell=True, capture_output=True).stdout).split(' ')[2][:5])
+        print("validation mAP: {}".format(mAP))
+    def saveParameters(self, path):
+        torch.save(self.state_dict(), path)
+    def loadParameters(self, path):
+        selfState = self.state_dict()
+        loadedState = torch.load(path, map_location='cpu')
+        for name, param in loadedState.items():
+            origName = name
+            if name not in selfState:
+                name = name.replace("module.", "")
+                if name not in selfState:
+                    print("%s is not in the model." % origName)
+                    continue
+            if selfState[name].size() != loadedState[origName].size():
+                sys.stderr.write("Wrong parameter length: %s, model: %s, loaded: %s" %
+                                 (origName, selfState[name].size(), loadedState[origName].size()))
+                continue
+            selfState[name].copy_(param)
+    def evaluate_network(self, loader):
+        self.eval()
+        self.model = self.model.cuda()
+        self.lossAV = self.lossAV.cuda()
+        self.lossA = self.lossA.cuda()
+        self.lossV = self.lossV.cuda()
+        predScores = []
+        evalCsvSave = self.cfg.evalCsvSave
+        evalOrig = self.cfg.evalOrig
+        for audioFeature, visualFeature, labels in tqdm.tqdm(loader):
+            with torch.no_grad():
+                audioEmbed = self.model.forward_audio_frontend(audioFeature[0].cuda())
+                visualEmbed = self.model.forward_visual_frontend(visualFeature[0].cuda())
+                audioEmbed, visualEmbed = self.model.forward_cross_attention(
+                    audioEmbed, visualEmbed)
+                outsAV = self.model.forward_audio_visual_backend(audioEmbed, visualEmbed)
+                labels = labels[0].reshape((-1)).cuda()
+                _, predScore, _, _ = self.lossAV.forward(outsAV, labels)
+                predScore = predScore[:, 1].detach().cpu().numpy()
+                predScores.extend(predScore)
+        evalLines = open(evalOrig).read().splitlines()[1:]
+        labels = []
+        labels = pandas.Series(['SPEAKING_AUDIBLE' for line in evalLines])
+        scores = pandas.Series(predScores)
+        evalRes = pandas.read_csv(evalOrig)
+        evalRes['score'] = scores
+        evalRes['label'] = labels
+        evalRes.drop(['label_id'], axis=1, inplace=True)
+        evalRes.drop(['instance_id'], axis=1, inplace=True)
+        evalRes.to_csv(evalCsvSave, index=False)
+        cmd = "python -O utils/get_ava_active_speaker_performance.py -g %s -p %s " % (evalOrig,
+                                                                                      evalCsvSave)
+        mAP = float(
+            str(subprocess.run(cmd, shell=True, capture_output=True).stdout).split(' ')[2][:5])
+        return mAP

legacy/talkNet_orig.py ADDED Viewed

	@@ -0,0 +1,102 @@

+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+import sys, time, numpy, os, subprocess, pandas, tqdm
+from loss import lossAV, lossA, lossV
+from model.talkNetModel import talkNetModel
+class talkNet(nn.Module):
+    def __init__(self, lr=0.0001, lrDecay=0.95, **kwargs):
+        super(talkNet, self).__init__()
+        self.model = talkNetModel().cuda()
+        self.lossAV = lossAV().cuda()
+        self.lossA = lossA().cuda()
+        self.lossV = lossV().cuda()
+        self.optim = torch.optim.Adam(self.parameters(), lr=lr)
+        self.scheduler = torch.optim.lr_scheduler.StepLR(self.optim, step_size=1, gamma=lrDecay)
+        print(
+            time.strftime("%m-%d %H:%M:%S") + " Model para number = %.2f" %
+            (sum(param.numel() for param in self.model.parameters()) / 1024 / 1024))
+    def train_network(self, loader, epoch, **kwargs):
+        self.train()
+        self.scheduler.step(epoch - 1)
+        index, top1, loss = 0, 0, 0
+        lr = self.optim.param_groups[0]['lr']
+        for num, (audioFeature, visualFeature, labels) in enumerate(loader, start=1):
+            self.zero_grad()
+            audioEmbed = self.model.forward_audio_frontend(audioFeature[0].cuda())    # feedForward
+            visualEmbed = self.model.forward_visual_frontend(visualFeature[0].cuda())
+            audioEmbed, visualEmbed = self.model.forward_cross_attention(audioEmbed, visualEmbed)
+            outsAV = self.model.forward_audio_visual_backend(audioEmbed, visualEmbed)
+            outsA = self.model.forward_audio_backend(audioEmbed)
+            outsV = self.model.forward_visual_backend(visualEmbed)
+            labels = labels[0].reshape((-1)).cuda()    # Loss
+            nlossAV, _, _, prec = self.lossAV.forward(outsAV, labels)
+            nlossA = self.lossA.forward(outsA, labels)
+            nlossV = self.lossV.forward(outsV, labels)
+            nloss = nlossAV + 0.4 * nlossA + 0.4 * nlossV
+            loss += nloss.detach().cpu().numpy()
+            top1 += prec
+            nloss.backward()
+            self.optim.step()
+            index += len(labels)
+            sys.stderr.write(time.strftime("%m-%d %H:%M:%S") + \
+            " [%2d] Lr: %5f, Training: %.2f%%, "    %(epoch, lr, 100 * (num / loader.__len__())) + \
+            " Loss: %.5f, ACC: %2.2f%% \r"        %(loss/(num), 100 * (top1/index)))
+            sys.stderr.flush()
+        sys.stdout.write("\n")
+        return loss / num, lr
+    def evaluate_network(self, loader, evalCsvSave, evalOrig, **kwargs):
+        self.eval()
+        predScores = []
+        for audioFeature, visualFeature, labels in tqdm.tqdm(loader):
+            with torch.no_grad():
+                audioEmbed = self.model.forward_audio_frontend(audioFeature[0].cuda())
+                visualEmbed = self.model.forward_visual_frontend(visualFeature[0].cuda())
+                audioEmbed, visualEmbed = self.model.forward_cross_attention(
+                    audioEmbed, visualEmbed)
+                outsAV = self.model.forward_audio_visual_backend(audioEmbed, visualEmbed)
+                labels = labels[0].reshape((-1)).cuda()
+                _, predScore, _, _ = self.lossAV.forward(outsAV, labels)
+                predScore = predScore[:, 1].detach().cpu().numpy()
+                predScores.extend(predScore)
+        evalLines = open(evalOrig).read().splitlines()[1:]
+        labels = []
+        labels = pandas.Series(['SPEAKING_AUDIBLE' for line in evalLines])
+        scores = pandas.Series(predScores)
+        evalRes = pandas.read_csv(evalOrig)
+        evalRes['score'] = scores
+        evalRes['label'] = labels
+        evalRes.drop(['label_id'], axis=1, inplace=True)
+        evalRes.drop(['instance_id'], axis=1, inplace=True)
+        evalRes.to_csv(evalCsvSave, index=False)
+        cmd = "python -O utils/get_ava_active_speaker_performance.py -g %s -p %s " % (evalOrig,
+                                                                                      evalCsvSave)
+        mAP = float(
+            str(subprocess.run(cmd, shell=True, capture_output=True).stdout).split(' ')[2][:5])
+        return mAP
+    def saveParameters(self, path):
+        torch.save(self.state_dict(), path)
+    def loadParameters(self, path):
+        selfState = self.state_dict()
+        loadedState = torch.load(path)
+        for name, param in loadedState.items():
+            origName = name
+            if name not in selfState:
+                name = name.replace("module.", "")
+                if name not in selfState:
+                    print("%s is not in the model." % origName)
+                    continue
+            if selfState[name].size() != loadedState[origName].size():
+                sys.stderr.write("Wrong parameter length: %s, model: %s, loaded: %s" %
+                                 (origName, selfState[name].size(), loadedState[origName].size()))
+                continue
+            selfState[name].copy_(param)

legacy/trainTalkNet_multicard.py ADDED Viewed

	@@ -0,0 +1,171 @@

+import time, os, torch, argparse, warnings, glob
+from utils.tools import *
+from dlhammer import bootstrap
+import pytorch_lightning as pl
+from pytorch_lightning import Trainer, seed_everything
+from pytorch_lightning.callbacks import ModelCheckpoint
+os.environ["CUDA_DEVICE_ORDER"] = "PCI_BUS_ID"
+class MyCollator(object):
+    def __init__(self, cfg):
+        self.cfg = cfg
+    def __call__(self, data):
+        audiofeatures = [item[0] for item in data]
+        visualfeatures = [item[1] for item in data]
+        labels = [item[2] for item in data]
+        masks = [item[3] for item in data]
+        cut_limit = self.cfg.MODEL.CLIP_LENGTH
+        # pad audio
+        lengths = torch.tensor([t.shape[1] for t in audiofeatures])
+        max_len = max(lengths)
+        padded_audio = torch.stack([
+            torch.cat([i, i.new_zeros((i.shape[0], max_len - i.shape[1], i.shape[2]))], 1)
+            for i in audiofeatures
+        ], 0)
+        if max_len > cut_limit * 4:
+            padded_audio = padded_audio[:, :, :cut_limit * 4, ...]
+        # pad video
+        lengths = torch.tensor([t.shape[1] for t in visualfeatures])
+        max_len = max(lengths)
+        padded_video = torch.stack([
+            torch.cat(
+                [i, i.new_zeros((i.shape[0], max_len - i.shape[1], i.shape[2], i.shape[3]))], 1)
+            for i in visualfeatures
+        ], 0)
+        padded_labels = torch.stack(
+            [torch.cat([i, i.new_zeros((i.shape[0], max_len - i.shape[1]))], 1) for i in labels], 0)
+        padded_masks = torch.stack(
+            [torch.cat([i, i.new_zeros((i.shape[0], max_len - i.shape[1]))], 1) for i in masks], 0)
+        if max_len > cut_limit:
+            padded_video = padded_video[:, :, :cut_limit, ...]
+            padded_labels = padded_labels[:, :, :cut_limit, ...]
+            padded_masks = padded_masks[:, :, :cut_limit, ...]
+        return padded_audio, padded_video, padded_labels, padded_masks
+class DataPrep(pl.LightningDataModule):
+    def __init__(self, cfg):
+        self.cfg = cfg
+    def train_dataloader(self):
+        cfg = self.cfg
+        if self.cfg.MODEL.NAME == "baseline":
+            from dataLoader import train_loader, val_loader
+            loader = train_loader(trialFileName = cfg.trainTrialAVA, \
+                              audioPath      = os.path.join(cfg.audioPathAVA , 'train'), \
+                              visualPath     = os.path.join(cfg.visualPathAVA, 'train'), \
+                              batchSize=2500
+                              )
+        elif self.cfg.MODEL.NAME == "multi":
+            from dataLoader_multiperson import train_loader, val_loader
+            loader = train_loader(trialFileName = cfg.trainTrialAVA, \
+                              audioPath      = os.path.join(cfg.audioPathAVA , 'train'), \
+                              visualPath     = os.path.join(cfg.visualPathAVA, 'train'), \
+                              num_speakers=cfg.MODEL.NUM_SPEAKERS,
+                              )
+        if cfg.MODEL.NAME == "baseline":
+            trainLoader = torch.utils.data.DataLoader(
+                loader,
+                batch_size=1,
+                shuffle=True,
+                num_workers=4,
+            )
+        elif cfg.MODEL.NAME == "multi":
+            collator = MyCollator(cfg)
+            trainLoader = torch.utils.data.DataLoader(loader,
+                                                      batch_size=1,
+                                                      shuffle=True,
+                                                      num_workers=4,
+                                                      collate_fn=collator)
+        return trainLoader
+    def val_dataloader(self):
+        cfg = self.cfg
+        loader = val_loader(trialFileName = cfg.evalTrialAVA, \
+                            audioPath     = os.path.join(cfg.audioPathAVA , cfg.evalDataType), \
+                            visualPath    = os.path.join(cfg.visualPathAVA, cfg.evalDataType), \
+                            )
+        valLoader = torch.utils.data.DataLoader(loader,
+                                                batch_size=cfg.VAL.BATCH_SIZE,
+                                                shuffle=False,
+                                                num_workers=16)
+        return valLoader
+def main():
+    # The structure of this code is learnt from https://github.com/clovaai/voxceleb_trainer
+    cfg = bootstrap(print_cfg=False)
+    print(cfg)
+    warnings.filterwarnings("ignore")
+    seed_everything(42, workers=True)
+    cfg = init_args(cfg)
+    # checkpoint_callback = ModelCheckpoint(dirpath=os.path.join(cfg.WORKSPACE, "model"),
+    #                                       save_top_k=-1,
+    #                                       filename='{epoch}')
+    data = DataPrep(cfg)
+    trainer = Trainer(
+        gpus=int(cfg.TRAIN.TRAINER_GPU),
+        precision=32,
+    # callbacks=[checkpoint_callback],
+        max_epochs=25,
+        replace_sampler_ddp=True)
+    # val_trainer = Trainer(deterministic=True, num_sanity_val_steps=-1, gpus=1)
+    if cfg.downloadAVA == True:
+        preprocess_AVA(cfg)
+        quit()
+    # if cfg.RESUME:
+    #     modelfiles = glob.glob('%s/model_0*.model' % cfg.modelSavePath)
+    #     modelfiles.sort()
+    #     if len(modelfiles) >= 1:
+    #         print("Model %s loaded from previous state!" % modelfiles[-1])
+    #         epoch = int(os.path.splitext(os.path.basename(modelfiles[-1]))[0][6:]) + 1
+    #         s = talkNet(cfg)
+    #         s.loadParameters(modelfiles[-1])
+    #     else:
+    #         epoch = 1
+    #         s = talkNet(cfg)
+    epoch = 1
+    if cfg.MODEL.NAME == "baseline":
+        from talkNet_multicard import talkNet
+    elif cfg.MODEL.NAME == "multi":
+        from talkNet_multi import talkNet
+    s = talkNet(cfg)
+    # scoreFile = open(cfg.scoreSavePath, "a+")
+    trainer.fit(s, train_dataloaders=data.train_dataloader())
+    modelfiles = glob.glob('%s/*.pth' % os.path.join(cfg.WORKSPACE, "model"))
+    modelfiles.sort()
+    for path in modelfiles:
+        s.loadParameters(path)
+        prec = trainer.validate(s, data.val_dataloader())
+    # if epoch % cfg.testInterval == 0:
+    # s.saveParameters(cfg.modelSavePath + "/model_%04d.model" % epoch)
+    # trainer.validate(dataloaders=valLoader)
+    # print(time.strftime("%Y-%m-%d %H:%M:%S"), "%d epoch, mAP %2.2f%%" % (epoch, mAPs[-1]))
+    # scoreFile.write("%d epoch, LOSS %f, mAP %2.2f%%\n" % (epoch, loss, mAPs[-1]))
+    # scoreFile.flush()
+if __name__ == '__main__':
+    main()

legacy/train_multi.py ADDED Viewed

	@@ -0,0 +1,156 @@

+import time, os, torch, argparse, warnings, glob
+from dataLoader_multiperson import train_loader, val_loader
+from utils.tools import *
+from talkNet_multi import talkNet
+def collate_fn_padding(data):
+    audiofeatures = [item[0] for item in data]
+    visualfeatures = [item[1] for item in data]
+    labels = [item[2] for item in data]
+    masks = [item[3] for item in data]
+    cut_limit = 200
+    # pad audio
+    lengths = torch.tensor([t.shape[1] for t in audiofeatures])
+    max_len = max(lengths)
+    padded_audio = torch.stack([
+        torch.cat([i, i.new_zeros((i.shape[0], max_len - i.shape[1], i.shape[2]))], 1)
+        for i in audiofeatures
+    ], 0)
+    if max_len > cut_limit * 4:
+        padded_audio = padded_audio[:, :, :cut_limit * 4, ...]
+    # pad video
+    lengths = torch.tensor([t.shape[1] for t in visualfeatures])
+    max_len = max(lengths)
+    padded_video = torch.stack([
+        torch.cat([i, i.new_zeros((i.shape[0], max_len - i.shape[1], i.shape[2], i.shape[3]))], 1)
+        for i in visualfeatures
+    ], 0)
+    padded_labels = torch.stack(
+        [torch.cat([i, i.new_zeros((i.shape[0], max_len - i.shape[1]))], 1) for i in labels], 0)
+    padded_masks = torch.stack(
+        [torch.cat([i, i.new_zeros((i.shape[0], max_len - i.shape[1]))], 1) for i in masks], 0)
+    if max_len > cut_limit:
+        padded_video = padded_video[:, :, :cut_limit, ...]
+        padded_labels = padded_labels[:, :, :cut_limit, ...]
+        padded_masks = padded_masks[:, :, :cut_limit, ...]
+    # print(padded_audio.shape, padded_video.shape, padded_labels.shape, padded_masks.shape)
+    return padded_audio, padded_video, padded_labels, padded_masks
+def main():
+    # The structure of this code is learnt from https://github.com/clovaai/voxceleb_trainer
+    warnings.filterwarnings("ignore")
+    parser = argparse.ArgumentParser(description="TalkNet Training")
+    # Training details
+    parser.add_argument('--lr', type=float, default=0.0001, help='Learning rate')
+    parser.add_argument('--lrDecay', type=float, default=0.95, help='Learning rate decay rate')
+    parser.add_argument('--maxEpoch', type=int, default=25, help='Maximum number of epochs')
+    parser.add_argument('--testInterval',
+                        type=int,
+                        default=1,
+                        help='Test and save every [testInterval] epochs')
+    parser.add_argument(
+        '--batchSize',
+        type=int,
+        default=2500,
+        help=
+        'Dynamic batch size, default is 2500 frames, other batchsize (such as 1500) will not affect the performance'
+    )
+    parser.add_argument('--batch_size', type=int, default=1, help='batch_size')
+    parser.add_argument('--num_speakers', type=int, default=5, help='num_speakers')
+    parser.add_argument('--nDataLoaderThread', type=int, default=4, help='Number of loader threads')
+    # Data path
+    parser.add_argument('--dataPathAVA',
+                        type=str,
+                        default="/data08/AVA",
+                        help='Save path of AVA dataset')
+    parser.add_argument('--savePath', type=str, default="exps/exp1")
+    # Data selection
+    parser.add_argument('--evalDataType',
+                        type=str,
+                        default="val",
+                        help='Only for AVA, to choose the dataset for evaluation, val or test')
+    # For download dataset only, for evaluation only
+    parser.add_argument('--downloadAVA',
+                        dest='downloadAVA',
+                        action='store_true',
+                        help='Only download AVA dataset and do related preprocess')
+    parser.add_argument('--evaluation',
+                        dest='evaluation',
+                        action='store_true',
+                        help='Only do evaluation by using pretrained model [pretrain_AVA.model]')
+    args = parser.parse_args()
+    # Data loader
+    args = init_args(args)
+    if args.downloadAVA == True:
+        preprocess_AVA(args)
+        quit()
+    loader = train_loader(trialFileName = args.trainTrialAVA, \
+                          audioPath      = os.path.join(args.audioPathAVA , 'train'), \
+                          visualPath     = os.path.join(args.visualPathAVA, 'train'), \
+                          # num_speakers = args.num_speakers, \
+                          **vars(args))
+    trainLoader = torch.utils.data.DataLoader(loader,
+                                              batch_size=args.batch_size,
+                                              shuffle=True,
+                                              num_workers=args.nDataLoaderThread,
+                                              collate_fn=collate_fn_padding)
+    loader = val_loader(trialFileName = args.evalTrialAVA, \
+                        audioPath     = os.path.join(args.audioPathAVA , args.evalDataType), \
+                        visualPath    = os.path.join(args.visualPathAVA, args.evalDataType), \
+                        # num_speakers = args.num_speakers, \
+                        **vars(args))
+    valLoader = torch.utils.data.DataLoader(loader, batch_size=1, shuffle=False, num_workers=16)
+    if args.evaluation == True:
+        download_pretrain_model_AVA()
+        s = talkNet(**vars(args))
+        s.loadParameters('pretrain_AVA.model')
+        print("Model %s loaded from previous state!" % ('pretrain_AVA.model'))
+        mAP = s.evaluate_network(loader=valLoader, **vars(args))
+        print("mAP %2.2f%%" % (mAP))
+        quit()
+    modelfiles = glob.glob('%s/model_0*.model' % args.modelSavePath)
+    modelfiles.sort()
+    if len(modelfiles) >= 1:
+        print("Model %s loaded from previous state!" % modelfiles[-1])
+        epoch = int(os.path.splitext(os.path.basename(modelfiles[-1]))[0][6:]) + 1
+        s = talkNet(epoch=epoch, **vars(args))
+        s.loadParameters(modelfiles[-1])
+    else:
+        epoch = 1
+        s = talkNet(epoch=epoch, **vars(args))
+    mAPs = []
+    scoreFile = open(args.scoreSavePath, "a+")
+    while (1):
+        loss, lr = s.train_network(epoch=epoch, loader=trainLoader, **vars(args))
+        if epoch % args.testInterval == 0:
+            s.saveParameters(args.modelSavePath + "/model_%04d.model" % epoch)
+            mAPs.append(s.evaluate_network(epoch=epoch, loader=valLoader, **vars(args)))
+            print(time.strftime("%Y-%m-%d %H:%M:%S"),
+                  "%d epoch, mAP %2.2f%%, bestmAP %2.2f%%" % (epoch, mAPs[-1], max(mAPs)))
+            scoreFile.write("%d epoch, LR %f, LOSS %f, mAP %2.2f%%, bestmAP %2.2f%%\n" %
+                            (epoch, lr, loss, mAPs[-1], max(mAPs)))
+            scoreFile.flush()
+        if epoch >= args.maxEpoch:
+            quit()
+        epoch += 1
+if __name__ == '__main__':
+    main()

loconet.py ADDED Viewed

	@@ -0,0 +1,182 @@

+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+import sys, time, numpy, os, subprocess, pandas, tqdm
+from loss_multi import lossAV, lossA, lossV
+from model.loconet_encoder import locoencoder
+import torch.distributed as dist
+from xxlib.utils.distributed import all_gather, all_reduce
+class Loconet(nn.Module):
+    def __init__(self, cfg):
+        super(Loconet, self).__init__()
+        self.cfg = cfg
+        self.model = locoencoder(cfg)
+        self.lossAV = lossAV()
+        self.lossA = lossA()
+        self.lossV = lossV()
+    def forward(self, audioFeature, visualFeature, labels, masks):
+        b, s, t = visualFeature.shape[:3]
+        visualFeature = visualFeature.view(b * s, *visualFeature.shape[2:])
+        labels = labels.view(b * s, *labels.shape[2:])
+        masks = masks.view(b * s, *masks.shape[2:])
+        audioEmbed = self.model.forward_audio_frontend(audioFeature)    # B, C, T, 4
+        visualEmbed = self.model.forward_visual_frontend(visualFeature)
+        audioEmbed = audioEmbed.repeat(s, 1, 1)
+        audioEmbed, visualEmbed = self.model.forward_cross_attention(audioEmbed, visualEmbed)
+        outsAV = self.model.forward_audio_visual_backend(audioEmbed, visualEmbed, b, s)
+        outsA = self.model.forward_audio_backend(audioEmbed)
+        outsV = self.model.forward_visual_backend(visualEmbed)
+        labels = labels.reshape((-1))
+        masks = masks.reshape((-1))
+        nlossAV, _, _, prec = self.lossAV.forward(outsAV, labels, masks)
+        nlossA = self.lossA.forward(outsA, labels, masks)
+        nlossV = self.lossV.forward(outsV, labels, masks)
+        nloss = nlossAV + 0.4 * nlossA + 0.4 * nlossV
+        num_frames = masks.sum()
+        return nloss, prec, num_frames
+class loconet(nn.Module):
+    def __init__(self, cfg, rank=None, device=None):
+        super(loconet, self).__init__()
+        self.cfg = cfg
+        self.rank = rank
+        if rank != None:
+            self.rank = rank
+            self.device = device
+            self.model = Loconet(cfg).to(device)
+            self.model = nn.SyncBatchNorm.convert_sync_batchnorm(self.model)
+            self.model = nn.parallel.DistributedDataParallel(self.model,
+                                                             device_ids=[rank],
+                                                             output_device=rank,
+                                                             find_unused_parameters=False)
+            self.optim = torch.optim.Adam(self.model.parameters(), lr=self.cfg.SOLVER.BASE_LR)
+            self.scheduler = torch.optim.lr_scheduler.StepLR(self.optim,
+                                                             step_size=1,
+                                                             gamma=self.cfg.SOLVER.SCHEDULER.GAMMA)
+        else:
+            self.model = locoencoder(cfg).cuda()
+            self.lossAV = lossAV().cuda()
+            self.lossA = lossA().cuda()
+            self.lossV = lossV().cuda()
+        print(
+            time.strftime("%m-%d %H:%M:%S") + " Model para number = %.2f" %
+            (sum(param.numel() for param in self.model.parameters()) / 1024 / 1024))
+    def train_network(self, epoch, loader):
+        self.model.train()
+        self.scheduler.step(epoch - 1)
+        index, top1, loss = 0, 0, 0
+        lr = self.optim.param_groups[0]['lr']
+        loader.sampler.set_epoch(epoch)
+        device = self.device
+        pbar = enumerate(loader, start=1)
+        if self.rank == 0:
+            pbar = tqdm.tqdm(pbar, total=loader.__len__())
+        for num, (audioFeature, visualFeature, labels, masks) in pbar:
+            audioFeature = audioFeature.to(device)
+            visualFeature = visualFeature.to(device)
+            labels = labels.to(device)
+            masks = masks.to(device)
+            nloss, prec, num_frames = self.model(
+                audioFeature,
+                visualFeature,
+                labels,
+                masks,
+            )
+            self.optim.zero_grad()
+            nloss.backward()
+            self.optim.step()
+            [nloss, prec, num_frames] = all_reduce([nloss, prec, num_frames], average=False)
+            top1 += prec.detach().cpu().numpy()
+            loss += nloss.detach().cpu().numpy()
+            index += int(num_frames.detach().cpu().item())
+            if self.rank == 0:
+                pbar.set_postfix(
+                    dict(epoch=epoch,
+                         lr=lr,
+                         loss=loss / (num * self.cfg.NUM_GPUS),
+                         acc=(top1 / index)))
+        dist.barrier()
+        return loss / num, lr
+    def evaluate_network(self, epoch, loader):
+        self.eval()
+        predScores = []
+        evalCsvSave = os.path.join(self.cfg.WORKSPACE, "{}_res.csv".format(epoch))
+        evalOrig = self.cfg.evalOrig
+        for audioFeature, visualFeature, labels, masks in tqdm.tqdm(loader):
+            with torch.no_grad():
+                audioFeature = audioFeature.cuda()
+                visualFeature = visualFeature.cuda()
+                labels = labels.cuda()
+                masks = masks.cuda()
+                b, s, t = visualFeature.shape[0], visualFeature.shape[1], visualFeature.shape[2]
+                visualFeature = visualFeature.view(b * s, *visualFeature.shape[2:])
+                labels = labels.view(b * s, *labels.shape[2:])
+                masks = masks.view(b * s, *masks.shape[2:])
+                audioEmbed = self.model.forward_audio_frontend(audioFeature)
+                visualEmbed = self.model.forward_visual_frontend(visualFeature)
+                audioEmbed = audioEmbed.repeat(s, 1, 1)
+                audioEmbed, visualEmbed = self.model.forward_cross_attention(
+                    audioEmbed, visualEmbed)
+                outsAV = self.model.forward_audio_visual_backend(audioEmbed, visualEmbed, b, s)
+                labels = labels.reshape((-1))
+                masks = masks.reshape((-1))
+                outsAV = outsAV.view(b, s, t, -1)[:, 0, :, :].view(b * t, -1)
+                labels = labels.view(b, s, t)[:, 0, :].view(b * t).cuda()
+                masks = masks.view(b, s, t)[:, 0, :].view(b * t)
+                _, predScore, _, _ = self.lossAV.forward(outsAV, labels, masks)
+                predScore = predScore[:, 1].detach().cpu().numpy()
+                predScores.extend(predScore)
+        evalLines = open(evalOrig).read().splitlines()[1:]
+        labels = []
+        labels = pandas.Series(['SPEAKING_AUDIBLE' for line in evalLines])
+        scores = pandas.Series(predScores)
+        evalRes = pandas.read_csv(evalOrig)
+        evalRes['score'] = scores
+        evalRes['label'] = labels
+        evalRes.drop(['label_id'], axis=1, inplace=True)
+        evalRes.drop(['instance_id'], axis=1, inplace=True)
+        evalRes.to_csv(evalCsvSave, index=False)
+        cmd = "python -O utils/get_ava_active_speaker_performance.py -g %s -p %s " % (evalOrig,
+                                                                                      evalCsvSave)
+        mAP = float(
+            str(subprocess.run(cmd, shell=True, capture_output=True).stdout).split(' ')[2][:5])
+        return mAP
+    def saveParameters(self, path):
+        torch.save(self.state_dict(), path)
+    def loadParameters(self, path):
+        selfState = self.state_dict()
+        loadedState = torch.load(path, map_location='cpu')
+        if self.rank != None:
+            info = self.load_state_dict(loadedState)
+        else:
+            new_state = {}
+            for k, v in loadedState.items():
+                new_state[k.replace("model.module.", "")] = v
+            info = self.load_state_dict(new_state, strict=False)
+        print(info)

loss_multi.py ADDED Viewed

	@@ -0,0 +1,72 @@

+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+import utils.distributed as du
+class lossAV(nn.Module):
+    def __init__(self):
+        super(lossAV, self).__init__()
+        self.criterion = nn.CrossEntropyLoss(reduction='none')
+        self.FC = nn.Linear(256, 2)
+    def forward(self, x, labels=None, masks=None):
+        x = x.squeeze(1)
+        x = self.FC(x)
+        if labels == None:
+            predScore = x[:, 1]
+            predScore = predScore.t()
+            predScore = predScore.view(-1).detach().cpu().numpy()
+            return predScore
+        else:
+            nloss = self.criterion(x, labels) * masks
+            num_valid = masks.sum().float()
+            if self.training:
+                [num_valid] = du.all_reduce([num_valid],average=True)
+            nloss = torch.sum(nloss) / num_valid
+            predScore = F.softmax(x, dim=-1)
+            predLabel = torch.round(F.softmax(x, dim=-1))[:, 1]
+            correctNum = ((predLabel == labels) * masks).sum().float()
+            return nloss, predScore, predLabel, correctNum
+class lossA(nn.Module):
+    def __init__(self):
+        super(lossA, self).__init__()
+        self.criterion = nn.CrossEntropyLoss(reduction='none')
+        self.FC = nn.Linear(128, 2)
+    def forward(self, x, labels, masks=None):
+        x = x.squeeze(1)
+        x = self.FC(x)
+        nloss = self.criterion(x, labels) * masks
+        num_valid = masks.sum().float()
+        if self.training:
+            [num_valid] = du.all_reduce([num_valid],average=True)
+        nloss = torch.sum(nloss) / num_valid
+        #nloss = torch.sum(nloss) / torch.sum(masks)
+        return nloss
+class lossV(nn.Module):
+    def __init__(self):
+        super(lossV, self).__init__()
+        self.criterion = nn.CrossEntropyLoss(reduction='none')
+        self.FC = nn.Linear(128, 2)
+    def forward(self, x, labels, masks=None):
+        x = x.squeeze(1)
+        x = self.FC(x)
+        nloss = self.criterion(x, labels) * masks
+        # nloss = torch.sum(nloss) / torch.sum(masks)
+        num_valid = masks.sum().float()
+        if self.training:
+            [num_valid] = du.all_reduce([num_valid],average=True)
+        nloss = torch.sum(nloss) / num_valid
+        return nloss

metrics/AverageMeter.py ADDED Viewed

	@@ -0,0 +1,18 @@

+#taken from pytorch imagenet example
+class AverageMeter(object):
+    """Computes and stores the average and current value"""
+    def __init__(self):
+        self.reset()
+    def reset(self):
+        self.val = 0
+        self.avg = 0
+        self.sum = 0
+        self.count = 0
+    def update(self, val, n=1):
+        self.val = val
+        self.sum += val * n
+        self.count += n
+        self.avg = self.sum / self.count

metrics/__pycache__/.nfs000000035f4a8257000000eb ADDED Viewed

Binary file (896 Bytes). View file

metrics/__pycache__/AverageMeter.cpython-36.pyc ADDED Viewed

Binary file (897 Bytes). View file

metrics/__pycache__/AverageMeter.cpython-38.pyc ADDED Viewed

Binary file (908 Bytes). View file

metrics/__pycache__/accuracy.cpython-36.pyc ADDED Viewed

Binary file (870 Bytes). View file

metrics/__pycache__/accuracy.cpython-38.pyc ADDED Viewed

Binary file (876 Bytes). View file

metrics/accuracy.py ADDED Viewed

	@@ -0,0 +1,20 @@

+import torch
+accuracy = lambda output,target : acc_topk(output, target)[0]
+#taken from pytorch imagenet example
+def acc_topk(output, target, topk=(1,)):
+    """Computes the accuracy over the k top predictions for the specified values of k"""
+    with torch.no_grad():
+        maxk = max(topk)
+        batch_size = target.size(0)
+        _, pred = output.topk(maxk, 1, True, True)
+        pred = pred.t()
+        correct = pred.eq(target.view(1, -1).expand_as(pred))
+        res = []
+        for k in topk:
+            correct_k = correct[:k].view(-1).float().sum(0, keepdim=True)
+            res.append(correct_k.mul_(1.0 / batch_size))
+        return res

model/.DS_Store ADDED Viewed

Binary file (6.15 kB). View file

model/__init__.py ADDED Viewed

	@@ -0,0 +1,5 @@

+from model.transformer.position_encoding import PositionalEncoding
+from model.transformer.transformer import Transformer
+from model.transformer.transformer import TransformerEncoder, TransformerEncoderLayer
+from model.transformer.transformer import TransformerDecoder, TransformerDecoderLayer
+from model.transformer.utils import layer_norm, generate_square_subsequent_mask, generate_proposal_mask

model/__pycache__/__init__.cpython-36.pyc ADDED Viewed

Binary file (561 Bytes). View file

model/__pycache__/__init__.cpython-37.pyc ADDED Viewed

Binary file (573 Bytes). View file

model/__pycache__/attentionLayer.cpython-37.pyc ADDED Viewed

Binary file (1.38 kB). View file

model/__pycache__/convLayer.cpython-37.pyc ADDED Viewed

Binary file (1.32 kB). View file

model/__pycache__/loconet_encoder.cpython-37.pyc ADDED Viewed

Binary file (3.21 kB). View file

model/__pycache__/position_encoding.cpython-36.pyc ADDED Viewed

Binary file (1.26 kB). View file

model/__pycache__/talkNetModel.cpython-37.pyc ADDED Viewed

Binary file (6.33 kB). View file

model/__pycache__/transformer.cpython-36.pyc ADDED Viewed

Binary file (8.84 kB). View file

model/__pycache__/utils.cpython-36.pyc ADDED Viewed

Binary file (1.08 kB). View file