import os import sys import math import tarfile class DataManager: def __init__(self, dataset_path): self.dataset_path = dataset_path def extract_dataset(self, compressed_dataset_file_name, dataset_directory): try: # extract files to dataset folder tar = tarfile.open(compressed_dataset_file_name, "r:gz") tar.extractall(dataset_directory) tar.close() print("Files extraction was successfull ...") except: print("Ecxception raised: No extraction was done ...") def make_folder(self, folder_path): try: os.mkdir(folder_path) print(folder_path, "was created ...") except: print("Ecxception raised: ", folder_path, "could not be created ...") def move_files(self, src, dst, group): for fname in group: os.rename(src + '/' + fname, dst + '/' + fname) def get_fnames_from_dict(self, dataset_dict, f_or_m): training_data, testing_data = [], [] for i in range(1,5): length_data = len(dataset_dict[f_or_m +"000" + str(i)]) length_separator = math.trunc(length_data*2/3) training_data += dataset_dict[f_or_m + "000" + str(i)][:length_separator] testing_data += dataset_dict[f_or_m + "000" + str(i)][length_separator:] return training_data, testing_data def manage(self): # read config file and get path to compressed dataset compressed_dataset_file_name = self.dataset_path dataset_directory = compressed_dataset_file_name.split(".")[0] # create a folder for the data try: os.mkdir(dataset_directory) except: pass # extract dataset self.extract_dataset(compressed_dataset_file_name, dataset_directory) # select females files and males files file_names = [fname for fname in os.listdir(dataset_directory) if ("f0" in fname or "m0" in fname)] dataset_dict = {"f0001": [], "f0002": [], "f0003": [], "f0004": [], "f0005": [], "m0001": [], "m0002": [], "m0003": [], "m0004": [], "m0005": [], } # fill in dictionary for fname in file_names: dataset_dict[fname.split('_')[0]].append(fname) # divide and group file names training_set, testing_set = {},{} training_set["females"], testing_set["females"] = self.get_fnames_from_dict(dataset_dict, "f") training_set["males" ], testing_set["males" ] = self.get_fnames_from_dict(dataset_dict, "m") # make training and testing folders self.make_folder("TrainingData") self.make_folder("TestingData") self.make_folder("TrainingData/females") self.make_folder("TrainingData/males") self.make_folder("TestingData/females") self.make_folder("TestingData/males") # move files self.move_files(dataset_directory, "TrainingData/females", training_set["females"]) self.move_files(dataset_directory, "TrainingData/males", training_set["males"]) self.move_files(dataset_directory, "TestingData/females", testing_set["females"]) self.move_files(dataset_directory, "TestingData/males", testing_set["males"]) if __name__== "__main__": data_manager = DataManager("SLR45.tgz") data_manager.manage()