# -*- coding: utf-8 -*- # file: compress_datasets.py # time: 19:13 2023/2/5 # author: yangheng # github: https://github.com/yangheng95 # huggingface: https://huggingface.co/yangheng # google scholar: https://scholar.google.com/citations?user=NPq5a_0AAAAJ&hl=en # Copyright (C) 2021. All Rights Reserved. # -*- coding: utf-8 -*- # file: zip_datasets.py # time: 05/11/2022 17:10 # author: yangheng # github: https://github.com/yangheng95 # GScholar: https://scholar.google.com/citations?user=NPq5a_0AAAAJ&hl=en # ResearchGate: https://www.researchgate.net/profile/Heng-Yang-17/research # Copyright (C) 2022. All Rights Reserved. import os import shutil import zipfile from pathlib import Path import findfile from pyabsa.utils.pyabsa_utils import fprint def cascade_zip_datasets(): # iterate zip all datasets in the folder datasets = findfile.find_dirs("integrated_datasets", "datasets", recursive=1) for dataset in datasets: if dataset in [ "integrated_datasets", "integrated_datasets.zip", ]: continue task_name = Path(dataset).name for d in findfile.find_dirs(dataset, ""): fprint(f"compressing dataset: {d}") dataset_name = Path(d).name zip_file = zipfile.ZipFile( f"integrated_datasets/{task_name}.{dataset_name}.zip".lower(), "w", zipfile.ZIP_DEFLATED, ) for root, dirs, files in os.walk(d): for file in files: zip_file.write(os.path.join(root, file).lower()) zip_file.close() if __name__ == "__main__": # if os.path.exists('integrated_datasets'): # try: # shutil.rmtree('integrated_datasets') # except: # os.system('rm -rf integrated_datasets') # # from pyabsa import download_all_available_datasets # # download_all_available_datasets() cascade_zip_datasets()