| import os
|
| import urllib.request
|
| import tarfile
|
| from pathlib import Path
|
| import shutil
|
| import zipfile
|
| import os
|
|
|
| def get_archive(path, url, Set):
|
| """
|
| Download and save an archive file from a given URL.
|
|
|
| Inputs:
|
| path (str): The directory path where the archive will be saved.
|
| url (str): The URL of the archive to download.
|
| Set (str): The name to be used for the saved archive file.
|
|
|
| Returns:
|
|
|
| """
|
| try:
|
| os.mkdir(path)
|
| except:
|
| path = path
|
|
|
| urllib.request.urlretrieve(url, f"{path}/{Set}.tar")
|
|
|
| def extract_tar(tar_file):
|
| """
|
| Extract contents of a tar file and remove the original tar file.
|
|
|
| Inputs:
|
| tar_file (str): The name of the tar file to extract (without .tar extension).
|
|
|
| Returns:
|
|
|
| """
|
| print(f'{os.getcwd()}/data/raw/{tar_file}.tar', end='\r')
|
| file = tarfile.open(f'{os.getcwd()}/data/raw/{tar_file}.tar')
|
| file.extractall(f'{os.getcwd()}/data/raw/{tar_file}')
|
| file.close()
|
| os.remove(f'{os.getcwd()}/data/raw/{tar_file}.tar')
|
|
|
| def make_dir(target_dir):
|
| """
|
| Create a new directory, removing it first if it already exists.
|
|
|
| Inputs:
|
| target_dir (str): The path of the directory to create.
|
|
|
| Returns:
|
|
|
| """
|
| if Path(target_dir).exists() and Path(target_dir).is_dir():
|
| shutil.rmtree(Path(target_dir))
|
| os.makedirs(target_dir, exist_ok=True)
|
|
|
| def combine_dirs(source_dirs):
|
| """
|
| Combine contents of multiple directories into a single target directory.
|
|
|
| Inputs:
|
| source_dirs (list): A list of directory names to combine.
|
|
|
| Returns:
|
|
|
| """
|
| for source_dir in source_dirs:
|
| for subdir, dirs, files in os.walk(os.getcwd() + '/data/raw/' + source_dir):
|
| for file in files:
|
| filepath = subdir + os.sep + file
|
|
|
| if filepath.find('.jpg') != -1:
|
| shutil.copy(filepath, target_dir)
|
|
|
| if Path(os.getcwd() + '/data/raw/' + source_dir).exists():
|
| shutil.rmtree(Path(os.getcwd() + '/data/raw/' + source_dir))
|
|
|
| def unzip_file(zip_file_path, extract_to):
|
| """
|
| Extract contents of a zip file to a specified directory.
|
|
|
| Inputs:
|
| zip_file_path (str): The path to the zip file to be extracted.
|
| extract_to (str): The directory where the contents should be extracted.
|
|
|
| Returns:
|
|
|
| """
|
|
|
| os.makedirs(extract_to, exist_ok=True)
|
|
|
|
|
| with zipfile.ZipFile(zip_file_path, 'r') as zip_ref:
|
|
|
| zip_ref.extractall(extract_to)
|
|
|
|
|
| if __name__ == '__main__':
|
| make_dir(os.getcwd() + '/data/raw')
|
| make_dir(os.getcwd() + '/data/processed')
|
| make_dir(os.getcwd() + '/data/outputs')
|
| make_dir(os.getcwd() + '/models')
|
|
|
| get_archive(os.getcwd() + '/data/raw','https://dax-cdn.cdn.appdomain.cloud/dax-publaynet/1.0.0/labels.tar.gz',"label")
|
| get_archive(os.getcwd() + '/data/raw','https://dax-cdn.cdn.appdomain.cloud/dax-publaynet/1.0.0/train-0.tar.gz',"train0")
|
| get_archive(os.getcwd() + '/data/raw','https://dax-cdn.cdn.appdomain.cloud/dax-publaynet/1.0.0/train-1.tar.gz',"train1")
|
| get_archive(os.getcwd() + '/data/raw','https://dax-cdn.cdn.appdomain.cloud/dax-publaynet/1.0.0/train-2.tar.gz',"train2")
|
| get_archive(os.getcwd() + '/data/raw','https://dax-cdn.cdn.appdomain.cloud/dax-publaynet/1.0.0/train-3.tar.gz',"train3")
|
| get_archive(os.getcwd() + '/data/raw','https://dax-cdn.cdn.appdomain.cloud/dax-publaynet/1.0.0/train-4.tar.gz',"train4")
|
| get_archive(os.getcwd() + '/data/raw','https://dax-cdn.cdn.appdomain.cloud/dax-publaynet/1.0.0/train-5.tar.gz',"train5")
|
| get_archive(os.getcwd() + '/data/raw','https://dax-cdn.cdn.appdomain.cloud/dax-publaynet/1.0.0/train-6.tar.gz',"train6")
|
| get_archive(os.getcwd() + '/data/raw','https://dax-cdn.cdn.appdomain.cloud/dax-publaynet/1.0.0/val.tar.gz',"val")
|
| get_archive(os.getcwd() + '/data/raw','https://dax-cdn.cdn.appdomain.cloud/dax-publaynet/1.0.0/test.tar.gz',"test")
|
|
|
| extract_tar("train0")
|
| extract_tar("train1")
|
| extract_tar("train2")
|
| extract_tar("train3")
|
| extract_tar("train4")
|
| extract_tar("train5")
|
| extract_tar("train6")
|
| extract_tar("label")
|
| extract_tar("val")
|
| extract_tar("test")
|
|
|
| target_dir = os.getcwd() + '/data/raw/train/publaynet/train/'
|
| make_dir(target_dir)
|
|
|
| source_dirs = ['train0','train1','train2','train3', 'train4', 'train5', 'train6']
|
| combine_dirs(source_dirs)
|
|
|
| source_dirs = ['val', 'test']
|
| combine_dirs(source_dirs)
|
|
|
| unzip_file('hand_labeled_tables.zip', os.getcwd() + '/data/processed/hand_labeled_tables') |