# Copyright (c) Facebook, Inc. and its affiliates. # # This source code is licensed under the MIT license found in the # LICENSE file in the root directory of this source tree. import os import glob import argparse from utils.dedup import deup import sys WORKDIR_ROOT = os.environ.get('WORKDIR_ROOT', None) if WORKDIR_ROOT is None or not WORKDIR_ROOT.strip(): print('please specify your working directory root in OS environment variable WORKDIR_ROOT. Exitting..."') sys.exit(-1) def main(): parser = argparse.ArgumentParser() parser.add_argument("--from-folder", type=str, required=True, help="the data folder to be dedup") parser.add_argument("--to-folder", type=str, required=True, help="the data folder to save deduped data") parser.add_argument('--directions', type=str, default=None, required=False) args = parser.parse_args() if args.directions is None: raw_files = glob.glob(f'{args.from_folder}/train*') directions = [os.path.split(file_path)[-1].split('.')[1] for file_path in raw_files] else: directions = args.directions.split(',') directions = sorted(set(directions)) for direction in directions: src, tgt = direction.split('-') src_file = f'{args.from_folder}/train.{src}-{tgt}.{src}' tgt_file = f'{args.from_folder}/train.{src}-{tgt}.{tgt}' src_file_out = f'{args.to_folder}/train.{src}-{tgt}.{src}' tgt_file_out = f'{args.to_folder}/train.{src}-{tgt}.{tgt}' assert src_file != src_file_out assert tgt_file != tgt_file_out print(f'deduping {src_file}, {tgt_file}') deup(src_file, tgt_file, src_file_out, tgt_file_out) if __name__ == "__main__": main()