import os import shutil import glob import json import csv import yaml from git import Repo import gzip fieldnames = ['original','compressed'] def to_csv_record(writer, buffer): record = json.loads(buffer) writer.writerow(dict( original=record['graph']['sentence'], compressed=record['compression']['text'])) def build_dataset(rawdata_dir, preprocessed_data_dir): print("Data Preparation...") os.makedirs(preprocessed_data_dir, exist_ok=True) with open(os.path.join(preprocessed_data_dir, 'training_data.csv'),'w') as csvfile: writer = csv.DictWriter(csvfile, fieldnames=fieldnames) writer.writeheader() for rawdata_files in glob.glob(f'{rawdata_dir}/data/**train**.json'): with open(rawdata_files) as raw_contents: buffer = '' for line in raw_contents: if line.strip()=='': to_csv_record(writer, buffer) buffer = '' else: buffer += line if len(buffer)>0: to_csv_record(writer, buffer) with open(os.path.join(preprocessed_data_dir, 'eval_data.csv'),'w') as csvfile: writer = csv.DictWriter(csvfile, fieldnames=['original','compressed']) writer.writeheader() with open(f'{rawdata_dir}/data/comp-data.eval.json') as raw_contents: buffer = '' for line in raw_contents: if line.strip()=='': to_csv_record(writer, buffer) buffer = '' else: buffer += line if len(buffer)>0: to_csv_record(writer, buffer) def decompressing_rawdata(rawdata_dir): print("Decompression...") compressed_files = glob.glob(rawdata_dir + "/data/*.json.gz") for compressed_file_path in compressed_files: output_file_path = os.path.splitext(compressed_file_path)[0] with gzip.open(compressed_file_path, 'rb') as comp_file: compressed_content = comp_file.read() with open(output_file_path, 'wb') as output_file: output_file.write(compressed_content) os.remove(compressed_file_path) def download_rawdata(git_url, rawdata_dir): os.makedirs(rawdata_dir, exist_ok=True) print("Data Cloning...") current_dir = os.getcwd() try: os.chdir(rawdata_dir) Repo.clone_from(git_url, '.') except Exception as e: print("Error:", e) finally: os.chdir(current_dir) if __name__ == "__main__": config = yaml.safe_load(open("config.yaml", "r")) PROJECT_DIR = eval(config["SENTENCE_COMPRESSION"]["PROJECT_DIR"]) rawdata_git = config["SENTENCE_COMPRESSION"]["DATA"]["RAW_DATA"] preprocessed_data_dir = os.path.join(PROJECT_DIR, config["SENTENCE_COMPRESSION"]["DATA"]["CLEAN_DATA"]) rawdata_dir = os.path.join(PROJECT_DIR, config["SENTENCE_COMPRESSION"]["DATA"]["RAW_DIR"]) download_rawdata(rawdata_git, rawdata_dir) decompressing_rawdata(rawdata_dir) build_dataset(rawdata_dir, preprocessed_data_dir) shutil.rmtree(rawdata_dir)