def load_file(filename): with open(filename, 'r') as f: header = f.readline().strip().split(";") return header, [line.strip().split(";") for line in f if line.strip()] def remove_duplicates(data): keys = set() _data = [] for item in data: key = tuple((item[0], item[1], item[2], item[3], item[-1])) if key in keys: continue _data += [item] keys.add(key) return _data def fix_arxiv_links(data): return [[*item[:-2], item[-2].replace("arxiv.org/pdf", "arxiv.org/abs"), item[-1]] for item in data] def sort_data(data): return sorted(data, key=lambda x: (x[0], x[1], x[2], x[3], x[-1])) def main(): header, data = load_file("contamination_report.csv") data = sort_data(data) data = remove_duplicates(data) data = fix_arxiv_links(data) print("Total datapoints:", len(data)) with open("contamination_report.csv", 'w') as f: f.write(";".join(header) + "\n") past_key = None for line in data: key = tuple((line[0], line[1])) if key != past_key: f.write("\n") past_key = key line = line[:3] + [""] + line[3:] f.write(";".join(line) + "\n") if __name__ == "__main__": main()