import tqdm import json def filtered(): import json with open("data/lean4_basic/1k_test.jsonl", "r") as f: test_data = json.load(f) test_data = [item['statement_poof'] for item in test_data] # Function to filter items based on existence in test data with open("data/lean4_random/5k_second.json", "r") as f: second_5k = json.load(f) def filter_items(data, test_data): filtered_data = [item for item in tqdm.tqdm(data) if item['statement_poof'][:-2] not in test_data] return filtered_data # Filter and save filtered data filtered_second_5k = filter_items(second_5k, test_data) with open("data/lean4_random/5k_second_filtered.json", "w") as f: json.dump(filtered_second_5k, f, ensure_ascii=False, indent=2) print("Filtered second file length:", len(filtered_second_5k)) def insert_label_for_autoformalization(): input_lists = ["data/lean4_statement_translate/15k_state_problem_translation.json"] for input_file in input_lists: with open(input_file, "r") as f: test_data = json.load(f) for item in test_data: item['task']='statement_form' with open("data/lean4_statement_translate/15k_state_problem_translation_statement_form.json", "w") as f: json.dump(test_data, f, ensure_ascii=False, indent=2) input_lists = ["data/lean4_statement_translate/15k_state_problem_translation.json"] for input_file in input_lists: with open(input_file, "r") as f: test_data = json.load(f) for item in test_data: item['task']='statementproof_inform' with open("data/lean4_statement_translate/15k_state_problem_translation_statementproof_inform.json", "w") as f: json.dump(test_data, f, ensure_ascii=False, indent=2) input_lists = ["all_theorem.jsonl"] for input_file in input_lists: with open(input_file, "r") as f: test_data = json.load(f) for item in test_data: item['task']='solver' with open("data/all_theorem_solver.jsonl", "w") as f: json.dump(test_data, f, ensure_ascii=False, indent=2) if __name__ == '__main__': insert_label_for_autoformalization() # filtered()