"""Utils for data load, save, and process (e.g., prompt construction)""" import os import json import yaml import re DOMAIN_CAT2SUB_CAT = { 'Art and Design': ['Art', 'Art_Theory', 'Design', 'Music'], 'Business': ['Accounting', 'Economics', 'Finance', 'Manage','Marketing'], 'Science': ['Biology', 'Chemistry', 'Geography', 'Math', 'Physics',], 'Health and Medicine': ['Basic_Medical_Science', 'Clinical_Medicine', 'Diagnostics_and_Laboratory_Medicine', 'Pharmacy', 'Public_Health'], 'Humanities and Social Science': ['History', 'Literature', 'Sociology', 'Psychology'], 'Tech and Engineering': ['Agriculture', 'Architecture_and_Engineering', 'Computer_Science', 'Electronics', 'Energy_and_Power', 'Materials', 'Mechanical_Engineering'], } CAT_SHORT2LONG = { 'acc': 'Accounting', 'agri': 'Agriculture', 'arch': 'Architecture_and_Engineering', 'art': 'Art', 'art_theory': 'Art_Theory', 'bas_med': 'Basic_Medical_Science', 'bio': 'Biology', 'chem': 'Chemistry', 'cli_med': 'Clinical_Medicine', 'cs': 'Computer_Science', 'design': 'Design', 'diag_med': 'Diagnostics_and_Laboratory_Medicine', 'econ': 'Economics', 'elec': 'Electronics', 'ep': 'Energy_and_Power', 'fin': 'Finance', 'geo': 'Geography', 'his': 'History', 'liter': 'Literature', 'manage': 'Manage', 'mark': 'Marketing', 'mate': 'Materials', 'math': 'Math', 'mech': 'Mechanical_Engineering', 'music': 'Music', 'phar': 'Pharmacy', 'phys': 'Physics', 'psy': 'Psychology', 'pub_health': 'Public_Health', 'socio': 'Sociology' } # DATA SAVING def save_json(filename, ds): with open(filename, 'w') as f: json.dump(ds, f, indent=4) def get_multi_choice_info(options): """ Given the list of options for multiple choice question Return the index2ans and all_choices """ start_chr = 'A' all_choices = [] index2ans = {} for i, option in enumerate(options): index2ans[chr(ord(start_chr) + i)] = option all_choices.append(chr(ord(start_chr) + i)) return index2ans, all_choices def load_yaml(file_path): with open(file_path, 'r') as stream: try: yaml_dict = yaml.safe_load(stream) except yaml.YAMLError as exc: print(exc) return yaml_dict def parse_img_path(text): matches = re.findall("", text) return matches def process_single_sample(data): question = data['question'] o_imgs_paths = [] for option in data['options']: current_o_imgs_paths = parse_img_path(option) for img_path in current_o_imgs_paths: o_imgs_paths.append(img_path) if len(o_imgs_paths) > 1: # multiple images in options, used for random selection return {'id': data['id'], 'question': question, 'options': data['options'], 'answer': data['answer'], 'image': None, 'question_type': data['question_type']} else: return {'id': data['id'], 'question': question, 'options': data['options'], 'answer': data['answer'], 'image': data['image_1'], 'question_type': data['question_type']} # DATA SAVING def save_json(filename, ds): with open(filename, 'w') as f: json.dump(ds, f, indent=4) def save_jsonl(filename, data): """ Save a dictionary of data to a JSON Lines file with the filename as key and caption as value. Args: filename (str): The path to the file where the data should be saved. data (dict): The dictionary containing the data to save where key is the image path and value is the caption. """ with open(filename, 'w', encoding='utf-8') as f: for img_path, caption in data.items(): # Extract the base filename without the extension base_filename = os.path.basename(img_path) # Create a JSON object with the filename as the key and caption as the value json_record = json.dumps({base_filename: caption}, ensure_ascii=False) # Write the JSON object to the file, one per line f.write(json_record + '\n') def save_args(args, path_dir): argsDict = args.__dict__ with open(path_dir + 'setting.txt', 'w') as f: f.writelines('------------------ start ------------------' + '\n') for eachArg, value in argsDict.items(): f.writelines(eachArg + ' : ' + str(value) + '\n') f.writelines('------------------- end -------------------') # DATA PROCESSING def construct_prompt(sample, config): question = sample['question'] options = eval(sample['options']) example = "" if sample['question_type'] == 'multiple-choice': start_chr = 'A' prediction_range = [] index2ans = {} for option in options: prediction_range.append(start_chr) example += f"({start_chr}) {option}\n" index2ans[start_chr] = option start_chr = chr(ord(start_chr) + 1) empty_prompt_sample_structure = config['multi_choice_example_format'] empty_prompt = empty_prompt_sample_structure.format(question, example) res_dict = {} res_dict['index2ans'] = index2ans res_dict['correct_choice'] = sample['answer'] res_dict['all_choices'] = prediction_range res_dict['empty_prompt'] = empty_prompt if config['task_instructions']: res_dict['final_input_prompt'] = config['task_instructions'].strip() + '\n\n' + empty_prompt else: res_dict['final_input_prompt'] = empty_prompt res_dict['gt_content'] = options[ord(sample['answer'].upper()) - ord('A')] else: empty_prompt_sample_structure = config['short_ans_example_format'] empty_prompt = empty_prompt_sample_structure.format(question) res_dict = {} res_dict['empty_prompt'] = empty_prompt if config['task_instructions']: res_dict['final_input_prompt'] = config['task_instructions'].strip() + '\n\n' + empty_prompt else: res_dict['final_input_prompt'] = empty_prompt res_dict['gt_content'] = sample['answer'] res_dict.update(sample) return res_dict