import random import numpy as np import os import json from Config import * import pandas as pd def format_card_str(card): entries = [] for k, v in card.items(): r = '' if isinstance(v, str): r += f'- {k}: {v}\n' elif isinstance(v, dict): r += f"- {k}: {v['overview']}\n" # r += f"- {k}:\n" if v['thinking_pattern'] + v['strength'] + v['weakness'] == '': continue r += f" - Thinking Patterns: {v['thinking_pattern']}\n" r += f" - Strength: {v['strength']}\n" r += f" - Weakness: {v['weakness']}\n" else: raise ValueError(f'Unknown type: {type(v)}') entries.append(r) return entries def format_qa_entry(qa): # concat question + choice question = qa['question'] choices = qa['choices'] ground_truth = qa['ground truth'] choice_str = '' # choices are in 0 - n, convert to A - Z for i, c in enumerate(choices): choice_str += f"{chr(65+i)}. {c}\n" choice_str = choice_str[:-1] return question + '\n\n' + choice_str +'\n\n' + f'Ground Truth: {chr(65+ground_truth)}' def sample_random_entry(dataset='', topic='', model='', n=1): if dataset == '': dataset = random.choice(DATASETS) if topic == '': topic = random.choice(TOPICS[dataset]) if model == '': model = random.choice(MODELS) # print(f"Sampling {n} random entries from {dataset} - {topic} - {model}") card_lst = sample_card(dataset, topic, model) qa, index = sample_QA_entry(dataset, topic, model) display_dict, info_dict = process_for_display(card_lst, qa) info_dict['index'] = index return display_dict, info_dict def process_for_display(card_lst, qa): qa_entry = format_qa_entry(qa) display_dict = {} display_dict['card'] = select_entry(qa_entry, card_lst) display_dict['qa'] = qa_entry info_dict = {**qa} info_dict.pop('question') info_dict.pop('choices') return display_dict, info_dict def select_entry(qa_entry, card_lst): # TODO: Automatically select most relevant criterion. # PLACE HOLDER, RETURN THE WHOEL THING return '\n'.join(card_lst[:2]) def sample_card(dataset='', topic='', model='', card_cnt=2): card_index = random.randint(0, card_cnt-1) path = f'dataset/{dataset}/cards/{topic}/{topic}_{model}_{card_index}.jsonl' # load jsonl with open(path, 'r') as f: data = json.load(f) card = format_card_str(data) return card def sample_QA_entry(dataset='', topic='', model='', n=1): path = f'dataset/{dataset}/{topic}/{topic}_test.jsonl' # load jsonl # with jsonlines.open(path) as reader: # data = list(reader) # use json # load line by line with open(path, 'r') as f: data = [json.loads(line) for line in f.readlines()] # transfer into pandas df = pd.DataFrame(data) # select whose model equals model df = df[df['model'] == model] sample = df.sample(1) # Convert to dictionary sample_idx = sample.index[0] sample = sample.to_dict(orient='records')[0] return sample, sample_idx if __name__ == '__main__': sample_random_entry(n=5)