File size: 3,284 Bytes
abedf13
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
91143ec
abedf13
 
91143ec
abedf13
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
07a63f0
 
 
 
 
 
abedf13
 
 
 
 
 
 
 
91143ec
abedf13
91143ec
abedf13
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
import random
import numpy as np
import os
import json
from Config import *
import pandas as pd

def format_card_str(card):
    entries = []
    for k, v in card.items():
        r = ''
        if isinstance(v, str):
            r += f'- {k}: {v}\n'
        elif isinstance(v, dict):
            r += f"- {k}: {v['overview']}\n"
            # r += f"- {k}:\n"
            if v['thinking_pattern'] + v['strength'] + v['weakness'] == '':
                continue
            r += f"    - Thinking Patterns: {v['thinking_pattern']}\n"
            r += f"    - Strength: {v['strength']}\n"
            r += f"    - Weakness: {v['weakness']}\n"
        else:
            raise ValueError(f'Unknown type: {type(v)}')

        entries.append(r)
    return entries

def format_qa_entry(qa):
    # concat question + choice
    question = qa['question']
    choices = qa['choices']
    ground_truth = qa['ground truth']
    choice_str = ''
    # choices are in 0 - n, convert to A - Z
    for i, c in enumerate(choices):
        choice_str += f"{chr(65+i)}. {c}\n"
    
    choice_str = choice_str[:-1]
    
    return question + '\n\n' + choice_str +'\n\n' + f'Ground Truth: {chr(65+ground_truth)}'


def sample_random_entry(dataset='', topic='', model='', n=1):
    if dataset == '': 
        dataset = random.choice(DATASETS)
    
    if topic == '':
        topic = random.choice(TOPICS[dataset])
    
    if model == '':
        model = random.choice(MODELS)

    # print(f"Sampling {n} random entries from {dataset} - {topic} - {model}")
    card_lst = sample_card(dataset, topic, model)
    qa, index = sample_QA_entry(dataset, topic, model)

    display_dict, info_dict = process_for_display(card_lst, qa)
    info_dict['index'] = index

    return display_dict, info_dict


def process_for_display(card_lst, qa):
    qa_entry = format_qa_entry(qa)
    display_dict = {}
    display_dict['card'] = select_entry(qa_entry, card_lst)
    display_dict['qa'] = qa_entry
    info_dict = {**qa}
    info_dict.pop('question')
    info_dict.pop('choices')

    return display_dict, info_dict
    


def select_entry(qa_entry, card_lst):
    # TODO: Automatically select most relevant criterion.
    # PLACE HOLDER, RETURN THE WHOEL THING
    return '\n'.join(card_lst[:2])


def sample_card(dataset='', topic='', model='', card_cnt=2):
    card_index = random.randint(0, card_cnt-1)
    path = f'dataset/{dataset}/cards/{topic}/{topic}_{model}_{card_index}.jsonl'
    # load jsonl 

    with open(path, 'r') as f:
        data = json.load(f)
    
    card = format_card_str(data)

    return card
   

def sample_QA_entry(dataset='', topic='', model='', n=1):
    path = f'dataset/{dataset}/{topic}/{topic}_test.jsonl'
    # load jsonl 
    # with jsonlines.open(path) as reader:
    #     data = list(reader)
    # use json
    # load line by line
    with open(path, 'r') as f:
        data = [json.loads(line) for line in f.readlines()]
    
    # transfer into pandas
    df = pd.DataFrame(data)

    # select whose model equals model
    df = df[df['model'] == model]
    sample = df.sample(1)
    # Convert to dictionary
    sample_idx = sample.index[0]
    sample = sample.to_dict(orient='records')[0]
    return sample, sample_idx

if __name__ == '__main__':
    sample_random_entry(n=5)