from functools import partial import gradio as gr import os import csv import pandas as pd import pickle def load_results(gen_file, grader_file, exp_name, model_name, grader): record = [] if 'llama' in model_name: version = exp_name.split('_')[2] k = exp_name.split('_')[0] if k == 'k2' and version == 'v8': return [] if k != 'k2' and version == 'v9': return [] # read csv file with open(gen_file, 'r') as file: reader = csv.reader(file) gen = list(reader) with open(grader_file, 'r') as file: reader = csv.reader(file) grade = list(reader) # print(gen_file, grader_file) # print(gen[0], grade[0]) if len(gen) != len(grade): # print("ERROR: len(gen) != len(grade)") # print(gen_file, grader_file) return [] else: # print(gen[0], grade[0]) # exit(0) #['k', 'skills', 'topic', 'system prompt', '[INST]_0', '[/INST]_0', 'model_input_0', 'text_0', '[INST]_1', '[/INST]_1', 'model_input_1', 'text_1'] # ['k', 'skills', 'topic', 'system prompt', 'user_0', 'assistant_0', 'model_input_0', 'score_0', 'score_extracted_0', 'points_0', 'num_sentences_manual_in_student_answer_0', 'true_sentence_lim_pt_0', 'num_sentences_extracted_eq_num_sentences_model_0'] for i in range(1, len(gen)): skills = [skill.strip() for skill in gen[i][1].split(',')] topic = gen[i][2] assert(skills == [skill.strip() for skill in grade[i][1].split(',')]) assert(topic == grade[i][2]) points = grade[i][9].split(',') if len(points) < len(skills): points = points + ['0.0'] * (len(skills) - len(points)) points[-1] = grade[i][11] points_no_skill_name = points.copy() answer = gen[i][-1] for skill_id, skill in enumerate(skills): simple_skill = skill.split('(')[0].strip() if simple_skill in answer: # print(skill_id, skill, simple_skill, answer, points_no_skill_name, points) points_no_skill_name[skill_id] = '0.0' record.append({ 'k': gen[i][0], 'exp_name': exp_name, 'model': model_name, 'grader': 'gpt-4' if 'gpt-4' in grader else 'llama-2-70b', 'grade_run': grader, 'skills': '\n\n'.join(skills), 'topic': topic, 'topic+skills': '+'.join([topic] + sorted(skills)), 'gen_prompt': gen[i][4].split('examples for the concepts:')[1].split('Please start the minimal natural')[0].replace('\n', '\n\n'), 'gen': gen[i][-3].replace('\n', '\n\n'), 'grade': grade[i][5].replace('\n', '\n\n'), 'points': ' '.join([(g[:-2] if g[-2:] == '.0' else g) for g in points]), 'points_no_skill_name': ' '.join([(g[:-2] if g[-2:] == '.0' else g) for g in points_no_skill_name]), # 'sent_limit_point': grade[i][11] }) return record def load_all_results(path='final'): all_results = [] for exp_name in os.listdir(path): if os.path.isfile(os.path.join(path, exp_name)): continue for model_name in os.listdir(os.path.join(path, exp_name)): gen_file = os.path.join(path, exp_name, model_name, "records.csv") if os.path.exists(gen_file) and os.path.isdir(os.path.join(path, exp_name, model_name, 'graded')): for grader in os.listdir(os.path.join(path, exp_name, model_name, 'graded')): grader_file = os.path.join(path, exp_name, model_name, 'graded', grader, "records.csv") if os.path.exists(grader_file): all_results += load_results(gen_file, grader_file, exp_name, model_name, grader) return pd.DataFrame(all_results) block_css = """ #a { color: black; background-color: #DEEBF7; font-size: 20px; } #b { color: black; background-color: #E2F0D9; font-size: 20px; } #c { color: black; background-color: #FFF2CC; font-size: 20px; } #d { color: black; background-color: #FBE5D6; font-size: 20px; } """ from Levenshtein import distance def best_match(comb, comb_list): if comb == '': return comb_list[0] dist = [distance(comb.split('+'), comb_.split('+')) for comb_ in comb_list] return comb_list[dist.index(min(dist))] class Tracker: def __init__(self, df) -> None: self.df = df self.value = {k: '' for k in ['k', 'k_list', 'comb', 'comb_list', 'model', 'model_list', 'exp_name', 'exp_name_list', 'topic', 'skills', 'gen_prompt', 'gen', 'grader', 'grader_list', 'grader_run', 'grader_run_list', 'points', 'grade']} # self.value = {k: '' for k in ['k', 'k_list', 'comb', 'comb_list', 'model', 'model_list', 'exp_name', 'exp_name_list', 'topic', 'skills', 'gen_prompt', 'gen', 'grader', 'grader_list', 'grader_run', 'grader_run_list', 'points', 'sent_limit_point', 'grade']} self.value = self.update(self.value) self.value = [self.value.copy() for _ in range(5)] self.component = [{k: '' for k in ['k', 'comb', 'model', 'exp_name', 'topic', 'skills', 'gen_prompt', 'gen', 'grader', 'grader_run', 'points', 'grade']} for _ in range(5)] # self.component = [{k: '' for k in ['k', 'comb', 'model', 'exp_name', 'topic', 'skills', 'gen_prompt', 'gen', 'grader', 'grader_run', 'points', 'sent_limit_point', 'grade']} for _ in range(5)] def update(self, value): cdf = self.df k, comb, model, exp_name, grader, grader_run = value['k'], value['comb'], value['model'], value['exp_name'], value['grader'], value['grader_run'] k_list = sorted(list(cdf.k.unique())) if k not in k_list: k = k_list[0] value['k'] = k value['k_list'] = k_list cdf = cdf[cdf.k==k] comb_list = sorted(list(cdf['topic+skills'].unique())) if comb not in comb_list: comb = best_match(comb, comb_list) value['comb'] = comb value['comb_list'] = comb_list cdf = cdf[cdf['topic+skills']==comb] model_list = sorted(list(cdf['model'].unique())) if model not in model_list: model = model_list[0] value['model'] = model value['model_list'] = model_list cdf = cdf[cdf.model==model] exp_name_list = sorted(list(cdf['exp_name'].unique())) if exp_name not in exp_name_list: exp_name = exp_name_list[0] value['exp_name'] = exp_name value['exp_name_list'] = exp_name_list cdf = cdf[cdf.exp_name==exp_name] value['topic'] = "*Topic*: " + cdf['topic'].unique()[0] value['skills'] = "*Skills*: \n\n" + cdf['skills'].unique()[0] value['gen_prompt'] = "*Skill Definition and Example*:\n\n" + cdf['gen_prompt'].unique()[0] value['gen'] = "*Model Answer*:\n\n" + cdf['gen'].unique()[0] grader_list = sorted(list(cdf['grader'].unique())) if grader not in grader_list: grader = grader_list[0] value['grader'] = grader value['grader_list'] = grader_list cdf = cdf[cdf.grader==grader] grader_run_list = sorted(list(cdf['grade_run'].unique())) if grader_run not in grader_run_list: grader_run = grader_run_list[0] value['grader_run'] = grader_run value['grader_run_list'] = grader_run_list cdf = cdf[cdf.grade_run==grader_run] value['points'] = "Points: " + cdf['points'].unique()[0] + "\n\n(After deducting points for explicitly mentioning skill names: " + cdf['points_no_skill_name'].unique()[0] + ")" # value['sent_limit_point'] = "within sentence limit? " + cdf['sent_limit_point'].unique()[0] value['grade'] = cdf['grade'].unique()[0] return value def procedure(self, c): input_list = [] output_list = [] fn_list = [] # binding = [['k', [0, 1, 3]], # ['comb', [0, 1, 3]], # ['model', [0, 1, 3]], # ['exp_name', [0, 1, 3]], # ['grader', [0, 1, 3]], # ['grader_run', [0, 1, 3]], # ] binding = [] idx = -1 for i in range(5): for k, v in self.component[i].items(): if v is c: idx = i key = k break if idx != -1: break assert(idx != -1) # print(id(c), id(self.component[idx][key]), idx, key) sync_list = [] for b in binding: if (key == b[0]) and (idx in b[1]): sync_list = [j for j in b[1] if j != idx] sync_component = [self.component[j][key] for j in sync_list] # print(c.label, key, idx, sync_list) def sync(v, sync_list=[0]): return [gr.Dropdown.update(value=v) for _ in range(len(sync_list))] if len(sync_list) > 0: input_list.append(c) output_list.append(sync_component) fn_list.append(partial(sync, sync_list=sync_list)) def update(k, comb, model, exp_name, grader, grader_run): value = { 'k': k, 'k_list': '', 'comb': comb, 'comb_list': '', 'model': model, 'model_list': '', 'exp_name': exp_name, 'exp_name_list': '', 'topic': '', 'skills': '', 'gen_prompt': '', 'gen': '', 'grader': grader, 'grader_list': '', 'grader_run': grader_run, 'grader_run_list': '', 'points': '', 'sent_limit_point': '', 'grade': '' } value = self.update(value) return [gr.Dropdown.update(value=value['k'], choices=value['k_list']), gr.Dropdown.update(value=value['comb'], choices=value['comb_list']), gr.Dropdown.update(value=value['model'], choices=value['model_list']), gr.Dropdown.update(value=value['exp_name'], choices=value['exp_name_list']), value['topic'], value['skills'], value['gen_prompt'], value['gen'], gr.Dropdown.update(value=value['grader'], choices=value['grader_list']), gr.Dropdown.update(value=value['grader_run'], choices=value['grader_run_list']), value['points'], # value['sent_limit_point'], value['grade'] ] sync_list += [idx] update_list = [] for i in range(5): for j in sync_list: if self.component[j][key] is self.component[i][key]: update_list.append(i) break for j in update_list: input_list.append([self.component[j][k] for k in ['k', 'comb', 'model', 'exp_name', 'grader', 'grader_run']]) output_list.append([self.component[j][k] for k in ['k', 'comb', 'model', 'exp_name', 'topic', 'skills', 'gen_prompt', 'gen', 'grader', 'grader_run', 'points', 'grade']]) # output_list.append([self.component[j][k] for k in ['k', 'comb', 'model', 'exp_name', 'topic', 'skills', 'gen_prompt', 'gen', 'grader', 'grader_run', 'points', 'sent_limit_point', 'grade']]) fn_list.append(update) return input_list, output_list, fn_list def build_demo(df): tracker = Tracker(df) with gr.Blocks( title="Skill-Mix: a Flexible and Expandable Family of Evaluations for AI models", theme=gr.themes.Base(text_size=gr.themes.sizes.text_lg), css=block_css, ) as demo: gr.Markdown( """ # Skill-Mix: a Flexible and Expandable Family of Evaluations for AI models By [Princeton Language and Intelligence (PLI), Princeton University](https://pli.princeton.edu/) and [Google DeepMind](https://www.deepmind.com/) ### This is a demonstration of the Skill-Mix evaluation. Paper link: [https://arxiv.org/abs/2310.17567](https://arxiv.org/abs/2310.17567) ### Samples are generated using 10% of the full set of skills and topics. Click the second tab for comparison between two generations. Coming soon: generation by more models; grading by LLaMA-2. """ ) with gr.Tab('Browse Single Generation'): v = tracker.value[0] with gr.Row(): k = gr.Dropdown(choices=v['k_list'], value=v['k'], label="k") tracker.component[0]['k'] = k comb = gr.Dropdown(choices=v['comb_list'], value=v['comb'], label="topic+skills") tracker.component[0]['comb'] = comb with gr.Row(): with gr.Column(): with gr.Row(): model = gr.Dropdown(choices=v['model_list'], value=v['model'], label="model") tracker.component[0]['model'] = model exp_name = gr.Dropdown(choices=v['exp_name_list'], value=v['exp_name'], label="exp_name") tracker.component[0]['exp_name'] = exp_name with gr.Row(): topic = gr.Markdown(value=v['topic'], elem_id='a') tracker.component[0]['topic'] = topic skills = gr.Markdown(value=v['skills'], elem_id='a') tracker.component[0]['skills'] = skills gen = gr.Markdown(value=v['gen'], elem_id='b') tracker.component[0]['gen'] = gen gen_prompt = gr.Markdown(value=v['gen_prompt'], elem_id='a') tracker.component[0]['gen_prompt'] = gen_prompt with gr.Column(): with gr.Row(): grader = gr.Dropdown(choices=v['grader_list'], value=v['grader'], label="grader") tracker.component[0]['grader'] = grader grader_run = gr.Dropdown(choices=v['grader_run_list'], value=v['grader_run'], label="grader_run") tracker.component[0]['grader_run'] = grader_run points = gr.Markdown(value=v['points'], elem_id='c') tracker.component[0]['points'] = points # sent_limit_point = gr.Markdown(value=v['sent_limit_point'], elem_id='c') # tracker.component[0]['sent_limit_point'] = sent_limit_point grade = gr.Markdown(value=v['grade'], elem_id='d') tracker.component[0]['grade'] = grade with gr.Tab('Compare Two Generations'): v = tracker.value[1] with gr.Row(): k = gr.Dropdown(choices=v['k_list'], value=v['k'], label="k") tracker.component[1]['k'] = tracker.component[2]['k'] = k comb = gr.Dropdown(choices=v['comb_list'], value=v['comb'], label="topic+skills") tracker.component[1]['comb'] = tracker.component[2]['comb'] = comb with gr.Row(): for col in range(1, 3): v = tracker.value[col] with gr.Column(): with gr.Row(): model = gr.Dropdown(choices=v['model_list'], value=v['model'], label="model") tracker.component[col]['model'] = model exp_name = gr.Dropdown(choices=v['exp_name_list'], value=v['exp_name'], label="exp_name") tracker.component[col]['exp_name'] = exp_name with gr.Row(): topic = gr.Markdown(value=v['topic'], elem_id='a') tracker.component[col]['topic'] = topic skills = gr.Markdown(value=v['skills'], elem_id='a') tracker.component[col]['skills'] = skills gen = gr.Markdown(value=v['gen'], elem_id='b') tracker.component[col]['gen'] = gen with gr.Row(): grader = gr.Dropdown(choices=v['grader_list'], value=v['grader'], label="grader") tracker.component[col]['grader'] = grader grader_run = gr.Dropdown(choices=v['grader_run_list'], value=v['grader_run'], label="grader_run") tracker.component[col]['grader_run'] = grader_run points = gr.Markdown(value=v['points'], elem_id='c') tracker.component[col]['points'] = points # sent_limit_point = gr.Markdown(value=v['sent_limit_point'], elem_id='c') # tracker.component[col]['sent_limit_point'] = sent_limit_point gen_prompt = gr.Markdown(value=v['gen_prompt'], elem_id='a') tracker.component[col]['gen_prompt'] = gen_prompt grade = gr.Markdown(value=v['grade'], elem_id='d') tracker.component[col]['grade'] = grade # with gr.Tab('One Generation Two Grading'): # v = tracker.value[3] # with gr.Row(): # k = gr.Dropdown(choices=v['k_list'], value=v['k'], label="k") # tracker.component[3]['k'] = tracker.component[4]['k'] = k # comb = gr.Dropdown(choices=v['comb_list'], value=v['comb'], label="topic+skills") # tracker.component[3]['comb'] = tracker.component[4]['comb'] = comb # with gr.Row(): # model = gr.Dropdown(choices=v['model_list'], value=v['model'], label="model") # tracker.component[3]['model'] = tracker.component[4]['model'] = model # exp_name = gr.Dropdown(choices=v['exp_name_list'], value=v['exp_name'], label="exp_name") # tracker.component[3]['exp_name'] = tracker.component[4]['exp_name'] = exp_name # with gr.Row(): # topic = gr.Markdown(value=v['topic'], elem_id='a') # tracker.component[3]['topic'] = tracker.component[4]['topic'] = topic # skills = gr.Markdown(value=v['skills'], elem_id='a') # tracker.component[3]['skills'] = tracker.component[4]['skills'] = skills # gen = gr.Markdown(value=v['gen'], elem_id='b') # tracker.component[3]['gen'] = tracker.component[4]['gen'] = gen # with gr.Row(): # for col in range(3, 5): # v = tracker.value[col] # with gr.Column(): # with gr.Row(): # grader = gr.Dropdown(choices=v['grader_list'], value=v['grader'], label="grader") # tracker.component[col]['grader'] = grader # grader_run = gr.Dropdown(choices=v['grader_run_list'], value=v['grader_run'], label="grader_run") # tracker.component[col]['grader_run'] = grader_run # points = gr.Markdown(value=v['points'], elem_id='c') # tracker.component[col]['points'] = points # sent_limit_point = gr.Markdown(value=v['sent_limit_point'], elem_id='c') # tracker.component[col]['sent_limit_point'] = sent_limit_point # grade = gr.Markdown(value=v['grade'], elem_id='d') # tracker.component[col]['grade'] = grade # gen_prompt = gr.Markdown(value=v['gen_prompt'], elem_id='a') # tracker.component[3]['gen_prompt'] = tracker.component[4]['gen_prompt'] = gen_prompt all_components = sum([list(tracker.component[i].values()) for i in range(5)], []) all_components = [c for c in all_components if c != ''] all_components = list(set(all_components)) # print(all_components) for c in all_components: input_list, output_list, fn_list = tracker.procedure(c) if len(fn_list) > 0: if len(fn_list) == 1: c.change(fn_list[0], input_list[0], output_list[0]) elif len(fn_list) == 2: c.change(fn_list[0], input_list[0], output_list[0]).then(fn_list[1], input_list[1], output_list[1]) elif len(fn_list) == 3: c.change(fn_list[0], input_list[0], output_list[0]).then(fn_list[1], input_list[1], output_list[1]).then(fn_list[2], input_list[2], output_list[2]) elif len(fn_list) == 4: c.change(fn_list[0], input_list[0], output_list[0]).then(fn_list[1], input_list[1], output_list[1]).then(fn_list[2], input_list[2], output_list[2]).then(fn_list[3], input_list[3], output_list[3]) elif len(fn_list) == 5: c.change(fn_list[0], input_list[0], output_list[0]).then(fn_list[1], input_list[1], output_list[1]).then(fn_list[2], input_list[2], output_list[2]).then(fn_list[3], input_list[3], output_list[3]).then(fn_list[4], input_list[4], output_list[4]) elif len(fn_list) == 6: c.change(fn_list[0], input_list[0], output_list[0]).then(fn_list[1], input_list[1], output_list[1]).then(fn_list[2], input_list[2], output_list[2]).then(fn_list[3], input_list[3], output_list[3]).then(fn_list[4], input_list[4], output_list[4]).then(fn_list[5], input_list[5], output_list[5]) else: raise NotImplementedError gr.Markdown('''### Citations ``` @article{yu2023skillmix, title={Skill-Mix: a Flexible and Expandable Family of Evaluations for AI models}, author={Yu, Dingli and Kaur, Simran and Gupta, Arushi and Brown-Cohen, Jonah and Goyal, Anirudh and Arora, Sanjeev}, journal={arXiv preprint arXiv:2310.17567}, year={2023} } ``` ``` @misc{openai2023gpt4, title={GPT-4 Technical Report}, author={OpenAI}, year={2023}, eprint={2303.08774}, archivePrefix={arXiv}, primaryClass={cs.CL} } ``` ``` @article{touvron2023llama, title={Llama 2: Open foundation and fine-tuned chat models}, author={Touvron, Hugo and Martin, Louis and Stone, Kevin and Albert, Peter and Almahairi, Amjad and Babaei, Yasmine and Bashlykov, Nikolay and Batra, Soumya and Bhargava, Prajjwal and Bhosale, Shruti and others}, journal={arXiv preprint arXiv:2307.09288}, year={2023} } ``` ''') return demo if __name__ == '__main__': # df = load_all_results(path="../../on_released_topics_and_skills") # pickle.dump(df, open('on_released_topics_and_skills.pkl', 'wb')) df = pickle.load(open('on_released_topics_and_skills.pkl', 'rb')) demo = build_demo(df) # demo.launch(share=True) demo.launch()