diff --git "a/prepare_pretraining_input_vocab_file.py" "b/prepare_pretraining_input_vocab_file.py" new file mode 100644--- /dev/null +++ "b/prepare_pretraining_input_vocab_file.py" @@ -0,0 +1,4755 @@ +import argparse +import pickle +import random +import copy +import pandas as pd +import numpy as np +from collections import Counter +import os +from data_preprocessor import DataPreprocessor + +def prepare_pretraining_files(data_processor, options): + + # steps = pickle.load(open(f"{options.dataset_folder}unique_steps_list.pkl","rb")) + + chunk_iterator = data_processor.load_file_iterator() + + train_file = open(options.train_file_path, "w") + train_info = open(options.train_info_path, "w") + + val_file = open(options.val_file_path, "w") + val_info = open(options.val_info_path, "w") + + test_file = open(options.test_file_path, "w") + test_info = open(options.test_info_path, "w") + + + for chunk_data in chunk_iterator: + for section, section_groups in chunk_data.groupby("Level (Workspace Id)"): + # if options.workspace_name == section: + if "ratio_proportion_change3" == section: + for student, student_groups in section_groups.groupby("Anon Student Id"): + # step_names_token = [step for step in prob_groups['Step Name'] if str(step) != 'nan'] + # print(step_names_token) + + # writtenTrain = False + # writtenTest = False + + student_groups.sort_values(by="Time", inplace=True) + # prob_list = list(pd.unique(student_groups["Problem Name"])) + # print(len(prob_list), prob_list) + + # first_prob_list = prob_list[:3] + # last_prob_list = prob_list[-3:] + # print(len(first_prob_list), first_prob_list) + # print(len(last_prob_list), last_prob_list) + + # final_prob_list = first_prob_list + last_prob_list + # print(len(prob_list), len(final_prob_list), final_prob_list) + + for prob, prob_groups in student_groups.groupby("Problem Name"): + # For first 3 and last 3 only + # if not prob in final_prob_list: + # continue + # print(prob) + actions = list(prob_groups["Action"]) + # A problem should be completed by a student clicking Done button. + if not "Done" in actions: + continue + unique_steps = list(pd.unique(prob_groups["Step Name"])) + unique_steps_len = len(set([s for s in unique_steps if not pd.isna(s) and not (s in options.opt_step1) and not(s in options.opt_step2)])) + if unique_steps_len < 4: + continue + + time_stamps = list(prob_groups["Time"]) + time_stamps_list = set() + for i in range(len(time_stamps)-1): + if (time_stamps[i+1] - time_stamps[i]) < 1800: + time_stamps_list.add(time_stamps[i+1]) + + # progress = "" + + step_names_token = [] + outcome = [] + help_level = [] + auto_complete = False + means_and_extremes = False + + for index, row in prob_groups[['Time', 'Step Name', 'CF (Etalon)', 'Outcome', 'Help Level','CF (Workspace Progress Status)']].iterrows(): + + step = row["Step Name"] + progress = row["CF (Workspace Progress Status)"] + etalon = row["CF (Etalon)"] + + if not pd.isna(step): + if step in options.opt_step1 and not means_and_extremes: + try: + etalon = int(etalon) + except Exception as e: + try: + etalon = float(etalon) + means_and_extremes = True + # break + except Exception as e: + pass + + if (step in options.opt_step1 or step in options.opt_step2) and row["Time"] in time_stamps_list: + # if row["Time"] in time_stamps_list: + auto_complete = True + # print(row) + continue + + if not step_names_token or step != step_names_token[-1]: + step_names_token.append(step) + # ['OK', 'BUG', 'ERROR', 'INITIAL_HINT', 'HINT_LEVEL_CHANGE'] + outcome.append(row['Outcome']) + help_level.append(str(row["Help Level"])) + else: + outcome[-1] = outcome[-1]+":"+row['Outcome'] + help_level[-1] = help_level[-1]+":"+str(row['Help Level']) + + unique_steps_len = len(set([s for s in step_names_token if not (s in options.opt_step1) and not(s in options.opt_step2)])) + + # 4 and more in sequence + if step_names_token and unique_steps_len > 4: + # and len(step_names_token) > 3 + # For information + # indices = [str(i) for i in prob_groups.index] + where_opt = [] + for stp in step_names_token: + if stp in options.opt_step1: + where_opt.append("1") + elif stp in options.opt_step2: + where_opt.append("2") + else: + where_opt.append("0") + proba = random.random() + + # if prob in first_prob_list: + if proba <= 0.8: + # writtenTrain = True + train_file.write("\t".join(step_names_token)) + train_file.write("\n") + # train_info.write(",".join([str(progress),str(prob), str(student), str(len(step_names_token)), + # "\t".join(map(str, outcome)), "\t".join(map(str, help_level))])) + # progress, problem name, student id, auto_complete, total steps length, er or me, outcome seq, help_level seq, encoding in steps length + train_info.write(",".join([str(progress),str(prob), str(student), str(auto_complete), str(len(step_names_token)), + f"{1 if means_and_extremes else 0}", "\t".join(map(str, outcome)), + "\t".join(map(str, help_level)), "\t".join(map(str, where_opt))])) + train_info.write("\n") + + elif proba > 0.9: + # elif prob in last_prob_list: + + # writtenTest = True + + test_file.write("\t".join(step_names_token)) + test_file.write("\n") + # test_info.write(",".join([str(progress),str(prob), str(student), str(auto_complete), str(len(step_names_token)), + # "\t".join(map(str, outcome)), "\t".join(map(str, help_level))])) + # progress, problem name, student id, total steps length, er or me, outcome seq, help_level seq, encoding in steps length + test_info.write(",".join([str(progress),str(prob), str(student), str(auto_complete), str(len(step_names_token)), + f"{1 if means_and_extremes else 0}", "\t".join(map(str, outcome)), + "\t".join(map(str, help_level)), "\t".join(map(str, where_opt))])) + test_info.write("\n") + else: + val_file.write("\t".join(step_names_token)) + val_file.write("\n") + # test_info.write(",".join([str(progress),str(prob), str(student), str(auto_complete), str(len(step_names_token)), + # "\t".join(map(str, outcome)), "\t".join(map(str, help_level))])) + # progress, problem name, student id, total steps length, er or me, outcome seq, help_level seq, encoding in steps length + val_info.write(",".join([str(progress),str(prob), str(student), str(auto_complete), str(len(step_names_token)), + f"{1 if means_and_extremes else 0}", "\t".join(map(str, outcome)), + "\t".join(map(str, help_level)), "\t".join(map(str, where_opt))])) + val_info.write("\n") + # Indicates actions of next student + # Indicates next problem + # if writtenTrain: + # train_file.write("\n") + # train_info.write("\n") + # if writtenTest: + # test_file.write("\n") + # test_info.write("\n") + # if not writtenTrain and not writtenTest: + # print(f"Student {student} is not involved in workspace : {options.workspace_name}.") + + + train_file.close() + train_info.close() + + val_file.close() + val_info.close() + + test_file.close() + test_info.close() + +def prepare_school_pretraining_files(data_processor, options): + + # steps = pickle.load(open(f"{options.dataset_folder}unique_steps_list.pkl","rb")) + + chunk_iterator = data_processor.load_file_iterator(sep=",") + + train_file = open(options.train_file_path, "w") + train_info = open(options.train_info_path, "w") + + val_file = open(options.val_file_path, "w") + val_info = open(options.val_info_path, "w") + + test_file = open(options.test_file_path, "w") + test_info = open(options.test_info_path, "w") + + + for chunk_data in chunk_iterator: + for school, school_group in chunk_data.groupby('CF (Anon School Id)'): + if not options.school or school in options.school: + for class_id, class_group in school_group.groupby('CF (Anon Class Id)'): + for student, student_groups in class_group.groupby("Anon Student Id"): + student_groups.sort_values(by="Time") + # prob_list = list(pd.unique(student_groups["Problem Name"])) + for prob, prob_groups in student_groups.groupby("Problem Name"): + # For first 3 and last 3 only + # if not prob in final_prob_list: + # continue + # print(prob) + step_names_token = [] + means_and_extremes = False + for index, row in prob_groups[['Time', 'Step Name', 'Action', 'Attempt At Step', 'CF (Is Autofilled)', 'CF (Workspace Progress Status)', 'CF (Etalon)', 'CF (Problem Scenario Tags)']].iterrows(): + progress = row["CF (Workspace Progress Status)"] + action = row["Action"] + attempt = row["Attempt At Step"] + autofilled = row["CF (Is Autofilled)"] + step = row["Step Name"] + scenario = row['CF (Problem Scenario Tags)'] + + if not pd.isna(step): + if step in options.opt_step1 and not means_and_extremes: + etalon = row["CF (Etalon)"] + if not pd.isna(etalon): + etalon = etalon.strip('{}') + key, value = etalon.split('=') + etalon = value + try: + etalon = int(etalon) + except Exception as e: + try: + etalon = float(etalon) + means_and_extremes = True + except Exception as e: + pass + + if not autofilled: + new_step = f"{step}:{action}:{attempt}" + step_names_token.append(new_step) + + if step_names_token: + where_opt = [] + step1 = False + step2 = False + strategy_data = False + for step_oh in step_names_token: + step = step_oh.split(":") + if len(step) == 3: + step = step[0] + else: + step = ":".join(step[:2]) + + # print(f"changed {step_oh} = ? {step}") + if step == options.opt_step1[0]: + where_opt.append("_1") + step1 = True + elif step == options.opt_step2[0]: + where_opt.append("_2") + step2 = True + elif step in options.opt_step1[1:]: + where_opt.append("1") + if step1: + strategy_data = True + elif step in options.opt_step2[1:]: + where_opt.append("2") + if step2: + strategy_data = True + else: + where_opt.append("0") + + if strategy_data and step_names_token[-1].split(":")[-2] != "Done": + strategy_data = False + + if strategy_data: + proba = random.random() + step_names_tokens = [":".join(s.split(":")[:-2]) for s in step_names_token] + step_names_token = [] + for s in step_names_tokens: + if s != "nan": + if not step_names_token or s != step_names_token[-1]: + step_names_token.append(s) + # if prob in first_prob_list: + if proba <= 0.8: + train_file.write("\t".join(step_names_token)) + train_file.write("\n") + # school, class, student id, progress, problem name, scenario, prefered ER or ME, total steps length, encoding in steps length + train_info.write(",".join([str(school), str(class_id), str(student), str(progress), str(prob), str(scenario), f"{1 if means_and_extremes else 0}", str(len(step_names_token)), "\t".join(map(str, where_opt))])) + train_info.write("\n") + + elif proba > 0.9: + # elif prob in last_prob_list: + test_file.write("\t".join(step_names_token)) + test_file.write("\n") + # school, class, student id, progress, problem name, scenario, prefered ER or ME, total steps length, encoding in steps length + test_info.write(",".join([str(school), str(class_id), str(student), str(progress), str(prob), str(scenario), f"{1 if means_and_extremes else 0}", str(len(step_names_token)), "\t".join(map(str, where_opt))])) + test_info.write("\n") + + else: + val_file.write("\t".join(step_names_token)) + val_file.write("\n") + # school, class, student id, progress, problem name, scenario, prefered ER or ME, total steps length, encoding in steps length + val_info.write(",".join([str(school), str(class_id), str(student), str(progress), str(prob), str(scenario), f"{1 if means_and_extremes else 0}", str(len(step_names_token)), "\t".join(map(str, where_opt))])) + val_info.write("\n") + # break + # break + # break + # break + # break + + + + train_file.close() + train_info.close() + + val_file.close() + val_info.close() + + test_file.close() + test_info.close() + +def prepare_school_coded_pretraining_files(data_processor, options): + + # steps = pickle.load(open(f"{options.dataset_folder}unique_steps_list.pkl","rb")) + + chunk_iterator = data_processor.load_file_iterator(sep=",") + + train_file = open(options.train_file_path, "w") + train_info = open(options.train_info_path, "w") + + val_file = open(options.val_file_path, "w") + val_info = open(options.val_info_path, "w") + + test_file = open(options.test_file_path, "w") + test_info = open(options.test_info_path, "w") + + + for chunk_data in chunk_iterator: + for school, school_group in chunk_data.groupby('CF (Anon School Id)'): + if not options.school or school in options.school: + print(f"{school} : {school_group.shape}") + school_group = school_group[(school_group['CF (Is StepByStep)'] == False) & + (school_group['CF (Encounter)'] == 0) & + (school_group['CF (Is Review Mode)'] == -1) ] + print(f"{school} : {school_group.shape}") + # for class_id, class_group in school_groups.groupby('CF (Anon Class Id)'): + for student, student_groups in school_group.groupby("Anon Student Id"): + student_groups.sort_values(by="Time", inplace=True) + # At least 3 last problems are selected + prob_list= list(student_groups[student_groups["CF (Workspace Progress Status)"]=="GRADUATED"]["Problem Name"]) + prob_list = prob_list[-int(len(prob_list)/2):] + for prob, prob_groups in student_groups.groupby("Problem Name"): + if not prob in prob_list: + continue + progress = list(pd.unique(prob_groups["CF (Workspace Progress Status)"]))[0] + if progress != "GRADUATED": + continue + actions = list(prob_groups["Action"]) + # A problem should be completed by a student clicking Done button. + if not "Done" in actions: + continue + unique_steps = list(pd.unique(prob_groups[prob_groups["CF (Is Autofilled)"] == False]["Step Name"])) + unique_steps_len = len([s for s in unique_steps if not pd.isna(s) and not (s in options.opt_step1) and not (s in options.opt_step2)]) + if unique_steps_len < 4: + continue + class_id = list(pd.unique(prob_groups["CF (Anon Class Id)"])) + step_names_token = [] + original_steps_actions_attempts_help_levels_outcomes = [] + original_steps = [] + means_and_extremes = False + for index, row in prob_groups[['Step Name', 'Action', 'Attempt At Step', 'CF (Is Autofilled)', + 'Outcome', 'Help Level', 'CF (Workspace Progress Status)', + 'CF (Etalon)', 'CF (Problem Scenario Tags)']].iterrows(): + step = row["Step Name"] + action = row["Action"] # ['Attempt', 'Hint Request', 'Hint Level Change', 'Done'] + attempt = row["Attempt At Step"] # number + outcome = row["Outcome"] # ['OK', 'BUG', 'ERROR', 'INITIAL_HINT', 'HINT_LEVEL_CHANGE'] + help_level = row["Help Level"] # number + # progress = row["CF (Workspace Progress Status)"] + scenario = row['CF (Problem Scenario Tags)'] + + if not pd.isna(step): + if step in options.opt_step1 and not means_and_extremes: + etalon = row["CF (Etalon)"] + if not pd.isna(etalon): + etalon = etalon.strip('{}') + key, value = etalon.split('=') + etalon = value + try: + etalon = int(etalon) + except Exception as e: + try: + etalon = float(etalon) + means_and_extremes = True + except Exception as e: + pass + if row['CF (Is Autofilled)'] == True: + continue + prev = step_names_token[-1] if step_names_token else "" + prev_step = step_names_token[-1].split("-")[0] if step_names_token else "" + + if not step_names_token or step != prev_step: + if step in options.opt_step1 or step in options.opt_step2: + new_step = step + else: + if action == "Attempt" and outcome != "OK": + new_step = step+"-2" + elif "Hint" in action: + new_step = step+"-1" + else: + new_step = step+"-0" + step_names_token.append(new_step) + else: + if not (step in options.opt_step1 or step in options.opt_step2): + if action == "Attempt" and outcome != "OK": + new_step = step+"-2" + elif "Hint" in action: + new_step = step+"-1" + else: + new_step = step+"-0" + if prev < new_step: + step_names_token[-1] = new_step + original_steps_actions_attempts_help_levels_outcomes.append(f"{step}-{action}-{attempt}-{help_level}-{outcome}") + original_steps.append(step) + + unique_steps_len = len([s for s in original_steps if not (s in options.opt_step1) and not(s in options.opt_step2)]) + if step_names_token and unique_steps_len > 4: + proba = random.random() + # if prob in first_prob_list: + if proba <= 0.8: + train_file.write("\t".join(step_names_token)) + train_file.write("\n") + # school, class, student id, progress, problem name, scenario, + # prefered ER or ME, total steps length, + # original seq-action-attempt-help_level-outcome + train_info.write(",".join([str(school), "\t".join(class_id), str(student), str(progress), str(prob), str(scenario), + f"{1 if means_and_extremes else 0}", str(len(step_names_token)), + "\t".join(map(str, original_steps_actions_attempts_help_levels_outcomes))])) + train_info.write("\n") + + elif proba > 0.9: + # elif prob in last_prob_list: + test_file.write("\t".join(step_names_token)) + test_file.write("\n") + # school, class, student id, progress, problem name, scenario, + # prefered ER or ME, total steps length, + # original seq-action-attempt-help_level-outcome + test_info.write(",".join([str(school), "\t".join(class_id), str(student), str(progress), str(prob), str(scenario), + f"{1 if means_and_extremes else 0}", str(len(step_names_token)), + "\t".join(map(str, original_steps_actions_attempts_help_levels_outcomes))])) + test_info.write("\n") + + else: + val_file.write("\t".join(step_names_token)) + val_file.write("\n") + # school, class, student id, progress, problem name, scenario, + # prefered ER or ME, total steps length, + # original seq-action-attempt-help_level-outcome + val_info.write(",".join([str(school), "\t".join(class_id), str(student), str(progress), str(prob), str(scenario), + f"{1 if means_and_extremes else 0}", str(len(step_names_token)), + "\t".join(map(str, original_steps_actions_attempts_help_levels_outcomes))])) + val_info.write("\n") + # break + # break + # break + # break + # break + + + + train_file.close() + train_info.close() + + val_file.close() + val_info.close() + + test_file.close() + test_info.close() + + +def prepare_school_attention_files(data_processor, options): + + # steps = pickle.load(open(f"{options.dataset_folder}unique_steps_list.pkl","rb")) + + chunk_iterator = data_processor.load_file_iterator(sep=",") + + train_file = open(options.train_file_path, "w") + train_info = open(options.train_info_path, "w") + + val_file = open(options.val_file_path, "w") + val_info = open(options.val_info_path, "w") + + test_file = open(options.test_file_path, "w") + test_info = open(options.test_info_path, "w") + + + for chunk_data in chunk_iterator: + for school, school_group in chunk_data.groupby('CF (Anon School Id)'): + if not options.school or school in options.school: + for class_id, class_group in school_group.groupby('CF (Anon Class Id)'): + for student, student_groups in class_group.groupby("Anon Student Id"): + student_groups.sort_values(by="Time") +# prob_list = list(pd.unique(student_groups["Problem Name"])) +# if len(prob_list) > 0 : +# first_fews = int(len(prob_list)/2) +# last_fews = len(prob_list) - first_fews +# first_prob_list = prob_list[:first_fews] +# last_prob_list = prob_list[-last_fews:] + + # final_prob_list = first_prob_list + last_prob_list + for prob, prob_groups in student_groups.groupby("Problem Name"): + step_names_token = [] + means_and_extremes = False + for index, row in prob_groups[['Time', 'Step Name', 'Action', 'Attempt At Step', 'CF (Is Autofilled)', 'CF (Workspace Progress Status)', 'CF (Etalon)', 'CF (Problem Scenario Tags)']].iterrows(): + progress = row["CF (Workspace Progress Status)"] + action = row["Action"] + attempt = row["Attempt At Step"] + autofilled = row["CF (Is Autofilled)"] + step = row["Step Name"] + scenario = row['CF (Problem Scenario Tags)'] + + if not pd.isna(step): + if step in options.opt_step1 and not means_and_extremes: + etalon = row["CF (Etalon)"] + if not pd.isna(etalon): + etalon = etalon.strip('{}') + key, value = etalon.split('=') + etalon = value + try: + etalon = int(etalon) + except Exception as e: + try: + etalon = float(etalon) + means_and_extremes = True + except Exception as e: + pass + + if not autofilled: + new_step = f"{step}:{action}:{attempt}" + step_names_token.append(new_step) + + if step_names_token: + where_opt = [] + step1 = False + step2 = False + strategy_data = False + for step_oh in step_names_token: + step = step_oh.split(":") + if len(step) == 3: + step = step[0] + else: + step = ":".join(step[:2]) + + # print(f"changed {step_oh} = ? {step}") + if step == options.opt_step1[0]: + where_opt.append("_1") + step1 = True + elif step == options.opt_step2[0]: + where_opt.append("_2") + step2 = True + elif step in options.opt_step1[1:]: + where_opt.append("1") + if step1: + strategy_data = True + elif step in options.opt_step2[1:]: + where_opt.append("2") + if step2: + strategy_data = True + else: + where_opt.append("0") + + if strategy_data and step_names_token[-1].split(":")[-2] != "Done": + strategy_data = False + + if strategy_data: + # proba = random.random() + step_names_tokens = [":".join(s.split(":")[:-2]) for s in step_names_token] + step_names_token = [] + for s in step_names_tokens: + if s != "nan": + if not step_names_token or s != step_names_token[-1]: + step_names_token.append(s) + # if prob in first_prob_list: + if progress == "GRADUATED":# and means_and_extremes:# and prob in first_prob_list: + train_file.write("\t".join(step_names_token)) + train_file.write("\n") + # school, class, student id, progress, problem name, scenario, prefered ER or ME, total steps length, encoding in steps length + train_info.write(",".join([str(school), str(class_id), str(student), str(progress), str(prob), str(scenario), f"{1 if means_and_extremes else 0}", str(len(step_names_token)), "\t".join(map(str, where_opt))])) + train_info.write("\n") + + elif progress == "PROMOTED":# and means_and_extremes:# and prob in last_prob_list: + # elif prob in last_prob_list: + test_file.write("\t".join(step_names_token)) + test_file.write("\n") + # school, class, student id, progress, problem name, scenario, prefered ER or ME, total steps length, encoding in steps length + test_info.write(",".join([str(school), str(class_id), str(student), str(progress), str(prob), str(scenario), f"{1 if means_and_extremes else 0}", str(len(step_names_token)), "\t".join(map(str, where_opt))])) + test_info.write("\n") + + # else: + # val_file.write("\t".join(step_names_token)) + # val_file.write("\n") + # # school, class, student id, progress, problem name, scenario, prefered ER or ME, total steps length, encoding in steps length + # val_info.write(",".join([str(school), str(class_id), str(student), str(progress), str(prob), str(scenario), f"{1 if means_and_extremes else 0}", str(len(step_names_token)), "\t".join(map(str, where_opt))])) + # val_info.write("\n") + # break + # break + # break + # break + # break + + + + train_file.close() + train_info.close() + + val_file.close() + val_info.close() + + test_file.close() + test_info.close() + +def prepare_finetuning_10per_files(data_processor, options): + ''' + Used for L@S paper. + Only two strategies were defined as: + 0: non-opt strategy + 1: opt used strategy + ''' + chunk_iterator = data_processor.load_file_iterator() + + train_file = open(options.train_file_path, "w") + train_info = open(options.train_info_path, "w") + train_label = open(options.train_label_path, "w") + +# val_file = open(options.val_file_path, "w") +# val_info = open(options.val_info_path, "w") +# val_label = open(options.val_label_path, "w") + + test_file = open(options.test_file_path, "w") + test_info = open(options.test_info_path, "w") + test_label = open(options.test_label_path, "w") + + overall_data = [] + overall_labels = [] + for chunk_data in chunk_iterator: + for section, section_groups in chunk_data.groupby("Level (Workspace Id)"): + if "ratio_proportion_change3" == section: + for student, student_groups in section_groups.groupby("Anon Student Id"): + student_groups.sort_values(by="Time", inplace=True) + for prob, prob_groups in student_groups.groupby("Problem Name"): + + actions = list(prob_groups["Action"]) + # A problem should be completed by a student clicking Done button. + if not "Done" in actions: + continue + unique_steps = list(pd.unique(prob_groups["Step Name"])) + unique_steps_len = len(set([s for s in unique_steps if not pd.isna(s) and not (s in options.opt_step1) and not(s in options.opt_step2)])) + if unique_steps_len < 4: + continue + + step_names_token = [] + + time_stamps = list(prob_groups["Time"]) + time_stamps_list = set() + for i in range(len(time_stamps)-1): + if (time_stamps[i+1] - time_stamps[i]) < 1800: + time_stamps_list.add(time_stamps[i+1]) + + outcome = [] + help_level = [] + auto_complete = False + means_and_extremes = False + + for index, row in prob_groups[['Time', 'Step Name', 'Outcome', 'Help Level','CF (Workspace Progress Status)', 'CF (Etalon)']].iterrows(): + step = row["Step Name"] + etalon = row["CF (Etalon)"] + progress = row["CF (Workspace Progress Status)"] + if not pd.isna(step): + if step in options.opt_step1: + try: + etalon = int(etalon) + except Exception as e: + try: + etalon = float(etalon) + means_and_extremes = True + # break + except Exception as e: + pass + if (step in options.opt_step1 or step in options.opt_step2) and row["Time"] in time_stamps_list: + auto_complete = True + continue + # if not step_names_token or step != step_names_token[-1]: + # step_names_token.append(step) + + if not step_names_token or step != step_names_token[-1]: + step_names_token.append(step) + # ['OK', 'BUG', 'ERROR', 'INITIAL_HINT', 'HINT_LEVEL_CHANGE'] + outcome.append(row['Outcome']) + help_level.append(str(row["Help Level"])) + else: + outcome[-1] = outcome[-1]+":"+row['Outcome'] + help_level[-1] = help_level[-1]+":"+str(row['Help Level']) + + unique_steps_len = len(set([s for s in step_names_token if not (s in options.opt_step1) and not(s in options.opt_step2)])) + # 4 and more in sequence + if step_names_token and unique_steps_len > 4: + where_opt = [] + for stp in step_names_token: + if stp in options.opt_step1: + where_opt.append("1") + elif stp in options.opt_step2: + where_opt.append("2") + else: + where_opt.append("0") + + label_opt = "0" + if options.opt_step1: + any_opt1 = any(any(opt in step for step in step_names_token) for opt in options.opt_step1[1:]) + if any_opt1: + label_opt = "1" + + if options.opt_step2: + any_opt2 = any(any(opt in step for step in step_names_token) for opt in options.opt_step2[1:]) + if any_opt2: + label_opt = "1" + + # progress, problem name, student id, total steps length, outcome seq, help_level seq, encoding in steps length + info = ",".join([str(progress),str(prob), str(student), str(auto_complete), str(len(step_names_token)), + "\t".join(map(str, outcome)), "\t".join(map(str, help_level)), "\t".join(map(str, where_opt)), + f"{1 if means_and_extremes else 0}"]) + overall_data.append(["\t".join(step_names_token), info]) + overall_labels.append(label_opt) + + # overall_data.append('') + # overall_labels.append('') + + overall_labels = np.array(overall_labels) + indices_of_zeros = list(np.where(overall_labels == '0')[0]) + indices_of_ones = list(np.where(overall_labels == '1')[0]) + + train_len = int(len(overall_labels) * 0.10) + sample_size = int(train_len/2) + print(f"sample_size: {sample_size}") + sampled_instances = random.sample(indices_of_zeros, sample_size) + sampled_instances.extend(random.sample(indices_of_ones, sample_size)) + + balanced_test = min(len(indices_of_zeros), len(indices_of_ones)) + test_sampled_instances = random.sample(indices_of_zeros, balanced_test) + test_sampled_instances.extend(random.sample(indices_of_ones, balanced_test)) + + # writtenTrain = False + # writtenTest = False + for index, (all_data, label) in enumerate(zip(overall_data, overall_labels)): + + steps_seq = all_data[0] + info = all_data[1] + + if index in sampled_instances: + train_file.write(steps_seq) + train_file.write("\n") + + train_info.write(info) + train_info.write("\n") + + train_label.write(label) + train_label.write("\n") + elif index in test_sampled_instances: + # proba = random.random() + # if proba <0.5: + test_file.write(steps_seq) + test_file.write("\n") + + test_info.write(info) + test_info.write("\n") + + test_label.write(label) + test_label.write("\n") +# else: +# val_file.write(steps_seq) +# val_file.write("\n") + +# val_info.write(info) +# val_info.write("\n") + +# val_label.write(label) +# val_label.write("\n") + + train_file.close() + train_info.close() + train_label.close() + + # val_file.close() + # val_info.close() + # val_label.close() + + test_file.close() + test_info.close() + test_label.close() + +def prepare_finetuning_IS_FS_files(data_processor, options): + ''' + Used for L@S paper. This function gathers first three problems of each student. + Only two strategies were defined as: + 0: non-opt strategy + 1: opt used strategy + train: IS + test: FS + ''' + chunk_iterator = data_processor.load_file_iterator() + + train_file = open(options.train_file_path, "w") + train_info = open(options.train_info_path, "w") + train_label = open(options.train_label_path, "w") + +# val_file = open(options.val_file_path, "w") +# val_info = open(options.val_info_path, "w") +# val_label = open(options.val_label_path, "w") + + test_file = open(options.test_file_path, "w") + test_info = open(options.test_info_path, "w") + test_label = open(options.test_label_path, "w") + + for chunk_data in chunk_iterator: + for section, section_groups in chunk_data.groupby("Level (Workspace Id)"): + if "ratio_proportion_change3" == section: + for student, student_groups in section_groups.groupby("Anon Student Id"): + student_groups.sort_values(by="Time", inplace=True) + + prob_list = list(pd.unique(student_groups["Problem Name"])) + if len(prob_list) < 3: + continue + selected = 3 #1. int(len(prob_list)/2) + #2. 3 & <6 + #3. 3 & <3 + first_prob_list = prob_list[:selected] + last_prob_list = prob_list[-selected:] + + for prob, prob_groups in student_groups.groupby("Problem Name"): + + actions = list(prob_groups["Action"]) + # A problem should be completed by a student clicking Done button. + if not "Done" in actions: + continue + unique_steps = list(pd.unique(prob_groups["Step Name"])) + unique_steps_len = len(set([s for s in unique_steps if not pd.isna(s) and not (s in options.opt_step1) and not(s in options.opt_step2)])) + if unique_steps_len < 4: + continue + + step_names_token = [] + + time_stamps = list(prob_groups["Time"]) + time_stamps_list = set() + for i in range(len(time_stamps)-1): + if (time_stamps[i+1] - time_stamps[i]) < 1800: + time_stamps_list.add(time_stamps[i+1]) + + outcome = [] + help_level = [] + auto_complete = False + means_and_extremes = False + + for index, row in prob_groups[['Time', 'Step Name', 'Outcome', 'Help Level','CF (Workspace Progress Status)', 'CF (Etalon)']].iterrows(): + step = row["Step Name"] + etalon = row["CF (Etalon)"] + progress = row["CF (Workspace Progress Status)"] + if not pd.isna(step): + if step in options.opt_step1: + try: + etalon = int(etalon) + except Exception as e: + try: + etalon = float(etalon) + means_and_extremes = True + # break + except Exception as e: + pass + if (step in options.opt_step1 or step in options.opt_step2) and row["Time"] in time_stamps_list: + auto_complete = True + continue + # if not step_names_token or step != step_names_token[-1]: + # step_names_token.append(step) + + if not step_names_token or step != step_names_token[-1]: + step_names_token.append(step) + # ['OK', 'BUG', 'ERROR', 'INITIAL_HINT', 'HINT_LEVEL_CHANGE'] + outcome.append(row['Outcome']) + help_level.append(str(row["Help Level"])) + else: + outcome[-1] = outcome[-1]+":"+row['Outcome'] + help_level[-1] = help_level[-1]+":"+str(row['Help Level']) + + unique_steps_len = len(set([s for s in step_names_token if not (s in options.opt_step1) and not(s in options.opt_step2)])) + # 4 and more in sequence + if step_names_token and unique_steps_len > 4: + where_opt = [] + for stp in step_names_token: + if stp in options.opt_step1: + where_opt.append("1") + elif stp in options.opt_step2: + where_opt.append("2") + else: + where_opt.append("0") + + label_opt = "0" + if options.opt_step1: + any_opt1 = any(any(opt in step for step in step_names_token) for opt in options.opt_step1[1:]) + if any_opt1: + label_opt = "1" + + if options.opt_step2: + any_opt2 = any(any(opt in step for step in step_names_token) for opt in options.opt_step2[1:]) + if any_opt2: + label_opt = "1" + + # progress, problem name, student id, total steps length, outcome seq, help_level seq, encoding in steps length + info = ",".join([str(progress),str(prob), str(student), str(auto_complete), str(len(step_names_token)), + "\t".join(map(str, outcome)), "\t".join(map(str, help_level)), "\t".join(map(str, where_opt)), + f"{1 if means_and_extremes else 0}"]) + if prob in first_prob_list: + train_file.write("\t".join(step_names_token)) + train_file.write("\n") + + train_info.write(info) + train_info.write("\n") + + train_label.write(label_opt) + train_label.write("\n") + elif prob in last_prob_list: + test_file.write("\t".join(step_names_token)) + test_file.write("\n") + + test_info.write(info) + test_info.write("\n") + + test_label.write(label_opt) + test_label.write("\n") + + train_file.close() + train_info.close() + train_label.close() + + # val_file.close() + # val_info.close() + # val_label.close() + + test_file.close() + test_info.close() + test_label.close() + +def prepare_finetuning_IS_files_old(data_processor, opts): + ''' + Used for L@S paper. This function gathers first three problems of each student. + Only two strategies were defined as: + 0: non-opt strategy + 1: opt used strategy + ''' + + options = copy.deepcopy(opts) + for k,v in vars(opts).items(): + if k.startswith("train") or k.startswith("test"): + if v: + f_path = v.split("/") + f_path = f_path[0]+"/"+f_path[1]+"/IS/"+f_path[2] + setattr(options, f"{k}", f_path) + print(f"options.{k} : {getattr(options, f'{k}')}") + + steps = pickle.load(open(f"{options.dataset_folder}unique_steps_list.pkl","rb")) + chunk_iterator = data_processor.load_file_iterator() + + train_file = open(options.train_file_path, "w") + train_info = open(options.train_info_path, "w") + train_label = open(options.train_label_path, "w") + trainr_label = open(options.trainr_label_path, "w") + train_gt_label = open(options.train_gt_label_path, "w") + + # test_file = open(options.test_file_path, "w") + # test_info = open(options.test_info_path, "w") + # test_label = open(options.test_label_path, "w") + # testr_label = open(options.testr_label_path, "w") + # test_gt_label = open(options.test_gt_label_path, "w") + + overall_data = [] + overall_labels = [] + for chunk_data in chunk_iterator: + for section, section_groups in chunk_data.groupby("Level (Workspace Id)"): + if options.workspace_name == section: + for student, student_groups in section_groups.groupby("Anon Student Id"): + + student_groups.sort_values(by="Time") + prob_list = list(pd.unique(student_groups["Problem Name"])) + + if len(prob_list) < 3: + continue + + first_prob_list = prob_list[:3] +# last_prob_list = prob_list[-3:] +# # print(len(first_prob_list), len(last_prob_list)) + +# final_prob_list = first_prob_list + last_prob_list + # print(len(prob_list), len(final_prob_list)) #, final_prob_list) + + for prob, prob_groups in student_groups.groupby("Problem Name"): + # For first 3 and last 3 only + if not prob in first_prob_list: + continue + + step_names_token = [] + + time_stamps = list(prob_groups["Time"]) + time_stamps_list = set() + for i in range(len(time_stamps)-1): + if (time_stamps[i+1] - time_stamps[i]) < 2000: + time_stamps_list.add(time_stamps[i+1]) + + progress = "" + outcome = [] + help_level = [] + auto_complete = False + means_and_extremes = False + finals = len(options.final_step) + totals = 0 + + for index, row in prob_groups[['Time', 'Step Name', 'Outcome', 'Help Level','CF (Workspace Progress Status)', 'CF (Etalon)']].iterrows(): + step = row["Step Name"] + etalon = row["CF (Etalon)"] + progress = row["CF (Workspace Progress Status)"] + if not pd.isna(step): + if step in options.opt_step1: + try: + etalon = int(etalon) + except Exception as e: + try: + etalon = float(etalon) + means_and_extremes = True + # break + except Exception as e: + pass + if (step in options.opt_step1 or step in options.opt_step2) and row["Time"] in time_stamps_list: + # if row["Time"] in time_stamps_list: + auto_complete = True + # print(row) + continue + # if not step_names_token or step != step_names_token[-1]: + # step_names_token.append(step) + + if not step_names_token or step != step_names_token[-1]: + step_names_token.append(step) + # ['OK', 'BUG', 'ERROR', 'INITIAL_HINT', 'HINT_LEVEL_CHANGE'] + outcome.append(row['Outcome']) + help_level.append(str(row["Help Level"])) + if finals == 0: + totals += 1 + else: + outcome[-1] = outcome[-1]+":"+row['Outcome'] + help_level[-1] = help_level[-1]+":"+str(row['Help Level']) + + error_ind = ['BUG', 'ERROR', 'INITIAL_HINT', 'HINT_LEVEL_CHANGE'] + errors = 0 + for step, out in zip(step_names_token, outcome): + if (finals and step in options.final_step) or totals > 0: + out = out.split(":") + if any(any(ind in o for o in out) for ind in error_ind): + errors +=1 + + if finals: + totals = finals + # 4 and more in sequence + if step_names_token: # and len(step_names_token) > 3 + + where_opt = [] + for stp in step_names_token: + if stp in options.opt_step1: + where_opt.append("1") + elif stp in options.opt_step2: + where_opt.append("2") + else: + where_opt.append("0") + + label_opt = "0" + if options.opt_step1: + any_opt1 = any(any(opt in step for step in step_names_token) for opt in options.opt_step1[1:]) + if any_opt1: + label_opt = "1" + + + if options.opt_step2: + any_opt2 = any(any(opt in step for step in step_names_token) for opt in options.opt_step2[1:]) + if any_opt2: + label_opt = "1" + + correctness = 1 - errors/totals + strat_correct = "0" + if correctness > 0.75: + strat_correct = "1" + + # progress, problem name, student id, total steps length, outcome seq, help_level seq, encoding in steps length + info = ",".join([str(progress),str(prob), str(student), str(auto_complete), str(len(step_names_token)),"\t".join(map(str, outcome)), "\t".join(map(str, help_level)), "\t".join(map(str, where_opt)), str(correctness)]) + + overall_data.append(["\t".join(step_names_token), strat_correct, info, f"{1 if means_and_extremes else 0}"]) + overall_labels.append(label_opt) + + overall_data.append('') + overall_labels.append('') + +# overall_labels = np.array(overall_labels) +# indices_of_zeros = list(np.where(overall_labels == '0')[0]) +# indices_of_ones = list(np.where(overall_labels == '1')[0]) + +# zeros_instances_size = int(1 * len(indices_of_zeros)) +# ones_instances_size = int(1 * len(indices_of_ones)) +# sample_size = min(zeros_instances_size, ones_instances_size) +# sampled_instances = random.sample(indices_of_zeros, sample_size) +# sampled_instances.extend(random.sample(indices_of_ones, sample_size)) + + writtenTrain = False + # writtenTest = False + for index, (all_data, label) in enumerate(zip(overall_data, overall_labels)): + if all_data: + steps_seq = all_data[0] + strat_correct = all_data[1] + info = all_data[2] + me_opt = all_data[3] + + # if index in sampled_instances: + writtenTrain = True + train_file.write(steps_seq) + train_file.write("\n") + train_label.write(label) + train_label.write("\n") + trainr_label.write(strat_correct) + trainr_label.write("\n") + train_info.write(info) + train_info.write("\n") + train_gt_label.write(me_opt) + train_gt_label.write("\n") + # else: + # writtenTest = True + # test_file.write(steps_seq) + # test_file.write("\n") + # # progress, problem name, student id, total steps length, outcome seq, help_level seq, encoding in steps length + # test_label.write(label) + # test_label.write("\n") + # # testr_label.write(str(correctness)) + # testr_label.write(strat_correct) + # testr_label.write("\n") + # test_info.write(info) + # test_info.write("\n") + # test_gt_label.write(me_opt) + # test_gt_label.write("\n") + else: + # Indicates actions of next student + # Indicates next problem + if writtenTrain: + writtenTrain = False + train_file.write("\n") + train_info.write("\n") + train_label.write("\n") + trainr_label.write("\n") + train_gt_label.write("\n") + # if writtenTest: + # writtenTest = False + # test_file.write("\n") + # test_info.write("\n") + # test_label.write("\n") + # testr_label.write("\n") + # test_gt_label.write("\n") + + train_file.close() + train_info.close() + train_label.close() + trainr_label.close() + train_gt_label.close() + + # test_file.close() + # test_info.close() + # test_label.close() + # testr_label.close() + # test_gt_label.close() + +def prepare_finetuning_FS_files_old(data_processor, opts): + ''' + Used for L@S paper. This function gathers last three problems of each student. + Only two strategies were defined as: + 0: non-opt strategy + 1: opt used strategy + ''' + + options = copy.deepcopy(opts) + for k,v in vars(opts).items(): + if k.startswith("train") or k.startswith("test"): + if v: + f_path = v.split("/") + f_path = f_path[0]+"/"+f_path[1]+"/FS/"+f_path[2] + setattr(options, f"{k}", f_path) + print(f"options.{k} : {getattr(options, f'{k}')}") + + steps = pickle.load(open(f"{options.dataset_folder}unique_steps_list.pkl","rb")) + chunk_iterator = data_processor.load_file_iterator() + + train_file = open(options.train_file_path, "w") + train_info = open(options.train_info_path, "w") + train_label = open(options.train_label_path, "w") + trainr_label = open(options.trainr_label_path, "w") + train_gt_label = open(options.train_gt_label_path, "w") + + # test_file = open(options.test_file_path, "w") + # test_info = open(options.test_info_path, "w") + # test_label = open(options.test_label_path, "w") + # testr_label = open(options.testr_label_path, "w") + # test_gt_label = open(options.test_gt_label_path, "w") + + overall_data = [] + overall_labels = [] + for chunk_data in chunk_iterator: + for section, section_groups in chunk_data.groupby("Level (Workspace Id)"): + if options.workspace_name == section: + for student, student_groups in section_groups.groupby("Anon Student Id"): + + student_groups.sort_values(by="Time") + prob_list = list(pd.unique(student_groups["Problem Name"])) + + if len(prob_list) < 3: + continue + + # first_prob_list = prob_list[:3] + last_prob_list = prob_list[-3:] +# # print(len(first_prob_list), len(last_prob_list)) + +# final_prob_list = first_prob_list + last_prob_list + # print(len(prob_list), len(final_prob_list)) #, final_prob_list) + + for prob, prob_groups in student_groups.groupby("Problem Name"): + # For first 3 and last 3 only + if not prob in last_prob_list: + continue + + step_names_token = [] + + time_stamps = list(prob_groups["Time"]) + time_stamps_list = set() + for i in range(len(time_stamps)-1): + if (time_stamps[i+1] - time_stamps[i]) < 2000: + time_stamps_list.add(time_stamps[i+1]) + + progress = "" + outcome = [] + help_level = [] + auto_complete = False + means_and_extremes = False + finals = len(options.final_step) + totals = 0 + + for index, row in prob_groups[['Time', 'Step Name', 'Outcome', 'Help Level','CF (Workspace Progress Status)', 'CF (Etalon)']].iterrows(): + step = row["Step Name"] + etalon = row["CF (Etalon)"] + progress = row["CF (Workspace Progress Status)"] + if not pd.isna(step): + if step in options.opt_step1: + try: + etalon = int(etalon) + except Exception as e: + try: + etalon = float(etalon) + means_and_extremes = True + # break + except Exception as e: + pass + if (step in options.opt_step1 or step in options.opt_step2) and row["Time"] in time_stamps_list: + # if row["Time"] in time_stamps_list: + auto_complete = True + # print(row) + continue + # if not step_names_token or step != step_names_token[-1]: + # step_names_token.append(step) + + if not step_names_token or step != step_names_token[-1]: + step_names_token.append(step) + # ['OK', 'BUG', 'ERROR', 'INITIAL_HINT', 'HINT_LEVEL_CHANGE'] + outcome.append(row['Outcome']) + help_level.append(str(row["Help Level"])) + if finals == 0: + totals += 1 + else: + outcome[-1] = outcome[-1]+":"+row['Outcome'] + help_level[-1] = help_level[-1]+":"+str(row['Help Level']) + + error_ind = ['BUG', 'ERROR', 'INITIAL_HINT', 'HINT_LEVEL_CHANGE'] + errors = 0 + for step, out in zip(step_names_token, outcome): + if (finals and step in options.final_step) or totals > 0: + out = out.split(":") + if any(any(ind in o for o in out) for ind in error_ind): + errors +=1 + + if finals: + totals = finals + # 4 and more in sequence + if step_names_token: # and len(step_names_token) > 3 + + where_opt = [] + for stp in step_names_token: + if stp in options.opt_step1: + where_opt.append("1") + elif stp in options.opt_step2: + where_opt.append("2") + else: + where_opt.append("0") + + label_opt = "0" + if options.opt_step1: + any_opt1 = any(any(opt in step for step in step_names_token) for opt in options.opt_step1[1:]) + if any_opt1: + label_opt = "1" + + + if options.opt_step2: + any_opt2 = any(any(opt in step for step in step_names_token) for opt in options.opt_step2[1:]) + if any_opt2: + label_opt = "1" + + correctness = 1 - errors/totals + strat_correct = "0" + if correctness > 0.75: + strat_correct = "1" + + # progress, problem name, student id, total steps length, outcome seq, help_level seq, encoding in steps length + info = ",".join([str(progress),str(prob), str(student), str(auto_complete), str(len(step_names_token)),"\t".join(map(str, outcome)), "\t".join(map(str, help_level)), "\t".join(map(str, where_opt)), str(correctness)]) + + overall_data.append(["\t".join(step_names_token), strat_correct, info, f"{1 if means_and_extremes else 0}"]) + overall_labels.append(label_opt) + + overall_data.append('') + overall_labels.append('') + +# overall_labels = np.array(overall_labels) +# indices_of_zeros = list(np.where(overall_labels == '0')[0]) +# indices_of_ones = list(np.where(overall_labels == '1')[0]) + +# zeros_instances_size = int(0.10 * len(indices_of_zeros)) +# ones_instances_size = int(0.10 * len(indices_of_ones)) +# sample_size = min(zeros_instances_size, ones_instances_size) +# sampled_instances = random.sample(indices_of_zeros, sample_size) +# sampled_instances.extend(random.sample(indices_of_ones, sample_size)) + + writtenTrain = False + # writtenTest = False + for index, (all_data, label) in enumerate(zip(overall_data, overall_labels)): + if all_data: + steps_seq = all_data[0] + strat_correct = all_data[1] + info = all_data[2] + me_opt = all_data[3] + + # if index in sampled_instances: + writtenTrain = True + train_file.write(steps_seq) + train_file.write("\n") + train_label.write(label) + train_label.write("\n") + trainr_label.write(strat_correct) + trainr_label.write("\n") + train_info.write(info) + train_info.write("\n") + train_gt_label.write(me_opt) + train_gt_label.write("\n") + # else: + # writtenTest = True + # test_file.write(steps_seq) + # test_file.write("\n") + # # progress, problem name, student id, total steps length, outcome seq, help_level seq, encoding in steps length + # test_label.write(label) + # test_label.write("\n") + # # testr_label.write(str(correctness)) + # testr_label.write(strat_correct) + # testr_label.write("\n") + # test_info.write(info) + # test_info.write("\n") + # test_gt_label.write(me_opt) + # test_gt_label.write("\n") + else: + # Indicates actions of next student + # Indicates next problem + if writtenTrain: + writtenTrain = False + train_file.write("\n") + train_info.write("\n") + train_label.write("\n") + trainr_label.write("\n") + train_gt_label.write("\n") + # if writtenTest: + # writtenTest = False + # test_file.write("\n") + # test_info.write("\n") + # test_label.write("\n") + # testr_label.write("\n") + # test_gt_label.write("\n") + + train_file.close() + train_info.close() + train_label.close() + trainr_label.close() + train_gt_label.close() + + # test_file.close() + # test_info.close() + # test_label.close() + # testr_label.close() + # test_gt_label.close() + + +def prepare_finetuning_correctness_files(data_processor, options): + ''' + Ongoing research. Student strategy learning/predicting. + FinalAnswer step + Correct: 1 , correctness of final strategy > 0.75 + Incorrect: 0 , else < 0.75 + ''' + chunk_iterator = data_processor.load_file_iterator() + + train_file = open(options.train_file_path, "w") + train_info = open(options.train_info_path, "w") + train_label = open(options.train_label_path, "w") + +# val_file = open(options.val_file_path, "w") +# val_info = open(options.val_info_path, "w") +# val_label = open(options.val_label_path, "w") + + test_file = open(options.test_file_path, "w") + test_info = open(options.test_info_path, "w") + test_label = open(options.test_label_path, "w") + + overall_data = [] + overall_labels = [] + for chunk_data in chunk_iterator: + for section, section_groups in chunk_data.groupby("Level (Workspace Id)"): + if "ratio_proportion_change3" == section: + for student, student_groups in section_groups.groupby("Anon Student Id"): + student_groups.sort_values(by="Time", inplace=True) + for prob, prob_groups in student_groups.groupby("Problem Name"): + + actions = list(prob_groups["Action"]) + # A problem should be completed by a student clicking Done button. + if not "Done" in actions: + continue + unique_steps = list(pd.unique(prob_groups["Step Name"])) + unique_steps_len = len(set([s for s in unique_steps if not pd.isna(s) and not (s in options.opt_step1) and not(s in options.opt_step2)])) + if unique_steps_len < 4: + continue + + step_names_token = [] + + time_stamps = list(prob_groups["Time"]) + time_stamps_list = set() + for i in range(len(time_stamps)-1): + if (time_stamps[i+1] - time_stamps[i]) < 1800: + time_stamps_list.add(time_stamps[i+1]) + + outcome = [] + help_level = [] + auto_complete = False + means_and_extremes = False + final_correct = 0 + for index, row in prob_groups[['Time', 'Step Name', 'Outcome', 'Help Level','CF (Workspace Progress Status)', 'CF (Etalon)']].iterrows(): + step = row["Step Name"] + etalon = row["CF (Etalon)"] + progress = row["CF (Workspace Progress Status)"] + if not pd.isna(step): + if step in options.opt_step1: + try: + etalon = int(etalon) + except Exception as e: + try: + etalon = float(etalon) + means_and_extremes = True + # break + except Exception as e: + pass + if (step in options.opt_step1 or step in options.opt_step2) and row["Time"] in time_stamps_list: + auto_complete = True + continue + # if not step_names_token or step != step_names_token[-1]: + # step_names_token.append(step) + + if not step_names_token or step != step_names_token[-1]: + step_names_token.append(step) + # ['OK', 'BUG', 'ERROR', 'INITIAL_HINT', 'HINT_LEVEL_CHANGE'] + outcome.append(row['Outcome']) + help_level.append(str(row["Help Level"])) + else: + outcome[-1] = outcome[-1]+":"+row['Outcome'] + help_level[-1] = help_level[-1]+":"+str(row['Help Level']) + if step == "FinalAnswer": + final_correct += 1 + unique_steps_len = len(set([s for s in step_names_token if not (s in options.opt_step1) and not(s in options.opt_step2)])) + # 4 and more in sequence + if step_names_token and unique_steps_len > 4: + where_opt = [] + for stp in step_names_token: + if stp in options.opt_step1: + where_opt.append("1") + elif stp in options.opt_step2: + where_opt.append("2") + else: + where_opt.append("0") + + label_opt = "0" + if final_correct == 1: + label_opt = "1" + + # progress, problem name, student id, total steps length, outcome seq, help_level seq, encoding in steps length + info = ",".join([str(progress),str(prob), str(student), str(auto_complete), str(len(step_names_token)), + "\t".join(map(str, outcome)), "\t".join(map(str, help_level)), "\t".join(map(str, where_opt)), + f"{1 if means_and_extremes else 0}"]) + overall_data.append(["\t".join(step_names_token), info]) + overall_labels.append(label_opt) + + # overall_data.append('') + # overall_labels.append('') + + overall_labels = np.array(overall_labels) + indices_of_zeros = list(np.where(overall_labels == '0')[0]) + indices_of_ones = list(np.where(overall_labels == '1')[0]) + + train_len = int(len(overall_labels) * 0.10) + sample_size = int(train_len/2) + print(f"sample_size: {sample_size}") + sampled_instances = random.sample(indices_of_zeros, sample_size) + sampled_instances.extend(random.sample(indices_of_ones, sample_size)) + + # writtenTrain = False + # writtenTest = False + for index, (all_data, label) in enumerate(zip(overall_data, overall_labels)): + + steps_seq = all_data[0] + info = all_data[1] + + if index in sampled_instances: + train_file.write(steps_seq) + train_file.write("\n") + + train_info.write(info) + train_info.write("\n") + + train_label.write(label) + train_label.write("\n") + else: + # proba = random.random() + # if proba <0.5: + test_file.write(steps_seq) + test_file.write("\n") + + test_info.write(info) + test_info.write("\n") + + test_label.write(label) + test_label.write("\n") +# else: +# val_file.write(steps_seq) +# val_file.write("\n") + +# val_info.write(info) +# val_info.write("\n") + +# val_label.write(label) +# val_label.write("\n") + + train_file.close() + train_info.close() + train_label.close() + + # val_file.close() + # val_info.close() + # val_label.close() + + test_file.close() + test_info.close() + test_label.close() + +def prepare_finetuning_correctness_files_old(data_processor, opts): + ''' + Ongoing research. Student strategy learning/predicting. + Correct, 1: correctness of final strategy > 0.75 + Incorrect, 0: else < 0.75 + ''' + options = copy.deepcopy(opts) + for k,v in vars(opts).items(): + if k.startswith("train") or k.startswith("test"): + if v: + f_path = v.split("/") + f_path = f_path[0]+"/"+f_path[1]+"/fa_correctness/"+f_path[2] + # f_path = f_path[0]+"/"+f_path[1]+"/check2/"+f_path[2] + setattr(options, f"{k}", f_path) + print(f"options.{k} : {getattr(options, f'{k}')}") + + steps = pickle.load(open(f"{options.dataset_folder}unique_steps_list.pkl","rb")) + chunk_iterator = data_processor.load_file_iterator() + + train_file = open(options.train_file_path, "w") + train_info = open(options.train_info_path, "w") + train_label = open(options.train_label_path, "w") + # trainr_label = open(options.trainr_label_path, "w") + # train_gt_label = open(options.train_gt_label_path, "w") + + test_file = open(options.test_file_path, "w") + test_info = open(options.test_info_path, "w") + test_label = open(options.test_label_path, "w") + # testr_label = open(options.testr_label_path, "w") + # test_gt_label = open(options.test_gt_label_path, "w") + ws = "_".join(options.workspace_name.split("_")[:-1]) + print("Workspace: ", ws) + overall_data = [] + overall_labels = [] + for chunk_data in chunk_iterator: + for section, section_groups in chunk_data.groupby("Level (Workspace Id)"): + if ws == section: + for student, student_groups in section_groups.groupby("Anon Student Id"): + + student_groups.sort_values(by="Time") + prob_list = list(pd.unique(student_groups["Problem Name"])) + + # if len(prob_list) < 3: + # continue + +# first_prob_list = prob_list[:3] + # last_prob_list = prob_list[-3:] +# # print(len(first_prob_list), len(last_prob_list)) + +# final_prob_list = first_prob_list + last_prob_list + # print(len(prob_list), len(final_prob_list)) #, final_prob_list) + + for prob, prob_groups in student_groups.groupby("Problem Name"): + # For first 3 and last 3 only +# if not prob in last_prob_list: +# continue + # print(options.final_step in list(prob_groups["Step Name"])) + # if not (options.final_step in list(prob_groups["Step Name"])): + # continue + step_names_token = [] + + time_stamps = list(prob_groups["Time"]) + time_stamps_list = set() + for i in range(len(time_stamps)-1): + if (time_stamps[i+1] - time_stamps[i]) < 2000: + time_stamps_list.add(time_stamps[i+1]) + + progress = "" + outcome = [] + help_level = [] + auto_complete = False + means_and_extremes = False + # finals = len(options.final_step) + + + totals = 0 + + for index, row in prob_groups[['Time', 'Step Name', 'Outcome', 'Help Level','CF (Workspace Progress Status)', 'CF (Etalon)']].iterrows(): + step = row["Step Name"] + etalon = row["CF (Etalon)"] + progress = row["CF (Workspace Progress Status)"] + if not pd.isna(step): + if step in options.opt_step1: + try: + etalon = int(etalon) + except Exception as e: + try: + etalon = float(etalon) + means_and_extremes = True + # break + except Exception as e: + pass + if (step in options.opt_step1 or step in options.opt_step2) and row["Time"] in time_stamps_list: + # if row["Time"] in time_stamps_list: + auto_complete = True + # print(row) + continue + # if not step_names_token or step != step_names_token[-1]: + # step_names_token.append(step) + + if not step_names_token or step != step_names_token[-1]: + step_names_token.append(step) + # ['OK', 'BUG', 'ERROR', 'INITIAL_HINT', 'HINT_LEVEL_CHANGE'] + outcome.append(row['Outcome']) + help_level.append(str(row["Help Level"])) + # if finals == 0: + # totals += 1 + else: + outcome[-1] = outcome[-1]+":"+row['Outcome'] + help_level[-1] = help_level[-1]+":"+str(row['Help Level']) + + error_ind = ['BUG', 'ERROR', 'INITIAL_HINT', 'HINT_LEVEL_CHANGE'] + errors = 0 + for step, out in zip(step_names_token, outcome): + if (step in options.final_step):# or totals > 0: + out = out.split(":") + totals = len(out) + # print(totals) + for ind in error_ind: + if ind in out: + errors +=1 + + # if finals: + # totals = finals + # 4 and more in sequence + if step_names_token and totals>0: # and len(step_names_token) > 3 + + where_opt = [] + for stp in step_names_token: + if stp in options.opt_step1: + where_opt.append("1") + elif stp in options.opt_step2: + where_opt.append("2") + else: + where_opt.append("0") + + label_opt = "0" + if options.opt_step1: + all_opt1 = all(any(opt in step for step in step_names_token) for opt in options.opt_step1) + any_opt1 = any(any(opt in step for step in step_names_token) for opt in options.opt_step1[1:]) + + if any_opt1: + label_opt = "2" + if all_opt1: + label_opt = "1" + + + if options.opt_step2: + all_opt2 = all(any(opt in step for step in step_names_token) for opt in options.opt_step2) + any_opt2 = any(any(opt in step for step in step_names_token) for opt in options.opt_step2[1:]) + if any_opt2: + label_opt = "4" + if all_opt2: + label_opt = "3" + if any_opt1 and any_opt2: + label_opt = "5" + if any_opt1 and all_opt2: + label_opt = "6" + if all_opt1 and any_opt2: + label_opt = "7" + if all_opt1 and all_opt2: + label_opt = "8" + + + correctness = 1 - errors/totals + strat_correct = "0" + if correctness > 0.75: + strat_correct = "1" + + # if not means_and_extremes and label_opt == "2": + # progress, problem name, student id, total steps length, outcome seq, help_level seq, encoding in steps length + info = ",".join([str(progress),str(prob), str(student), str(auto_complete), str(len(step_names_token)),"\t".join(map(str, outcome)), "\t".join(map(str, help_level)), "\t".join(map(str, where_opt)), str(correctness), f"{1 if means_and_extremes else 0}"]) + + overall_data.append(["\t".join(step_names_token), label_opt, info]) + overall_labels.append(strat_correct) + + overall_data.append('') + overall_labels.append('') + + overall_labels = np.array(overall_labels, dtype=str) + indices_of_zeros = list(np.where(overall_labels == '0')[0]) + indices_of_ones = list(np.where(overall_labels == '1')[0]) + + per = 0.20 + zeros_instances_size = int(per * len(indices_of_zeros)) + ones_instances_size = int(per * len(indices_of_ones)) + + sample_size = min(zeros_instances_size, ones_instances_size) + sampled_instances = random.sample(indices_of_zeros, sample_size) + sampled_instances.extend(random.sample(indices_of_ones, sample_size)) + + writtenTrain = False + writtenTest = False + for index, (all_data, label) in enumerate(zip(overall_data, overall_labels)): + if all_data: + steps_seq = all_data[0] + label_opt = all_data[1] + info = all_data[2] + # me_opt = all_data[3] + + if index in sampled_instances: + writtenTrain = True + train_file.write(steps_seq) + train_file.write("\n") + train_label.write(label) + train_label.write("\n") + # trainr_label.write(label_opt) + # trainr_label.write("\n") + train_info.write(info) + train_info.write("\n") + # train_gt_label.write(me_opt) + # train_gt_label.write("\n") + else: + writtenTest = True + test_file.write(steps_seq) + test_file.write("\n") + # progress, problem name, student id, total steps length, outcome seq, help_level seq, encoding in steps length + test_label.write(label) + test_label.write("\n") + # testr_label.write(str(correctness)) + # testr_label.write(label_opt) + # testr_label.write("\n") + test_info.write(info) + test_info.write("\n") + # test_gt_label.write(me_opt) + # test_gt_label.write("\n") + else: + # Indicates actions of next student + # Indicates next problem + if writtenTrain: + writtenTrain = False + train_file.write("\n") + train_info.write("\n") + train_label.write("\n") + # trainr_label.write("\n") + # train_gt_label.write("\n") + if writtenTest: + writtenTest = False + test_file.write("\n") + test_info.write("\n") + test_label.write("\n") + # testr_label.write("\n") + # test_gt_label.write("\n") + + + train_file.close() + train_info.close() + train_label.close() + # trainr_label.close() + # train_gt_label.close() + + test_file.close() + test_info.close() + test_label.close() + # testr_label.close() + # test_gt_label.close() + +def prepare_finetuning_correctness_aaai_files(data_processor, opts): + ''' + Ongoing research. Student strategy learning/predicting. + Correct, 1: correctness of final strategy > 0.75 + Incorrect, 0: else < 0.75 + ''' + options = copy.deepcopy(opts) + for k,v in vars(opts).items(): + if k.startswith("train") or k.startswith("test") or k.startswith("val"): + if v: + f_path = v.split("/") + # f_path = f_path[0]+"/"+f_path[1]+"/correctness/"+f_path[2] + f_path = f_path[0]+"/"+f_path[1]+"/aaai/"+f_path[2] + setattr(options, f"{k}", f_path) + print(f"options.{k} : {getattr(options, f'{k}')}") + + # steps = pickle.load(open(f"{options.dataset_folder}unique_steps_list.pkl","rb")) + chunk_iterator = data_processor.load_file_iterator() + + train_file = open(options.train_file_path, "w") + train_info = open(options.train_info_path, "w") + train_label = open(options.train_label_path, "w") + + val_file = open(options.val_file_path, "w") + val_info = open(options.val_info_path, "w") + val_label = open(options.val_label_path, "w") + + test_file = open(options.test_file_path, "w") + test_info = open(options.test_info_path, "w") + test_label = open(options.test_label_path, "w") + + high_performer = pickle.load(open(f"{options.workspace_name}/aaai/change3_high_performers.pkl", "rb")) + mid_performer = pickle.load(open(f"{options.workspace_name}/aaai/change3_mid_performers.pkl", "rb")) + low_performer = pickle.load(open(f"{options.workspace_name}/aaai/change3_low_performers.pkl", "rb")) + prob_sel_list = pickle.load(open(f"{options.workspace_name}/aaai/change3_problem_list.pkl", "rb")) + + ws = "_".join(options.workspace_name.split("_")[:-1]) + + print(ws, len(high_performer), len(mid_performer), len(low_performer), len(prob_sel_list)) + overall_data = [] + overall_labels = [] + for chunk_data in chunk_iterator: + for section, section_groups in chunk_data.groupby("Level (Workspace Id)"): + # if options.workspace_name == section: + if ws == section: + for student, student_groups in section_groups.groupby("Anon Student Id"): + if student in high_performer or student in mid_performer or student in low_performer: + student_groups.sort_values(by="Time") + prob_list = list(pd.unique(student_groups["Problem Name"])) + + for prob, prob_groups in student_groups.groupby("Problem Name"): + # For first 3 and last 3 only + if not prob in prob_sel_list: + continue + + step_names_token = [] + + time_stamps = list(prob_groups["Time"]) + time_stamps_list = set() + for i in range(len(time_stamps)-1): + if (time_stamps[i+1] - time_stamps[i]) < 2000: + time_stamps_list.add(time_stamps[i+1]) + + progress = "" + outcome = [] + help_level = [] + auto_complete = False + means_and_extremes = False + totals = 0 + + for index, row in prob_groups[['Time', 'Step Name', 'Outcome', 'Help Level','CF (Workspace Progress Status)', 'CF (Etalon)']].iterrows(): + step = row["Step Name"] + etalon = row["CF (Etalon)"] + progress = row["CF (Workspace Progress Status)"] + if not pd.isna(step): + if step in options.opt_step1: + try: + etalon = int(etalon) + except Exception as e: + try: + etalon = float(etalon) + means_and_extremes = True + # break + except Exception as e: + pass + if (step in options.opt_step1 or step in options.opt_step2) and row["Time"] in time_stamps_list: + # if row["Time"] in time_stamps_list: + auto_complete = True + # print(row) + continue + # if not step_names_token or step != step_names_token[-1]: + # step_names_token.append(step) + + if not step_names_token or step != step_names_token[-1]: + step_names_token.append(step) + # ['OK', 'BUG', 'ERROR', 'INITIAL_HINT', 'HINT_LEVEL_CHANGE'] + outcome.append(row['Outcome']) + help_level.append(str(row["Help Level"])) + + else: + outcome[-1] = outcome[-1]+":"+row['Outcome'] + help_level[-1] = help_level[-1]+":"+str(row['Help Level']) + + error_ind = ['BUG', 'ERROR', 'INITIAL_HINT', 'HINT_LEVEL_CHANGE'] + errors = 0 + for step, out in zip(step_names_token, outcome): + if (step in options.final_step): + out = out.split(":") + totals = len(out) + # print(totals) + for ind in error_ind: + if ind in out: + errors +=1 + + # 4 and more in sequence + if step_names_token and totals>0: # and len(step_names_token) > 3 + + where_opt = [] + for stp in step_names_token: + if stp in options.opt_step1: + where_opt.append("1") + elif stp in options.opt_step2: + where_opt.append("2") + else: + where_opt.append("0") + + + + correctness = 1 - errors/totals + strat_correct = "0" + if correctness > 0.75: + strat_correct = "1" + + # if not means_and_extremes and label_opt == "2": + # progress, problem name, student id, total steps length, outcome seq, help_level seq, encoding in steps length + info = ",".join([str(progress), str(correctness), f"{1 if means_and_extremes else 0}",str(prob), str(student), str(auto_complete), str(len(step_names_token)),"\t".join(map(str, outcome)), "\t".join(map(str, help_level)), "\t".join(map(str, where_opt))]) + + overall_data.append(["\t".join(step_names_token), info]) + overall_labels.append(strat_correct) + + # overall_data.append('') + # overall_labels.append('') + + overall_labels = np.array(overall_labels) + + for index, (all_data, label) in enumerate(zip(overall_data, overall_labels)): + if all_data: + steps_seq = all_data[0] + info = all_data[1] + student = info.split(",")[4] + + if student in high_performer: + train_file.write(steps_seq) + train_file.write("\n") + train_label.write(label) + train_label.write("\n") + train_info.write(info) + train_info.write("\n") + elif student in mid_performer: + val_file.write(steps_seq) + val_file.write("\n") + val_label.write(label) + val_label.write("\n") + val_info.write(info) + val_info.write("\n") + elif student in low_performer: + test_file.write(steps_seq) + test_file.write("\n") + test_label.write(label) + test_label.write("\n") + test_info.write(info) + test_info.write("\n") + + + + train_file.close() + train_info.close() + train_label.close() + + val_file.close() + val_info.close() + val_label.close() + + test_file.close() + test_info.close() + test_label.close() + +def prepare_finetuning_SL_files(data_processor, opts): + ''' + Ongoing research. Student strategy learning/predicting. + We have defined 9 strategy as: + Notation; Label + UU; 0 + CU; 1 + PU; 2 + UC; 3 + UP; 4 + PP; 5 + PC; 6 + CP; 7 + CC; 8 + ''' + options = copy.deepcopy(opts) + for k,v in vars(opts).items(): + if k.startswith("train") or k.startswith("test"): + if v: + f_path = v.split("/") + f_path = f_path[0]+"/"+f_path[1]+"/SL/"+f_path[2] + setattr(options, f"{k}", f_path) + print(f"options.{k} : {getattr(options, f'{k}')}") + + steps = pickle.load(open(f"{options.dataset_folder}unique_steps_list.pkl","rb")) + chunk_iterator = data_processor.load_file_iterator() + + train_file = open(options.train_file_path, "w") + train_info = open(options.train_info_path, "w") + train_label = open(options.train_label_path, "w") + trainr_label = open(options.trainr_label_path, "w") + train_gt_label = open(options.train_gt_label_path, "w") + + test_file = open(options.test_file_path, "w") + test_info = open(options.test_info_path, "w") + test_label = open(options.test_label_path, "w") + testr_label = open(options.testr_label_path, "w") + test_gt_label = open(options.test_gt_label_path, "w") + + overall_data = [] + overall_labels = [] + for chunk_data in chunk_iterator: + for section, section_groups in chunk_data.groupby("Level (Workspace Id)"): + if options.workspace_name == section: + for student, student_groups in section_groups.groupby("Anon Student Id"): + + student_groups.sort_values(by="Time") + prob_list = list(pd.unique(student_groups["Problem Name"])) + + # if len(prob_list) < 3: + # continue + +# first_prob_list = prob_list[:3] + # last_prob_list = prob_list[-3:] +# # print(len(first_prob_list), len(last_prob_list)) + +# final_prob_list = first_prob_list + last_prob_list + # print(len(prob_list), len(final_prob_list)) #, final_prob_list) + + for prob, prob_groups in student_groups.groupby("Problem Name"): + # For first 3 and last 3 only + # if not prob in last_prob_list: + # continue + + step_names_token = [] + + time_stamps = list(prob_groups["Time"]) + time_stamps_list = set() + for i in range(len(time_stamps)-1): + if (time_stamps[i+1] - time_stamps[i]) < 2000: + time_stamps_list.add(time_stamps[i+1]) + + progress = "" + outcome = [] + help_level = [] + auto_complete = False + means_and_extremes = False + finals = len(options.final_step) + totals = 0 + + for index, row in prob_groups[['Time', 'Step Name', 'Outcome', 'Help Level','CF (Workspace Progress Status)', 'CF (Etalon)']].iterrows(): + step = row["Step Name"] + etalon = row["CF (Etalon)"] + progress = row["CF (Workspace Progress Status)"] + if not pd.isna(step): + if step in options.opt_step1: + try: + etalon = int(etalon) + except Exception as e: + try: + etalon = float(etalon) + means_and_extremes = True + # break + except Exception as e: + pass + if (step in options.opt_step1 or step in options.opt_step2) and row["Time"] in time_stamps_list: + # if row["Time"] in time_stamps_list: + auto_complete = True + # print(row) + continue + # if not step_names_token or step != step_names_token[-1]: + # step_names_token.append(step) + + if not step_names_token or step != step_names_token[-1]: + step_names_token.append(step) + # ['OK', 'BUG', 'ERROR', 'INITIAL_HINT', 'HINT_LEVEL_CHANGE'] + outcome.append(row['Outcome']) + help_level.append(str(row["Help Level"])) + if finals == 0: + totals += 1 + else: + outcome[-1] = outcome[-1]+":"+row['Outcome'] + help_level[-1] = help_level[-1]+":"+str(row['Help Level']) + + error_ind = ['BUG', 'ERROR', 'INITIAL_HINT', 'HINT_LEVEL_CHANGE'] + errors = 0 + for step, out in zip(step_names_token, outcome): + if (finals and step in options.final_step) or totals > 0: + out = out.split(":") + if any(any(ind in o for o in out) for ind in error_ind): + errors +=1 + + if finals: + totals = finals + # 4 and more in sequence + if step_names_token: # and len(step_names_token) > 3 + + where_opt = [] + for stp in step_names_token: + if stp in options.opt_step1: + where_opt.append("1") + elif stp in options.opt_step2: + where_opt.append("2") + else: + where_opt.append("0") + + label_opt = "0" + if options.opt_step1: + all_opt1 = all(any(opt in step for step in step_names_token) for opt in options.opt_step1) + any_opt1 = any(any(opt in step for step in step_names_token) for opt in options.opt_step1[1:]) + + if any_opt1: + label_opt = "2" + if all_opt1: + label_opt = "1" + + + if options.opt_step2: + all_opt2 = all(any(opt in step for step in step_names_token) for opt in options.opt_step2) + any_opt2 = any(any(opt in step for step in step_names_token) for opt in options.opt_step2[1:]) + if any_opt2: + label_opt = "4" + if all_opt2: + label_opt = "3" + if any_opt1 and any_opt2: + label_opt = "5" + if any_opt1 and all_opt2: + label_opt = "6" + if all_opt1 and any_opt2: + label_opt = "7" + if all_opt1 and all_opt2: + label_opt = "8" + + + correctness = 1 - errors/totals + strat_correct = "0" + if correctness > 0.75: + strat_correct = "1" + + # progress, problem name, student id, total steps length, outcome seq, help_level seq, encoding in steps length + info = ",".join([str(progress),str(prob), str(student), str(auto_complete), str(len(step_names_token)),"\t".join(map(str, outcome)), "\t".join(map(str, help_level)), "\t".join(map(str, where_opt)), str(correctness)]) + + overall_data.append(["\t".join(step_names_token), strat_correct, info, f"{1 if means_and_extremes else 0}"]) + overall_labels.append(label_opt) + + overall_data.append('') + overall_labels.append('') + + overall_labels = np.array(overall_labels) + indices_of_zeros = list(np.where(overall_labels == '0')[0]) + indices_of_ones = list(np.where(overall_labels == '1')[0]) + indices_of_twos = list(np.where(overall_labels == '2')[0]) + indices_of_threes = list(np.where(overall_labels == '3')[0]) + indices_of_fours = list(np.where(overall_labels == '4')[0]) + indices_of_fives = list(np.where(overall_labels == '5')[0]) + indices_of_sixes = list(np.where(overall_labels == '6')[0]) + indices_of_sevens = list(np.where(overall_labels == '7')[0]) + indices_of_eights = list(np.where(overall_labels == '8')[0]) + + per = 0.20 + zeros_instances_size = int(per * len(indices_of_zeros)) + ones_instances_size = int(per * len(indices_of_ones)) + twos_instances_size = int(per * len(indices_of_twos)) + threes_instances_size = int(per * len(indices_of_threes)) + fours_instances_size = int(per * len(indices_of_fours)) + fives_instances_size = int(per * len(indices_of_fives)) + sixes_instances_size = int(per * len(indices_of_sixes)) + sevens_instances_size = int(per * len(indices_of_sevens)) + eights_instances_size = int(per * len(indices_of_eights)) + + sample_size = min(zeros_instances_size, ones_instances_size, twos_instances_size, threes_instances_size, fours_instances_size, fives_instances_size, sixes_instances_size, sevens_instances_size, eights_instances_size) + print(f"Sample size.... {sample_size}") + sampled_instances = random.sample(indices_of_zeros, sample_size) + sampled_instances.extend(random.sample(indices_of_ones, sample_size)) + sampled_instances.extend(random.sample(indices_of_twos, sample_size)) + sampled_instances.extend(random.sample(indices_of_threes, sample_size)) + sampled_instances.extend(random.sample(indices_of_fours, sample_size)) + sampled_instances.extend(random.sample(indices_of_fives, sample_size)) + sampled_instances.extend(random.sample(indices_of_sixes, sample_size)) + sampled_instances.extend(random.sample(indices_of_sevens, sample_size)) + sampled_instances.extend(random.sample(indices_of_eights, sample_size)) + + writtenTrain = False + writtenTest = False + for index, (all_data, label) in enumerate(zip(overall_data, overall_labels)): + if all_data: + steps_seq = all_data[0] + strat_correct = all_data[1] + info = all_data[2] + me_opt = all_data[3] + + if index in sampled_instances: + writtenTrain = True + train_file.write(steps_seq) + train_file.write("\n") + train_label.write(label) + train_label.write("\n") + trainr_label.write(strat_correct) + trainr_label.write("\n") + train_info.write(info) + train_info.write("\n") + train_gt_label.write(me_opt) + train_gt_label.write("\n") + else: + writtenTest = True + test_file.write(steps_seq) + test_file.write("\n") + # progress, problem name, student id, total steps length, outcome seq, help_level seq, encoding in steps length + test_label.write(label) + test_label.write("\n") + # testr_label.write(str(correctness)) + testr_label.write(strat_correct) + testr_label.write("\n") + test_info.write(info) + test_info.write("\n") + test_gt_label.write(me_opt) + test_gt_label.write("\n") + else: + # Indicates actions of next student + # Indicates next problem + if writtenTrain: + writtenTrain = False + train_file.write("\n") + train_info.write("\n") + train_label.write("\n") + trainr_label.write("\n") + train_gt_label.write("\n") + if writtenTest: + writtenTest = False + test_file.write("\n") + test_info.write("\n") + test_label.write("\n") + testr_label.write("\n") + test_gt_label.write("\n") + + + train_file.close() + train_info.close() + train_label.close() + trainr_label.close() + train_gt_label.close() + + test_file.close() + test_info.close() + test_label.close() + testr_label.close() + test_gt_label.close() + +def prepare_finetuning_effectiveness_files(data_processor, opts): + ''' + Ongoing research. Student strategy learning/predicting. + We have defined 9 strategy as: + Notation; Label + UU; 0 + CU; 1 + PU; 2 + UC; 3 + UP; 4 + PP; 5 + PC; 6 + CP; 7 + CC; 8 + + if UU and CU and PU and gt = ER and correct, a positive instance + if UU and UC and UP and gt = ME and correct, a positive instance + else a strategy PP, PC, CP, CC and gt = ER/ME or incorrect, a negative instance + ''' + options = copy.deepcopy(opts) + for k,v in vars(opts).items(): + if k.startswith("train") or k.startswith("test"): + if v: + f_path = v.split("/") + f_path = f_path[0]+"/"+f_path[1]+"/effectiveness/"+f_path[2] + setattr(options, f"{k}", f_path) + print(f"options.{k} : {getattr(options, f'{k}')}") + + steps = pickle.load(open(f"{options.dataset_folder}unique_steps_list.pkl","rb")) + chunk_iterator = data_processor.load_file_iterator() + + train_file = open(options.train_file_path, "w") + train_info = open(options.train_info_path, "w") + train_label = open(options.train_label_path, "w") + trainr_label = open(options.trainr_label_path, "w") + train_gt_label = open(options.train_gt_label_path, "w") + + test_file = open(options.test_file_path, "w") + test_info = open(options.test_info_path, "w") + test_label = open(options.test_label_path, "w") + testr_label = open(options.testr_label_path, "w") + test_gt_label = open(options.test_gt_label_path, "w") + + overall_data = [] + overall_labels = [] + for chunk_data in chunk_iterator: + for section, section_groups in chunk_data.groupby("Level (Workspace Id)"): + if options.workspace_name == section: + for student, student_groups in section_groups.groupby("Anon Student Id"): + student_groups.sort_values(by="Time") + prob_list = list(pd.unique(student_groups["Problem Name"])) + + # if len(prob_list) < 3: + # continue + +# first_prob_list = prob_list[:3] + # last_prob_list = prob_list[-3:] +# # print(len(first_prob_list), len(last_prob_list)) + +# final_prob_list = first_prob_list + last_prob_list + # print(len(prob_list), len(final_prob_list)) #, final_prob_list) + + for prob, prob_groups in student_groups.groupby("Problem Name"): + # For first 3 and last 3 only + # if not prob in last_prob_list: + # continue + + step_names_token = [] + + time_stamps = list(prob_groups["Time"]) + time_stamps_list = set() + for i in range(len(time_stamps)-1): + if (time_stamps[i+1] - time_stamps[i]) < 2000: + time_stamps_list.add(time_stamps[i+1]) + + progress = "" + outcome = [] + help_level = [] + auto_complete = False + means_and_extremes = False + finals = len(options.final_step) + totals = 0 + + for index, row in prob_groups[['Time', 'Step Name', 'Outcome', 'Help Level','CF (Workspace Progress Status)', 'CF (Etalon)']].iterrows(): + step = row["Step Name"] + etalon = row["CF (Etalon)"] + progress = row["CF (Workspace Progress Status)"] + if not pd.isna(step): + if step in options.opt_step1: + try: + etalon = int(etalon) + except Exception as e: + try: + etalon = float(etalon) + means_and_extremes = True + # break + except Exception as e: + pass + if (step in options.opt_step1 or step in options.opt_step2) and row["Time"] in time_stamps_list: + # if row["Time"] in time_stamps_list: + auto_complete = True + # print(row) + continue + # if not step_names_token or step != step_names_token[-1]: + # step_names_token.append(step) + + if not step_names_token or step != step_names_token[-1]: + step_names_token.append(step) + # ['OK', 'BUG', 'ERROR', 'INITIAL_HINT', 'HINT_LEVEL_CHANGE'] + outcome.append(row['Outcome']) + help_level.append(str(row["Help Level"])) + if finals == 0: + totals += 1 + else: + outcome[-1] = outcome[-1]+":"+row['Outcome'] + help_level[-1] = help_level[-1]+":"+str(row['Help Level']) + + error_ind = ['BUG', 'ERROR', 'INITIAL_HINT', 'HINT_LEVEL_CHANGE'] + errors = 0 + for step, out in zip(step_names_token, outcome): + if (finals and step in options.final_step) or totals > 0: + out = out.split(":") + if any(any(ind in o for o in out) for ind in error_ind): + errors +=1 + + if finals: + totals = finals + # 4 and more in sequence + if step_names_token: # and len(step_names_token) > 3 + + where_opt = [] + for stp in step_names_token: + if stp in options.opt_step1: + where_opt.append("1") + elif stp in options.opt_step2: + where_opt.append("2") + else: + where_opt.append("0") + + label_opt = "0" + if options.opt_step1: + all_opt1 = all(any(opt in step for step in step_names_token) for opt in options.opt_step1) + any_opt1 = any(any(opt in step for step in step_names_token) for opt in options.opt_step1[1:]) + + if any_opt1: + label_opt = "2" + if all_opt1: + label_opt = "1" + + + if options.opt_step2: + all_opt2 = all(any(opt in step for step in step_names_token) for opt in options.opt_step2) + any_opt2 = any(any(opt in step for step in step_names_token) for opt in options.opt_step2[1:]) + if any_opt2: + label_opt = "4" + if all_opt2: + label_opt = "3" + if any_opt1 and any_opt2: + label_opt = "5" + if any_opt1 and all_opt2: + label_opt = "6" + if all_opt1 and any_opt2: + label_opt = "7" + if all_opt1 and all_opt2: + label_opt = "8" + + + correctness = 1 - errors/totals + strat_correct = "0" + if correctness > 0.75: + strat_correct = "1" + + label_effectiveness = "0" + if label_opt in ["0", "1", "2"] and not means_and_extremes and strat_correct == "1": + label_effectiveness = "1" + elif label_opt in ["0", "3", "4"] and means_and_extremes and strat_correct == "1": + label_effectiveness = "1" + # progress, problem name, student id, total steps length, outcome seq, help_level seq, encoding in steps length + info = ",".join([str(progress),str(prob), str(student), str(auto_complete), str(len(step_names_token)),"\t".join(map(str, outcome)), "\t".join(map(str, help_level)), "\t".join(map(str, where_opt)), str(correctness), label_opt, f"{1 if means_and_extremes else 0}"]) + + overall_data.append(["\t".join(step_names_token), strat_correct, info, f"{1 if means_and_extremes else 0}"]) + overall_labels.append(label_effectiveness) + + overall_data.append('') + overall_labels.append('') + + overall_labels = np.array(overall_labels) + indices_of_zeros = list(np.where(overall_labels == '0')[0]) + indices_of_ones = list(np.where(overall_labels == '1')[0]) + + per = 0.20 + zeros_instances_size = int(per * len(indices_of_zeros)) + ones_instances_size = int(per * len(indices_of_ones)) + + sample_size = min(zeros_instances_size, ones_instances_size) + sampled_instances = random.sample(indices_of_zeros, sample_size) + sampled_instances.extend(random.sample(indices_of_ones, sample_size)) + + writtenTrain = False + writtenTest = False + for index, (all_data, label) in enumerate(zip(overall_data, overall_labels)): + if all_data: + steps_seq = all_data[0] + strat_correct = all_data[1] + info = all_data[2] + me_opt = all_data[3] + + if index in sampled_instances: + writtenTrain = True + train_file.write(steps_seq) + train_file.write("\n") + train_label.write(label) + train_label.write("\n") + trainr_label.write(strat_correct) + trainr_label.write("\n") + train_info.write(info) + train_info.write("\n") + train_gt_label.write(me_opt) + train_gt_label.write("\n") + else: + writtenTest = True + test_file.write(steps_seq) + test_file.write("\n") + # progress, problem name, student id, total steps length, outcome seq, help_level seq, encoding in steps length + test_label.write(label) + test_label.write("\n") + # testr_label.write(str(correctness)) + testr_label.write(strat_correct) + testr_label.write("\n") + test_info.write(info) + test_info.write("\n") + test_gt_label.write(me_opt) + test_gt_label.write("\n") + else: + # Indicates actions of next student + # Indicates next problem + if writtenTrain: + writtenTrain = False + train_file.write("\n") + train_info.write("\n") + train_label.write("\n") + trainr_label.write("\n") + train_gt_label.write("\n") + if writtenTest: + writtenTest = False + test_file.write("\n") + test_info.write("\n") + test_label.write("\n") + testr_label.write("\n") + test_gt_label.write("\n") + + + train_file.close() + train_info.close() + train_label.close() + trainr_label.close() + train_gt_label.close() + + test_file.close() + test_info.close() + test_label.close() + testr_label.close() + test_gt_label.close() + +def prepare_attn_test_files(data_processor, opts): + options = copy.deepcopy(opts) + + if options.code: + new_folder = f"{options.workspace_name}/{options.code}" + if not os.path.exists(new_folder): + os.makedirs(new_folder) + + + for k,v in vars(opts).items(): + if k.startswith("train") or k.startswith("test"): + if v: + f_path = (f"/{options.code}/").join(v.split("/")) + setattr(options, f"{k}", f_path) + print(f"options.{k} : {getattr(options, f'{k}')}") + + steps = pickle.load(open(f"{options.dataset_folder}unique_steps_list.pkl","rb")) + chunk_iterator = data_processor.load_file_iterator() + + train_file = open(options.train_file_path, "w") + train_info = open(options.train_info_path, "w") + + if options.code != "full": + test_file = open(options.test_file_path, "w") + test_info = open(options.test_info_path, "w") + + for chunk_data in chunk_iterator: + for section, section_groups in chunk_data.groupby("Level (Workspace Id)"): + if options.workspace_name == section: + for student, student_groups in section_groups.groupby("Anon Student Id"): + + student_groups.sort_values(by="Time") + prob_list = list(pd.unique(student_groups["Problem Name"])) + + + for prob, prob_groups in student_groups.groupby("Problem Name"): + + step_names_token = [] + + time_stamps = list(prob_groups["Time"]) + time_stamps_list = set() + for i in range(len(time_stamps)-1): + if (time_stamps[i+1] - time_stamps[i]) < 2000: + time_stamps_list.add(time_stamps[i+1]) + + progress = "" + outcome = [] + help_level = [] + auto_complete = False + means_and_extremes = False + finals = len(options.final_step) + totals = 0 + + for index, row in prob_groups[['Time', 'Step Name', 'Outcome', 'Help Level','CF (Workspace Progress Status)', 'CF (Etalon)']].iterrows(): + step = row["Step Name"] + etalon = row["CF (Etalon)"] + progress = row["CF (Workspace Progress Status)"] + if not pd.isna(step): + if step in options.opt_step1: + try: + etalon = int(etalon) + except Exception as e: + try: + etalon = float(etalon) + means_and_extremes = True + # break + except Exception as e: + pass + if (step in options.opt_step1 or step in options.opt_step2) and row["Time"] in time_stamps_list: + # if row["Time"] in time_stamps_list: + auto_complete = True + # print(row) + continue + # if not step_names_token or step != step_names_token[-1]: + # step_names_token.append(step) + + if not step_names_token or step != step_names_token[-1]: + step_names_token.append(step) + # ['OK', 'BUG', 'ERROR', 'INITIAL_HINT', 'HINT_LEVEL_CHANGE'] + outcome.append(row['Outcome']) + help_level.append(str(row["Help Level"])) + if finals == 0: + totals += 1 + else: + outcome[-1] = outcome[-1]+":"+row['Outcome'] + help_level[-1] = help_level[-1]+":"+str(row['Help Level']) + + error_ind = ['BUG', 'ERROR', 'INITIAL_HINT', 'HINT_LEVEL_CHANGE'] + errors = 0 + for step, out in zip(step_names_token, outcome): + if (finals and step in options.final_step) or totals > 0: + out = out.split(":") + if any(any(ind in o for o in out) for ind in error_ind): + errors +=1 + + if finals: + totals = finals + # 4 and more in sequence + if step_names_token: # and len(step_names_token) > 3 + + where_opt = [] + for stp in step_names_token: + if stp in options.opt_step1: + where_opt.append("1") + elif stp in options.opt_step2: + where_opt.append("2") + else: + where_opt.append("0") + + label_opt = "0" + if options.opt_step1: + all_opt1 = all(any(opt in step for step in step_names_token) for opt in options.opt_step1) + any_opt1 = any(any(opt in step for step in step_names_token) for opt in options.opt_step1[1:]) + + if any_opt1: + label_opt = "2" + if all_opt1: + label_opt = "1" + + + if options.opt_step2: + all_opt2 = all(any(opt in step for step in step_names_token) for opt in options.opt_step2) + any_opt2 = any(any(opt in step for step in step_names_token) for opt in options.opt_step2[1:]) + if any_opt2: + label_opt = "4" + if all_opt2: + label_opt = "3" + if any_opt1 and any_opt2: + label_opt = "5" + if any_opt1 and all_opt2: + label_opt = "6" + if all_opt1 and any_opt2: + label_opt = "7" + if all_opt1 and all_opt2: + label_opt = "8" + + + correctness = 1 - errors/totals + opt_correct = "0" + if correctness > 0.75: + opt_correct = "1" + + proba = random.random() + + # if proba <= 0.1: + # if not means_and_extremes: + # if prob in first_prob_list: + if options.code == "full" or (options.code == "gt" and not means_and_extremes) or (options.code == "correct" and opt_correct == "1") or (options.code == "progress" and progress == "GRADUATED"): + if label_opt == "0": + continue + train_file.write("\t".join(step_names_token)) + train_file.write("\n") + # progress, problem name, student id, total steps length, outcome seq, help_level seq, encoding in steps length + train_info.write(",".join([str(progress),str(prob), str(student), str(auto_complete), str(len(step_names_token)), + "\t".join(map(str, outcome)), "\t".join(map(str, help_level)), "\t".join(map(str, where_opt)), + str(correctness), f"{1 if means_and_extremes else 0}", label_opt])) + train_info.write("\n") + # if means_and_extremes: + # if prob in last_prob_list: + else: + test_file.write("\t".join(step_names_token)) + test_file.write("\n") + # progress, problem name, student id, total steps length, outcome seq, help_level seq, encoding in steps length + test_info.write(",".join([str(progress),str(prob), str(student), str(auto_complete), str(len(step_names_token)), + "\t".join(map(str, outcome)), "\t".join(map(str, help_level)), "\t".join(map(str, where_opt)), + str(correctness), f"{1 if means_and_extremes else 0}", label_opt])) + test_info.write("\n") + + train_file.close() + train_info.close() + + if options.code != "full": + test_file.close() + test_info.close() + +def prepare_finetuning_future_files(data_processor, opts): + options = copy.deepcopy(opts) + for k,v in vars(opts).items(): + if k.startswith("train") or k.startswith("test"): + if v: + f_path = ("/effectiveness/").join(v.split("/")) + setattr(options, f"{k}", f_path) + print(f"options.{k} : {getattr(options, f'{k}')}") + + steps = pickle.load(open(f"{options.dataset_folder}unique_steps_list.pkl","rb")) + chunk_iterator = data_processor.load_file_iterator() + + train_file = open(options.train_file_path, "w") + train_info = open(options.train_info_path, "w") + train_label = open(options.train_label_path, "w") + trainr_label = open(options.trainr_label_path, "w") + + test_file = open(options.test_file_path, "w") + test_info = open(options.test_info_path, "w") + test_label = open(options.test_label_path, "w") + testr_label = open(options.testr_label_path, "w") + + for chunk_data in chunk_iterator: + for section, section_groups in chunk_data.groupby("Level (Workspace Id)"): + if options.workspace_name == section: + for student, student_groups in section_groups.groupby("Anon Student Id"): + writtenTrain = False + writtenTest = False + + student_groups.sort_values(by="Time") + prob_list = list(pd.unique(student_groups["Problem Name"])) + + # if len(prob_list) < 6: + # continue + +# first_prob_list = prob_list[:3] +# last_prob_list = prob_list[-3:] +# # print(len(first_prob_list), len(last_prob_list)) + +# final_prob_list = first_prob_list + last_prob_list + # print(len(prob_list), len(final_prob_list)) #, final_prob_list) + + for prob, prob_groups in student_groups.groupby("Problem Name"): + # For first 3 and last 3 only + # if not prob in final_prob_list: + # continue + + step_names_token = [] + + time_stamps = list(prob_groups["Time"]) + time_stamps_list = set() + for i in range(len(time_stamps)-1): + if (time_stamps[i+1] - time_stamps[i]) < 2000: + time_stamps_list.add(time_stamps[i+1]) + + progress = "" + outcome = [] + help_level = [] + auto_complete = False + errors = 0 + totals = 0 + means_and_extremes = False + + for index, row in prob_groups[['Time', 'Step Name', 'Outcome', 'Help Level','CF (Workspace Progress Status)', 'CF (Etalon)']].iterrows(): + step = row["Step Name"] + etalon = row["CF (Etalon)"] + progress = row["CF (Workspace Progress Status)"] + if not pd.isna(step): + if step in options.opt_step1: + try: + etalon = int(etalon) + except Exception as e: + try: + etalon = float(etalon) + means_and_extremes = True + # break + except Exception as e: + pass + if (step in options.opt_step1 or step in options.opt_step2) and row["Time"] in time_stamps_list: + # if row["Time"] in time_stamps_list: + auto_complete = True + # print(row) + continue + # if not step_names_token or step != step_names_token[-1]: + # step_names_token.append(step) + + if not step_names_token or step != step_names_token[-1]: + step_names_token.append(step) + # ['OK', 'BUG', 'ERROR', 'INITIAL_HINT', 'HINT_LEVEL_CHANGE'] + outcome.append(row['Outcome']) + help_level.append(str(row["Help Level"])) + totals += 1 + else: + outcome[-1] = outcome[-1]+":"+row['Outcome'] + help_level[-1] = help_level[-1]+":"+str(row['Help Level']) + + error_ind = ['BUG', 'ERROR', 'INITIAL_HINT', 'HINT_LEVEL_CHANGE'] + for out in outcome: + out = out.split(":") + if any(any(ind in o for o in out) for ind in error_ind): + errors +=1 + # 4 and more in sequence + if step_names_token: # and len(step_names_token) > 3 + + where_opt = [] + for stp in step_names_token: + if stp in options.opt_step1: + where_opt.append("1") + elif stp in options.opt_step2: + where_opt.append("2") + else: + where_opt.append("0") + + label_opt = "0" + if options.opt_step1: + all_opt1 = all(any(opt in step for step in step_names_token) for opt in options.opt_step1) + any_opt1 = any(any(opt in step for step in step_names_token) for opt in options.opt_step1[1:]) + + if any_opt1: + label_opt = "2" + if all_opt1: + label_opt = "1" + + + if options.opt_step2: + all_opt2 = all(any(opt in step for step in step_names_token) for opt in options.opt_step2) + any_opt2 = any(any(opt in step for step in step_names_token) for opt in options.opt_step2[1:]) + if any_opt2: + label_opt = "4" + if all_opt2: + label_opt = "3" + if any_opt1 and any_opt2: + label_opt = "5" + if any_opt1 and all_opt2: + label_opt = "6" + if all_opt1 and any_opt2: + label_opt = "7" + if all_opt1 and all_opt2: + label_opt = "8" + + + correctness = 1 - errors/totals + opt_correct = "0" + if correctness < 0.25: + opt_correct = "0" + elif correctness < 0.5: + opt_correct = "1" + elif correctness < 0.75: + opt_correct = "2" + else: + opt_correct = "3" + + + + proba = random.random() + + # if proba <= 0.1: + if not means_and_extremes: + # if prob in first_prob_list: + writtenTrain = True + train_file.write("\t".join(step_names_token)) + train_file.write("\n") + # progress, problem name, student id, total steps length, outcome seq, help_level seq, encoding in steps length + train_label.write(label_opt) + train_label.write("\n") + # trainr_label.write(str(correctness)) + trainr_label.write(opt_correct) + trainr_label.write("\n") + train_info.write(",".join([str(progress),str(prob), str(student), str(auto_complete), str(len(step_names_token)), + "\t".join(map(str, outcome)), "\t".join(map(str, help_level)), "\t".join(map(str, where_opt))])) + train_info.write("\n") + + if means_and_extremes: + # if prob in last_prob_list: + # else: + writtenTest = True + test_file.write("\t".join(step_names_token)) + test_file.write("\n") + # progress, problem name, student id, total steps length, outcome seq, help_level seq, encoding in steps length + test_label.write(label_opt) + test_label.write("\n") + # testr_label.write(str(correctness)) + testr_label.write(opt_correct) + testr_label.write("\n") + test_info.write(",".join([str(progress),str(prob), str(student), str(auto_complete), str(len(step_names_token)), + "\t".join(map(str, outcome)), "\t".join(map(str, help_level)), "\t".join(map(str, where_opt))])) + test_info.write("\n") + # Indicates actions of next student + # Indicates next problem + if writtenTrain: + train_file.write("\n") + train_info.write("\n") + train_label.write("\n") + trainr_label.write("\n") + if writtenTest: + test_file.write("\n") + test_info.write("\n") + test_label.write("\n") + testr_label.write("\n") + + + train_file.close() + train_info.close() + train_label.close() + trainr_label.close() + + test_file.close() + test_info.close() + test_label.close() + testr_label.close() + +def prepare_school_coded_finetuning_partial_seq_files(data_processor, options): + ''' + Ongoing research. + FinalAnswer step correctness + Correct: 0 if attempt at step>1 + 1 if attempt at step==1 + ''' + chunk_iterator = data_processor.load_file_iterator(sep=",") + + train_file = open(options.train_file_path, "w") + train_info = open(options.train_info_path, "w") + train_label = open(options.train_label_path, "w") + +# val_file = open(options.val_file_path, "w") +# val_info = open(options.val_info_path, "w") +# val_label = open(options.val_label_path, "w") + + test_file = open(options.test_file_path, "w") + test_info = open(options.test_info_path, "w") + test_label = open(options.test_label_path, "w") + + overall_data = [] + overall_labels = [] + # kcs = pickle.load(open("dataset/CL_2223/ratio_proportion_change3_2223/unique_kcs_list.pkl", "rb")) + # kcs = [kc if not pd.isna(kc) for kc in kcs] + for chunk_data in chunk_iterator: + for school, school_group in chunk_data.groupby('CF (Anon School Id)'): + if not options.school or school in options.school: + print(f"{school} : {school_group.shape}") + school_group = school_group[(school_group['CF (Is StepByStep)'] == False) & + (school_group['CF (Encounter)'] == 0) & + (school_group['CF (Is Review Mode)'] == -1) ] + print(f"{school} : {school_group.shape}") + # for class_id, class_group in school_groups.groupby('CF (Anon Class Id)'): + for student, student_groups in school_group.groupby("Anon Student Id"): + student_groups.sort_values(by="Time", inplace=True) + # prob_list = list(pd.unique(student_groups["Problem Name"])) + for prob, prob_groups in student_groups.groupby("Problem Name"): + + actions = list(prob_groups["Action"]) + # A problem should be completed by a student clicking Done button. + if not "Done" in actions: + continue + unique_steps = list(pd.unique(prob_groups[prob_groups["CF (Is Autofilled)"] == False]["Step Name"])) + unique_steps_len = len([s for s in unique_steps if not pd.isna(s) and not (s in options.opt_step1) and not (s in options.opt_step2)]) + if unique_steps_len < 4: + continue + unique_opt_steps_len = len([s for s in unique_steps if not pd.isna(s) and (s in options.opt_step1[1:] or s in options.opt_step2[1:])]) + if unique_opt_steps_len < 2: + continue + + class_id = list(pd.unique(prob_groups["CF (Anon Class Id)"])) + step_names_token = [] + original_steps_actions_attempts_help_levels_outcomes = [] + original_steps = [] + means_and_extremes = False + correctness = "0" + opt_used = False + for index, row in prob_groups[['Step Name', 'Action', 'Attempt At Step', 'CF (Is Autofilled)', + 'Outcome', 'Help Level', 'CF (Workspace Progress Status)', + 'CF (Etalon)', 'CF (Problem Scenario Tags)']].iterrows(): + step = row["Step Name"] + action = row["Action"] # ['Attempt', 'Hint Request', 'Hint Level Change', 'Done'] + attempt = row["Attempt At Step"] # number + outcome = row["Outcome"] # ['OK', 'BUG', 'ERROR', 'INITIAL_HINT', 'HINT_LEVEL_CHANGE'] + help_level = row["Help Level"] # number + progress = row["CF (Workspace Progress Status)"] + scenario = row['CF (Problem Scenario Tags)'] + + if not pd.isna(step): + if step in options.opt_step1 and not means_and_extremes: + etalon = row["CF (Etalon)"] + if not pd.isna(etalon): + etalon = etalon.strip('{}') + key, value = etalon.split('=') + etalon = value + try: + etalon = int(etalon) + except Exception as e: + try: + etalon = float(etalon) + means_and_extremes = True + except Exception as e: + pass + if row['CF (Is Autofilled)'] == True: + continue + prev = step_names_token[-1] if step_names_token else "" + prev_step = step_names_token[-1].split("-")[0] if step_names_token else "" + + if not step_names_token or step != prev_step: + if step in options.opt_step1 or step in options.opt_step2: + new_step = step + opt_used = True + else: + if action == "Attempt" and outcome != "OK": + new_step = step+"-2" + elif "Hint" in action: + new_step = step+"-1" + else: + new_step = step+"-0" + + if step != "FinalAnswer": + step_names_token.append(new_step) + else: + step_names_token.append("FinalAnswer") + else: + if not (step in options.opt_step1 or step in options.opt_step2 or step == "FinalAnswer"): + if action == "Attempt" and outcome != "OK": + new_step = step+"-2" + elif "Hint" in action: + new_step = step+"-1" + else: + new_step = step+"-0" + + if prev < new_step: + step_names_token[-1] = new_step + + if step == "FinalAnswer" and opt_used: + if attempt == 1 and outcome == "OK": + correctness = "1" + else: + correctness = "0" + original_steps_actions_attempts_help_levels_outcomes.append(f"{step}-{action}-{attempt}-{help_level}-{outcome}") + original_steps.append(step) + + unique_steps_len = len([s for s in original_steps if not (s in options.opt_step1) and not(s in options.opt_step2)]) + if step_names_token and unique_steps_len > 4: + info = ",".join([str(school), "\t".join(class_id), str(student), str(progress), str(prob), str(scenario), + f"{1 if means_and_extremes else 0}", str(len(step_names_token)), + "\t".join(map(str, original_steps_actions_attempts_help_levels_outcomes))]) + overall_data.append(["\t".join(step_names_token), info]) + overall_labels.append(correctness) +# proba = random.random() +# # if prob in first_prob_list: +# if proba <= 0.8: +# train_file.write("\t".join(step_names_token)) +# train_file.write("\n") +# # school, class, student id, progress, problem name, scenario, +# # prefered ER or ME, total steps length, +# # original seq-action-attempt-help_level-outcome +# train_info.write(",".join([str(school), "\t".join(class_id), str(student), str(progress), str(prob), str(scenario), +# f"{1 if means_and_extremes else 0}", str(len(step_names_token)), +# "\t".join(map(str, original_steps_actions_attempts_help_levels_outcomes))])) +# train_info.write("\n") + +# elif proba > 0.9: +# # elif prob in last_prob_list: +# test_file.write("\t".join(step_names_token)) +# test_file.write("\n") +# # school, class, student id, progress, problem name, scenario, +# # prefered ER or ME, total steps length, +# # original seq-action-attempt-help_level-outcome +# test_info.write(",".join([str(school), "\t".join(class_id), str(student), str(progress), str(prob), str(scenario), +# f"{1 if means_and_extremes else 0}", str(len(step_names_token)), +# "\t".join(map(str, original_steps_actions_attempts_help_levels_outcomes))])) +# test_info.write("\n") + +# else: +# val_file.write("\t".join(step_names_token)) +# val_file.write("\n") +# # school, class, student id, progress, problem name, scenario, +# # prefered ER or ME, total steps length, +# # original seq-action-attempt-help_level-outcome +# val_info.write(",".join([str(school), "\t".join(class_id), str(student), str(progress), str(prob), str(scenario), +# f"{1 if means_and_extremes else 0}", str(len(step_names_token)), +# "\t".join(map(str, original_steps_actions_attempts_help_levels_outcomes))])) +# val_info.write("\n") + # break + # break + # break + # break + # break + overall_labels = np.array(overall_labels) + indices_of_zeros = list(np.where(overall_labels == '0')[0]) + indices_of_ones = list(np.where(overall_labels == '1')[0]) + + train_len = int(len(overall_labels) * 0.10) + sample_size = int(train_len/2) + print(f"sample_size: {sample_size}") + sampled_instances = random.sample(indices_of_zeros, sample_size) + sampled_instances.extend(random.sample(indices_of_ones, sample_size)) + + indices_of_zeros = [i for i in indices_of_zeros if not i in sampled_instances ] + indices_of_ones = [i for i in indices_of_ones if not i in sampled_instances ] + + balanced_test = min(len(indices_of_zeros), len(indices_of_ones)) + print(f"balanced_test: {balanced_test}") + test_sampled_instances = random.sample(indices_of_zeros, balanced_test) + test_sampled_instances.extend(random.sample(indices_of_ones, balanced_test)) + + for index, (all_data, label) in enumerate(zip(overall_data, overall_labels)): + + steps_seq = all_data[0] + info = all_data[1] + + if index in sampled_instances: + train_file.write(steps_seq) + train_file.write("\n") + + train_info.write(info) + train_info.write("\n") + + train_label.write(label) + train_label.write("\n") + elif index in test_sampled_instances: + # proba = random.random() + # if proba <0.5: + test_file.write(steps_seq) + test_file.write("\n") + + test_info.write(info) + test_info.write("\n") + + test_label.write(label) + test_label.write("\n") +# else: +# val_file.write(steps_seq) +# val_file.write("\n") + +# val_info.write(info) +# val_info.write("\n") + +# val_label.write(label) +# val_label.write("\n") + + + train_file.close() + train_info.close() + train_label.close() + + # val_file.close() + # val_info.close() + # val_label.close() + + test_file.close() + test_info.close() + test_label.close() + +def prepare_school_coded_finetuning_opts_files(data_processor, options): + ''' + Ongoing research. + Labels: + 0 - Opt 1 + 1 - Opt 2 + 2 - Both Opt + ''' + chunk_iterator = data_processor.load_file_iterator(sep=",") + + train_file = open(options.train_file_path, "w") + train_info = open(options.train_info_path, "w") + train_label = open(options.train_label_path, "w") + +# val_file = open(options.val_file_path, "w") +# val_info = open(options.val_info_path, "w") +# val_label = open(options.val_label_path, "w") + + test_file = open(options.test_file_path, "w") + test_info = open(options.test_info_path, "w") + test_label = open(options.test_label_path, "w") + + overall_data = [] + overall_labels = [] + # kcs = pickle.load(open("dataset/CL_2223/ratio_proportion_change3_2223/unique_kcs_list.pkl", "rb")) + # kcs = [kc if not pd.isna(kc) for kc in kcs] + for chunk_data in chunk_iterator: + for school, school_group in chunk_data.groupby('CF (Anon School Id)'): + if not options.school or school in options.school: + print(f"{school} : {school_group.shape}") + school_group = school_group[(school_group['CF (Is StepByStep)'] == False) & + (school_group['CF (Encounter)'] == 0) & + (school_group['CF (Is Review Mode)'] == -1) ] + print(f"{school} : {school_group.shape}") + # for class_id, class_group in school_groups.groupby('CF (Anon Class Id)'): + for student, student_groups in school_group.groupby("Anon Student Id"): + student_groups.sort_values(by="Time", inplace=True) + # prob_list = list(pd.unique(student_groups["Problem Name"])) + # prob_list= list(student_groups[student_groups["CF (Workspace Progress Status)"]=="GRADUATED"]["Problem Name"]) + # prob_list = prob_list[-int(len(prob_list)/2):] + for prob, prob_groups in student_groups.groupby("Problem Name"): + # if not prob in prob_list: + # continue + actions = list(prob_groups["Action"]) + # A problem should be completed by a student clicking Done button. + if not "Done" in actions: + continue + unique_steps = list(pd.unique(prob_groups[prob_groups["CF (Is Autofilled)"] == False]["Step Name"])) + unique_steps_len = len([s for s in unique_steps if not pd.isna(s) and not (s in options.opt_step1) and not (s in options.opt_step2)]) + if unique_steps_len < 4: + continue + unique_opt_steps_len = len([s for s in unique_steps if not pd.isna(s) and (s in options.opt_step1[1:] or s in options.opt_step2[1:])]) + if unique_opt_steps_len < 2: + continue + print(unique_steps, unique_opt_steps_len) + class_id = list(pd.unique(prob_groups["CF (Anon Class Id)"])) + step_names_token = [] + original_steps_actions_attempts_help_levels_outcomes = [] + original_steps = [] + means_and_extremes = False + opt1_used = False + opt2_used = False + for index, row in prob_groups[['Step Name', 'Action', 'Attempt At Step', 'CF (Is Autofilled)', + 'Outcome', 'Help Level', 'CF (Workspace Progress Status)', + 'CF (Etalon)', 'CF (Problem Scenario Tags)']].iterrows(): + step = row["Step Name"] + action = row["Action"] # ['Attempt', 'Hint Request', 'Hint Level Change', 'Done'] + attempt = row["Attempt At Step"] # number + outcome = row["Outcome"] # ['OK', 'BUG', 'ERROR', 'INITIAL_HINT', 'HINT_LEVEL_CHANGE'] + help_level = row["Help Level"] # number + progress = row["CF (Workspace Progress Status)"] + scenario = row['CF (Problem Scenario Tags)'] + + if not pd.isna(step): + if step in options.opt_step1 and not means_and_extremes: + etalon = row["CF (Etalon)"] + if not pd.isna(etalon): + etalon = etalon.strip('{}') + key, value = etalon.split('=') + etalon = value + try: + etalon = int(etalon) + except Exception as e: + try: + etalon = float(etalon) + means_and_extremes = True + except Exception as e: + pass + if row['CF (Is Autofilled)'] == True: + continue + prev = step_names_token[-1] if step_names_token else "" + prev_step = step_names_token[-1].split("-")[0] if step_names_token else "" + + if not step_names_token or step != prev_step: + if step in options.opt_step1 or step in options.opt_step2: + new_step = step + if step in options.opt_step1[1:]: + opt1_used = True + elif step in options.opt_step2[2:]: + opt2_used = True + else: + if action == "Attempt" and outcome != "OK": + new_step = step+"-2" + elif "Hint" in action: + new_step = step+"-1" + else: + new_step = step+"-0" + + step_names_token.append(new_step) + + else: + if not (step in options.opt_step1 or step in options.opt_step2 or step == "FinalAnswer"): + if action == "Attempt" and outcome != "OK": + new_step = step+"-2" + elif "Hint" in action: + new_step = step+"-1" + else: + new_step = step+"-0" + + if prev < new_step: + step_names_token[-1] = new_step + + original_steps_actions_attempts_help_levels_outcomes.append(f"{step}-{action}-{attempt}-{help_level}-{outcome}") + original_steps.append(step) + if (not opt1_used) and (not opt2_used): + continue + unique_steps_len = len([s for s in original_steps if not (s in options.opt_step1) and not(s in options.opt_step2)]) + if step_names_token and unique_steps_len > 4: + info = ",".join([str(school), "\t".join(class_id), str(student), str(progress), str(prob), str(scenario), + f"{1 if means_and_extremes else 0}", str(len(step_names_token)), + "\t".join(map(str, original_steps_actions_attempts_help_levels_outcomes))]) + overall_data.append(["\t".join(step_names_token), info]) + label = None + if opt1_used and opt2_used: + label = "2" + if (not opt1_used) and opt2_used: + label = "1" + if opt1_used and (not opt2_used): + label = "0" + print(f"opt1_used: {opt1_used}, opt2_used: {opt2_used} label : {label}") + overall_labels.append(label) +# proba = random.random() +# # if prob in first_prob_list: +# if proba <= 0.8: +# train_file.write("\t".join(step_names_token)) +# train_file.write("\n") +# # school, class, student id, progress, problem name, scenario, +# # prefered ER or ME, total steps length, +# # original seq-action-attempt-help_level-outcome +# train_info.write(",".join([str(school), "\t".join(class_id), str(student), str(progress), str(prob), str(scenario), +# f"{1 if means_and_extremes else 0}", str(len(step_names_token)), +# "\t".join(map(str, original_steps_actions_attempts_help_levels_outcomes))])) +# train_info.write("\n") + +# elif proba > 0.9: +# # elif prob in last_prob_list: +# test_file.write("\t".join(step_names_token)) +# test_file.write("\n") +# # school, class, student id, progress, problem name, scenario, +# # prefered ER or ME, total steps length, +# # original seq-action-attempt-help_level-outcome +# test_info.write(",".join([str(school), "\t".join(class_id), str(student), str(progress), str(prob), str(scenario), +# f"{1 if means_and_extremes else 0}", str(len(step_names_token)), +# "\t".join(map(str, original_steps_actions_attempts_help_levels_outcomes))])) +# test_info.write("\n") + +# else: +# val_file.write("\t".join(step_names_token)) +# val_file.write("\n") +# # school, class, student id, progress, problem name, scenario, +# # prefered ER or ME, total steps length, +# # original seq-action-attempt-help_level-outcome +# val_info.write(",".join([str(school), "\t".join(class_id), str(student), str(progress), str(prob), str(scenario), +# f"{1 if means_and_extremes else 0}", str(len(step_names_token)), +# "\t".join(map(str, original_steps_actions_attempts_help_levels_outcomes))])) +# val_info.write("\n") + # break + # break + # break + # break + # break + overall_labels = np.array(overall_labels) + indices_of_zeros = list(np.where(overall_labels == '0')[0]) + indices_of_ones = list(np.where(overall_labels == '1')[0]) + indices_of_twos = list(np.where(overall_labels == '2')[0]) + + train_len = int(len(overall_labels) * 0.10) + sample_size = int(train_len/3) + print(f"sample_size: {sample_size}") + sampled_instances = random.sample(indices_of_zeros, sample_size) + sampled_instances.extend(random.sample(indices_of_ones, sample_size)) + sampled_instances.extend(random.sample(indices_of_twos, sample_size)) + + indices_of_zeros = [i for i in indices_of_zeros if not i in sampled_instances ] + indices_of_ones = [i for i in indices_of_ones if not i in sampled_instances ] + indices_of_twos = [i for i in indices_of_twos if not i in sampled_instances ] + + balanced_test = min(len(indices_of_zeros), len(indices_of_ones), len(indices_of_twos)) + print(f"balanced_test: {balanced_test}") + test_sampled_instances = random.sample(indices_of_zeros, balanced_test) + test_sampled_instances.extend(random.sample(indices_of_ones, balanced_test)) + test_sampled_instances.extend(random.sample(indices_of_twos, balanced_test)) + + for index, (all_data, label) in enumerate(zip(overall_data, overall_labels)): + + steps_seq = all_data[0] + info = all_data[1] + + if index in sampled_instances: + train_file.write(steps_seq) + train_file.write("\n") + + train_info.write(info) + train_info.write("\n") + + train_label.write(label) + train_label.write("\n") + elif index in test_sampled_instances: + # proba = random.random() + # if proba <0.5: + test_file.write(steps_seq) + test_file.write("\n") + + test_info.write(info) + test_info.write("\n") + + test_label.write(label) + test_label.write("\n") +# else: +# val_file.write(steps_seq) +# val_file.write("\n") + +# val_info.write(info) +# val_info.write("\n") + +# val_label.write(label) +# val_label.write("\n") + + + train_file.close() + train_info.close() + train_label.close() + + # val_file.close() + # val_info.close() + # val_label.close() + + test_file.close() + test_info.close() + test_label.close() + +def prepare_school_coded_finetuning_opts_intentional_files(data_processor, options): + ''' + Ongoing research. + Labels: + 0 - Opt 1 + 1 - Opt 2 + 2 - Both Opt + ''' + chunk_iterator = data_processor.load_file_iterator(sep=",") + + train_file = open(options.train_file_path, "w") + train_info = open(options.train_info_path, "w") + train_label = open(options.train_label_path, "w") + + val_file = open(options.val_file_path, "w") + val_info = open(options.val_info_path, "w") + val_label = open(options.val_label_path, "w") + + test_file = open(options.test_file_path, "w") + test_info = open(options.test_info_path, "w") + test_label = open(options.test_label_path, "w") + + # overall_data = [] + # overall_labels = [] + # kcs = pickle.load(open("dataset/CL_2223/ratio_proportion_change3_2223/unique_kcs_list.pkl", "rb")) + # kcs = [kc if not pd.isna(kc) for kc in kcs] + for chunk_data in chunk_iterator: + for school, school_group in chunk_data.groupby('CF (Anon School Id)'): + if not options.school or school in options.school: + print(f"{school} : {school_group.shape}") + school_group = school_group[(school_group['CF (Is StepByStep)'] == False) & + (school_group['CF (Encounter)'] == 0) & + (school_group['CF (Is Review Mode)'] == -1) ] + print(f"{school} : {school_group.shape}") + # for class_id, class_group in school_groups.groupby('CF (Anon Class Id)'): + for student, student_groups in school_group.groupby("Anon Student Id"): + student_groups.sort_values(by="Time", inplace=True) + # prob_list = list(pd.unique(student_groups["Problem Name"])) + prob_list= list(pd.unique(student_groups[student_groups["CF (Workspace Progress Status)"]=="GRADUATED"]["Problem Name"])) + # prob_list = prob_list[-int(len(prob_list)/2):] + if len(prob_list) == 0: + continue + for prob, prob_groups in student_groups.groupby("Problem Name"): + # if not prob in prob_list: + # continue + actions = list(prob_groups["Action"]) + # A problem should be completed by a student clicking Done button. + if not "Done" in actions: + continue + unique_steps = list(pd.unique(prob_groups[prob_groups["CF (Is Autofilled)"] == False]["Step Name"])) + unique_steps_len = len([s for s in unique_steps if not pd.isna(s) and not (s in options.opt_step1) and not (s in options.opt_step2)]) + if unique_steps_len < 4: + continue + unique_opt_steps_len = len([s for s in unique_steps if not pd.isna(s) and (s in options.opt_step1[1:] or s in options.opt_step2[1:])]) + if unique_opt_steps_len < 2: + continue + # print(unique_steps, unique_opt_steps_len) + class_id = list(pd.unique(prob_groups["CF (Anon Class Id)"])) + step_names_token = [] + original_steps_actions_attempts_help_levels_outcomes = [] + original_steps = [] + means_and_extremes = False + opt1_used = False + opt2_used = False + for index, row in prob_groups[['Step Name', 'Action', 'Attempt At Step', 'CF (Is Autofilled)', + 'Outcome', 'Help Level', 'CF (Workspace Progress Status)', + 'CF (Etalon)', 'CF (Problem Scenario Tags)']].iterrows(): + step = row["Step Name"] + action = row["Action"] # ['Attempt', 'Hint Request', 'Hint Level Change', 'Done'] + attempt = row["Attempt At Step"] # number + outcome = row["Outcome"] # ['OK', 'BUG', 'ERROR', 'INITIAL_HINT', 'HINT_LEVEL_CHANGE'] + help_level = row["Help Level"] # number + progress = row["CF (Workspace Progress Status)"] + scenario = row['CF (Problem Scenario Tags)'] + + if not pd.isna(step): + if step in options.opt_step1 and not means_and_extremes: + etalon = row["CF (Etalon)"] + if not pd.isna(etalon): + etalon = etalon.strip('{}') + key, value = etalon.split('=') + etalon = value + try: + etalon = int(etalon) + except Exception as e: + try: + etalon = float(etalon) + means_and_extremes = True + except Exception as e: + pass + if row['CF (Is Autofilled)'] == True: + continue + prev = step_names_token[-1] if step_names_token else "" + prev_step = step_names_token[-1].split("-")[0] if step_names_token else "" + + if not step_names_token or step != prev_step: + if step in options.opt_step1 or step in options.opt_step2: + new_step = step + if step in options.opt_step1[1:]: + opt1_used = True + elif step in options.opt_step2[2:]: + opt2_used = True + else: + if action == "Attempt" and outcome != "OK": + new_step = step+"-2" + elif "Hint" in action: + new_step = step+"-1" + else: + new_step = step+"-0" + + step_names_token.append(new_step) + + else: + if not (step in options.opt_step1 or step in options.opt_step2 or step == "FinalAnswer"): + if action == "Attempt" and outcome != "OK": + new_step = step+"-2" + elif "Hint" in action: + new_step = step+"-1" + else: + new_step = step+"-0" + + if prev < new_step: + step_names_token[-1] = new_step + + original_steps_actions_attempts_help_levels_outcomes.append(f"{step}-{action}-{attempt}-{help_level}-{outcome}") + original_steps.append(step) + # if (not opt1_used) and (not opt2_used): + # continue + unique_steps_len = len([s for s in original_steps if not (s in options.opt_step1) and not(s in options.opt_step2)]) + if step_names_token and unique_steps_len > 4: + info = ",".join([str(school), "\t".join(class_id), str(student), str(progress), str(prob), str(scenario), + f"{1 if means_and_extremes else 0}", str(len(step_names_token)), + "\t".join(map(str, original_steps_actions_attempts_help_levels_outcomes))]) + # overall_data.append(["\t".join(step_names_token), info]) + # label = None + # if opt1_used and opt2_used: + # label = "2" + # if (not opt1_used) and opt2_used: + # label = "1" + # if opt1_used and (not opt2_used): + # label = "0" + # print(f"opt1_used: {opt1_used}, opt2_used: {opt2_used} label : {label}") + # overall_labels.append(label) + + proba = random.random() + # if prob in first_prob_list: + if proba <= 0.8: + train_file.write("\t".join(step_names_token)) + train_file.write("\n") + # school, class, student id, progress, problem name, scenario, + # prefered ER or ME, total steps length, + # original seq-action-attempt-help_level-outcome + train_info.write(",".join([str(school), "\t".join(class_id), str(student), str(progress), str(prob), str(scenario), + f"{1 if means_and_extremes else 0}", str(len(step_names_token)), + "\t".join(map(str, original_steps_actions_attempts_help_levels_outcomes))])) + train_info.write("\n") + + elif proba > 0.9: + # elif prob in last_prob_list: + test_file.write("\t".join(step_names_token)) + test_file.write("\n") + # school, class, student id, progress, problem name, scenario, + # prefered ER or ME, total steps length, + # original seq-action-attempt-help_level-outcome + test_info.write(",".join([str(school), "\t".join(class_id), str(student), str(progress), str(prob), str(scenario), + f"{1 if means_and_extremes else 0}", str(len(step_names_token)), + "\t".join(map(str, original_steps_actions_attempts_help_levels_outcomes))])) + test_info.write("\n") + + else: + val_file.write("\t".join(step_names_token)) + val_file.write("\n") + # school, class, student id, progress, problem name, scenario, + # prefered ER or ME, total steps length, + # original seq-action-attempt-help_level-outcome + val_info.write(",".join([str(school), "\t".join(class_id), str(student), str(progress), str(prob), str(scenario), + f"{1 if means_and_extremes else 0}", str(len(step_names_token)), + "\t".join(map(str, original_steps_actions_attempts_help_levels_outcomes))])) + val_info.write("\n") + # break + # break + # break + # break + # break +# overall_labels = np.array(overall_labels) +# indices_of_zeros = list(np.where(overall_labels == '0')[0]) +# indices_of_ones = list(np.where(overall_labels == '1')[0]) +# indices_of_twos = list(np.where(overall_labels == '2')[0]) + +# train_len = int(len(overall_labels) * 0.10) +# sample_size = int(train_len/3) +# print(f"sample_size: {sample_size}") +# sampled_instances = random.sample(indices_of_zeros, sample_size) +# sampled_instances.extend(random.sample(indices_of_ones, sample_size)) +# sampled_instances.extend(random.sample(indices_of_twos, sample_size)) + +# indices_of_zeros = [i for i in indices_of_zeros if not i in sampled_instances ] +# indices_of_ones = [i for i in indices_of_ones if not i in sampled_instances ] +# indices_of_twos = [i for i in indices_of_twos if not i in sampled_instances ] + +# balanced_test = min(len(indices_of_zeros), len(indices_of_ones), len(indices_of_twos)) +# print(f"balanced_test: {balanced_test}") +# test_sampled_instances = random.sample(indices_of_zeros, balanced_test) +# test_sampled_instances.extend(random.sample(indices_of_ones, balanced_test)) +# test_sampled_instances.extend(random.sample(indices_of_twos, balanced_test)) + +# for index, (all_data, label) in enumerate(zip(overall_data, overall_labels)): + +# steps_seq = all_data[0] +# info = all_data[1] + +# if index in sampled_instances: +# train_file.write(steps_seq) +# train_file.write("\n") + +# train_info.write(info) +# train_info.write("\n") + +# train_label.write(label) +# train_label.write("\n") +# elif index in test_sampled_instances: +# # proba = random.random() +# # if proba <0.5: +# test_file.write(steps_seq) +# test_file.write("\n") + +# test_info.write(info) +# test_info.write("\n") + +# test_label.write(label) +# test_label.write("\n") +# # else: +# # val_file.write(steps_seq) +# # val_file.write("\n") + +# # val_info.write(info) +# # val_info.write("\n") + +# # val_label.write(label) +# # val_label.write("\n") + + + train_file.close() + train_info.close() + train_label.close() + + val_file.close() + val_info.close() + val_label.close() + + test_file.close() + test_info.close() + test_label.close() + +def prepare_school_coded_finetuning_correctness_after_opts_files(data_processor, options): + ''' + Ongoing research. + FinalAnswer step correctness + Correctness after opts: + 0 if attempt at step>1 + 1 if attempt at step==1 + ''' + kcs = pickle.load(open("dataset/CL_2223/ratio_proportion_change3_2223/unique_kcs_list.pkl", "rb")) + kcs = [kc for kc in kcs if not pd.isna(kc)] + kcs = np.array(sorted(list(kcs))) + print(kcs, type(kcs)) + print(f"KCs: {kcs}") + chunk_iterator = data_processor.load_file_iterator(sep=",") + + train_file = open(options.train_file_path, "w") + train_info = open(options.train_info_path, "w") + train_label = open(options.train_label_path, "w") + +# val_file = open(options.val_file_path, "w") +# val_info = open(options.val_info_path, "w") +# val_label = open(options.val_label_path, "w") + + test_file = open(options.test_file_path, "w") + test_info = open(options.test_info_path, "w") + test_label = open(options.test_label_path, "w") + + overall_data = [] + overall_labels = [] + # kcs = pickle.load(open("dataset/CL_2223/ratio_proportion_change3_2223/unique_kcs_list.pkl", "rb")) + # kcs = [kc if not pd.isna(kc) for kc in kcs] + for chunk_data in chunk_iterator: + for school, school_group in chunk_data.groupby('CF (Anon School Id)'): + if not options.school or school in options.school: + print(f"{school} : {school_group.shape}") + school_group = school_group[(school_group['CF (Is StepByStep)'] == False) & + (school_group['CF (Encounter)'] == 0) & + (school_group['CF (Is Review Mode)'] == -1) ] + print(f"{school} : {school_group.shape}") + # for class_id, class_group in school_groups.groupby('CF (Anon Class Id)'): + for student, student_groups in school_group.groupby("Anon Student Id"): + student_groups.sort_values(by="Time", inplace=True) + # prob_list = list(pd.unique(student_groups["Problem Name"])) + # prob_list= list(student_groups[student_groups["CF (Workspace Progress Status)"]=="GRADUATED"]["Problem Name"]) + # prob_list = prob_list[-int(len(prob_list)/2):] + for prob, prob_groups in student_groups.groupby("Problem Name"): + # if not prob in prob_list: + # continue + actions = list(prob_groups["Action"]) + # A problem should be completed by a student clicking Done button. + if not "Done" in actions: + continue + unique_steps = list(pd.unique(prob_groups[prob_groups["CF (Is Autofilled)"] == False]["Step Name"])) + unique_steps_len = len([s for s in unique_steps if not pd.isna(s) and not (s in options.opt_step1) and not (s in options.opt_step2)]) + if unique_steps_len < 4: + continue + unique_opt_steps_len = len([s for s in unique_steps if not pd.isna(s) and (s in options.opt_step1[1:] or s in options.opt_step2[1:])]) + if unique_opt_steps_len < 2: + continue + # print(unique_steps, unique_opt_steps_len) + class_id = list(pd.unique(prob_groups["CF (Anon Class Id)"])) + step_names_token = [] + original_steps_actions_attempts_help_levels_outcomes = [] + original_steps = [] + means_and_extremes = False + opt1_used = False + opt2_used = False + final_after_opts = False + correctness = "0" + kcs_skills = [0 for i in kcs] + diff_skills = [0 for i in kcs] + finalanswer_skill = [0 for i in kcs] + for index, row in prob_groups[['Step Name', 'Action', 'Attempt At Step', 'CF (Is Autofilled)', + 'Outcome', 'Help Level', 'CF (Workspace Progress Status)', + 'CF (Skill Previous p-Known)', 'CF (Skill New p-Known)', 'KC Model(MATHia)', + 'CF (Etalon)', 'CF (Problem Scenario Tags)']].iterrows(): + step = row["Step Name"] + action = row["Action"] # ['Attempt', 'Hint Request', 'Hint Level Change', 'Done'] + attempt = row["Attempt At Step"] # number + outcome = row["Outcome"] # ['OK', 'BUG', 'ERROR', 'INITIAL_HINT', 'HINT_LEVEL_CHANGE'] + help_level = row["Help Level"] # number + progress = row["CF (Workspace Progress Status)"] + scenario = row['CF (Problem Scenario Tags)'] + kc = row['KC Model(MATHia)'] + prev_skill = row['CF (Skill Previous p-Known)'] + curr_skill = row['CF (Skill New p-Known)'] + # print(kc, prev_skill) + if not pd.isna(step): + if step in options.opt_step1 and not means_and_extremes: + etalon = row["CF (Etalon)"] + if not pd.isna(etalon): + etalon = etalon.strip('{}') + key, value = etalon.split('=') + etalon = value + try: + etalon = int(etalon) + except Exception as e: + try: + etalon = float(etalon) + means_and_extremes = True + except Exception as e: + pass + if row['CF (Is Autofilled)'] == True: + continue + prev = step_names_token[-1] if step_names_token else "" + prev_step = step_names_token[-1].split("-")[0] if step_names_token else "" + + if not step_names_token or step != prev_step: + if step in options.opt_step1 or step in options.opt_step2: + new_step = step + if step in options.opt_step1[1:]: + opt1_used = True + elif step in options.opt_step2[2:]: + opt2_used = True + else: + if action == "Attempt" and outcome != "OK": + new_step = step+"-2" + elif "Hint" in action: + new_step = step+"-1" + else: + new_step = step+"-0" + + if step == "FinalAnswer" and (opt1_used or opt2_used) and not final_after_opts: + final_after_opts = True + if outcome == "OK": + correctness = "1" + step_names_token.append(new_step) + + else: + if not (step in options.opt_step1 or step in options.opt_step2 or step == "FinalAnswer"): + if action == "Attempt" and outcome != "OK": + new_step = step+"-2" + elif "Hint" in action: + new_step = step+"-1" + else: + new_step = step+"-0" + + if prev < new_step: + step_names_token[-1] = new_step + if not pd.isna(kc): + index = np.argwhere(kcs==kc).flatten()[0] + # print(index, type(index)) + kcs_skills[index] = prev_skill + diff_skills[index] = prev_skill - curr_skill + if step == "FinalAnswer": + finalanswer_skill[index] = prev_skill + + original_steps_actions_attempts_help_levels_outcomes.append(f"{step}-{action}-{attempt}-{help_level}-{outcome}") + original_steps.append(step) + if (not opt1_used) and (not opt2_used): + continue + unique_steps_len = len([s for s in original_steps if not (s in options.opt_step1) and not(s in options.opt_step2)]) + if step_names_token and unique_steps_len > 4: + label = None + if opt1_used and opt2_used: + label = "2" + if (not opt1_used) and opt2_used: + label = "1" + if opt1_used and (not opt2_used): + label = "0" + # print(f"opt1_used: {opt1_ßused}, opt2_used: {opt2_used} label : {label}") + info = ",".join([str(school), "\t".join(class_id), str(student), str(progress), str(prob), str(scenario), + f"{1 if means_and_extremes else 0}", str(len(step_names_token)), + "\t".join(map(str, original_steps_actions_attempts_help_levels_outcomes)), label, + "\t".join(map(str, kcs_skills)), "\t".join(map(str, diff_skills)), + "\t".join(map(str, finalanswer_skill))])#str(finalanswer_skill)]) + overall_data.append(["\t".join(step_names_token), info]) + overall_labels.append(correctness) +# proba = random.random() +# # if prob in first_prob_list: +# if proba <= 0.8: +# train_file.write("\t".join(step_names_token)) +# train_file.write("\n") +# # school, class, student id, progress, problem name, scenario, +# # prefered ER or ME, total steps length, +# # original seq-action-attempt-help_level-outcome +# train_info.write(",".join([str(school), "\t".join(class_id), str(student), str(progress), str(prob), str(scenario), +# f"{1 if means_and_extremes else 0}", str(len(step_names_token)), +# "\t".join(map(str, original_steps_actions_attempts_help_levels_outcomes))])) +# train_info.write("\n") + +# elif proba > 0.9: +# # elif prob in last_prob_list: +# test_file.write("\t".join(step_names_token)) +# test_file.write("\n") +# # school, class, student id, progress, problem name, scenario, +# # prefered ER or ME, total steps length, +# # original seq-action-attempt-help_level-outcome +# test_info.write(",".join([str(school), "\t".join(class_id), str(student), str(progress), str(prob), str(scenario), +# f"{1 if means_and_extremes else 0}", str(len(step_names_token)), +# "\t".join(map(str, original_steps_actions_attempts_help_levels_outcomes))])) +# test_info.write("\n") + +# else: +# val_file.write("\t".join(step_names_token)) +# val_file.write("\n") +# # school, class, student id, progress, problem name, scenario, +# # prefered ER or ME, total steps length, +# # original seq-action-attempt-help_level-outcome +# val_info.write(",".join([str(school), "\t".join(class_id), str(student), str(progress), str(prob), str(scenario), +# f"{1 if means_and_extremes else 0}", str(len(step_names_token)), +# "\t".join(map(str, original_steps_actions_attempts_help_levels_outcomes))])) +# val_info.write("\n") + # break + # break + # break + # break + # break + overall_labels = np.array(overall_labels) + indices_of_zeros = list(np.where(overall_labels == '0')[0]) + indices_of_ones = list(np.where(overall_labels == '1')[0]) + # indices_of_twos = list(np.where(overall_labels == '2')[0]) + + train_len = int(len(overall_labels) * 0.10) + sample_size = int(train_len/2) + print(f"sample_size: {sample_size}") + sampled_instances = random.sample(indices_of_zeros, sample_size) + sampled_instances.extend(random.sample(indices_of_ones, sample_size)) + # sampled_instances.extend(random.sample(indices_of_twos, sample_size)) + + indices_of_zeros = [i for i in indices_of_zeros if not i in sampled_instances ] + indices_of_ones = [i for i in indices_of_ones if not i in sampled_instances ] + # indices_of_twos = [i for i in indices_of_twos if not i in sampled_instances ] + + balanced_test = min(len(indices_of_zeros), len(indices_of_ones)) #, len(indices_of_twos)) + print(f"balanced_test: {balanced_test}") + test_sampled_instances = random.sample(indices_of_zeros, balanced_test) + test_sampled_instances.extend(random.sample(indices_of_ones, balanced_test)) + # test_sampled_instances.extend(random.sample(indices_of_twos, balanced_test)) + + for index, (all_data, label) in enumerate(zip(overall_data, overall_labels)): + + steps_seq = all_data[0] + info = all_data[1] + + if index in sampled_instances: + train_file.write(steps_seq) + train_file.write("\n") + + train_info.write(info) + train_info.write("\n") + + train_label.write(label) + train_label.write("\n") + elif index in test_sampled_instances: + # proba = random.random() + # if proba <0.5: + test_file.write(steps_seq) + test_file.write("\n") + + test_info.write(info) + test_info.write("\n") + + test_label.write(label) + test_label.write("\n") +# else: +# val_file.write(steps_seq) +# val_file.write("\n") + +# val_info.write(info) +# val_info.write("\n") + +# val_label.write(label) +# val_label.write("\n") + + + train_file.close() + train_info.close() + train_label.close() + + # val_file.close() + # val_info.close() + # val_label.close() + + test_file.close() + test_info.close() + test_label.close() + +def prepare_school_coded_finetuning_correctness_after_opts_over_prob_files(data_processor, options): + ''' + Ongoing research. + FinalAnswer step correctness + Correctness after opts: + 0 if attempt at step>1 + 1 if attempt at step==1 + ''' + kcs = pickle.load(open("dataset/CL_2223/ratio_proportion_change3_2223/unique_kcs_list.pkl", "rb")) + kcs = [kc for kc in kcs if not pd.isna(kc)] + kcs = np.array(sorted(list(kcs))) + print(kcs, type(kcs)) + print(f"KCs: {kcs}") + chunk_iterator = data_processor.load_file_iterator(sep=",") + + train_file = open(options.train_file_path, "w") + train_info = open(options.train_info_path, "w") + train_label = open(options.train_label_path, "w") + +# val_file = open(options.val_file_path, "w") +# val_info = open(options.val_info_path, "w") +# val_label = open(options.val_label_path, "w") + + test_file = open(options.test_file_path, "w") + test_info = open(options.test_info_path, "w") + test_label = open(options.test_label_path, "w") + + train_data = [] + train_labels = [] + + test_data = [] + test_labels = [] + # kcs = pickle.load(open("dataset/CL_2223/ratio_proportion_change3_2223/unique_kcs_list.pkl", "rb")) + # kcs = [kc if not pd.isna(kc) for kc in kcs] + for chunk_data in chunk_iterator: + for school, school_group in chunk_data.groupby('CF (Anon School Id)'): + if not options.school or school in options.school: + print(f"{school} : {school_group.shape}") + school_group = school_group[(school_group['CF (Is StepByStep)'] == False) & + (school_group['CF (Encounter)'] == 0) & + (school_group['CF (Is Review Mode)'] == -1) ] + print(f"{school} : {school_group.shape}") + # for class_id, class_group in school_groups.groupby('CF (Anon Class Id)'): + for student, student_groups in school_group.groupby("Anon Student Id"): + student_groups.sort_values(by="Time", inplace=True) + train = True + proba = random.random() + if proba < 0.5: + train = False + # prob_list = list(pd.unique(student_groups["Problem Name"])) + # prob_list= list(student_groups[student_groups["CF (Workspace Progress Status)"]=="GRADUATED"]["Problem Name"]) + # prob_list = prob_list[-int(len(prob_list)/2):] + prev_kcs_skills = [0 for i in kcs] + for pi, (prob, prob_groups) in enumerate(student_groups.groupby("Problem Name")): + # if not prob in prob_list: + # continue + actions = list(prob_groups["Action"]) + # A problem should be completed by a student clicking Done button. + if not "Done" in actions: + continue + unique_steps = list(pd.unique(prob_groups[prob_groups["CF (Is Autofilled)"] == False]["Step Name"])) + unique_steps_len = len([s for s in unique_steps if not pd.isna(s) and not (s in options.opt_step1) and not (s in options.opt_step2)]) + if unique_steps_len < 4: + continue + unique_opt_steps_len = len([s for s in unique_steps if not pd.isna(s) and (s in options.opt_step1[1:] or s in options.opt_step2[1:])]) + if unique_opt_steps_len < 2: + continue + # print(unique_steps, unique_opt_steps_len) + class_id = list(pd.unique(prob_groups["CF (Anon Class Id)"])) + step_names_token = [] + original_steps_actions_attempts_help_levels_outcomes = [] + original_steps = [] + means_and_extremes = False + opt1_used = False + opt2_used = False + final_after_opts = False + correctness = "0" + kcs_skills = [0 for i in kcs] + diff_skills = [0 for i in kcs] + finalanswer_skill = [0 for i in kcs] + for index, row in prob_groups[['Step Name', 'Action', 'Attempt At Step', 'CF (Is Autofilled)', + 'Outcome', 'Help Level', 'CF (Workspace Progress Status)', + 'CF (Skill Previous p-Known)', 'CF (Skill New p-Known)', 'KC Model(MATHia)', + 'CF (Etalon)', 'CF (Problem Scenario Tags)']].iterrows(): + step = row["Step Name"] + action = row["Action"] # ['Attempt', 'Hint Request', 'Hint Level Change', 'Done'] + attempt = row["Attempt At Step"] # number + outcome = row["Outcome"] # ['OK', 'BUG', 'ERROR', 'INITIAL_HINT', 'HINT_LEVEL_CHANGE'] + help_level = row["Help Level"] # number + progress = row["CF (Workspace Progress Status)"] + scenario = row['CF (Problem Scenario Tags)'] + kc = row['KC Model(MATHia)'] + prev_skill = row['CF (Skill Previous p-Known)'] + curr_skill = row['CF (Skill New p-Known)'] + # print(kc, prev_skill) + if not pd.isna(step): + if step in options.opt_step1 and not means_and_extremes: + etalon = row["CF (Etalon)"] + if not pd.isna(etalon): + etalon = etalon.strip('{}') + key, value = etalon.split('=') + etalon = value + try: + etalon = int(etalon) + except Exception as e: + try: + etalon = float(etalon) + means_and_extremes = True + except Exception as e: + pass + if row['CF (Is Autofilled)'] == True: + continue + prev = step_names_token[-1] if step_names_token else "" + prev_step = step_names_token[-1].split("-")[0] if step_names_token else "" + + if not step_names_token or step != prev_step: + if step in options.opt_step1 or step in options.opt_step2: + new_step = step + if step in options.opt_step1[1:]: + opt1_used = True + elif step in options.opt_step2[2:]: + opt2_used = True + else: + if action == "Attempt" and outcome != "OK": + new_step = step+"-2" + elif "Hint" in action: + new_step = step+"-1" + else: + new_step = step+"-0" + + if step == "FinalAnswer" and (opt1_used or opt2_used) and not final_after_opts: + final_after_opts = True + if outcome == "OK": + correctness = "1" + step_names_token.append(new_step) + + else: + if not (step in options.opt_step1 or step in options.opt_step2 or step == "FinalAnswer"): + if action == "Attempt" and outcome != "OK": + new_step = step+"-2" + elif "Hint" in action: + new_step = step+"-1" + else: + new_step = step+"-0" + + if prev < new_step: + step_names_token[-1] = new_step + if not pd.isna(kc): + index = np.argwhere(kcs==kc).flatten()[0] + # print(index, type(index)) + kcs_skills[index] = prev_skill + if pi != 0: + diff_skills[index] = prev_skill - prev_kcs_skills[index] + prev_kcs_skills[index] = prev_skill + if step == "FinalAnswer": + finalanswer_skill[index] = prev_skill + + original_steps_actions_attempts_help_levels_outcomes.append(f"{step}-{action}-{attempt}-{help_level}-{outcome}") + original_steps.append(step) + if (not opt1_used) and (not opt2_used): + continue + unique_steps_len = len([s for s in original_steps if not (s in options.opt_step1) and not(s in options.opt_step2)]) + if step_names_token and unique_steps_len > 4: + label = None + if opt1_used and opt2_used: + label = "2" + if (not opt1_used) and opt2_used: + label = "1" + if opt1_used and (not opt2_used): + label = "0" + # print(f"opt1_used: {opt1_ßused}, opt2_used: {opt2_used} label : {label}") + info = ",".join([str(school), "\t".join(class_id), str(student), str(progress), str(prob), str(scenario), + f"{1 if means_and_extremes else 0}", str(len(step_names_token)), + "\t".join(map(str, original_steps_actions_attempts_help_levels_outcomes)), label, + "\t".join(map(str, kcs_skills)), "\t".join(map(str, diff_skills)), + "\t".join(map(str, finalanswer_skill))])#str(finalanswer_skill)]) + if train: + train_data.append(["\t".join(step_names_token), info]) + train_labels.append(correctness) + else: + test_data.append(["\t".join(step_names_token), info]) + test_labels.append(correctness) +# proba = random.random() +# # if prob in first_prob_list: +# if proba <= 0.8: +# train_file.write("\t".join(step_names_token)) +# train_file.write("\n") +# # school, class, student id, progress, problem name, scenario, +# # prefered ER or ME, total steps length, +# # original seq-action-attempt-help_level-outcome +# train_info.write(",".join([str(school), "\t".join(class_id), str(student), str(progress), str(prob), str(scenario), +# f"{1 if means_and_extremes else 0}", str(len(step_names_token)), +# "\t".join(map(str, original_steps_actions_attempts_help_levels_outcomes))])) +# train_info.write("\n") + +# elif proba > 0.9: +# # elif prob in last_prob_list: +# test_file.write("\t".join(step_names_token)) +# test_file.write("\n") +# # school, class, student id, progress, problem name, scenario, +# # prefered ER or ME, total steps length, +# # original seq-action-attempt-help_level-outcome +# test_info.write(",".join([str(school), "\t".join(class_id), str(student), str(progress), str(prob), str(scenario), +# f"{1 if means_and_extremes else 0}", str(len(step_names_token)), +# "\t".join(map(str, original_steps_actions_attempts_help_levels_outcomes))])) +# test_info.write("\n") + +# else: +# val_file.write("\t".join(step_names_token)) +# val_file.write("\n") +# # school, class, student id, progress, problem name, scenario, +# # prefered ER or ME, total steps length, +# # original seq-action-attempt-help_level-outcome +# val_info.write(",".join([str(school), "\t".join(class_id), str(student), str(progress), str(prob), str(scenario), +# f"{1 if means_and_extremes else 0}", str(len(step_names_token)), +# "\t".join(map(str, original_steps_actions_attempts_help_levels_outcomes))])) +# val_info.write("\n") + # break + # break + # break + # break + # break +# overall_labels = np.array(overall_labels) +# indices_of_zeros = list(np.where(overall_labels == '0')[0]) +# indices_of_ones = list(np.where(overall_labels == '1')[0]) +# # indices_of_twos = list(np.where(overall_labels == '2')[0]) + +# train_len = int(len(overall_labels) * 0.10) +# sample_size = int(train_len/2) +# print(f"sample_size: {sample_size}") +# sampled_instances = random.sample(indices_of_zeros, sample_size) +# sampled_instances.extend(random.sample(indices_of_ones, sample_size)) +# # sampled_instances.extend(random.sample(indices_of_twos, sample_size)) + +# indices_of_zeros = [i for i in indices_of_zeros if not i in sampled_instances ] +# indices_of_ones = [i for i in indices_of_ones if not i in sampled_instances ] +# # indices_of_twos = [i for i in indices_of_twos if not i in sampled_instances ] + +# balanced_test = min(len(indices_of_zeros), len(indices_of_ones)) #, len(indices_of_twos)) +# print(f"balanced_test: {balanced_test}") +# test_sampled_instances = random.sample(indices_of_zeros, balanced_test) +# test_sampled_instances.extend(random.sample(indices_of_ones, balanced_test)) +# # test_sampled_instances.extend(random.sample(indices_of_twos, balanced_test)) + + for index, (all_data, label) in enumerate(zip(train_data, train_labels)): + steps_seq = all_data[0] + info = all_data[1] + + train_file.write(steps_seq) + train_file.write("\n") + + train_info.write(info) + train_info.write("\n") + + train_label.write(label) + train_label.write("\n") + train_file.close() + train_info.close() + train_label.close() + + for index, (all_data, label) in enumerate(zip(test_data, test_labels)): + steps_seq = all_data[0] + info = all_data[1] + + test_file.write(steps_seq) + test_file.write("\n") + + test_info.write(info) + test_info.write("\n") + + test_label.write(label) + test_label.write("\n") + test_file.close() + test_info.close() + test_label.close() + +def prepare_school_coded_finetuning_correctness_after_opts_per_files(data_processor, options): + ''' + Ongoing research. + FinalAnswer step correctness + Correctness after opts: + 0 if attempt at step>1 + 1 if attempt at step==1 + ''' + kcs = pickle.load(open("dataset/CL_2223/ratio_proportion_change3_2223/unique_kcs_list.pkl", "rb")) + kcs = [kc for kc in kcs if not pd.isna(kc)] + kcs = np.array(sorted(list(kcs))) + print(kcs, type(kcs)) + print(f"KCs: {kcs}") + chunk_iterator = data_processor.load_file_iterator(sep=",") + + train_file = open(options.train_file_path, "w") + train_info = open(options.train_info_path, "w") + train_label = open(options.train_label_path, "w") + + val_file = open(options.val_file_path, "a") + val_info = open(options.val_info_path, "a") + val_label = open(options.val_label_path, "a") + + test_file = open(options.test_file_path, "w") + test_info = open(options.test_info_path, "w") + test_label = open(options.test_label_path, "w") + + overall_data = [] + overall_labels = [] + # kcs = pickle.load(open("dataset/CL_2223/ratio_proportion_change3_2223/unique_kcs_list.pkl", "rb")) + # kcs = [kc if not pd.isna(kc) for kc in kcs] + for chunk_data in chunk_iterator: + for school, school_group in chunk_data.groupby('CF (Anon School Id)'): + if not options.school or school in options.school: + print(f"{school} : {school_group.shape}") + school_group = school_group[(school_group['CF (Is StepByStep)'] == False) & + (school_group['CF (Encounter)'] == 0) & + (school_group['CF (Is Review Mode)'] == -1) ] + print(f"{school} : {school_group.shape}") + # for class_id, class_group in school_groups.groupby('CF (Anon Class Id)'): + for student, student_groups in school_group.groupby("Anon Student Id"): + student_groups.sort_values(by="Time", inplace=True) + # prob_list = list(pd.unique(student_groups["Problem Name"])) + # prob_list= list(student_groups[student_groups["CF (Workspace Progress Status)"]=="GRADUATED"]["Problem Name"]) + # prob_list = prob_list[-int(len(prob_list)/2):] + for prob, prob_groups in student_groups.groupby("Problem Name"): + # if not prob in prob_list: + # continue + actions = list(prob_groups["Action"]) + # A problem should be completed by a student clicking Done button. + if not "Done" in actions: + continue + unique_steps = list(pd.unique(prob_groups[prob_groups["CF (Is Autofilled)"] == False]["Step Name"])) + unique_steps_len = len([s for s in unique_steps if not pd.isna(s) and not (s in options.opt_step1) and not (s in options.opt_step2)]) + if unique_steps_len < 4: + continue + unique_opt_steps_len = len([s for s in unique_steps if not pd.isna(s) and (s in options.opt_step1[1:] or s in options.opt_step2[1:])]) + if unique_opt_steps_len < 2: + continue + # print(unique_steps, unique_opt_steps_len) + class_id = list(pd.unique(prob_groups["CF (Anon Class Id)"])) + step_names_token = [] + original_steps_actions_attempts_help_levels_outcomes = [] + original_steps = [] + means_and_extremes = False + opt1_used = False + opt2_used = False + final_after_opts = False + correctness = "0" + kcs_skills = [0 for i in kcs] + diff_skills = [0 for i in kcs] + finalanswer_skill = [0 for i in kcs] + for index, row in prob_groups[['Step Name', 'Action', 'Attempt At Step', 'CF (Is Autofilled)', + 'Outcome', 'Help Level', 'CF (Workspace Progress Status)', + 'CF (Skill Previous p-Known)', 'CF (Skill New p-Known)', 'KC Model(MATHia)', + 'CF (Etalon)', 'CF (Problem Scenario Tags)']].iterrows(): + step = row["Step Name"] + action = row["Action"] # ['Attempt', 'Hint Request', 'Hint Level Change', 'Done'] + attempt = row["Attempt At Step"] # number + outcome = row["Outcome"] # ['OK', 'BUG', 'ERROR', 'INITIAL_HINT', 'HINT_LEVEL_CHANGE'] + help_level = row["Help Level"] # number + progress = row["CF (Workspace Progress Status)"] + scenario = row['CF (Problem Scenario Tags)'] + kc = row['KC Model(MATHia)'] + prev_skill = row['CF (Skill Previous p-Known)'] + curr_skill = row['CF (Skill New p-Known)'] + # print(kc, prev_skill) + if not pd.isna(step): + if step in options.opt_step1 and not means_and_extremes: + etalon = row["CF (Etalon)"] + if not pd.isna(etalon): + etalon = etalon.strip('{}') + key, value = etalon.split('=') + etalon = value + try: + etalon = int(etalon) + except Exception as e: + try: + etalon = float(etalon) + means_and_extremes = True + except Exception as e: + pass + if row['CF (Is Autofilled)'] == True: + continue + prev = step_names_token[-1] if step_names_token else "" + prev_step = step_names_token[-1].split("-")[0] if step_names_token else "" + + if not step_names_token or step != prev_step: + if step in options.opt_step1 or step in options.opt_step2: + new_step = step + if step in options.opt_step1[1:]: + opt1_used = True + elif step in options.opt_step2[2:]: + opt2_used = True + else: + if action == "Attempt" and outcome != "OK": + new_step = step+"-2" + elif "Hint" in action: + new_step = step+"-1" + else: + new_step = step+"-0" + + if step == "FinalAnswer" and (opt1_used or opt2_used) and not final_after_opts: + final_after_opts = True + if outcome == "OK": + correctness = "1" + step_names_token.append(new_step) + + else: + if not (step in options.opt_step1 or step in options.opt_step2 or step == "FinalAnswer"): + if action == "Attempt" and outcome != "OK": + new_step = step+"-2" + elif "Hint" in action: + new_step = step+"-1" + else: + new_step = step+"-0" + + if prev < new_step: + step_names_token[-1] = new_step + if not pd.isna(kc): + index = np.argwhere(kcs==kc).flatten()[0] + # print(index, type(index)) + kcs_skills[index] = prev_skill + diff_skills[index] = prev_skill - curr_skill + if step == "FinalAnswer": + finalanswer_skill[index] = prev_skill + + original_steps_actions_attempts_help_levels_outcomes.append(f"{step}-{action}-{attempt}-{help_level}-{outcome}") + original_steps.append(step) + if (not opt1_used) and (not opt2_used): + continue + unique_steps_len = len([s for s in original_steps if not (s in options.opt_step1) and not(s in options.opt_step2)]) + if step_names_token and unique_steps_len > 4: + label = None + if opt1_used and opt2_used: + label = "2" + if (not opt1_used) and opt2_used: + label = "1" + if opt1_used and (not opt2_used): + label = "0" + # print(f"opt1_used: {opt1_ßused}, opt2_used: {opt2_used} label : {label}") + info = ",".join([str(school), "\t".join(class_id), str(student), str(progress), str(prob), str(scenario), + f"{1 if means_and_extremes else 0}", str(len(step_names_token)), + "\t".join(map(str, original_steps_actions_attempts_help_levels_outcomes)), label, + "\t".join(map(str, kcs_skills)), "\t".join(map(str, diff_skills)), + "\t".join(map(str, finalanswer_skill))])#str(finalanswer_skill)]) + overall_data.append(["\t".join(step_names_token), info]) + overall_labels.append(correctness) +# proba = random.random() +# # if prob in first_prob_list: +# if proba <= 0.8: +# train_file.write("\t".join(step_names_token)) +# train_file.write("\n") +# # school, class, student id, progress, problem name, scenario, +# # prefered ER or ME, total steps length, +# # original seq-action-attempt-help_level-outcome +# train_info.write(",".join([str(school), "\t".join(class_id), str(student), str(progress), str(prob), str(scenario), +# f"{1 if means_and_extremes else 0}", str(len(step_names_token)), +# "\t".join(map(str, original_steps_actions_attempts_help_levels_outcomes))])) +# train_info.write("\n") + +# elif proba > 0.9: +# # elif prob in last_prob_list: +# test_file.write("\t".join(step_names_token)) +# test_file.write("\n") +# # school, class, student id, progress, problem name, scenario, +# # prefered ER or ME, total steps length, +# # original seq-action-attempt-help_level-outcome +# test_info.write(",".join([str(school), "\t".join(class_id), str(student), str(progress), str(prob), str(scenario), +# f"{1 if means_and_extremes else 0}", str(len(step_names_token)), +# "\t".join(map(str, original_steps_actions_attempts_help_levels_outcomes))])) +# test_info.write("\n") + +# else: +# val_file.write("\t".join(step_names_token)) +# val_file.write("\n") +# # school, class, student id, progress, problem name, scenario, +# # prefered ER or ME, total steps length, +# # original seq-action-attempt-help_level-outcome +# val_info.write(",".join([str(school), "\t".join(class_id), str(student), str(progress), str(prob), str(scenario), +# f"{1 if means_and_extremes else 0}", str(len(step_names_token)), +# "\t".join(map(str, original_steps_actions_attempts_help_levels_outcomes))])) +# val_info.write("\n") + # break + # break + # break + # break + # break + overall_labels = np.array(overall_labels) + indices_of_zeros = list(np.where(overall_labels == '0')[0]) + indices_of_ones = list(np.where(overall_labels == '1')[0]) + # indices_of_twos = list(np.where(overall_labels == '2')[0]) + + # train_len = int(len(overall_labels) * 0.10) + train_len = int(len(overall_labels) * float(options.per)) + + sample_size = int(train_len/2) + if float(options.per) == 1: + sample_size = min(len(indices_of_zeros), len(indices_of_ones)) + elif float(options.per) > 1: + sample_size = int(options.per) + print(f"sample_size: {sample_size}") + sampled_instances = random.sample(indices_of_zeros, sample_size) + sampled_instances.extend(random.sample(indices_of_ones, sample_size)) + # sampled_instances.extend(random.sample(indices_of_twos, sample_size)) + + indices_of_zeros = [i for i in indices_of_zeros if not i in sampled_instances ] + indices_of_ones = [i for i in indices_of_ones if not i in sampled_instances ] + # indices_of_twos = [i for i in indices_of_twos if not i in sampled_instances ] + + balanced_test = min(len(indices_of_zeros), len(indices_of_ones)) #, len(indices_of_twos)) + print(f"balanced_test: {balanced_test}") + test_sampled_instances = random.sample(indices_of_zeros, balanced_test) + test_sampled_instances.extend(random.sample(indices_of_ones, balanced_test)) + # test_sampled_instances.extend(random.sample(indices_of_twos, balanced_test)) + + for index, (all_data, label) in enumerate(zip(overall_data, overall_labels)): + + steps_seq = all_data[0] + info = all_data[1] + + if index in sampled_instances: + train_file.write(steps_seq) + train_file.write("\n") + + train_info.write(info) + train_info.write("\n") + + train_label.write(label) + train_label.write("\n") + if float(options.per) == 1.0: + val_file.write(steps_seq) + val_file.write("\n") + + val_info.write(info) + val_info.write("\n") + + val_label.write(label) + val_label.write("\n") + + elif index in test_sampled_instances: + # proba = random.random() + # if proba <0.5: + test_file.write(steps_seq) + test_file.write("\n") + + test_info.write(info) + test_info.write("\n") + + test_label.write(label) + test_label.write("\n") + + if float(options.per) != 1.0: + val_file.write(steps_seq) + val_file.write("\n") + + val_info.write(info) + val_info.write("\n") + + val_label.write(label) + val_label.write("\n") + + + train_file.close() + train_info.close() + train_label.close() + + val_file.close() + val_info.close() + val_label.close() + + test_file.close() + test_info.close() + test_label.close() + + + +def prepare_pretraining_vocab_file(options): + + # kc = pickle.load(open("dataset/unique/unique_kcs_list.pkl","rb")) + # kc_token = {"KC"+str(i):k for i, k in enumerate(kc)} + # pickle.dump(kc_token, open("pretraining/unique_dict_kc_token.pkl", "wb")) + + # steps = pickle.load(open(f"{options.dataset_folder}unique_steps_list.pkl","rb")) + # step_token = {"step"+str(i):k for i, k in enumerate(steps)} + # folder_name = options.workspace_name+"/" if options.workspace_name else "" + # pickle.dump(step_token, open(f"{folder_name}pretraining/unique_dict_step_token.pkl", "wb")) + + # steps = pickle.load(open(f"{options.dataset_folder}unique_new_steps_w_action_attempt_list.pkl","rb")) + steps = pickle.load(open(f"{options.dataset_folder}unique_steps_list.pkl","rb")) + + # print("No of unique kc", len(kc)) + print("No of unique steps ", len(steps)) + # print("No of unique problem", len(prob)) + # print("Size of vocab ", len(steps)) + + ordered_steps = sorted(list(steps)) + + with (open(options.vocab_file_path,"w")) as vb_file: + vb_file.write("[PAD]\n") + vb_file.write("[UNK]\n") + vb_file.write("[MASK]\n") + vb_file.write("[CLS]\n") + vb_file.write("[SEP]\n") + # vb_file.write("\n".join(kc_token.keys())) + # vb_file.write("\n") + # vb_file.write("\n".join(step_token.keys())) + # vb_file.write("\n".join(ordered_steps)) + for step in ordered_steps: + if step in options.opt_step1 or step in options.opt_step2: + vb_file.write(f"{step}\n") + else: + for i in range(3): + vb_file.write(f"{step}-{i}\n") + vb_file.close() + with open(options.vocab_file_path,"r") as f: + l = f.readlines() + print(l, len(l)) + f.close() + + +def main(opt): + options = copy.deepcopy(opt) + if opt.workspace_name: + options.dataset_folder = opt.dataset_folder+opt.workspace_name+"/" + + data_processor = DataPreprocessor(input_file_path=opt.dataset) + + if opt.analyze_dataset_by_section: + print(f"Analyzing dataset by section for workspace: {opt.workspace_name}") + data_processor.analyze_dataset_by_section(opt.workspace_name) + + pickle.dump(data_processor.unique_students, open(f"{options.dataset_folder}unique_students_list.pkl", "wb")) + pickle.dump(data_processor.unique_problems, open(f"{options.dataset_folder}unique_problems_list.pkl", "wb")) + pickle.dump(data_processor.unique_prob_hierarchy, open(f"{options.dataset_folder}unique_hierarchy_list.pkl", "wb")) + pickle.dump(data_processor.unique_kcs, open(f"{options.dataset_folder}unique_kcs_list.pkl", "wb")) + pickle.dump(data_processor.unique_steps, open(f"{options.dataset_folder}unique_steps_list.pkl", "wb")) + + if opt.analyze_dataset_by_school: + print(f"Analyzing dataset of all school for workspace: {opt.workspace_name}") + data_processor.analyze_dataset_by_school(opt.workspace_name) + + if not os.path.exists(options.dataset_folder): + os.makedirs(options.dataset_folder) + pickle.dump(data_processor.unique_schools, open(f"{options.dataset_folder}unique_schools_list.pkl", "wb")) + pickle.dump(data_processor.unique_class, open(f"{options.dataset_folder}unique_class_list.pkl", "wb")) + pickle.dump(data_processor.unique_students, open(f"{options.dataset_folder}unique_students_list.pkl", "wb")) + pickle.dump(data_processor.unique_problems, open(f"{options.dataset_folder}unique_problems_list.pkl", "wb")) + pickle.dump(data_processor.unique_kcs, open(f"{options.dataset_folder}unique_kcs_list.pkl", "wb")) + pickle.dump(data_processor.unique_steps, open(f"{options.dataset_folder}unique_steps_list.pkl", "wb")) + pickle.dump(data_processor.unique_new_steps_w_action_attempt, open(f"{options.dataset_folder}unique_new_steps_w_action_attempt_list.pkl", "wb")) + pickle.dump(data_processor.unique_new_steps_w_action_attempt_kcs, open(f"{options.dataset_folder}unique_new_steps_w_action_attempt_kcs.pkl", "wb")) + pickle.dump(data_processor.unique_new_steps_w_kcs, open(f"{options.dataset_folder}unique_new_steps_w_kcs_list.pkl", "wb")) + + if opt.workspace_name: + for k,v in vars(opt).items(): + if 'path' in k: + if v: + redirect_path = opt.workspace_name+"/" + if opt.school and opt.pretrain: + sch = f"sch_largest_{len(opt.school)}-coded" #f"sch_largest_655" + redirect_path = redirect_path + sch+"/" + if opt.school_folder: + redirect_path = redirect_path + opt.school_folder+"/" + # else: + # sch = "sch_largest_655" + if k != "vocab_file_path": + if opt.pretrain: + redirect_path = redirect_path + "pretraining/" + else: + if opt.code: + redirect_path = redirect_path + f"{opt.code}/" + elif opt.finetune_task: + if opt.diff_val_folder and "val" in v: + redirect_path = redirect_path + f"finetuning/" + else: + redirect_path = redirect_path + f"finetuning/{opt.finetune_task}/" + if not os.path.exists(redirect_path): + os.makedirs(redirect_path) + else: + if not os.path.exists(redirect_path+"/pretraining/"): + os.makedirs(redirect_path+"/pretraining/") + setattr(options, f"{k}", redirect_path+v) + # setattr(options, f"{k}", opt.workspace_name+"/check/"+v) + print(f"options.{k} : {getattr(options, f'{k}')}") + + + + if options.pretrain: + print("Preparing vocab...") + prepare_pretraining_vocab_file(options) + print("Preparing pre-training dataset...") + # old non-repeated steps + # prepare_pretraining_files(data_processor, options) + # coded + # prepare_school_coded_pretraining_files(data_processor, options) + prepare_school_coded_finetuning_opts_intentional_files(data_processor, options) + # prepare_pretraining_files(data_processor, options) + # prepare_school_pretraining_files(data_processor, options) + # else: + # print("Preparing attention dataset...") + # prepare_school_attention_files(data_processor, options) + else: + print("Preparing fine-tuning dataset...") + # _1920 + # prepare_finetuning_10per_files(data_processor, options) + # prepare_finetuning_IS_FS_files(data_processor, options) + # prepare_finetuning_correctness_files(data_processor, options) + + # _2223 + # prepare_school_coded_finetuning_partial_seq_files(data_processor, options) + # prepare_school_coded_finetuning_opts_files(data_processor, options) + prepare_school_coded_finetuning_correctness_after_opts_per_files(data_processor, options) + # prepare_school_coded_finetuning_correctness_after_opts_files(data_processor, options) + # prepare_school_coded_finetuning_correctness_after_opts_over_prob_files(data_processor, options) + # prepare_finetuning_IS_files(data_processor, options) + # # prepare_finetuning_FS_files(data_processor, options) + # prepare_finetuning_correctness_aaai_files(data_processor, options) + # # prepare_finetuning_SL_files(data_processor, options) + # # prepare_finetuning_effectiveness_files(data_processor, options) + # prepare_attn_test_files(data_processor, options) + + +if __name__ == "__main__": + parser = argparse.ArgumentParser() + parser.add_argument('-dataset_folder', type=str, default="dataset/CL4999_1920/") + + parser.add_argument('-analyze_dataset_by_section', type=bool, default=False) + parser.add_argument('-analyze_dataset_by_school', type=bool, default=False) + parser.add_argument('-workspace_name', type=str, default=None) + parser.add_argument('-school', nargs='+', type=str, default=None) + parser.add_argument('-school_folder', type=str, default=None) + + # parser.add_argument('-highGRschool', nargs='+', type=str, default=None) + # parser.add_argument('-lowGRschool', nargs='+', type=str, default=None) + + parser.add_argument('-code', type=str, default=None) + parser.add_argument('-finetune_task', type=str, default=None) + + parser.add_argument('-per', type=float, default=None) + parser.add_argument("-diff_val_folder", type=bool, default=False, help="use for different val folder") + + parser.add_argument('-opt_step1', nargs='+', type=str, help='List of optional steps 1') + parser.add_argument('-opt_step2', nargs='+', type=str, help='List of optional steps 2') + parser.add_argument('-final_step', nargs='+', type=str, help='List of final step') + + parser.add_argument('-dataset', type=str, default="dataset/CL4999_1920/course2_1920_4999_students_datashop.txt") + + parser.add_argument('-pretrain', type=bool, default=False) + parser.add_argument('-vocab_file_path', type=str, default="pretraining/vocab.txt") #pretraining/vocab.txt + + # Prepare for pretraining + parser.add_argument('-train_file_path', type=str, default="train.txt") #pretraining/pretrain.txt + parser.add_argument('-train_info_path', type=str, default="train_info.txt") #pretraining/pretrain_info.txt + parser.add_argument('-train_label_path', type=str, default="train_label.txt") #finetuning/train_label.txt + + parser.add_argument('-val_file_path', type=str, default="val.txt") #pretraining/val.txt + parser.add_argument('-val_info_path', type=str, default="val_info.txt") #pretraining/val_info.txt + parser.add_argument('-val_label_path', type=str, default="val_label.txt") #finetuning/val_label.txt + + parser.add_argument('-test_file_path', type=str, default="test.txt") #pretraining/test.txt + parser.add_argument('-test_info_path', type=str, default="test_info.txt") #pretraining/test_info.txt + parser.add_argument('-test_label_path', type=str, default="test_label.txt") #finetuning/test_label.txt + + +# parser.add_argument('-train_gt_label_path', type=str, default="finetuning/train_gt_label.txt") +# parser.add_argument('-test_gt_label_path', type=str, default="finetuning/test_gt_label.txt") + + + options = parser.parse_args() + if not options.opt_step1: + setattr(options, "opt_step1", []) + print("Optional steps 1: ", options.opt_step1) + + if not options.opt_step2: + setattr(options, "opt_step2", []) + print("Optional steps 2: ", options.opt_step2) + + if not options.final_step: + setattr(options, "final_step", []) + print("Final steps: ", options.final_step) + + main(options) + + + \ No newline at end of file