diff --git "a/prepare_pretraining_input_vocab_file.py" "b/prepare_pretraining_input_vocab_file.py"
new file mode 100644--- /dev/null
+++ "b/prepare_pretraining_input_vocab_file.py"
@@ -0,0 +1,4755 @@
+import argparse
+import pickle
+import random
+import copy
+import pandas as pd
+import numpy as np
+from collections import Counter
+import os
+from data_preprocessor import DataPreprocessor
+
+def prepare_pretraining_files(data_processor, options):
+    
+    # steps = pickle.load(open(f"{options.dataset_folder}unique_steps_list.pkl","rb"))
+    
+    chunk_iterator = data_processor.load_file_iterator()
+
+    train_file = open(options.train_file_path, "w")
+    train_info = open(options.train_info_path, "w")
+
+    val_file = open(options.val_file_path, "w")
+    val_info = open(options.val_info_path, "w")
+    
+    test_file = open(options.test_file_path, "w")
+    test_info = open(options.test_info_path, "w")
+
+
+    for chunk_data in chunk_iterator:
+        for section, section_groups in chunk_data.groupby("Level (Workspace Id)"):
+            # if options.workspace_name == section:
+            if "ratio_proportion_change3" == section:
+                for student, student_groups in section_groups.groupby("Anon Student Id"):
+                    # step_names_token = [step for step in prob_groups['Step Name'] if str(step) != 'nan']
+                    # print(step_names_token)
+                    
+                    # writtenTrain = False
+                    # writtenTest = False
+                    
+                    student_groups.sort_values(by="Time", inplace=True)
+                    # prob_list = list(pd.unique(student_groups["Problem Name"]))
+                    # print(len(prob_list), prob_list)
+                    
+                    # first_prob_list = prob_list[:3]
+                    # last_prob_list = prob_list[-3:]
+                    # print(len(first_prob_list), first_prob_list)
+                    # print(len(last_prob_list), last_prob_list)
+                    
+                    # final_prob_list = first_prob_list + last_prob_list
+                    # print(len(prob_list), len(final_prob_list), final_prob_list)
+                    
+                    for prob, prob_groups in student_groups.groupby("Problem Name"):
+                        # For first 3 and last 3 only
+                        # if not prob in final_prob_list:
+                        #     continue
+                        # print(prob)
+                        actions = list(prob_groups["Action"])
+                        # A problem should be completed by a student clicking Done button.
+                        if not "Done" in actions: 
+                            continue
+                        unique_steps = list(pd.unique(prob_groups["Step Name"]))
+                        unique_steps_len = len(set([s for s in unique_steps if not pd.isna(s) and not (s in options.opt_step1) and not(s in options.opt_step2)]))
+                        if unique_steps_len < 4:
+                            continue
+                                                    
+                        time_stamps = list(prob_groups["Time"])
+                        time_stamps_list = set()
+                        for i in range(len(time_stamps)-1):
+                            if (time_stamps[i+1] - time_stamps[i]) < 1800:
+                                time_stamps_list.add(time_stamps[i+1])
+                        
+                        # progress = ""
+                        
+                        step_names_token = []
+                        outcome = []
+                        help_level = []
+                        auto_complete = False
+                        means_and_extremes = False
+                        
+                        for index, row in prob_groups[['Time', 'Step Name', 'CF (Etalon)', 'Outcome', 'Help Level','CF (Workspace Progress Status)']].iterrows():
+                            
+                            step = row["Step Name"]
+                            progress = row["CF (Workspace Progress Status)"]
+                            etalon = row["CF (Etalon)"]
+                            
+                            if not pd.isna(step):
+                                if step in options.opt_step1 and not means_and_extremes:
+                                    try:
+                                        etalon = int(etalon)
+                                    except Exception as e:
+                                        try:
+                                            etalon = float(etalon)
+                                            means_and_extremes = True
+                                            # break
+                                        except Exception as e:
+                                            pass
+                                        
+                                if (step in options.opt_step1 or step in options.opt_step2) and row["Time"] in time_stamps_list:
+                                # if row["Time"] in time_stamps_list:
+                                    auto_complete = True
+                                    # print(row)
+                                    continue
+                              
+                                if not step_names_token or step != step_names_token[-1]:
+                                    step_names_token.append(step)
+                                    # ['OK', 'BUG', 'ERROR', 'INITIAL_HINT', 'HINT_LEVEL_CHANGE']
+                                    outcome.append(row['Outcome'])
+                                    help_level.append(str(row["Help Level"]))
+                                else:
+                                    outcome[-1] = outcome[-1]+":"+row['Outcome']
+                                    help_level[-1] = help_level[-1]+":"+str(row['Help Level'])
+                     
+                        unique_steps_len = len(set([s for s in step_names_token if not (s in options.opt_step1) and not(s in options.opt_step2)]))
+
+                        # 4 and more in sequence
+                        if step_names_token and unique_steps_len > 4: 
+                            # and len(step_names_token) > 3
+                            # For information
+                            # indices = [str(i) for i in prob_groups.index]
+                            where_opt = []
+                            for stp in step_names_token:
+                                if stp in options.opt_step1:
+                                    where_opt.append("1")
+                                elif stp in options.opt_step2:
+                                    where_opt.append("2")
+                                else:
+                                    where_opt.append("0")
+                            proba = random.random()
+                            
+                            # if prob in first_prob_list:
+                            if proba <= 0.8:
+                                # writtenTrain = True
+                                train_file.write("\t".join(step_names_token))
+                                train_file.write("\n")
+                                # train_info.write(",".join([str(progress),str(prob), str(student), str(len(step_names_token)),
+                                #                            "\t".join(map(str, outcome)), "\t".join(map(str, help_level))]))
+                                # progress, problem name, student id, auto_complete, total steps length, er or me, outcome seq, help_level seq, encoding in steps length
+                                train_info.write(",".join([str(progress),str(prob), str(student), str(auto_complete), str(len(step_names_token)), 
+                                                           f"{1 if means_and_extremes else 0}", "\t".join(map(str, outcome)), 
+                                                           "\t".join(map(str, help_level)), "\t".join(map(str, where_opt))]))
+                                train_info.write("\n")
+
+                            elif proba > 0.9:
+                            # elif prob in last_prob_list:
+                            
+                                # writtenTest = True
+
+                                test_file.write("\t".join(step_names_token))
+                                test_file.write("\n")
+                                # test_info.write(",".join([str(progress),str(prob), str(student),  str(auto_complete), str(len(step_names_token)),
+                                #                        "\t".join(map(str, outcome)), "\t".join(map(str, help_level))]))
+                                # progress, problem name, student id, total steps length, er or me, outcome seq, help_level seq, encoding in steps length
+                                test_info.write(",".join([str(progress),str(prob), str(student),  str(auto_complete), str(len(step_names_token)),
+                                                f"{1 if means_and_extremes else 0}", "\t".join(map(str, outcome)), 
+                                                  "\t".join(map(str, help_level)), "\t".join(map(str, where_opt))]))
+                                test_info.write("\n")
+                            else:
+                                val_file.write("\t".join(step_names_token))
+                                val_file.write("\n")
+                                # test_info.write(",".join([str(progress),str(prob), str(student),  str(auto_complete), str(len(step_names_token)),
+                                #                        "\t".join(map(str, outcome)), "\t".join(map(str, help_level))]))
+                                # progress, problem name, student id, total steps length, er or me, outcome seq, help_level seq, encoding in steps length
+                                val_info.write(",".join([str(progress),str(prob), str(student),  str(auto_complete), str(len(step_names_token)),
+                                                f"{1 if means_and_extremes else 0}", "\t".join(map(str, outcome)), 
+                                                  "\t".join(map(str, help_level)), "\t".join(map(str, where_opt))]))
+                                val_info.write("\n")
+                    # Indicates actions of next student
+                    # Indicates next problem
+                    # if writtenTrain:
+                    #     train_file.write("\n")
+                    #     train_info.write("\n")
+                    # if writtenTest:
+                    #     test_file.write("\n")
+                    #     test_info.write("\n")
+            # if not writtenTrain and not writtenTest:
+            #     print(f"Student {student} is not involved in workspace : {options.workspace_name}.")
+
+
+    train_file.close()
+    train_info.close()
+    
+    val_file.close()
+    val_info.close()
+    
+    test_file.close()
+    test_info.close()
+
+def prepare_school_pretraining_files(data_processor, options):
+    
+    # steps = pickle.load(open(f"{options.dataset_folder}unique_steps_list.pkl","rb"))
+    
+    chunk_iterator = data_processor.load_file_iterator(sep=",")
+
+    train_file = open(options.train_file_path, "w")
+    train_info = open(options.train_info_path, "w")
+
+    val_file = open(options.val_file_path, "w")
+    val_info = open(options.val_info_path, "w")
+    
+    test_file = open(options.test_file_path, "w")
+    test_info = open(options.test_info_path, "w")
+
+
+    for chunk_data in chunk_iterator:
+        for school, school_group in chunk_data.groupby('CF (Anon School Id)'):
+            if not options.school or school in options.school:
+                for class_id, class_group in school_group.groupby('CF (Anon Class Id)'):
+                    for student, student_groups in class_group.groupby("Anon Student Id"):
+                        student_groups.sort_values(by="Time")
+                        # prob_list = list(pd.unique(student_groups["Problem Name"]))
+                        for prob, prob_groups in student_groups.groupby("Problem Name"):
+                            # For first 3 and last 3 only
+                            # if not prob in final_prob_list:
+                            #     continue
+                            # print(prob)
+                            step_names_token = []
+                            means_and_extremes = False
+                            for index, row in prob_groups[['Time', 'Step Name', 'Action', 'Attempt At Step', 'CF (Is Autofilled)', 'CF (Workspace Progress Status)', 'CF (Etalon)', 'CF (Problem Scenario Tags)']].iterrows():
+                                progress = row["CF (Workspace Progress Status)"]
+                                action = row["Action"]
+                                attempt = row["Attempt At Step"]
+                                autofilled = row["CF (Is Autofilled)"]
+                                step = row["Step Name"]
+                                scenario = row['CF (Problem Scenario Tags)']
+                                
+                                if not pd.isna(step):
+                                    if step in options.opt_step1 and not means_and_extremes:
+                                        etalon = row["CF (Etalon)"]
+                                        if not pd.isna(etalon):
+                                            etalon = etalon.strip('{}')
+                                            key, value = etalon.split('=')
+                                            etalon = value
+                                            try:
+                                                etalon = int(etalon)
+                                            except Exception as e:
+                                                try:
+                                                    etalon = float(etalon)
+                                                    means_and_extremes = True
+                                                except Exception as e:
+                                                    pass
+                                
+                                if not autofilled:
+                                    new_step = f"{step}:{action}:{attempt}"
+                                    step_names_token.append(new_step)
+                            
+                            if step_names_token:
+                                where_opt = []
+                                step1 = False
+                                step2 = False
+                                strategy_data = False
+                                for step_oh in step_names_token:
+                                    step = step_oh.split(":")
+                                    if len(step) == 3:
+                                        step = step[0]
+                                    else:
+                                        step = ":".join(step[:2])
+                                        
+                                    # print(f"changed {step_oh} = ? {step}")
+                                    if step == options.opt_step1[0]:
+                                        where_opt.append("_1")
+                                        step1 = True
+                                    elif step == options.opt_step2[0]:
+                                        where_opt.append("_2")
+                                        step2 = True
+                                    elif step in options.opt_step1[1:]:
+                                        where_opt.append("1")
+                                        if step1:
+                                            strategy_data = True
+                                    elif step in options.opt_step2[1:]:
+                                        where_opt.append("2")
+                                        if step2:
+                                            strategy_data = True
+                                    else:
+                                        where_opt.append("0")
+                                        
+                                if strategy_data and step_names_token[-1].split(":")[-2] != "Done":
+                                    strategy_data = False
+                                        
+                                if strategy_data:
+                                    proba = random.random()
+                                    step_names_tokens = [":".join(s.split(":")[:-2]) for s in step_names_token]
+                                    step_names_token = []
+                                    for s in step_names_tokens:
+                                        if s != "nan":
+                                            if not step_names_token or s != step_names_token[-1]:
+                                                step_names_token.append(s)
+                                    # if prob in first_prob_list:
+                                    if proba <= 0.8:
+                                        train_file.write("\t".join(step_names_token))
+                                        train_file.write("\n")
+                                        # school, class, student id, progress, problem name, scenario, prefered ER or ME, total steps length, encoding in steps length
+                                        train_info.write(",".join([str(school), str(class_id), str(student), str(progress), str(prob), str(scenario), f"{1 if means_and_extremes else 0}", str(len(step_names_token)), "\t".join(map(str, where_opt))]))
+                                        train_info.write("\n")
+
+                                    elif proba > 0.9:
+                                    # elif prob in last_prob_list:
+                                        test_file.write("\t".join(step_names_token))
+                                        test_file.write("\n")
+                                        # school, class, student id, progress, problem name, scenario, prefered ER or ME, total steps length, encoding in steps length
+                                        test_info.write(",".join([str(school), str(class_id), str(student), str(progress), str(prob), str(scenario), f"{1 if means_and_extremes else 0}", str(len(step_names_token)), "\t".join(map(str, where_opt))]))
+                                        test_info.write("\n")
+
+                                    else:
+                                        val_file.write("\t".join(step_names_token))
+                                        val_file.write("\n")
+                                        # school, class, student id, progress, problem name, scenario, prefered ER or ME, total steps length, encoding in steps length
+                                        val_info.write(",".join([str(school), str(class_id), str(student), str(progress), str(prob), str(scenario), f"{1 if means_and_extremes else 0}", str(len(step_names_token)), "\t".join(map(str, where_opt))]))
+                                        val_info.write("\n")
+                            # break
+                        # break
+                    # break
+                # break
+        # break
+
+
+
+    train_file.close()
+    train_info.close()
+    
+    val_file.close()
+    val_info.close()
+    
+    test_file.close()
+    test_info.close()
+    
+def prepare_school_coded_pretraining_files(data_processor, options):
+    
+    # steps = pickle.load(open(f"{options.dataset_folder}unique_steps_list.pkl","rb"))
+    
+    chunk_iterator = data_processor.load_file_iterator(sep=",")
+
+    train_file = open(options.train_file_path, "w")
+    train_info = open(options.train_info_path, "w")
+
+    val_file = open(options.val_file_path, "w")
+    val_info = open(options.val_info_path, "w")
+    
+    test_file = open(options.test_file_path, "w")
+    test_info = open(options.test_info_path, "w")
+
+
+    for chunk_data in chunk_iterator:
+        for school, school_group in chunk_data.groupby('CF (Anon School Id)'):
+            if not options.school or school in options.school:
+                print(f"{school} : {school_group.shape}")
+                school_group = school_group[(school_group['CF (Is StepByStep)'] == False) &
+                                            (school_group['CF (Encounter)'] == 0) &
+                                            (school_group['CF (Is Review Mode)'] == -1) ]
+                print(f"{school} : {school_group.shape}")
+                # for class_id, class_group in school_groups.groupby('CF (Anon Class Id)'):
+                for student, student_groups in school_group.groupby("Anon Student Id"):
+                    student_groups.sort_values(by="Time", inplace=True)
+                    # At least 3 last problems are selected
+                    prob_list= list(student_groups[student_groups["CF (Workspace Progress Status)"]=="GRADUATED"]["Problem Name"])
+                    prob_list = prob_list[-int(len(prob_list)/2):]
+                    for prob, prob_groups in student_groups.groupby("Problem Name"):
+                        if not prob in prob_list:
+                            continue
+                        progress = list(pd.unique(prob_groups["CF (Workspace Progress Status)"]))[0]
+                        if progress != "GRADUATED":
+                            continue
+                        actions = list(prob_groups["Action"])
+                        # A problem should be completed by a student clicking Done button.
+                        if not "Done" in actions: 
+                            continue
+                        unique_steps = list(pd.unique(prob_groups[prob_groups["CF (Is Autofilled)"] == False]["Step Name"]))
+                        unique_steps_len = len([s for s in unique_steps if not pd.isna(s) and not (s in options.opt_step1) and not (s in options.opt_step2)])
+                        if unique_steps_len < 4:
+                            continue
+                        class_id = list(pd.unique(prob_groups["CF (Anon Class Id)"]))
+                        step_names_token = []
+                        original_steps_actions_attempts_help_levels_outcomes = []
+                        original_steps = []
+                        means_and_extremes = False
+                        for index, row in prob_groups[['Step Name', 'Action', 'Attempt At Step', 'CF (Is Autofilled)',
+                                                       'Outcome', 'Help Level', 'CF (Workspace Progress Status)', 
+                                                       'CF (Etalon)', 'CF (Problem Scenario Tags)']].iterrows():
+                            step = row["Step Name"]
+                            action = row["Action"]            # ['Attempt', 'Hint Request', 'Hint Level Change', 'Done']
+                            attempt = row["Attempt At Step"]  # number
+                            outcome = row["Outcome"]          # ['OK', 'BUG', 'ERROR', 'INITIAL_HINT', 'HINT_LEVEL_CHANGE']
+                            help_level = row["Help Level"]    # number
+                            # progress = row["CF (Workspace Progress Status)"]
+                            scenario = row['CF (Problem Scenario Tags)']
+                            
+                            if not pd.isna(step):
+                                if step in options.opt_step1 and not means_and_extremes:
+                                    etalon = row["CF (Etalon)"]
+                                    if not pd.isna(etalon):
+                                        etalon = etalon.strip('{}')
+                                        key, value = etalon.split('=')
+                                        etalon = value
+                                        try:
+                                            etalon = int(etalon)
+                                        except Exception as e:
+                                            try:
+                                                etalon = float(etalon)
+                                                means_and_extremes = True
+                                            except Exception as e:
+                                                pass
+                                if row['CF (Is Autofilled)'] == True:
+                                    continue
+                                prev = step_names_token[-1] if step_names_token else ""
+                                prev_step = step_names_token[-1].split("-")[0] if step_names_token else ""
+
+                                if not step_names_token or step != prev_step:
+                                    if step in options.opt_step1 or step in options.opt_step2:
+                                        new_step = step
+                                    else:
+                                        if action == "Attempt" and outcome != "OK":
+                                            new_step = step+"-2"
+                                        elif "Hint" in action:
+                                            new_step = step+"-1"
+                                        else:
+                                            new_step = step+"-0"
+                                    step_names_token.append(new_step)
+                                else:
+                                    if not (step in options.opt_step1 or step in options.opt_step2):
+                                        if action == "Attempt" and outcome != "OK":
+                                            new_step = step+"-2"
+                                        elif "Hint" in action:
+                                            new_step = step+"-1"
+                                        else:
+                                            new_step = step+"-0"
+                                        if prev < new_step:
+                                            step_names_token[-1] = new_step
+                                original_steps_actions_attempts_help_levels_outcomes.append(f"{step}-{action}-{attempt}-{help_level}-{outcome}")
+                                original_steps.append(step)
+                                
+                        unique_steps_len = len([s for s in original_steps if not (s in options.opt_step1) and not(s in options.opt_step2)])
+                        if step_names_token and unique_steps_len > 4:
+                            proba = random.random()
+                            # if prob in first_prob_list:
+                            if proba <= 0.8:
+                                train_file.write("\t".join(step_names_token))
+                                train_file.write("\n")
+                                # school, class, student id, progress, problem name, scenario, 
+                                # prefered ER or ME, total steps length, 
+                                # original seq-action-attempt-help_level-outcome
+                                train_info.write(",".join([str(school), "\t".join(class_id), str(student), str(progress), str(prob), str(scenario), 
+                                               f"{1 if means_and_extremes else 0}", str(len(step_names_token)), 
+                                               "\t".join(map(str, original_steps_actions_attempts_help_levels_outcomes))]))
+                                train_info.write("\n")
+
+                            elif proba > 0.9:
+                            # elif prob in last_prob_list:
+                                test_file.write("\t".join(step_names_token))
+                                test_file.write("\n")
+                                # school, class, student id, progress, problem name, scenario, 
+                                # prefered ER or ME, total steps length, 
+                                # original seq-action-attempt-help_level-outcome
+                                test_info.write(",".join([str(school), "\t".join(class_id), str(student), str(progress), str(prob), str(scenario), 
+                                               f"{1 if means_and_extremes else 0}", str(len(step_names_token)), 
+                                               "\t".join(map(str, original_steps_actions_attempts_help_levels_outcomes))]))
+                                test_info.write("\n")
+
+                            else:
+                                val_file.write("\t".join(step_names_token))
+                                val_file.write("\n")
+                                # school, class, student id, progress, problem name, scenario, 
+                                # prefered ER or ME, total steps length, 
+                                # original seq-action-attempt-help_level-outcome
+                                val_info.write(",".join([str(school), "\t".join(class_id), str(student), str(progress), str(prob), str(scenario), 
+                                               f"{1 if means_and_extremes else 0}", str(len(step_names_token)), 
+                                               "\t".join(map(str, original_steps_actions_attempts_help_levels_outcomes))]))
+                                val_info.write("\n")
+                        # break
+                    # break
+                    # break
+                # break
+        # break
+
+
+
+    train_file.close()
+    train_info.close()
+    
+    val_file.close()
+    val_info.close()
+    
+    test_file.close()
+    test_info.close()
+
+
+def prepare_school_attention_files(data_processor, options):
+    
+    # steps = pickle.load(open(f"{options.dataset_folder}unique_steps_list.pkl","rb"))
+    
+    chunk_iterator = data_processor.load_file_iterator(sep=",")
+
+    train_file = open(options.train_file_path, "w")
+    train_info = open(options.train_info_path, "w")
+
+    val_file = open(options.val_file_path, "w")
+    val_info = open(options.val_info_path, "w")
+    
+    test_file = open(options.test_file_path, "w")
+    test_info = open(options.test_info_path, "w")
+
+
+    for chunk_data in chunk_iterator:
+        for school, school_group in chunk_data.groupby('CF (Anon School Id)'):
+            if not options.school or school in options.school:
+                for class_id, class_group in school_group.groupby('CF (Anon Class Id)'):
+                    for student, student_groups in class_group.groupby("Anon Student Id"):
+                        student_groups.sort_values(by="Time")
+#                         prob_list = list(pd.unique(student_groups["Problem Name"]))
+#                         if len(prob_list) > 0 :
+#                             first_fews = int(len(prob_list)/2)
+#                             last_fews = len(prob_list) - first_fews
+#                             first_prob_list = prob_list[:first_fews]
+#                             last_prob_list = prob_list[-last_fews:]
+                    
+                    # final_prob_list = first_prob_list + last_prob_list
+                        for prob, prob_groups in student_groups.groupby("Problem Name"):
+                            step_names_token = []
+                            means_and_extremes = False
+                            for index, row in prob_groups[['Time', 'Step Name', 'Action', 'Attempt At Step', 'CF (Is Autofilled)', 'CF (Workspace Progress Status)', 'CF (Etalon)', 'CF (Problem Scenario Tags)']].iterrows():
+                                progress = row["CF (Workspace Progress Status)"]
+                                action = row["Action"]
+                                attempt = row["Attempt At Step"]
+                                autofilled = row["CF (Is Autofilled)"]
+                                step = row["Step Name"]
+                                scenario = row['CF (Problem Scenario Tags)']
+                                
+                                if not pd.isna(step):
+                                    if step in options.opt_step1 and not means_and_extremes:
+                                        etalon = row["CF (Etalon)"]
+                                        if not pd.isna(etalon):
+                                            etalon = etalon.strip('{}')
+                                            key, value = etalon.split('=')
+                                            etalon = value
+                                            try:
+                                                etalon = int(etalon)
+                                            except Exception as e:
+                                                try:
+                                                    etalon = float(etalon)
+                                                    means_and_extremes = True
+                                                except Exception as e:
+                                                    pass
+                                
+                                if not autofilled:
+                                    new_step = f"{step}:{action}:{attempt}"
+                                    step_names_token.append(new_step)
+                            
+                            if step_names_token:
+                                where_opt = []
+                                step1 = False
+                                step2 = False
+                                strategy_data = False
+                                for step_oh in step_names_token:
+                                    step = step_oh.split(":")
+                                    if len(step) == 3:
+                                        step = step[0]
+                                    else:
+                                        step = ":".join(step[:2])
+                                        
+                                    # print(f"changed {step_oh} = ? {step}")
+                                    if step == options.opt_step1[0]:
+                                        where_opt.append("_1")
+                                        step1 = True
+                                    elif step == options.opt_step2[0]:
+                                        where_opt.append("_2")
+                                        step2 = True
+                                    elif step in options.opt_step1[1:]:
+                                        where_opt.append("1")
+                                        if step1:
+                                            strategy_data = True
+                                    elif step in options.opt_step2[1:]:
+                                        where_opt.append("2")
+                                        if step2:
+                                            strategy_data = True
+                                    else:
+                                        where_opt.append("0")
+                                        
+                                if strategy_data and step_names_token[-1].split(":")[-2] != "Done":
+                                    strategy_data = False
+                                        
+                                if strategy_data:
+                                    # proba = random.random()
+                                    step_names_tokens = [":".join(s.split(":")[:-2]) for s in step_names_token]
+                                    step_names_token = []
+                                    for s in step_names_tokens:
+                                        if s != "nan":
+                                            if not step_names_token or s != step_names_token[-1]:
+                                                step_names_token.append(s)
+                                    # if prob in first_prob_list:
+                                    if progress == "GRADUATED":# and means_and_extremes:# and prob in first_prob_list:
+                                        train_file.write("\t".join(step_names_token))
+                                        train_file.write("\n")
+                                        # school, class, student id, progress, problem name, scenario, prefered ER or ME, total steps length, encoding in steps length
+                                        train_info.write(",".join([str(school), str(class_id), str(student), str(progress), str(prob), str(scenario), f"{1 if means_and_extremes else 0}", str(len(step_names_token)), "\t".join(map(str, where_opt))]))
+                                        train_info.write("\n")
+
+                                    elif progress == "PROMOTED":# and means_and_extremes:# and prob in last_prob_list:
+                                    # elif prob in last_prob_list:
+                                        test_file.write("\t".join(step_names_token))
+                                        test_file.write("\n")
+                                        # school, class, student id, progress, problem name, scenario, prefered ER or ME, total steps length, encoding in steps length
+                                        test_info.write(",".join([str(school), str(class_id), str(student), str(progress), str(prob), str(scenario), f"{1 if means_and_extremes else 0}", str(len(step_names_token)), "\t".join(map(str, where_opt))]))
+                                        test_info.write("\n")
+
+                                    # else:
+                                    #     val_file.write("\t".join(step_names_token))
+                                    #     val_file.write("\n")
+                                    #     # school, class, student id, progress, problem name, scenario, prefered ER or ME, total steps length, encoding in steps length
+                                    #     val_info.write(",".join([str(school), str(class_id), str(student), str(progress), str(prob), str(scenario), f"{1 if means_and_extremes else 0}", str(len(step_names_token)), "\t".join(map(str, where_opt))]))
+                                    #     val_info.write("\n")
+                            # break
+                        # break
+                    # break
+                # break
+        # break
+
+
+
+    train_file.close()
+    train_info.close()
+    
+    val_file.close()
+    val_info.close()
+    
+    test_file.close()
+    test_info.close()
+    
+def prepare_finetuning_10per_files(data_processor, options):
+    '''
+        Used for L@S paper.
+        Only two strategies were defined as:
+        0: non-opt strategy
+        1: opt used strategy
+    '''
+    chunk_iterator = data_processor.load_file_iterator()
+
+    train_file = open(options.train_file_path, "w")
+    train_info = open(options.train_info_path, "w")
+    train_label = open(options.train_label_path, "w")
+    
+#     val_file = open(options.val_file_path, "w")
+#     val_info = open(options.val_info_path, "w")
+#     val_label = open(options.val_label_path, "w")
+    
+    test_file = open(options.test_file_path, "w")
+    test_info = open(options.test_info_path, "w")
+    test_label = open(options.test_label_path, "w")
+    
+    overall_data = []
+    overall_labels = []
+    for chunk_data in chunk_iterator:
+        for section, section_groups in chunk_data.groupby("Level (Workspace Id)"):
+            if "ratio_proportion_change3" == section:
+                for student, student_groups in section_groups.groupby("Anon Student Id"):
+                    student_groups.sort_values(by="Time", inplace=True)                    
+                    for prob, prob_groups in student_groups.groupby("Problem Name"):
+                        
+                        actions = list(prob_groups["Action"])
+                        # A problem should be completed by a student clicking Done button.
+                        if not "Done" in actions: 
+                            continue
+                        unique_steps = list(pd.unique(prob_groups["Step Name"]))
+                        unique_steps_len = len(set([s for s in unique_steps if not pd.isna(s) and not (s in options.opt_step1) and not(s in options.opt_step2)]))
+                        if unique_steps_len < 4:
+                            continue
+                            
+                        step_names_token = []
+                        
+                        time_stamps = list(prob_groups["Time"])
+                        time_stamps_list = set()
+                        for i in range(len(time_stamps)-1):
+                            if (time_stamps[i+1] - time_stamps[i]) < 1800:
+                                time_stamps_list.add(time_stamps[i+1])
+                        
+                        outcome = []
+                        help_level = []
+                        auto_complete = False
+                        means_and_extremes = False
+                        
+                        for index, row in prob_groups[['Time', 'Step Name', 'Outcome', 'Help Level','CF (Workspace Progress Status)', 'CF (Etalon)']].iterrows():
+                            step = row["Step Name"]
+                            etalon = row["CF (Etalon)"]
+                            progress = row["CF (Workspace Progress Status)"]
+                            if not pd.isna(step):
+                                if step in options.opt_step1:
+                                    try:
+                                        etalon = int(etalon)
+                                    except Exception as e:
+                                        try:
+                                            etalon = float(etalon)
+                                            means_and_extremes = True
+                                            # break
+                                        except Exception as e:
+                                            pass
+                                if (step in options.opt_step1 or step in options.opt_step2) and row["Time"] in time_stamps_list:
+                                    auto_complete = True
+                                    continue
+                                # if not step_names_token or step != step_names_token[-1]:
+                                #     step_names_token.append(step)
+                                
+                                if not step_names_token or step != step_names_token[-1]:
+                                    step_names_token.append(step)
+                                    # ['OK', 'BUG', 'ERROR', 'INITIAL_HINT', 'HINT_LEVEL_CHANGE']
+                                    outcome.append(row['Outcome'])
+                                    help_level.append(str(row["Help Level"]))
+                                else:
+                                    outcome[-1] = outcome[-1]+":"+row['Outcome']
+                                    help_level[-1] = help_level[-1]+":"+str(row['Help Level'])
+                        
+                        unique_steps_len = len(set([s for s in step_names_token if not (s in options.opt_step1) and not(s in options.opt_step2)]))
+                        # 4 and more in sequence
+                        if step_names_token and unique_steps_len > 4: 
+                            where_opt = []
+                            for stp in step_names_token:
+                                if stp in options.opt_step1:
+                                    where_opt.append("1")
+                                elif stp in options.opt_step2:
+                                    where_opt.append("2")
+                                else:
+                                    where_opt.append("0")
+                            
+                            label_opt = "0"
+                            if options.opt_step1:
+                                any_opt1 = any(any(opt in step for step in step_names_token) for opt in options.opt_step1[1:])
+                                if any_opt1:
+                                    label_opt = "1"
+   
+                            if options.opt_step2:
+                                any_opt2 = any(any(opt in step for step in step_names_token) for opt in options.opt_step2[1:])
+                                if any_opt2:
+                                    label_opt = "1"
+                          
+                            # progress, problem name, student id, total steps length, outcome seq, help_level seq, encoding in steps length
+                            info = ",".join([str(progress),str(prob), str(student), str(auto_complete), str(len(step_names_token)),
+                                             "\t".join(map(str, outcome)), "\t".join(map(str, help_level)), "\t".join(map(str, where_opt)), 
+                                             f"{1 if means_and_extremes else 0}"])
+                            overall_data.append(["\t".join(step_names_token), info])
+                            overall_labels.append(label_opt)
+                            
+                    # overall_data.append('')
+                    # overall_labels.append('')
+    
+    overall_labels = np.array(overall_labels)
+    indices_of_zeros = list(np.where(overall_labels == '0')[0])
+    indices_of_ones = list(np.where(overall_labels == '1')[0])
+    
+    train_len = int(len(overall_labels) * 0.10)
+    sample_size = int(train_len/2)
+    print(f"sample_size: {sample_size}")
+    sampled_instances = random.sample(indices_of_zeros, sample_size)
+    sampled_instances.extend(random.sample(indices_of_ones, sample_size))
+    
+    balanced_test = min(len(indices_of_zeros), len(indices_of_ones))
+    test_sampled_instances = random.sample(indices_of_zeros, balanced_test)
+    test_sampled_instances.extend(random.sample(indices_of_ones, balanced_test))
+
+    # writtenTrain = False
+    # writtenTest = False
+    for index, (all_data, label) in enumerate(zip(overall_data, overall_labels)):
+
+        steps_seq = all_data[0]
+        info = all_data[1]
+
+        if index in sampled_instances:
+            train_file.write(steps_seq)
+            train_file.write("\n")
+            
+            train_info.write(info)
+            train_info.write("\n")
+            
+            train_label.write(label)
+            train_label.write("\n")
+        elif index in test_sampled_instances:
+            # proba = random.random()
+            # if proba <0.5:
+            test_file.write(steps_seq)
+            test_file.write("\n")
+
+            test_info.write(info)
+            test_info.write("\n")
+
+            test_label.write(label)
+            test_label.write("\n")
+#             else:
+#                 val_file.write(steps_seq)
+#                 val_file.write("\n")
+
+#                 val_info.write(info)
+#                 val_info.write("\n")
+
+#                 val_label.write(label)
+#                 val_label.write("\n")
+
+    train_file.close()
+    train_info.close()
+    train_label.close()
+    
+    # val_file.close()
+    # val_info.close()
+    # val_label.close()
+    
+    test_file.close()
+    test_info.close()
+    test_label.close()
+
+def prepare_finetuning_IS_FS_files(data_processor, options):
+    '''
+        Used for L@S paper. This function gathers first three problems of each student.
+        Only two strategies were defined as:
+        0: non-opt strategy
+        1: opt used strategy
+        train: IS
+        test: FS
+    '''
+    chunk_iterator = data_processor.load_file_iterator()
+
+    train_file = open(options.train_file_path, "w")
+    train_info = open(options.train_info_path, "w")
+    train_label = open(options.train_label_path, "w")
+    
+#     val_file = open(options.val_file_path, "w")
+#     val_info = open(options.val_info_path, "w")
+#     val_label = open(options.val_label_path, "w")
+    
+    test_file = open(options.test_file_path, "w")
+    test_info = open(options.test_info_path, "w")
+    test_label = open(options.test_label_path, "w")
+
+    for chunk_data in chunk_iterator:
+        for section, section_groups in chunk_data.groupby("Level (Workspace Id)"):
+            if "ratio_proportion_change3" == section:
+                for student, student_groups in section_groups.groupby("Anon Student Id"):
+                    student_groups.sort_values(by="Time", inplace=True)    
+                    
+                    prob_list = list(pd.unique(student_groups["Problem Name"]))
+                    if len(prob_list) < 3:
+                        continue
+                    selected = 3 #1. int(len(prob_list)/2)
+                                #2. 3 & <6
+                                #3. 3 & <3
+                    first_prob_list = prob_list[:selected]
+                    last_prob_list = prob_list[-selected:]
+                            
+                    for prob, prob_groups in student_groups.groupby("Problem Name"):
+                        
+                        actions = list(prob_groups["Action"])
+                        # A problem should be completed by a student clicking Done button.
+                        if not "Done" in actions: 
+                            continue
+                        unique_steps = list(pd.unique(prob_groups["Step Name"]))
+                        unique_steps_len = len(set([s for s in unique_steps if not pd.isna(s) and not (s in options.opt_step1) and not(s in options.opt_step2)]))
+                        if unique_steps_len < 4:
+                            continue
+                            
+                        step_names_token = []
+                        
+                        time_stamps = list(prob_groups["Time"])
+                        time_stamps_list = set()
+                        for i in range(len(time_stamps)-1):
+                            if (time_stamps[i+1] - time_stamps[i]) < 1800:
+                                time_stamps_list.add(time_stamps[i+1])
+                        
+                        outcome = []
+                        help_level = []
+                        auto_complete = False
+                        means_and_extremes = False
+                        
+                        for index, row in prob_groups[['Time', 'Step Name', 'Outcome', 'Help Level','CF (Workspace Progress Status)', 'CF (Etalon)']].iterrows():
+                            step = row["Step Name"]
+                            etalon = row["CF (Etalon)"]
+                            progress = row["CF (Workspace Progress Status)"]
+                            if not pd.isna(step):
+                                if step in options.opt_step1:
+                                    try:
+                                        etalon = int(etalon)
+                                    except Exception as e:
+                                        try:
+                                            etalon = float(etalon)
+                                            means_and_extremes = True
+                                            # break
+                                        except Exception as e:
+                                            pass
+                                if (step in options.opt_step1 or step in options.opt_step2) and row["Time"] in time_stamps_list:
+                                    auto_complete = True
+                                    continue
+                                # if not step_names_token or step != step_names_token[-1]:
+                                #     step_names_token.append(step)
+                                
+                                if not step_names_token or step != step_names_token[-1]:
+                                    step_names_token.append(step)
+                                    # ['OK', 'BUG', 'ERROR', 'INITIAL_HINT', 'HINT_LEVEL_CHANGE']
+                                    outcome.append(row['Outcome'])
+                                    help_level.append(str(row["Help Level"]))
+                                else:
+                                    outcome[-1] = outcome[-1]+":"+row['Outcome']
+                                    help_level[-1] = help_level[-1]+":"+str(row['Help Level'])
+                        
+                        unique_steps_len = len(set([s for s in step_names_token if not (s in options.opt_step1) and not(s in options.opt_step2)]))
+                        # 4 and more in sequence
+                        if step_names_token and unique_steps_len > 4: 
+                            where_opt = []
+                            for stp in step_names_token:
+                                if stp in options.opt_step1:
+                                    where_opt.append("1")
+                                elif stp in options.opt_step2:
+                                    where_opt.append("2")
+                                else:
+                                    where_opt.append("0")
+                            
+                            label_opt = "0"
+                            if options.opt_step1:
+                                any_opt1 = any(any(opt in step for step in step_names_token) for opt in options.opt_step1[1:])
+                                if any_opt1:
+                                    label_opt = "1"
+   
+                            if options.opt_step2:
+                                any_opt2 = any(any(opt in step for step in step_names_token) for opt in options.opt_step2[1:])
+                                if any_opt2:
+                                    label_opt = "1"
+                          
+                            # progress, problem name, student id, total steps length, outcome seq, help_level seq, encoding in steps length
+                            info = ",".join([str(progress),str(prob), str(student), str(auto_complete), str(len(step_names_token)),
+                                             "\t".join(map(str, outcome)), "\t".join(map(str, help_level)), "\t".join(map(str, where_opt)), 
+                                             f"{1 if means_and_extremes else 0}"])
+                            if prob in first_prob_list:
+                                train_file.write("\t".join(step_names_token))
+                                train_file.write("\n")
+
+                                train_info.write(info)
+                                train_info.write("\n")
+
+                                train_label.write(label_opt)
+                                train_label.write("\n")
+                            elif prob in last_prob_list:
+                                test_file.write("\t".join(step_names_token))
+                                test_file.write("\n")
+
+                                test_info.write(info)
+                                test_info.write("\n")
+
+                                test_label.write(label_opt)
+                                test_label.write("\n")
+
+    train_file.close()
+    train_info.close()
+    train_label.close()
+    
+    # val_file.close()
+    # val_info.close()
+    # val_label.close()
+    
+    test_file.close()
+    test_info.close()
+    test_label.close()
+    
+def prepare_finetuning_IS_files_old(data_processor, opts):
+    '''
+        Used for L@S paper. This function gathers first three problems of each student.
+        Only two strategies were defined as:
+        0: non-opt strategy
+        1: opt used strategy
+    '''
+    
+    options = copy.deepcopy(opts)
+    for k,v in vars(opts).items():
+        if k.startswith("train") or k.startswith("test"):
+            if v:
+                f_path = v.split("/")
+                f_path = f_path[0]+"/"+f_path[1]+"/IS/"+f_path[2]
+                setattr(options, f"{k}", f_path)
+                print(f"options.{k} : {getattr(options, f'{k}')}")
+                
+    steps = pickle.load(open(f"{options.dataset_folder}unique_steps_list.pkl","rb"))
+    chunk_iterator = data_processor.load_file_iterator()
+
+    train_file = open(options.train_file_path, "w")
+    train_info = open(options.train_info_path, "w")
+    train_label = open(options.train_label_path, "w")
+    trainr_label = open(options.trainr_label_path, "w")
+    train_gt_label = open(options.train_gt_label_path, "w")
+    
+    # test_file = open(options.test_file_path, "w")
+    # test_info = open(options.test_info_path, "w")
+    # test_label = open(options.test_label_path, "w")
+    # testr_label = open(options.testr_label_path, "w")
+    # test_gt_label = open(options.test_gt_label_path, "w")
+    
+    overall_data = []
+    overall_labels = []
+    for chunk_data in chunk_iterator:
+        for section, section_groups in chunk_data.groupby("Level (Workspace Id)"):
+            if options.workspace_name == section:
+                for student, student_groups in section_groups.groupby("Anon Student Id"):
+                    
+                    student_groups.sort_values(by="Time")
+                    prob_list = list(pd.unique(student_groups["Problem Name"]))
+                    
+                    if len(prob_list) < 3:
+                        continue
+
+                    first_prob_list = prob_list[:3]
+#                     last_prob_list = prob_list[-3:]
+#                     # print(len(first_prob_list), len(last_prob_list))
+                    
+#                     final_prob_list = first_prob_list + last_prob_list
+                    # print(len(prob_list), len(final_prob_list)) #, final_prob_list)
+                    
+                    for prob, prob_groups in student_groups.groupby("Problem Name"):
+                        # For first 3 and last 3 only
+                        if not prob in first_prob_list:
+                            continue
+                            
+                        step_names_token = []
+                        
+                        time_stamps = list(prob_groups["Time"])
+                        time_stamps_list = set()
+                        for i in range(len(time_stamps)-1):
+                            if (time_stamps[i+1] - time_stamps[i]) < 2000:
+                                time_stamps_list.add(time_stamps[i+1])
+                        
+                        progress = ""
+                        outcome = []
+                        help_level = []
+                        auto_complete = False
+                        means_and_extremes = False
+                        finals = len(options.final_step)
+                        totals = 0
+                        
+                        for index, row in prob_groups[['Time', 'Step Name', 'Outcome', 'Help Level','CF (Workspace Progress Status)', 'CF (Etalon)']].iterrows():
+                            step = row["Step Name"]
+                            etalon = row["CF (Etalon)"]
+                            progress = row["CF (Workspace Progress Status)"]
+                            if not pd.isna(step):
+                                if step in options.opt_step1:
+                                    try:
+                                        etalon = int(etalon)
+                                    except Exception as e:
+                                        try:
+                                            etalon = float(etalon)
+                                            means_and_extremes = True
+                                            # break
+                                        except Exception as e:
+                                            pass
+                                if (step in options.opt_step1 or step in options.opt_step2) and row["Time"] in time_stamps_list:
+                                # if row["Time"] in time_stamps_list:
+                                    auto_complete = True
+                                    # print(row)
+                                    continue
+                                # if not step_names_token or step != step_names_token[-1]:
+                                #     step_names_token.append(step)
+                                
+                                if not step_names_token or step != step_names_token[-1]:
+                                    step_names_token.append(step)
+                                    # ['OK', 'BUG', 'ERROR', 'INITIAL_HINT', 'HINT_LEVEL_CHANGE']
+                                    outcome.append(row['Outcome'])
+                                    help_level.append(str(row["Help Level"]))
+                                    if finals == 0:
+                                        totals += 1
+                                else:
+                                    outcome[-1] = outcome[-1]+":"+row['Outcome']
+                                    help_level[-1] = help_level[-1]+":"+str(row['Help Level'])
+                                
+                        error_ind = ['BUG', 'ERROR', 'INITIAL_HINT', 'HINT_LEVEL_CHANGE']
+                        errors = 0
+                        for step, out in zip(step_names_token, outcome):
+                            if (finals and step in options.final_step) or totals > 0:
+                                out = out.split(":")
+                                if any(any(ind in o for o in out) for ind in error_ind):
+                                    errors +=1
+                                    
+                        if finals:
+                            totals = finals
+                        # 4 and more in sequence
+                        if step_names_token: # and len(step_names_token) > 3
+                            
+                            where_opt = []
+                            for stp in step_names_token:
+                                if stp in options.opt_step1:
+                                    where_opt.append("1")
+                                elif stp in options.opt_step2:
+                                    where_opt.append("2")
+                                else:
+                                    where_opt.append("0")
+                            
+                            label_opt = "0"
+                            if options.opt_step1:
+                                any_opt1 = any(any(opt in step for step in step_names_token) for opt in options.opt_step1[1:])
+                                if any_opt1:
+                                    label_opt = "1"
+
+                                
+                            if options.opt_step2:
+                                any_opt2 = any(any(opt in step for step in step_names_token) for opt in options.opt_step2[1:])
+                                if any_opt2:
+                                    label_opt = "1"
+                            
+                            correctness = 1 - errors/totals
+                            strat_correct = "0"
+                            if correctness > 0.75:
+                                strat_correct = "1" 
+                                
+                             # progress, problem name, student id, total steps length, outcome seq, help_level seq, encoding in steps length
+                            info = ",".join([str(progress),str(prob), str(student), str(auto_complete), str(len(step_names_token)),"\t".join(map(str, outcome)), "\t".join(map(str, help_level)), "\t".join(map(str, where_opt)), str(correctness)])
+                            
+                            overall_data.append(["\t".join(step_names_token), strat_correct, info, f"{1 if means_and_extremes else 0}"])
+                            overall_labels.append(label_opt)
+                            
+                    overall_data.append('')
+                    overall_labels.append('')   
+                            
+#     overall_labels = np.array(overall_labels)
+#     indices_of_zeros = list(np.where(overall_labels == '0')[0])
+#     indices_of_ones = list(np.where(overall_labels == '1')[0])
+    
+#     zeros_instances_size = int(1 * len(indices_of_zeros))
+#     ones_instances_size = int(1 * len(indices_of_ones))
+#     sample_size = min(zeros_instances_size, ones_instances_size)
+#     sampled_instances = random.sample(indices_of_zeros, sample_size)
+#     sampled_instances.extend(random.sample(indices_of_ones, sample_size))
+
+    writtenTrain = False
+    # writtenTest = False
+    for index, (all_data, label) in enumerate(zip(overall_data, overall_labels)):        
+        if all_data:
+            steps_seq = all_data[0]
+            strat_correct = all_data[1]
+            info = all_data[2]
+            me_opt = all_data[3]
+            
+            # if index in sampled_instances:
+            writtenTrain = True
+            train_file.write(steps_seq)
+            train_file.write("\n")
+            train_label.write(label)
+            train_label.write("\n")
+            trainr_label.write(strat_correct)
+            trainr_label.write("\n")
+            train_info.write(info)
+            train_info.write("\n")
+            train_gt_label.write(me_opt)
+            train_gt_label.write("\n")
+            # else:
+            #     writtenTest = True
+            #     test_file.write(steps_seq)
+            #     test_file.write("\n")
+            #     # progress, problem name, student id, total steps length, outcome seq, help_level seq, encoding in steps length
+            #     test_label.write(label)
+            #     test_label.write("\n")
+            #     # testr_label.write(str(correctness))
+            #     testr_label.write(strat_correct)
+            #     testr_label.write("\n")
+            #     test_info.write(info)
+            #     test_info.write("\n")
+            #     test_gt_label.write(me_opt)
+            #     test_gt_label.write("\n")
+        else:
+            # Indicates actions of next student
+            # Indicates next problem
+            if writtenTrain:
+                writtenTrain = False
+                train_file.write("\n")
+                train_info.write("\n")
+                train_label.write("\n")
+                trainr_label.write("\n")
+                train_gt_label.write("\n")
+            # if writtenTest:
+            #     writtenTest = False
+            #     test_file.write("\n")
+            #     test_info.write("\n")
+            #     test_label.write("\n")
+            #     testr_label.write("\n")
+            #     test_gt_label.write("\n")                        
+
+    train_file.close()
+    train_info.close()
+    train_label.close()
+    trainr_label.close()
+    train_gt_label.close()
+
+    # test_file.close()
+    # test_info.close()
+    # test_label.close()
+    # testr_label.close()
+    # test_gt_label.close()
+    
+def prepare_finetuning_FS_files_old(data_processor, opts):
+    '''
+        Used for L@S paper. This function gathers last three problems of each student.
+        Only two strategies were defined as:
+        0: non-opt strategy
+        1: opt used strategy
+    '''
+    
+    options = copy.deepcopy(opts)
+    for k,v in vars(opts).items():
+        if k.startswith("train") or k.startswith("test"):
+            if v:
+                f_path = v.split("/")
+                f_path = f_path[0]+"/"+f_path[1]+"/FS/"+f_path[2]
+                setattr(options, f"{k}", f_path)
+                print(f"options.{k} : {getattr(options, f'{k}')}")
+                
+    steps = pickle.load(open(f"{options.dataset_folder}unique_steps_list.pkl","rb"))
+    chunk_iterator = data_processor.load_file_iterator()
+
+    train_file = open(options.train_file_path, "w")
+    train_info = open(options.train_info_path, "w")
+    train_label = open(options.train_label_path, "w")
+    trainr_label = open(options.trainr_label_path, "w")
+    train_gt_label = open(options.train_gt_label_path, "w")
+    
+    # test_file = open(options.test_file_path, "w")
+    # test_info = open(options.test_info_path, "w")
+    # test_label = open(options.test_label_path, "w")
+    # testr_label = open(options.testr_label_path, "w")
+    # test_gt_label = open(options.test_gt_label_path, "w")
+
+    overall_data = []
+    overall_labels = []
+    for chunk_data in chunk_iterator:
+        for section, section_groups in chunk_data.groupby("Level (Workspace Id)"):
+            if options.workspace_name == section:
+                for student, student_groups in section_groups.groupby("Anon Student Id"):
+                  
+                    student_groups.sort_values(by="Time")
+                    prob_list = list(pd.unique(student_groups["Problem Name"]))
+                    
+                    if len(prob_list) < 3:
+                        continue
+
+                    # first_prob_list = prob_list[:3]
+                    last_prob_list = prob_list[-3:]
+#                     # print(len(first_prob_list), len(last_prob_list))
+                    
+#                     final_prob_list = first_prob_list + last_prob_list
+                    # print(len(prob_list), len(final_prob_list)) #, final_prob_list)
+                    
+                    for prob, prob_groups in student_groups.groupby("Problem Name"):
+                        # For first 3 and last 3 only
+                        if not prob in last_prob_list:
+                            continue
+                            
+                        step_names_token = []
+                        
+                        time_stamps = list(prob_groups["Time"])
+                        time_stamps_list = set()
+                        for i in range(len(time_stamps)-1):
+                            if (time_stamps[i+1] - time_stamps[i]) < 2000:
+                                time_stamps_list.add(time_stamps[i+1])
+                        
+                        progress = ""
+                        outcome = []
+                        help_level = []
+                        auto_complete = False
+                        means_and_extremes = False
+                        finals = len(options.final_step)
+                        totals = 0
+                        
+                        for index, row in prob_groups[['Time', 'Step Name', 'Outcome', 'Help Level','CF (Workspace Progress Status)', 'CF (Etalon)']].iterrows():
+                            step = row["Step Name"]
+                            etalon = row["CF (Etalon)"]
+                            progress = row["CF (Workspace Progress Status)"]
+                            if not pd.isna(step):
+                                if step in options.opt_step1:
+                                    try:
+                                        etalon = int(etalon)
+                                    except Exception as e:
+                                        try:
+                                            etalon = float(etalon)
+                                            means_and_extremes = True
+                                            # break
+                                        except Exception as e:
+                                            pass
+                                if (step in options.opt_step1 or step in options.opt_step2) and row["Time"] in time_stamps_list:
+                                # if row["Time"] in time_stamps_list:
+                                    auto_complete = True
+                                    # print(row)
+                                    continue
+                                # if not step_names_token or step != step_names_token[-1]:
+                                #     step_names_token.append(step)
+                                
+                                if not step_names_token or step != step_names_token[-1]:
+                                    step_names_token.append(step)
+                                    # ['OK', 'BUG', 'ERROR', 'INITIAL_HINT', 'HINT_LEVEL_CHANGE']
+                                    outcome.append(row['Outcome'])
+                                    help_level.append(str(row["Help Level"]))
+                                    if finals == 0:
+                                        totals += 1
+                                else:
+                                    outcome[-1] = outcome[-1]+":"+row['Outcome']
+                                    help_level[-1] = help_level[-1]+":"+str(row['Help Level'])
+                                
+                        error_ind = ['BUG', 'ERROR', 'INITIAL_HINT', 'HINT_LEVEL_CHANGE']
+                        errors = 0
+                        for step, out in zip(step_names_token, outcome):
+                            if (finals and step in options.final_step) or totals > 0:
+                                out = out.split(":")
+                                if any(any(ind in o for o in out) for ind in error_ind):
+                                    errors +=1
+                                    
+                        if finals:
+                            totals = finals
+                        # 4 and more in sequence
+                        if step_names_token: # and len(step_names_token) > 3
+                            
+                            where_opt = []
+                            for stp in step_names_token:
+                                if stp in options.opt_step1:
+                                    where_opt.append("1")
+                                elif stp in options.opt_step2:
+                                    where_opt.append("2")
+                                else:
+                                    where_opt.append("0")
+                            
+                            label_opt = "0"
+                            if options.opt_step1:
+                                any_opt1 = any(any(opt in step for step in step_names_token) for opt in options.opt_step1[1:])
+                                if any_opt1:
+                                    label_opt = "1"
+
+                                
+                            if options.opt_step2:
+                                any_opt2 = any(any(opt in step for step in step_names_token) for opt in options.opt_step2[1:])
+                                if any_opt2:
+                                    label_opt = "1"
+                            
+                            correctness = 1 - errors/totals
+                            strat_correct = "0"
+                            if correctness > 0.75:
+                                strat_correct = "1" 
+                                
+                             # progress, problem name, student id, total steps length, outcome seq, help_level seq, encoding in steps length
+                            info = ",".join([str(progress),str(prob), str(student), str(auto_complete), str(len(step_names_token)),"\t".join(map(str, outcome)), "\t".join(map(str, help_level)), "\t".join(map(str, where_opt)), str(correctness)])
+                            
+                            overall_data.append(["\t".join(step_names_token), strat_correct, info, f"{1 if means_and_extremes else 0}"])
+                            overall_labels.append(label_opt)
+                            
+                    overall_data.append('')
+                    overall_labels.append('')   
+                            
+#     overall_labels = np.array(overall_labels)
+#     indices_of_zeros = list(np.where(overall_labels == '0')[0])
+#     indices_of_ones = list(np.where(overall_labels == '1')[0])
+    
+#     zeros_instances_size = int(0.10 * len(indices_of_zeros))
+#     ones_instances_size = int(0.10 * len(indices_of_ones))
+#     sample_size = min(zeros_instances_size, ones_instances_size)
+#     sampled_instances = random.sample(indices_of_zeros, sample_size)
+#     sampled_instances.extend(random.sample(indices_of_ones, sample_size))
+
+    writtenTrain = False
+    # writtenTest = False
+    for index, (all_data, label) in enumerate(zip(overall_data, overall_labels)):        
+        if all_data:
+            steps_seq = all_data[0]
+            strat_correct = all_data[1]
+            info = all_data[2]
+            me_opt = all_data[3]
+            
+            # if index in sampled_instances:
+            writtenTrain = True
+            train_file.write(steps_seq)
+            train_file.write("\n")
+            train_label.write(label)
+            train_label.write("\n")
+            trainr_label.write(strat_correct)
+            trainr_label.write("\n")
+            train_info.write(info)
+            train_info.write("\n")
+            train_gt_label.write(me_opt)
+            train_gt_label.write("\n")
+            # else:
+            #     writtenTest = True
+            #     test_file.write(steps_seq)
+            #     test_file.write("\n")
+            #     # progress, problem name, student id, total steps length, outcome seq, help_level seq, encoding in steps length
+            #     test_label.write(label)
+            #     test_label.write("\n")
+            #     # testr_label.write(str(correctness))
+            #     testr_label.write(strat_correct)
+            #     testr_label.write("\n")
+            #     test_info.write(info)
+            #     test_info.write("\n")
+            #     test_gt_label.write(me_opt)
+            #     test_gt_label.write("\n")
+        else:
+            # Indicates actions of next student
+            # Indicates next problem
+            if writtenTrain:
+                writtenTrain = False
+                train_file.write("\n")
+                train_info.write("\n")
+                train_label.write("\n")
+                trainr_label.write("\n")
+                train_gt_label.write("\n")
+            # if writtenTest:
+            #     writtenTest = False
+            #     test_file.write("\n")
+            #     test_info.write("\n")
+            #     test_label.write("\n")
+            #     testr_label.write("\n")
+            #     test_gt_label.write("\n")                        
+
+    train_file.close()
+    train_info.close()
+    train_label.close()
+    trainr_label.close()
+    train_gt_label.close()
+
+    # test_file.close()
+    # test_info.close()
+    # test_label.close()
+    # testr_label.close()
+    # test_gt_label.close()
+
+
+def prepare_finetuning_correctness_files(data_processor, options):
+    '''
+        Ongoing research. Student strategy learning/predicting.
+        FinalAnswer step
+        Correct: 1 , correctness of final strategy > 0.75
+        Incorrect: 0 , else < 0.75
+    '''
+    chunk_iterator = data_processor.load_file_iterator()
+
+    train_file = open(options.train_file_path, "w")
+    train_info = open(options.train_info_path, "w")
+    train_label = open(options.train_label_path, "w")
+    
+#     val_file = open(options.val_file_path, "w")
+#     val_info = open(options.val_info_path, "w")
+#     val_label = open(options.val_label_path, "w")
+    
+    test_file = open(options.test_file_path, "w")
+    test_info = open(options.test_info_path, "w")
+    test_label = open(options.test_label_path, "w")
+    
+    overall_data = []
+    overall_labels = []
+    for chunk_data in chunk_iterator:
+        for section, section_groups in chunk_data.groupby("Level (Workspace Id)"):
+            if "ratio_proportion_change3" == section:
+                for student, student_groups in section_groups.groupby("Anon Student Id"):
+                    student_groups.sort_values(by="Time", inplace=True)                    
+                    for prob, prob_groups in student_groups.groupby("Problem Name"):
+                        
+                        actions = list(prob_groups["Action"])
+                        # A problem should be completed by a student clicking Done button.
+                        if not "Done" in actions: 
+                            continue
+                        unique_steps = list(pd.unique(prob_groups["Step Name"]))
+                        unique_steps_len = len(set([s for s in unique_steps if not pd.isna(s) and not (s in options.opt_step1) and not(s in options.opt_step2)]))
+                        if unique_steps_len < 4:
+                            continue
+                            
+                        step_names_token = []
+                        
+                        time_stamps = list(prob_groups["Time"])
+                        time_stamps_list = set()
+                        for i in range(len(time_stamps)-1):
+                            if (time_stamps[i+1] - time_stamps[i]) < 1800:
+                                time_stamps_list.add(time_stamps[i+1])
+                        
+                        outcome = []
+                        help_level = []
+                        auto_complete = False
+                        means_and_extremes = False
+                        final_correct = 0
+                        for index, row in prob_groups[['Time', 'Step Name', 'Outcome', 'Help Level','CF (Workspace Progress Status)', 'CF (Etalon)']].iterrows():
+                            step = row["Step Name"]
+                            etalon = row["CF (Etalon)"]
+                            progress = row["CF (Workspace Progress Status)"]
+                            if not pd.isna(step):
+                                if step in options.opt_step1:
+                                    try:
+                                        etalon = int(etalon)
+                                    except Exception as e:
+                                        try:
+                                            etalon = float(etalon)
+                                            means_and_extremes = True
+                                            # break
+                                        except Exception as e:
+                                            pass
+                                if (step in options.opt_step1 or step in options.opt_step2) and row["Time"] in time_stamps_list:
+                                    auto_complete = True
+                                    continue
+                                # if not step_names_token or step != step_names_token[-1]:
+                                #     step_names_token.append(step)
+                                
+                                if not step_names_token or step != step_names_token[-1]:
+                                    step_names_token.append(step)
+                                    # ['OK', 'BUG', 'ERROR', 'INITIAL_HINT', 'HINT_LEVEL_CHANGE']
+                                    outcome.append(row['Outcome'])
+                                    help_level.append(str(row["Help Level"]))
+                                else:
+                                    outcome[-1] = outcome[-1]+":"+row['Outcome']
+                                    help_level[-1] = help_level[-1]+":"+str(row['Help Level'])
+                                if step == "FinalAnswer":
+                                    final_correct += 1
+                        unique_steps_len = len(set([s for s in step_names_token if not (s in options.opt_step1) and not(s in options.opt_step2)]))
+                        # 4 and more in sequence
+                        if step_names_token and unique_steps_len > 4: 
+                            where_opt = []
+                            for stp in step_names_token:
+                                if stp in options.opt_step1:
+                                    where_opt.append("1")
+                                elif stp in options.opt_step2:
+                                    where_opt.append("2")
+                                else:
+                                    where_opt.append("0")
+                            
+                            label_opt = "0"
+                            if final_correct == 1:
+                                label_opt = "1"
+                          
+                            # progress, problem name, student id, total steps length, outcome seq, help_level seq, encoding in steps length
+                            info = ",".join([str(progress),str(prob), str(student), str(auto_complete), str(len(step_names_token)),
+                                             "\t".join(map(str, outcome)), "\t".join(map(str, help_level)), "\t".join(map(str, where_opt)), 
+                                             f"{1 if means_and_extremes else 0}"])
+                            overall_data.append(["\t".join(step_names_token), info])
+                            overall_labels.append(label_opt)
+                            
+                    # overall_data.append('')
+                    # overall_labels.append('')
+    
+    overall_labels = np.array(overall_labels)
+    indices_of_zeros = list(np.where(overall_labels == '0')[0])
+    indices_of_ones = list(np.where(overall_labels == '1')[0])
+    
+    train_len = int(len(overall_labels) * 0.10)
+    sample_size = int(train_len/2)
+    print(f"sample_size: {sample_size}")
+    sampled_instances = random.sample(indices_of_zeros, sample_size)
+    sampled_instances.extend(random.sample(indices_of_ones, sample_size))
+
+    # writtenTrain = False
+    # writtenTest = False
+    for index, (all_data, label) in enumerate(zip(overall_data, overall_labels)):
+
+        steps_seq = all_data[0]
+        info = all_data[1]
+
+        if index in sampled_instances:
+            train_file.write(steps_seq)
+            train_file.write("\n")
+            
+            train_info.write(info)
+            train_info.write("\n")
+            
+            train_label.write(label)
+            train_label.write("\n")
+        else:
+            # proba = random.random()
+            # if proba <0.5:
+            test_file.write(steps_seq)
+            test_file.write("\n")
+
+            test_info.write(info)
+            test_info.write("\n")
+
+            test_label.write(label)
+            test_label.write("\n")
+#             else:
+#                 val_file.write(steps_seq)
+#                 val_file.write("\n")
+
+#                 val_info.write(info)
+#                 val_info.write("\n")
+
+#                 val_label.write(label)
+#                 val_label.write("\n")
+
+    train_file.close()
+    train_info.close()
+    train_label.close()
+    
+    # val_file.close()
+    # val_info.close()
+    # val_label.close()
+    
+    test_file.close()
+    test_info.close()
+    test_label.close()
+
+def prepare_finetuning_correctness_files_old(data_processor, opts):
+    '''
+        Ongoing research. Student strategy learning/predicting.
+        Correct, 1: correctness of final strategy > 0.75
+        Incorrect, 0: else < 0.75
+    '''
+    options = copy.deepcopy(opts)
+    for k,v in vars(opts).items():
+        if k.startswith("train") or k.startswith("test"):
+            if v:
+                f_path = v.split("/")
+                f_path = f_path[0]+"/"+f_path[1]+"/fa_correctness/"+f_path[2]
+                # f_path = f_path[0]+"/"+f_path[1]+"/check2/"+f_path[2]
+                setattr(options, f"{k}", f_path)
+                print(f"options.{k} : {getattr(options, f'{k}')}")
+                
+    steps = pickle.load(open(f"{options.dataset_folder}unique_steps_list.pkl","rb"))
+    chunk_iterator = data_processor.load_file_iterator()
+
+    train_file = open(options.train_file_path, "w")
+    train_info = open(options.train_info_path, "w")
+    train_label = open(options.train_label_path, "w")
+    # trainr_label = open(options.trainr_label_path, "w")
+    # train_gt_label = open(options.train_gt_label_path, "w")
+    
+    test_file = open(options.test_file_path, "w")
+    test_info = open(options.test_info_path, "w")
+    test_label = open(options.test_label_path, "w")
+    # testr_label = open(options.testr_label_path, "w")
+    # test_gt_label = open(options.test_gt_label_path, "w")
+    ws = "_".join(options.workspace_name.split("_")[:-1])
+    print("Workspace: ", ws)
+    overall_data = []
+    overall_labels = []
+    for chunk_data in chunk_iterator:
+        for section, section_groups in chunk_data.groupby("Level (Workspace Id)"):
+            if ws == section:
+                for student, student_groups in section_groups.groupby("Anon Student Id"):
+                    
+                    student_groups.sort_values(by="Time")
+                    prob_list = list(pd.unique(student_groups["Problem Name"]))
+                    
+                    # if len(prob_list) < 3:
+                    #     continue
+
+#                     first_prob_list = prob_list[:3]
+                    # last_prob_list = prob_list[-3:]
+#                     # print(len(first_prob_list), len(last_prob_list))
+                    
+#                     final_prob_list = first_prob_list + last_prob_list
+                    # print(len(prob_list), len(final_prob_list)) #, final_prob_list)
+                    
+                    for prob, prob_groups in student_groups.groupby("Problem Name"):
+                        # For first 3 and last 3 only
+#                         if not prob in last_prob_list:
+#                             continue
+                        # print(options.final_step in list(prob_groups["Step Name"]))
+                        # if not (options.final_step in list(prob_groups["Step Name"])):
+                        #     continue
+                        step_names_token = []
+                        
+                        time_stamps = list(prob_groups["Time"])
+                        time_stamps_list = set()
+                        for i in range(len(time_stamps)-1):
+                            if (time_stamps[i+1] - time_stamps[i]) < 2000:
+                                time_stamps_list.add(time_stamps[i+1])
+                        
+                        progress = ""
+                        outcome = []
+                        help_level = []
+                        auto_complete = False
+                        means_and_extremes = False
+                        # finals = len(options.final_step)
+                        
+                       
+                        totals = 0
+                        
+                        for index, row in prob_groups[['Time', 'Step Name', 'Outcome', 'Help Level','CF (Workspace Progress Status)', 'CF (Etalon)']].iterrows():
+                            step = row["Step Name"]
+                            etalon = row["CF (Etalon)"]
+                            progress = row["CF (Workspace Progress Status)"]
+                            if not pd.isna(step):
+                                if step in options.opt_step1:
+                                    try:
+                                        etalon = int(etalon)
+                                    except Exception as e:
+                                        try:
+                                            etalon = float(etalon)
+                                            means_and_extremes = True
+                                            # break
+                                        except Exception as e:
+                                            pass
+                                if (step in options.opt_step1 or step in options.opt_step2) and row["Time"] in time_stamps_list:
+                                # if row["Time"] in time_stamps_list:
+                                    auto_complete = True
+                                    # print(row)
+                                    continue
+                                # if not step_names_token or step != step_names_token[-1]:
+                                #     step_names_token.append(step)
+                                
+                                if not step_names_token or step != step_names_token[-1]:
+                                    step_names_token.append(step)
+                                    # ['OK', 'BUG', 'ERROR', 'INITIAL_HINT', 'HINT_LEVEL_CHANGE']
+                                    outcome.append(row['Outcome'])
+                                    help_level.append(str(row["Help Level"]))
+                                    # if finals == 0:
+                                    #     totals += 1
+                                else:
+                                    outcome[-1] = outcome[-1]+":"+row['Outcome']
+                                    help_level[-1] = help_level[-1]+":"+str(row['Help Level'])
+                                
+                        error_ind = ['BUG', 'ERROR', 'INITIAL_HINT', 'HINT_LEVEL_CHANGE']
+                        errors = 0
+                        for step, out in zip(step_names_token, outcome):
+                            if (step in options.final_step):# or totals > 0:
+                                out = out.split(":")
+                                totals = len(out)
+                                # print(totals)
+                                for ind in error_ind:
+                                    if ind in out:
+                                        errors +=1
+                                    
+                        # if finals:
+                        #     totals = finals
+                        # 4 and more in sequence
+                        if step_names_token and totals>0: # and len(step_names_token) > 3
+                            
+                            where_opt = []
+                            for stp in step_names_token:
+                                if stp in options.opt_step1:
+                                    where_opt.append("1")
+                                elif stp in options.opt_step2:
+                                    where_opt.append("2")
+                                else:
+                                    where_opt.append("0")
+                            
+                            label_opt = "0"
+                            if options.opt_step1:
+                                all_opt1 = all(any(opt in step for step in step_names_token) for opt in options.opt_step1)
+                                any_opt1 = any(any(opt in step for step in step_names_token) for opt in options.opt_step1[1:])
+
+                                if any_opt1:
+                                    label_opt = "2"
+                                if all_opt1:
+                                    label_opt = "1"
+
+                                
+                            if options.opt_step2:
+                                all_opt2 = all(any(opt in step for step in step_names_token) for opt in options.opt_step2)
+                                any_opt2 = any(any(opt in step for step in step_names_token) for opt in options.opt_step2[1:])
+                                if any_opt2:
+                                    label_opt = "4"
+                                if all_opt2:
+                                    label_opt = "3"
+                                if any_opt1 and any_opt2:
+                                    label_opt = "5"
+                                if any_opt1 and all_opt2:
+                                    label_opt = "6"
+                                if all_opt1 and any_opt2:
+                                    label_opt = "7"
+                                if all_opt1 and all_opt2:
+                                    label_opt = "8"
+                            
+                            
+                            correctness = 1 - errors/totals
+                            strat_correct = "0"
+                            if correctness > 0.75:
+                                strat_correct = "1"
+                            
+                            # if not means_and_extremes and label_opt == "2":
+                                # progress, problem name, student id, total steps length, outcome seq, help_level seq, encoding in steps length
+                            info = ",".join([str(progress),str(prob), str(student), str(auto_complete), str(len(step_names_token)),"\t".join(map(str, outcome)), "\t".join(map(str, help_level)), "\t".join(map(str, where_opt)), str(correctness), f"{1 if means_and_extremes else 0}"])
+
+                            overall_data.append(["\t".join(step_names_token), label_opt, info])
+                            overall_labels.append(strat_correct)
+                            
+                    overall_data.append('')
+                    overall_labels.append('')    
+                            
+    overall_labels = np.array(overall_labels, dtype=str)
+    indices_of_zeros = list(np.where(overall_labels == '0')[0])
+    indices_of_ones = list(np.where(overall_labels == '1')[0])
+    
+    per = 0.20
+    zeros_instances_size = int(per * len(indices_of_zeros))
+    ones_instances_size = int(per * len(indices_of_ones))
+    
+    sample_size = min(zeros_instances_size, ones_instances_size)
+    sampled_instances = random.sample(indices_of_zeros, sample_size)
+    sampled_instances.extend(random.sample(indices_of_ones, sample_size))
+
+    writtenTrain = False
+    writtenTest = False
+    for index, (all_data, label) in enumerate(zip(overall_data, overall_labels)):        
+        if all_data:
+            steps_seq = all_data[0]
+            label_opt = all_data[1]
+            info = all_data[2]
+            # me_opt = all_data[3]
+            
+            if index in sampled_instances:
+                writtenTrain = True
+                train_file.write(steps_seq)
+                train_file.write("\n")
+                train_label.write(label)
+                train_label.write("\n")
+                # trainr_label.write(label_opt)
+                # trainr_label.write("\n")
+                train_info.write(info)
+                train_info.write("\n")
+                # train_gt_label.write(me_opt)
+                # train_gt_label.write("\n")
+            else:
+                writtenTest = True
+                test_file.write(steps_seq)
+                test_file.write("\n")
+                # progress, problem name, student id, total steps length, outcome seq, help_level seq, encoding in steps length
+                test_label.write(label)
+                test_label.write("\n")
+                # testr_label.write(str(correctness))
+                # testr_label.write(label_opt)
+                # testr_label.write("\n")
+                test_info.write(info)
+                test_info.write("\n")
+                # test_gt_label.write(me_opt)
+                # test_gt_label.write("\n")
+        else:
+            # Indicates actions of next student
+            # Indicates next problem
+            if writtenTrain:
+                writtenTrain = False
+                train_file.write("\n")
+                train_info.write("\n")
+                train_label.write("\n")
+                # trainr_label.write("\n")
+                # train_gt_label.write("\n")
+            if writtenTest:
+                writtenTest = False
+                test_file.write("\n")
+                test_info.write("\n")
+                test_label.write("\n")
+                # testr_label.write("\n")
+                # test_gt_label.write("\n")
+        
+
+    train_file.close()
+    train_info.close()
+    train_label.close()
+    # trainr_label.close()
+    # train_gt_label.close()
+    
+    test_file.close()
+    test_info.close()
+    test_label.close()
+    # testr_label.close()
+    # test_gt_label.close()
+
+def prepare_finetuning_correctness_aaai_files(data_processor, opts):
+    '''
+        Ongoing research. Student strategy learning/predicting.
+        Correct, 1: correctness of final strategy > 0.75
+        Incorrect, 0: else < 0.75
+    '''
+    options = copy.deepcopy(opts)
+    for k,v in vars(opts).items():
+        if k.startswith("train") or k.startswith("test") or k.startswith("val"):
+            if v:
+                f_path = v.split("/")
+                # f_path = f_path[0]+"/"+f_path[1]+"/correctness/"+f_path[2]
+                f_path = f_path[0]+"/"+f_path[1]+"/aaai/"+f_path[2]
+                setattr(options, f"{k}", f_path)
+                print(f"options.{k} : {getattr(options, f'{k}')}")
+                
+    # steps = pickle.load(open(f"{options.dataset_folder}unique_steps_list.pkl","rb"))
+    chunk_iterator = data_processor.load_file_iterator()
+
+    train_file = open(options.train_file_path, "w")
+    train_info = open(options.train_info_path, "w")
+    train_label = open(options.train_label_path, "w")
+    
+    val_file = open(options.val_file_path, "w")
+    val_info = open(options.val_info_path, "w")
+    val_label = open(options.val_label_path, "w")
+    
+    test_file = open(options.test_file_path, "w")
+    test_info = open(options.test_info_path, "w")
+    test_label = open(options.test_label_path, "w")
+    
+    high_performer = pickle.load(open(f"{options.workspace_name}/aaai/change3_high_performers.pkl", "rb"))
+    mid_performer = pickle.load(open(f"{options.workspace_name}/aaai/change3_mid_performers.pkl", "rb"))
+    low_performer = pickle.load(open(f"{options.workspace_name}/aaai/change3_low_performers.pkl", "rb"))
+    prob_sel_list = pickle.load(open(f"{options.workspace_name}/aaai/change3_problem_list.pkl", "rb"))
+
+    ws = "_".join(options.workspace_name.split("_")[:-1])
+
+    print(ws, len(high_performer), len(mid_performer), len(low_performer), len(prob_sel_list))
+    overall_data = []
+    overall_labels = []
+    for chunk_data in chunk_iterator:
+        for section, section_groups in chunk_data.groupby("Level (Workspace Id)"):
+            # if options.workspace_name == section:
+            if ws == section:
+                for student, student_groups in section_groups.groupby("Anon Student Id"):
+                    if student in high_performer or student in mid_performer or student in low_performer:
+                        student_groups.sort_values(by="Time")
+                        prob_list = list(pd.unique(student_groups["Problem Name"]))
+
+                        for prob, prob_groups in student_groups.groupby("Problem Name"):
+                            # For first 3 and last 3 only
+                            if not prob in prob_sel_list:
+                                continue
+
+                            step_names_token = []
+
+                            time_stamps = list(prob_groups["Time"])
+                            time_stamps_list = set()
+                            for i in range(len(time_stamps)-1):
+                                if (time_stamps[i+1] - time_stamps[i]) < 2000:
+                                    time_stamps_list.add(time_stamps[i+1])
+
+                            progress = ""
+                            outcome = []
+                            help_level = []
+                            auto_complete = False
+                            means_and_extremes = False
+                            totals = 0
+
+                            for index, row in prob_groups[['Time', 'Step Name', 'Outcome', 'Help Level','CF (Workspace Progress Status)', 'CF (Etalon)']].iterrows():
+                                step = row["Step Name"]
+                                etalon = row["CF (Etalon)"]
+                                progress = row["CF (Workspace Progress Status)"]
+                                if not pd.isna(step):
+                                    if step in options.opt_step1:
+                                        try:
+                                            etalon = int(etalon)
+                                        except Exception as e:
+                                            try:
+                                                etalon = float(etalon)
+                                                means_and_extremes = True
+                                                # break
+                                            except Exception as e:
+                                                pass
+                                    if (step in options.opt_step1 or step in options.opt_step2) and row["Time"] in time_stamps_list:
+                                    # if row["Time"] in time_stamps_list:
+                                        auto_complete = True
+                                        # print(row)
+                                        continue
+                                    # if not step_names_token or step != step_names_token[-1]:
+                                    #     step_names_token.append(step)
+
+                                    if not step_names_token or step != step_names_token[-1]:
+                                        step_names_token.append(step)
+                                        # ['OK', 'BUG', 'ERROR', 'INITIAL_HINT', 'HINT_LEVEL_CHANGE']
+                                        outcome.append(row['Outcome'])
+                                        help_level.append(str(row["Help Level"]))
+
+                                    else:
+                                        outcome[-1] = outcome[-1]+":"+row['Outcome']
+                                        help_level[-1] = help_level[-1]+":"+str(row['Help Level'])
+
+                            error_ind = ['BUG', 'ERROR', 'INITIAL_HINT', 'HINT_LEVEL_CHANGE']
+                            errors = 0
+                            for step, out in zip(step_names_token, outcome):
+                                if (step in options.final_step):
+                                    out = out.split(":")
+                                    totals = len(out)
+                                    # print(totals)
+                                    for ind in error_ind:
+                                        if ind in out:
+                                            errors +=1
+
+                            # 4 and more in sequence
+                            if step_names_token and totals>0: # and len(step_names_token) > 3
+
+                                where_opt = []
+                                for stp in step_names_token:
+                                    if stp in options.opt_step1:
+                                        where_opt.append("1")
+                                    elif stp in options.opt_step2:
+                                        where_opt.append("2")
+                                    else:
+                                        where_opt.append("0")
+
+                                
+
+                                correctness = 1 - errors/totals
+                                strat_correct = "0"
+                                if correctness > 0.75:
+                                    strat_correct = "1"
+
+                                # if not means_and_extremes and label_opt == "2":
+                                # progress, problem name, student id, total steps length, outcome seq, help_level seq, encoding in steps length
+                                info = ",".join([str(progress), str(correctness), f"{1 if means_and_extremes else 0}",str(prob), str(student), str(auto_complete), str(len(step_names_token)),"\t".join(map(str, outcome)), "\t".join(map(str, help_level)), "\t".join(map(str, where_opt))])
+
+                                overall_data.append(["\t".join(step_names_token), info])
+                                overall_labels.append(strat_correct)
+
+                        # overall_data.append('')
+                        # overall_labels.append('')    
+                            
+    overall_labels = np.array(overall_labels)
+    
+    for index, (all_data, label) in enumerate(zip(overall_data, overall_labels)):        
+        if all_data:
+            steps_seq = all_data[0]
+            info = all_data[1]
+            student = info.split(",")[4]
+            
+            if student in high_performer:
+                train_file.write(steps_seq)
+                train_file.write("\n")
+                train_label.write(label)
+                train_label.write("\n")
+                train_info.write(info)
+                train_info.write("\n")
+            elif student in mid_performer:
+                val_file.write(steps_seq)
+                val_file.write("\n")
+                val_label.write(label)
+                val_label.write("\n")
+                val_info.write(info)
+                val_info.write("\n")
+            elif student in low_performer:
+                test_file.write(steps_seq)
+                test_file.write("\n")
+                test_label.write(label)
+                test_label.write("\n")
+                test_info.write(info)
+                test_info.write("\n")
+
+        
+
+    train_file.close()
+    train_info.close()
+    train_label.close()
+    
+    val_file.close()
+    val_info.close()
+    val_label.close()
+    
+    test_file.close()
+    test_info.close()
+    test_label.close()
+
+def prepare_finetuning_SL_files(data_processor, opts):
+    '''
+        Ongoing research. Student strategy learning/predicting.
+        We have defined 9 strategy as:
+        Notation; Label
+        UU; 0
+        CU; 1
+        PU; 2
+        UC; 3
+        UP; 4
+        PP; 5
+        PC; 6
+        CP; 7
+        CC; 8
+    '''
+    options = copy.deepcopy(opts)
+    for k,v in vars(opts).items():
+        if k.startswith("train") or k.startswith("test"):
+            if v:
+                f_path = v.split("/")
+                f_path = f_path[0]+"/"+f_path[1]+"/SL/"+f_path[2]
+                setattr(options, f"{k}", f_path)
+                print(f"options.{k} : {getattr(options, f'{k}')}")
+                
+    steps = pickle.load(open(f"{options.dataset_folder}unique_steps_list.pkl","rb"))
+    chunk_iterator = data_processor.load_file_iterator()
+
+    train_file = open(options.train_file_path, "w")
+    train_info = open(options.train_info_path, "w")
+    train_label = open(options.train_label_path, "w")
+    trainr_label = open(options.trainr_label_path, "w")
+    train_gt_label = open(options.train_gt_label_path, "w")
+    
+    test_file = open(options.test_file_path, "w")
+    test_info = open(options.test_info_path, "w")
+    test_label = open(options.test_label_path, "w")
+    testr_label = open(options.testr_label_path, "w")
+    test_gt_label = open(options.test_gt_label_path, "w")
+
+    overall_data = []
+    overall_labels = []
+    for chunk_data in chunk_iterator:
+        for section, section_groups in chunk_data.groupby("Level (Workspace Id)"):
+            if options.workspace_name == section:
+                for student, student_groups in section_groups.groupby("Anon Student Id"):
+                    
+                    student_groups.sort_values(by="Time")
+                    prob_list = list(pd.unique(student_groups["Problem Name"]))
+                    
+                    # if len(prob_list) < 3:
+                    #     continue
+
+#                     first_prob_list = prob_list[:3]
+                    # last_prob_list = prob_list[-3:]
+#                     # print(len(first_prob_list), len(last_prob_list))
+                    
+#                     final_prob_list = first_prob_list + last_prob_list
+                    # print(len(prob_list), len(final_prob_list)) #, final_prob_list)
+                    
+                    for prob, prob_groups in student_groups.groupby("Problem Name"):
+                        # For first 3 and last 3 only
+                        # if not prob in last_prob_list:
+                        #     continue
+                            
+                        step_names_token = []
+                        
+                        time_stamps = list(prob_groups["Time"])
+                        time_stamps_list = set()
+                        for i in range(len(time_stamps)-1):
+                            if (time_stamps[i+1] - time_stamps[i]) < 2000:
+                                time_stamps_list.add(time_stamps[i+1])
+                        
+                        progress = ""
+                        outcome = []
+                        help_level = []
+                        auto_complete = False
+                        means_and_extremes = False
+                        finals = len(options.final_step)
+                        totals = 0
+                        
+                        for index, row in prob_groups[['Time', 'Step Name', 'Outcome', 'Help Level','CF (Workspace Progress Status)', 'CF (Etalon)']].iterrows():
+                            step = row["Step Name"]
+                            etalon = row["CF (Etalon)"]
+                            progress = row["CF (Workspace Progress Status)"]
+                            if not pd.isna(step):
+                                if step in options.opt_step1:
+                                    try:
+                                        etalon = int(etalon)
+                                    except Exception as e:
+                                        try:
+                                            etalon = float(etalon)
+                                            means_and_extremes = True
+                                            # break
+                                        except Exception as e:
+                                            pass
+                                if (step in options.opt_step1 or step in options.opt_step2) and row["Time"] in time_stamps_list:
+                                # if row["Time"] in time_stamps_list:
+                                    auto_complete = True
+                                    # print(row)
+                                    continue
+                                # if not step_names_token or step != step_names_token[-1]:
+                                #     step_names_token.append(step)
+                                
+                                if not step_names_token or step != step_names_token[-1]:
+                                    step_names_token.append(step)
+                                    # ['OK', 'BUG', 'ERROR', 'INITIAL_HINT', 'HINT_LEVEL_CHANGE']
+                                    outcome.append(row['Outcome'])
+                                    help_level.append(str(row["Help Level"]))
+                                    if finals == 0:
+                                        totals += 1
+                                else:
+                                    outcome[-1] = outcome[-1]+":"+row['Outcome']
+                                    help_level[-1] = help_level[-1]+":"+str(row['Help Level'])
+                                
+                        error_ind = ['BUG', 'ERROR', 'INITIAL_HINT', 'HINT_LEVEL_CHANGE']
+                        errors = 0
+                        for step, out in zip(step_names_token, outcome):
+                            if (finals and step in options.final_step) or totals > 0:
+                                out = out.split(":")
+                                if any(any(ind in o for o in out) for ind in error_ind):
+                                    errors +=1
+                                    
+                        if finals:
+                            totals = finals
+                        # 4 and more in sequence
+                        if step_names_token: # and len(step_names_token) > 3
+                            
+                            where_opt = []
+                            for stp in step_names_token:
+                                if stp in options.opt_step1:
+                                    where_opt.append("1")
+                                elif stp in options.opt_step2:
+                                    where_opt.append("2")
+                                else:
+                                    where_opt.append("0")
+                            
+                            label_opt = "0"
+                            if options.opt_step1:
+                                all_opt1 = all(any(opt in step for step in step_names_token) for opt in options.opt_step1)
+                                any_opt1 = any(any(opt in step for step in step_names_token) for opt in options.opt_step1[1:])
+
+                                if any_opt1:
+                                    label_opt = "2"
+                                if all_opt1:
+                                    label_opt = "1"
+
+                                
+                            if options.opt_step2:
+                                all_opt2 = all(any(opt in step for step in step_names_token) for opt in options.opt_step2)
+                                any_opt2 = any(any(opt in step for step in step_names_token) for opt in options.opt_step2[1:])
+                                if any_opt2:
+                                    label_opt = "4"
+                                if all_opt2:
+                                    label_opt = "3"
+                                if any_opt1 and any_opt2:
+                                    label_opt = "5"
+                                if any_opt1 and all_opt2:
+                                    label_opt = "6"
+                                if all_opt1 and any_opt2:
+                                    label_opt = "7"
+                                if all_opt1 and all_opt2:
+                                    label_opt = "8"
+                            
+                            
+                            correctness = 1 - errors/totals
+                            strat_correct = "0"
+                            if correctness > 0.75:
+                                strat_correct = "1"
+                                
+                            # progress, problem name, student id, total steps length, outcome seq, help_level seq, encoding in steps length
+                            info = ",".join([str(progress),str(prob), str(student), str(auto_complete), str(len(step_names_token)),"\t".join(map(str, outcome)), "\t".join(map(str, help_level)), "\t".join(map(str, where_opt)), str(correctness)])
+                            
+                            overall_data.append(["\t".join(step_names_token), strat_correct, info, f"{1 if means_and_extremes else 0}"])
+                            overall_labels.append(label_opt)
+                            
+                    overall_data.append('')
+                    overall_labels.append('')    
+                            
+    overall_labels = np.array(overall_labels)
+    indices_of_zeros = list(np.where(overall_labels == '0')[0])
+    indices_of_ones = list(np.where(overall_labels == '1')[0])
+    indices_of_twos = list(np.where(overall_labels == '2')[0])
+    indices_of_threes = list(np.where(overall_labels == '3')[0])
+    indices_of_fours = list(np.where(overall_labels == '4')[0])
+    indices_of_fives = list(np.where(overall_labels == '5')[0])
+    indices_of_sixes = list(np.where(overall_labels == '6')[0])
+    indices_of_sevens = list(np.where(overall_labels == '7')[0])
+    indices_of_eights = list(np.where(overall_labels == '8')[0])
+    
+    per = 0.20
+    zeros_instances_size = int(per * len(indices_of_zeros))
+    ones_instances_size = int(per * len(indices_of_ones))
+    twos_instances_size = int(per * len(indices_of_twos))
+    threes_instances_size = int(per * len(indices_of_threes))
+    fours_instances_size = int(per * len(indices_of_fours))
+    fives_instances_size = int(per * len(indices_of_fives))
+    sixes_instances_size = int(per * len(indices_of_sixes))
+    sevens_instances_size = int(per * len(indices_of_sevens))
+    eights_instances_size = int(per * len(indices_of_eights))
+
+    sample_size = min(zeros_instances_size, ones_instances_size, twos_instances_size, threes_instances_size, fours_instances_size, fives_instances_size, sixes_instances_size, sevens_instances_size, eights_instances_size)
+    print(f"Sample size.... {sample_size}")
+    sampled_instances = random.sample(indices_of_zeros, sample_size)
+    sampled_instances.extend(random.sample(indices_of_ones, sample_size))
+    sampled_instances.extend(random.sample(indices_of_twos, sample_size))
+    sampled_instances.extend(random.sample(indices_of_threes, sample_size))
+    sampled_instances.extend(random.sample(indices_of_fours, sample_size))
+    sampled_instances.extend(random.sample(indices_of_fives, sample_size))
+    sampled_instances.extend(random.sample(indices_of_sixes, sample_size))
+    sampled_instances.extend(random.sample(indices_of_sevens, sample_size))
+    sampled_instances.extend(random.sample(indices_of_eights, sample_size))
+
+    writtenTrain = False
+    writtenTest = False
+    for index, (all_data, label) in enumerate(zip(overall_data, overall_labels)):        
+        if all_data:
+            steps_seq = all_data[0]
+            strat_correct = all_data[1]
+            info = all_data[2]
+            me_opt = all_data[3]
+            
+            if index in sampled_instances:
+                writtenTrain = True
+                train_file.write(steps_seq)
+                train_file.write("\n")
+                train_label.write(label)
+                train_label.write("\n")
+                trainr_label.write(strat_correct)
+                trainr_label.write("\n")
+                train_info.write(info)
+                train_info.write("\n")
+                train_gt_label.write(me_opt)
+                train_gt_label.write("\n")
+            else:
+                writtenTest = True
+                test_file.write(steps_seq)
+                test_file.write("\n")
+                # progress, problem name, student id, total steps length, outcome seq, help_level seq, encoding in steps length
+                test_label.write(label)
+                test_label.write("\n")
+                # testr_label.write(str(correctness))
+                testr_label.write(strat_correct)
+                testr_label.write("\n")
+                test_info.write(info)
+                test_info.write("\n")
+                test_gt_label.write(me_opt)
+                test_gt_label.write("\n")
+        else:
+            # Indicates actions of next student
+            # Indicates next problem
+            if writtenTrain:
+                writtenTrain = False
+                train_file.write("\n")
+                train_info.write("\n")
+                train_label.write("\n")
+                trainr_label.write("\n")
+                train_gt_label.write("\n")
+            if writtenTest:
+                writtenTest = False
+                test_file.write("\n")
+                test_info.write("\n")
+                test_label.write("\n")
+                testr_label.write("\n")
+                test_gt_label.write("\n")
+        
+
+    train_file.close()
+    train_info.close()
+    train_label.close()
+    trainr_label.close()
+    train_gt_label.close()
+    
+    test_file.close()
+    test_info.close()
+    test_label.close()
+    testr_label.close()
+    test_gt_label.close()
+
+def prepare_finetuning_effectiveness_files(data_processor, opts):
+    '''
+        Ongoing research. Student strategy learning/predicting.
+        We have defined 9 strategy as:
+        Notation; Label
+        UU; 0
+        CU; 1
+        PU; 2
+        UC; 3
+        UP; 4
+        PP; 5
+        PC; 6
+        CP; 7
+        CC; 8
+        
+        if UU and CU and PU and gt = ER and correct, a positive instance
+        if UU and UC and UP and gt = ME and correct, a positive instance
+        else a strategy PP, PC, CP, CC and gt = ER/ME or incorrect, a negative instance
+    '''
+    options = copy.deepcopy(opts)
+    for k,v in vars(opts).items():
+        if k.startswith("train") or k.startswith("test"):
+            if v:
+                f_path = v.split("/")
+                f_path = f_path[0]+"/"+f_path[1]+"/effectiveness/"+f_path[2]
+                setattr(options, f"{k}", f_path)
+                print(f"options.{k} : {getattr(options, f'{k}')}")
+                
+    steps = pickle.load(open(f"{options.dataset_folder}unique_steps_list.pkl","rb"))
+    chunk_iterator = data_processor.load_file_iterator()
+
+    train_file = open(options.train_file_path, "w")
+    train_info = open(options.train_info_path, "w")
+    train_label = open(options.train_label_path, "w")
+    trainr_label = open(options.trainr_label_path, "w")
+    train_gt_label = open(options.train_gt_label_path, "w")
+    
+    test_file = open(options.test_file_path, "w")
+    test_info = open(options.test_info_path, "w")
+    test_label = open(options.test_label_path, "w")
+    testr_label = open(options.testr_label_path, "w")
+    test_gt_label = open(options.test_gt_label_path, "w")
+
+    overall_data = []
+    overall_labels = []
+    for chunk_data in chunk_iterator:
+        for section, section_groups in chunk_data.groupby("Level (Workspace Id)"):
+            if options.workspace_name == section:
+                for student, student_groups in section_groups.groupby("Anon Student Id"):    
+                    student_groups.sort_values(by="Time")
+                    prob_list = list(pd.unique(student_groups["Problem Name"]))
+                    
+                    # if len(prob_list) < 3:
+                    #     continue
+
+#                     first_prob_list = prob_list[:3]
+                    # last_prob_list = prob_list[-3:]
+#                     # print(len(first_prob_list), len(last_prob_list))
+                    
+#                     final_prob_list = first_prob_list + last_prob_list
+                    # print(len(prob_list), len(final_prob_list)) #, final_prob_list)
+                    
+                    for prob, prob_groups in student_groups.groupby("Problem Name"):
+                        # For first 3 and last 3 only
+                        # if not prob in last_prob_list:
+                        #     continue
+                            
+                        step_names_token = []
+                        
+                        time_stamps = list(prob_groups["Time"])
+                        time_stamps_list = set()
+                        for i in range(len(time_stamps)-1):
+                            if (time_stamps[i+1] - time_stamps[i]) < 2000:
+                                time_stamps_list.add(time_stamps[i+1])
+                        
+                        progress = ""
+                        outcome = []
+                        help_level = []
+                        auto_complete = False
+                        means_and_extremes = False
+                        finals = len(options.final_step)
+                        totals = 0
+                        
+                        for index, row in prob_groups[['Time', 'Step Name', 'Outcome', 'Help Level','CF (Workspace Progress Status)', 'CF (Etalon)']].iterrows():
+                            step = row["Step Name"]
+                            etalon = row["CF (Etalon)"]
+                            progress = row["CF (Workspace Progress Status)"]
+                            if not pd.isna(step):
+                                if step in options.opt_step1:
+                                    try:
+                                        etalon = int(etalon)
+                                    except Exception as e:
+                                        try:
+                                            etalon = float(etalon)
+                                            means_and_extremes = True
+                                            # break
+                                        except Exception as e:
+                                            pass
+                                if (step in options.opt_step1 or step in options.opt_step2) and row["Time"] in time_stamps_list:
+                                # if row["Time"] in time_stamps_list:
+                                    auto_complete = True
+                                    # print(row)
+                                    continue
+                                # if not step_names_token or step != step_names_token[-1]:
+                                #     step_names_token.append(step)
+                                
+                                if not step_names_token or step != step_names_token[-1]:
+                                    step_names_token.append(step)
+                                    # ['OK', 'BUG', 'ERROR', 'INITIAL_HINT', 'HINT_LEVEL_CHANGE']
+                                    outcome.append(row['Outcome'])
+                                    help_level.append(str(row["Help Level"]))
+                                    if finals == 0:
+                                        totals += 1
+                                else:
+                                    outcome[-1] = outcome[-1]+":"+row['Outcome']
+                                    help_level[-1] = help_level[-1]+":"+str(row['Help Level'])
+                                
+                        error_ind = ['BUG', 'ERROR', 'INITIAL_HINT', 'HINT_LEVEL_CHANGE']
+                        errors = 0
+                        for step, out in zip(step_names_token, outcome):
+                            if (finals and step in options.final_step) or totals > 0:
+                                out = out.split(":")
+                                if any(any(ind in o for o in out) for ind in error_ind):
+                                    errors +=1
+                                    
+                        if finals:
+                            totals = finals
+                        # 4 and more in sequence
+                        if step_names_token: # and len(step_names_token) > 3
+                            
+                            where_opt = []
+                            for stp in step_names_token:
+                                if stp in options.opt_step1:
+                                    where_opt.append("1")
+                                elif stp in options.opt_step2:
+                                    where_opt.append("2")
+                                else:
+                                    where_opt.append("0")
+                            
+                            label_opt = "0"
+                            if options.opt_step1:
+                                all_opt1 = all(any(opt in step for step in step_names_token) for opt in options.opt_step1)
+                                any_opt1 = any(any(opt in step for step in step_names_token) for opt in options.opt_step1[1:])
+
+                                if any_opt1:
+                                    label_opt = "2"
+                                if all_opt1:
+                                    label_opt = "1"
+
+                                
+                            if options.opt_step2:
+                                all_opt2 = all(any(opt in step for step in step_names_token) for opt in options.opt_step2)
+                                any_opt2 = any(any(opt in step for step in step_names_token) for opt in options.opt_step2[1:])
+                                if any_opt2:
+                                    label_opt = "4"
+                                if all_opt2:
+                                    label_opt = "3"
+                                if any_opt1 and any_opt2:
+                                    label_opt = "5"
+                                if any_opt1 and all_opt2:
+                                    label_opt = "6"
+                                if all_opt1 and any_opt2:
+                                    label_opt = "7"
+                                if all_opt1 and all_opt2:
+                                    label_opt = "8"
+                            
+                            
+                            correctness = 1 - errors/totals
+                            strat_correct = "0"
+                            if correctness > 0.75:
+                                strat_correct = "1"
+                            
+                            label_effectiveness = "0"
+                            if label_opt in ["0", "1", "2"] and not means_and_extremes and strat_correct == "1":
+                                label_effectiveness = "1"
+                            elif label_opt in ["0", "3", "4"] and means_and_extremes and strat_correct == "1":
+                                label_effectiveness = "1"
+                            # progress, problem name, student id, total steps length, outcome seq, help_level seq, encoding in steps length
+                            info = ",".join([str(progress),str(prob), str(student), str(auto_complete), str(len(step_names_token)),"\t".join(map(str, outcome)), "\t".join(map(str, help_level)), "\t".join(map(str, where_opt)), str(correctness), label_opt, f"{1 if means_and_extremes else 0}"])
+                            
+                            overall_data.append(["\t".join(step_names_token), strat_correct, info, f"{1 if means_and_extremes else 0}"])
+                            overall_labels.append(label_effectiveness)
+                            
+                    overall_data.append('')
+                    overall_labels.append('')    
+                            
+    overall_labels = np.array(overall_labels)
+    indices_of_zeros = list(np.where(overall_labels == '0')[0])
+    indices_of_ones = list(np.where(overall_labels == '1')[0])
+    
+    per = 0.20
+    zeros_instances_size = int(per * len(indices_of_zeros))
+    ones_instances_size = int(per * len(indices_of_ones))
+
+    sample_size = min(zeros_instances_size, ones_instances_size)
+    sampled_instances = random.sample(indices_of_zeros, sample_size)
+    sampled_instances.extend(random.sample(indices_of_ones, sample_size))
+
+    writtenTrain = False
+    writtenTest = False
+    for index, (all_data, label) in enumerate(zip(overall_data, overall_labels)):        
+        if all_data:
+            steps_seq = all_data[0]
+            strat_correct = all_data[1]
+            info = all_data[2]
+            me_opt = all_data[3]
+            
+            if index in sampled_instances:
+                writtenTrain = True
+                train_file.write(steps_seq)
+                train_file.write("\n")
+                train_label.write(label)
+                train_label.write("\n")
+                trainr_label.write(strat_correct)
+                trainr_label.write("\n")
+                train_info.write(info)
+                train_info.write("\n")
+                train_gt_label.write(me_opt)
+                train_gt_label.write("\n")
+            else:
+                writtenTest = True
+                test_file.write(steps_seq)
+                test_file.write("\n")
+                # progress, problem name, student id, total steps length, outcome seq, help_level seq, encoding in steps length
+                test_label.write(label)
+                test_label.write("\n")
+                # testr_label.write(str(correctness))
+                testr_label.write(strat_correct)
+                testr_label.write("\n")
+                test_info.write(info)
+                test_info.write("\n")
+                test_gt_label.write(me_opt)
+                test_gt_label.write("\n")
+        else:
+            # Indicates actions of next student
+            # Indicates next problem
+            if writtenTrain:
+                writtenTrain = False
+                train_file.write("\n")
+                train_info.write("\n")
+                train_label.write("\n")
+                trainr_label.write("\n")
+                train_gt_label.write("\n")
+            if writtenTest:
+                writtenTest = False
+                test_file.write("\n")
+                test_info.write("\n")
+                test_label.write("\n")
+                testr_label.write("\n")
+                test_gt_label.write("\n")
+        
+
+    train_file.close()
+    train_info.close()
+    train_label.close()
+    trainr_label.close()
+    train_gt_label.close()
+    
+    test_file.close()
+    test_info.close()
+    test_label.close()
+    testr_label.close()
+    test_gt_label.close()
+
+def prepare_attn_test_files(data_processor, opts):
+    options = copy.deepcopy(opts)
+    
+    if options.code:
+        new_folder = f"{options.workspace_name}/{options.code}"
+        if not os.path.exists(new_folder):
+            os.makedirs(new_folder)
+                
+                    
+    for k,v in vars(opts).items():
+        if k.startswith("train") or k.startswith("test"):
+            if v:
+                f_path = (f"/{options.code}/").join(v.split("/"))
+                setattr(options, f"{k}", f_path)
+                print(f"options.{k} : {getattr(options, f'{k}')}")
+                
+    steps = pickle.load(open(f"{options.dataset_folder}unique_steps_list.pkl","rb"))
+    chunk_iterator = data_processor.load_file_iterator()
+
+    train_file = open(options.train_file_path, "w")
+    train_info = open(options.train_info_path, "w")
+    
+    if options.code != "full":
+        test_file = open(options.test_file_path, "w")
+        test_info = open(options.test_info_path, "w")
+
+    for chunk_data in chunk_iterator:
+        for section, section_groups in chunk_data.groupby("Level (Workspace Id)"):
+            if options.workspace_name == section:
+                for student, student_groups in section_groups.groupby("Anon Student Id"):
+                    
+                    student_groups.sort_values(by="Time")
+                    prob_list = list(pd.unique(student_groups["Problem Name"]))
+                    
+                    
+                    for prob, prob_groups in student_groups.groupby("Problem Name"):
+                            
+                        step_names_token = []
+                        
+                        time_stamps = list(prob_groups["Time"])
+                        time_stamps_list = set()
+                        for i in range(len(time_stamps)-1):
+                            if (time_stamps[i+1] - time_stamps[i]) < 2000:
+                                time_stamps_list.add(time_stamps[i+1])
+                        
+                        progress = ""
+                        outcome = []
+                        help_level = []
+                        auto_complete = False
+                        means_and_extremes = False
+                        finals = len(options.final_step)
+                        totals = 0
+                        
+                        for index, row in prob_groups[['Time', 'Step Name', 'Outcome', 'Help Level','CF (Workspace Progress Status)', 'CF (Etalon)']].iterrows():
+                            step = row["Step Name"]
+                            etalon = row["CF (Etalon)"]
+                            progress = row["CF (Workspace Progress Status)"]
+                            if not pd.isna(step):
+                                if step in options.opt_step1:
+                                    try:
+                                        etalon = int(etalon)
+                                    except Exception as e:
+                                        try:
+                                            etalon = float(etalon)
+                                            means_and_extremes = True
+                                            # break
+                                        except Exception as e:
+                                            pass
+                                if (step in options.opt_step1 or step in options.opt_step2) and row["Time"] in time_stamps_list:
+                                # if row["Time"] in time_stamps_list:
+                                    auto_complete = True
+                                    # print(row)
+                                    continue
+                                # if not step_names_token or step != step_names_token[-1]:
+                                #     step_names_token.append(step)
+                                
+                                if not step_names_token or step != step_names_token[-1]:
+                                    step_names_token.append(step)
+                                    # ['OK', 'BUG', 'ERROR', 'INITIAL_HINT', 'HINT_LEVEL_CHANGE']
+                                    outcome.append(row['Outcome'])
+                                    help_level.append(str(row["Help Level"]))
+                                    if finals == 0:
+                                        totals += 1
+                                else:
+                                    outcome[-1] = outcome[-1]+":"+row['Outcome']
+                                    help_level[-1] = help_level[-1]+":"+str(row['Help Level'])
+                                
+                        error_ind = ['BUG', 'ERROR', 'INITIAL_HINT', 'HINT_LEVEL_CHANGE']
+                        errors = 0
+                        for step, out in zip(step_names_token, outcome):
+                            if (finals and step in options.final_step) or totals > 0:
+                                out = out.split(":")
+                                if any(any(ind in o for o in out) for ind in error_ind):
+                                    errors +=1
+                                    
+                        if finals:
+                            totals = finals
+                        # 4 and more in sequence
+                        if step_names_token: # and len(step_names_token) > 3
+                            
+                            where_opt = []
+                            for stp in step_names_token:
+                                if stp in options.opt_step1:
+                                    where_opt.append("1")
+                                elif stp in options.opt_step2:
+                                    where_opt.append("2")
+                                else:
+                                    where_opt.append("0")
+                            
+                            label_opt = "0"
+                            if options.opt_step1:
+                                all_opt1 = all(any(opt in step for step in step_names_token) for opt in options.opt_step1)
+                                any_opt1 = any(any(opt in step for step in step_names_token) for opt in options.opt_step1[1:])
+
+                                if any_opt1:
+                                    label_opt = "2"
+                                if all_opt1:
+                                    label_opt = "1"
+
+                                
+                            if options.opt_step2:
+                                all_opt2 = all(any(opt in step for step in step_names_token) for opt in options.opt_step2)
+                                any_opt2 = any(any(opt in step for step in step_names_token) for opt in options.opt_step2[1:])
+                                if any_opt2:
+                                    label_opt = "4"
+                                if all_opt2:
+                                    label_opt = "3"
+                                if any_opt1 and any_opt2:
+                                    label_opt = "5"
+                                if any_opt1 and all_opt2:
+                                    label_opt = "6"
+                                if all_opt1 and any_opt2:
+                                    label_opt = "7"
+                                if all_opt1 and all_opt2:
+                                    label_opt = "8"
+                            
+                            
+                            correctness = 1 - errors/totals
+                            opt_correct = "0"
+                            if correctness > 0.75:
+                                opt_correct = "1"
+                            
+                            proba = random.random()
+                            
+                            # if proba <= 0.1:
+                            # if not means_and_extremes:
+                            # if prob in first_prob_list:
+                            if options.code == "full" or (options.code == "gt" and not means_and_extremes) or (options.code == "correct" and opt_correct == "1") or (options.code == "progress" and progress == "GRADUATED"):
+                                if label_opt == "0":
+                                    continue
+                                train_file.write("\t".join(step_names_token))
+                                train_file.write("\n")
+                                # progress, problem name, student id, total steps length, outcome seq, help_level seq, encoding in steps length
+                                train_info.write(",".join([str(progress),str(prob), str(student), str(auto_complete), str(len(step_names_token)),
+                                                "\t".join(map(str, outcome)), "\t".join(map(str, help_level)), "\t".join(map(str, where_opt)), 
+                                                           str(correctness), f"{1 if means_and_extremes else 0}", label_opt]))
+                                train_info.write("\n")
+                            # if means_and_extremes:
+                            # if prob in last_prob_list:
+                            else:
+                                test_file.write("\t".join(step_names_token))
+                                test_file.write("\n")
+                                # progress, problem name, student id, total steps length, outcome seq, help_level seq, encoding in steps length
+                                test_info.write(",".join([str(progress),str(prob), str(student), str(auto_complete), str(len(step_names_token)),
+                                                "\t".join(map(str, outcome)), "\t".join(map(str, help_level)), "\t".join(map(str, where_opt)), 
+                                                          str(correctness), f"{1 if means_and_extremes else 0}", label_opt]))
+                                test_info.write("\n")
+
+    train_file.close()
+    train_info.close()
+    
+    if options.code != "full":
+        test_file.close()
+        test_info.close()
+
+def prepare_finetuning_future_files(data_processor, opts):
+    options = copy.deepcopy(opts)
+    for k,v in vars(opts).items():
+        if k.startswith("train") or k.startswith("test"):
+            if v:
+                f_path = ("/effectiveness/").join(v.split("/"))
+                setattr(options, f"{k}", f_path)
+                print(f"options.{k} : {getattr(options, f'{k}')}")
+                
+    steps = pickle.load(open(f"{options.dataset_folder}unique_steps_list.pkl","rb"))
+    chunk_iterator = data_processor.load_file_iterator()
+
+    train_file = open(options.train_file_path, "w")
+    train_info = open(options.train_info_path, "w")
+    train_label = open(options.train_label_path, "w")
+    trainr_label = open(options.trainr_label_path, "w")
+    
+    test_file = open(options.test_file_path, "w")
+    test_info = open(options.test_info_path, "w")
+    test_label = open(options.test_label_path, "w")
+    testr_label = open(options.testr_label_path, "w")
+
+    for chunk_data in chunk_iterator:
+        for section, section_groups in chunk_data.groupby("Level (Workspace Id)"):
+            if options.workspace_name == section:
+                for student, student_groups in section_groups.groupby("Anon Student Id"):
+                    writtenTrain = False
+                    writtenTest = False
+                    
+                    student_groups.sort_values(by="Time")
+                    prob_list = list(pd.unique(student_groups["Problem Name"]))
+                    
+                    # if len(prob_list) < 6:
+                    #     continue
+
+#                     first_prob_list = prob_list[:3]
+#                     last_prob_list = prob_list[-3:]
+#                     # print(len(first_prob_list), len(last_prob_list))
+                    
+#                     final_prob_list = first_prob_list + last_prob_list
+                    # print(len(prob_list), len(final_prob_list)) #, final_prob_list)
+                    
+                    for prob, prob_groups in student_groups.groupby("Problem Name"):
+                        # For first 3 and last 3 only
+                        # if not prob in final_prob_list:
+                        #     continue
+                            
+                        step_names_token = []
+                        
+                        time_stamps = list(prob_groups["Time"])
+                        time_stamps_list = set()
+                        for i in range(len(time_stamps)-1):
+                            if (time_stamps[i+1] - time_stamps[i]) < 2000:
+                                time_stamps_list.add(time_stamps[i+1])
+                        
+                        progress = ""
+                        outcome = []
+                        help_level = []
+                        auto_complete = False
+                        errors = 0
+                        totals = 0
+                        means_and_extremes = False
+                        
+                        for index, row in prob_groups[['Time', 'Step Name', 'Outcome', 'Help Level','CF (Workspace Progress Status)', 'CF (Etalon)']].iterrows():
+                            step = row["Step Name"]
+                            etalon = row["CF (Etalon)"]
+                            progress = row["CF (Workspace Progress Status)"]
+                            if not pd.isna(step):
+                                if step in options.opt_step1:
+                                    try:
+                                        etalon = int(etalon)
+                                    except Exception as e:
+                                        try:
+                                            etalon = float(etalon)
+                                            means_and_extremes = True
+                                            # break
+                                        except Exception as e:
+                                            pass
+                                if (step in options.opt_step1 or step in options.opt_step2) and row["Time"] in time_stamps_list:
+                                # if row["Time"] in time_stamps_list:
+                                    auto_complete = True
+                                    # print(row)
+                                    continue
+                                # if not step_names_token or step != step_names_token[-1]:
+                                #     step_names_token.append(step)
+                                
+                                if not step_names_token or step != step_names_token[-1]:
+                                    step_names_token.append(step)
+                                    # ['OK', 'BUG', 'ERROR', 'INITIAL_HINT', 'HINT_LEVEL_CHANGE']
+                                    outcome.append(row['Outcome'])
+                                    help_level.append(str(row["Help Level"]))
+                                    totals += 1
+                                else:
+                                    outcome[-1] = outcome[-1]+":"+row['Outcome']
+                                    help_level[-1] = help_level[-1]+":"+str(row['Help Level'])
+                                
+                        error_ind = ['BUG', 'ERROR', 'INITIAL_HINT', 'HINT_LEVEL_CHANGE']
+                        for out in outcome:
+                            out = out.split(":")
+                            if any(any(ind in o for o in out) for ind in error_ind):
+                                errors +=1
+                        # 4 and more in sequence
+                        if step_names_token: # and len(step_names_token) > 3
+                            
+                            where_opt = []
+                            for stp in step_names_token:
+                                if stp in options.opt_step1:
+                                    where_opt.append("1")
+                                elif stp in options.opt_step2:
+                                    where_opt.append("2")
+                                else:
+                                    where_opt.append("0")
+                            
+                            label_opt = "0"
+                            if options.opt_step1:
+                                all_opt1 = all(any(opt in step for step in step_names_token) for opt in options.opt_step1)
+                                any_opt1 = any(any(opt in step for step in step_names_token) for opt in options.opt_step1[1:])
+
+                                if any_opt1:
+                                    label_opt = "2"
+                                if all_opt1:
+                                    label_opt = "1"
+
+                                
+                            if options.opt_step2:
+                                all_opt2 = all(any(opt in step for step in step_names_token) for opt in options.opt_step2)
+                                any_opt2 = any(any(opt in step for step in step_names_token) for opt in options.opt_step2[1:])
+                                if any_opt2:
+                                    label_opt = "4"
+                                if all_opt2:
+                                    label_opt = "3"
+                                if any_opt1 and any_opt2:
+                                    label_opt = "5"
+                                if any_opt1 and all_opt2:
+                                    label_opt = "6"
+                                if all_opt1 and any_opt2:
+                                    label_opt = "7"
+                                if all_opt1 and all_opt2:
+                                    label_opt = "8"
+                            
+                            
+                            correctness = 1 - errors/totals
+                            opt_correct = "0"
+                            if correctness < 0.25:
+                                opt_correct = "0"
+                            elif correctness < 0.5:
+                                opt_correct = "1"
+                            elif correctness < 0.75:
+                                opt_correct = "2"
+                            else:
+                                opt_correct = "3"
+                                
+                                
+                            
+                            proba = random.random()
+                            
+                            # if proba <= 0.1:
+                            if not means_and_extremes:
+                            # if prob in first_prob_list:
+                                writtenTrain = True
+                                train_file.write("\t".join(step_names_token))
+                                train_file.write("\n")
+                                # progress, problem name, student id, total steps length, outcome seq, help_level seq, encoding in steps length
+                                train_label.write(label_opt)
+                                train_label.write("\n")
+                                # trainr_label.write(str(correctness))
+                                trainr_label.write(opt_correct)
+                                trainr_label.write("\n")
+                                train_info.write(",".join([str(progress),str(prob), str(student), str(auto_complete), str(len(step_names_token)),
+                                                "\t".join(map(str, outcome)), "\t".join(map(str, help_level)), "\t".join(map(str, where_opt))]))
+                                train_info.write("\n")
+
+                            if means_and_extremes:
+                            # if prob in last_prob_list:
+                            # else:
+                                writtenTest = True
+                                test_file.write("\t".join(step_names_token))
+                                test_file.write("\n")
+                                # progress, problem name, student id, total steps length, outcome seq, help_level seq, encoding in steps length
+                                test_label.write(label_opt)
+                                test_label.write("\n")
+                                # testr_label.write(str(correctness))
+                                testr_label.write(opt_correct)
+                                testr_label.write("\n")
+                                test_info.write(",".join([str(progress),str(prob), str(student), str(auto_complete), str(len(step_names_token)),
+                                                "\t".join(map(str, outcome)), "\t".join(map(str, help_level)), "\t".join(map(str, where_opt))]))
+                                test_info.write("\n")
+                    # Indicates actions of next student
+                    # Indicates next problem
+                    if writtenTrain:
+                        train_file.write("\n")
+                        train_info.write("\n")
+                        train_label.write("\n")
+                        trainr_label.write("\n")
+                    if writtenTest:
+                        test_file.write("\n")
+                        test_info.write("\n")
+                        test_label.write("\n")
+                        testr_label.write("\n")
+
+
+    train_file.close()
+    train_info.close()
+    train_label.close()
+    trainr_label.close()
+    
+    test_file.close()
+    test_info.close()
+    test_label.close()
+    testr_label.close()
+    
+def prepare_school_coded_finetuning_partial_seq_files(data_processor, options):
+    '''
+        Ongoing research.
+        FinalAnswer step correctness
+        Correct: 0 if attempt at step>1
+                1 if attempt at step==1
+    '''
+    chunk_iterator = data_processor.load_file_iterator(sep=",")
+
+    train_file = open(options.train_file_path, "w")
+    train_info = open(options.train_info_path, "w")
+    train_label = open(options.train_label_path, "w")
+    
+#     val_file = open(options.val_file_path, "w")
+#     val_info = open(options.val_info_path, "w")
+#     val_label = open(options.val_label_path, "w")
+    
+    test_file = open(options.test_file_path, "w")
+    test_info = open(options.test_info_path, "w")
+    test_label = open(options.test_label_path, "w")
+    
+    overall_data = []
+    overall_labels = []
+    # kcs = pickle.load(open("dataset/CL_2223/ratio_proportion_change3_2223/unique_kcs_list.pkl", "rb"))
+    # kcs = [kc if not pd.isna(kc) for kc in kcs]
+    for chunk_data in chunk_iterator:
+        for school, school_group in chunk_data.groupby('CF (Anon School Id)'):
+            if not options.school or school in options.school:
+                print(f"{school} : {school_group.shape}")
+                school_group = school_group[(school_group['CF (Is StepByStep)'] == False) &
+                                            (school_group['CF (Encounter)'] == 0) &
+                                            (school_group['CF (Is Review Mode)'] == -1) ]
+                print(f"{school} : {school_group.shape}")
+                # for class_id, class_group in school_groups.groupby('CF (Anon Class Id)'):
+                for student, student_groups in school_group.groupby("Anon Student Id"):
+                    student_groups.sort_values(by="Time", inplace=True)
+                    # prob_list = list(pd.unique(student_groups["Problem Name"]))
+                    for prob, prob_groups in student_groups.groupby("Problem Name"):
+                        
+                        actions = list(prob_groups["Action"])
+                        # A problem should be completed by a student clicking Done button.
+                        if not "Done" in actions: 
+                            continue
+                        unique_steps = list(pd.unique(prob_groups[prob_groups["CF (Is Autofilled)"] == False]["Step Name"]))
+                        unique_steps_len = len([s for s in unique_steps if not pd.isna(s) and not (s in options.opt_step1) and not (s in options.opt_step2)])
+                        if unique_steps_len < 4:
+                            continue
+                        unique_opt_steps_len = len([s for s in unique_steps if not pd.isna(s) and (s in options.opt_step1[1:] or s in options.opt_step2[1:])])
+                        if unique_opt_steps_len < 2:
+                            continue
+                            
+                        class_id = list(pd.unique(prob_groups["CF (Anon Class Id)"]))
+                        step_names_token = []
+                        original_steps_actions_attempts_help_levels_outcomes = []
+                        original_steps = []
+                        means_and_extremes = False
+                        correctness = "0"
+                        opt_used = False
+                        for index, row in prob_groups[['Step Name', 'Action', 'Attempt At Step', 'CF (Is Autofilled)',
+                                                       'Outcome', 'Help Level', 'CF (Workspace Progress Status)', 
+                                                       'CF (Etalon)', 'CF (Problem Scenario Tags)']].iterrows():
+                            step = row["Step Name"]
+                            action = row["Action"]            # ['Attempt', 'Hint Request', 'Hint Level Change', 'Done']
+                            attempt = row["Attempt At Step"]  # number
+                            outcome = row["Outcome"]          # ['OK', 'BUG', 'ERROR', 'INITIAL_HINT', 'HINT_LEVEL_CHANGE']
+                            help_level = row["Help Level"]    # number
+                            progress = row["CF (Workspace Progress Status)"]
+                            scenario = row['CF (Problem Scenario Tags)']
+                            
+                            if not pd.isna(step):
+                                if step in options.opt_step1 and not means_and_extremes:
+                                    etalon = row["CF (Etalon)"]
+                                    if not pd.isna(etalon):
+                                        etalon = etalon.strip('{}')
+                                        key, value = etalon.split('=')
+                                        etalon = value
+                                        try:
+                                            etalon = int(etalon)
+                                        except Exception as e:
+                                            try:
+                                                etalon = float(etalon)
+                                                means_and_extremes = True
+                                            except Exception as e:
+                                                pass
+                                if row['CF (Is Autofilled)'] == True:
+                                    continue
+                                prev = step_names_token[-1] if step_names_token else ""
+                                prev_step = step_names_token[-1].split("-")[0] if step_names_token else ""
+
+                                if not step_names_token or step != prev_step:
+                                    if step in options.opt_step1 or step in options.opt_step2:
+                                        new_step = step
+                                        opt_used = True
+                                    else:
+                                        if action == "Attempt" and outcome != "OK":
+                                            new_step = step+"-2"
+                                        elif "Hint" in action:
+                                            new_step = step+"-1"
+                                        else:
+                                            new_step = step+"-0"
+                                            
+                                    if step != "FinalAnswer":
+                                        step_names_token.append(new_step)
+                                    else:
+                                        step_names_token.append("FinalAnswer")
+                                else:
+                                    if not (step in options.opt_step1 or step in options.opt_step2 or step == "FinalAnswer"):
+                                        if action == "Attempt" and outcome != "OK":
+                                            new_step = step+"-2"
+                                        elif "Hint" in action:
+                                            new_step = step+"-1"
+                                        else:
+                                            new_step = step+"-0"
+                                            
+                                        if prev < new_step:
+                                            step_names_token[-1] = new_step
+                                
+                                if step == "FinalAnswer" and opt_used:
+                                    if attempt == 1 and outcome == "OK":
+                                        correctness = "1"
+                                    else:
+                                        correctness = "0"
+                                original_steps_actions_attempts_help_levels_outcomes.append(f"{step}-{action}-{attempt}-{help_level}-{outcome}")
+                                original_steps.append(step)
+                                
+                        unique_steps_len = len([s for s in original_steps if not (s in options.opt_step1) and not(s in options.opt_step2)])
+                        if step_names_token and unique_steps_len > 4:
+                            info = ",".join([str(school), "\t".join(class_id), str(student), str(progress), str(prob), str(scenario), 
+                                           f"{1 if means_and_extremes else 0}", str(len(step_names_token)), 
+                                           "\t".join(map(str, original_steps_actions_attempts_help_levels_outcomes))])
+                            overall_data.append(["\t".join(step_names_token), info])
+                            overall_labels.append(correctness)
+#                             proba = random.random()
+#                             # if prob in first_prob_list:
+#                             if proba <= 0.8:
+#                                 train_file.write("\t".join(step_names_token))
+#                                 train_file.write("\n")
+#                                 # school, class, student id, progress, problem name, scenario, 
+#                                 # prefered ER or ME, total steps length, 
+#                                 # original seq-action-attempt-help_level-outcome
+#                                 train_info.write(",".join([str(school), "\t".join(class_id), str(student), str(progress), str(prob), str(scenario), 
+#                                                f"{1 if means_and_extremes else 0}", str(len(step_names_token)), 
+#                                                "\t".join(map(str, original_steps_actions_attempts_help_levels_outcomes))]))
+#                                 train_info.write("\n")
+
+#                             elif proba > 0.9:
+#                             # elif prob in last_prob_list:
+#                                 test_file.write("\t".join(step_names_token))
+#                                 test_file.write("\n")
+#                                 # school, class, student id, progress, problem name, scenario, 
+#                                 # prefered ER or ME, total steps length, 
+#                                 # original seq-action-attempt-help_level-outcome
+#                                 test_info.write(",".join([str(school), "\t".join(class_id), str(student), str(progress), str(prob), str(scenario), 
+#                                                f"{1 if means_and_extremes else 0}", str(len(step_names_token)), 
+#                                                "\t".join(map(str, original_steps_actions_attempts_help_levels_outcomes))]))
+#                                 test_info.write("\n")
+
+#                             else:
+#                                 val_file.write("\t".join(step_names_token))
+#                                 val_file.write("\n")
+#                                 # school, class, student id, progress, problem name, scenario, 
+#                                 # prefered ER or ME, total steps length, 
+#                                 # original seq-action-attempt-help_level-outcome
+#                                 val_info.write(",".join([str(school), "\t".join(class_id), str(student), str(progress), str(prob), str(scenario), 
+#                                                f"{1 if means_and_extremes else 0}", str(len(step_names_token)), 
+#                                                "\t".join(map(str, original_steps_actions_attempts_help_levels_outcomes))]))
+#                                 val_info.write("\n")
+                        # break
+                    # break
+                    # break
+                # break
+        # break
+    overall_labels = np.array(overall_labels)
+    indices_of_zeros = list(np.where(overall_labels == '0')[0])
+    indices_of_ones = list(np.where(overall_labels == '1')[0])
+    
+    train_len = int(len(overall_labels) * 0.10)
+    sample_size = int(train_len/2)
+    print(f"sample_size: {sample_size}")
+    sampled_instances = random.sample(indices_of_zeros, sample_size)
+    sampled_instances.extend(random.sample(indices_of_ones, sample_size))
+
+    indices_of_zeros = [i for i in indices_of_zeros if not i in sampled_instances ]
+    indices_of_ones = [i for i in indices_of_ones if not i in sampled_instances ]
+
+    balanced_test = min(len(indices_of_zeros), len(indices_of_ones))
+    print(f"balanced_test: {balanced_test}")
+    test_sampled_instances = random.sample(indices_of_zeros, balanced_test)
+    test_sampled_instances.extend(random.sample(indices_of_ones, balanced_test))
+    
+    for index, (all_data, label) in enumerate(zip(overall_data, overall_labels)):
+
+        steps_seq = all_data[0]
+        info = all_data[1]
+
+        if index in sampled_instances:
+            train_file.write(steps_seq)
+            train_file.write("\n")
+            
+            train_info.write(info)
+            train_info.write("\n")
+            
+            train_label.write(label)
+            train_label.write("\n")
+        elif index in test_sampled_instances:
+            # proba = random.random()
+            # if proba <0.5:
+            test_file.write(steps_seq)
+            test_file.write("\n")
+
+            test_info.write(info)
+            test_info.write("\n")
+
+            test_label.write(label)
+            test_label.write("\n")
+#             else:
+#                 val_file.write(steps_seq)
+#                 val_file.write("\n")
+
+#                 val_info.write(info)
+#                 val_info.write("\n")
+
+#                 val_label.write(label)
+#                 val_label.write("\n")
+
+
+    train_file.close()
+    train_info.close()
+    train_label.close()
+    
+    # val_file.close()
+    # val_info.close()
+    # val_label.close()
+    
+    test_file.close()
+    test_info.close()
+    test_label.close()
+    
+def prepare_school_coded_finetuning_opts_files(data_processor, options):
+    '''
+        Ongoing research.
+        Labels:
+            0 - Opt 1
+            1 - Opt 2
+            2 - Both Opt
+    '''
+    chunk_iterator = data_processor.load_file_iterator(sep=",")
+
+    train_file = open(options.train_file_path, "w")
+    train_info = open(options.train_info_path, "w")
+    train_label = open(options.train_label_path, "w")
+    
+#     val_file = open(options.val_file_path, "w")
+#     val_info = open(options.val_info_path, "w")
+#     val_label = open(options.val_label_path, "w")
+    
+    test_file = open(options.test_file_path, "w")
+    test_info = open(options.test_info_path, "w")
+    test_label = open(options.test_label_path, "w")
+    
+    overall_data = []
+    overall_labels = []
+    # kcs = pickle.load(open("dataset/CL_2223/ratio_proportion_change3_2223/unique_kcs_list.pkl", "rb"))
+    # kcs = [kc if not pd.isna(kc) for kc in kcs]
+    for chunk_data in chunk_iterator:
+        for school, school_group in chunk_data.groupby('CF (Anon School Id)'):
+            if not options.school or school in options.school:
+                print(f"{school} : {school_group.shape}")
+                school_group = school_group[(school_group['CF (Is StepByStep)'] == False) &
+                                            (school_group['CF (Encounter)'] == 0) &
+                                            (school_group['CF (Is Review Mode)'] == -1) ]
+                print(f"{school} : {school_group.shape}")
+                # for class_id, class_group in school_groups.groupby('CF (Anon Class Id)'):
+                for student, student_groups in school_group.groupby("Anon Student Id"):
+                    student_groups.sort_values(by="Time", inplace=True)
+                    # prob_list = list(pd.unique(student_groups["Problem Name"]))
+                    # prob_list= list(student_groups[student_groups["CF (Workspace Progress Status)"]=="GRADUATED"]["Problem Name"])
+                    # prob_list = prob_list[-int(len(prob_list)/2):]
+                    for prob, prob_groups in student_groups.groupby("Problem Name"):
+                        # if not prob in prob_list:
+                        #     continue
+                        actions = list(prob_groups["Action"])
+                        # A problem should be completed by a student clicking Done button.
+                        if not "Done" in actions: 
+                            continue
+                        unique_steps = list(pd.unique(prob_groups[prob_groups["CF (Is Autofilled)"] == False]["Step Name"]))
+                        unique_steps_len = len([s for s in unique_steps if not pd.isna(s) and not (s in options.opt_step1) and not (s in options.opt_step2)])
+                        if unique_steps_len < 4:
+                            continue
+                        unique_opt_steps_len = len([s for s in unique_steps if not pd.isna(s) and (s in options.opt_step1[1:] or s in options.opt_step2[1:])])
+                        if unique_opt_steps_len < 2:
+                            continue
+                        print(unique_steps, unique_opt_steps_len)
+                        class_id = list(pd.unique(prob_groups["CF (Anon Class Id)"]))
+                        step_names_token = []
+                        original_steps_actions_attempts_help_levels_outcomes = []
+                        original_steps = []
+                        means_and_extremes = False
+                        opt1_used = False
+                        opt2_used = False
+                        for index, row in prob_groups[['Step Name', 'Action', 'Attempt At Step', 'CF (Is Autofilled)',
+                                                       'Outcome', 'Help Level', 'CF (Workspace Progress Status)', 
+                                                       'CF (Etalon)', 'CF (Problem Scenario Tags)']].iterrows():
+                            step = row["Step Name"]
+                            action = row["Action"]            # ['Attempt', 'Hint Request', 'Hint Level Change', 'Done']
+                            attempt = row["Attempt At Step"]  # number
+                            outcome = row["Outcome"]          # ['OK', 'BUG', 'ERROR', 'INITIAL_HINT', 'HINT_LEVEL_CHANGE']
+                            help_level = row["Help Level"]    # number
+                            progress = row["CF (Workspace Progress Status)"]
+                            scenario = row['CF (Problem Scenario Tags)']
+                            
+                            if not pd.isna(step):
+                                if step in options.opt_step1 and not means_and_extremes:
+                                    etalon = row["CF (Etalon)"]
+                                    if not pd.isna(etalon):
+                                        etalon = etalon.strip('{}')
+                                        key, value = etalon.split('=')
+                                        etalon = value
+                                        try:
+                                            etalon = int(etalon)
+                                        except Exception as e:
+                                            try:
+                                                etalon = float(etalon)
+                                                means_and_extremes = True
+                                            except Exception as e:
+                                                pass
+                                if row['CF (Is Autofilled)'] == True:
+                                    continue
+                                prev = step_names_token[-1] if step_names_token else ""
+                                prev_step = step_names_token[-1].split("-")[0] if step_names_token else ""
+
+                                if not step_names_token or step != prev_step:
+                                    if step in options.opt_step1 or step in options.opt_step2:
+                                        new_step = step
+                                        if step in options.opt_step1[1:]:
+                                            opt1_used = True
+                                        elif step in options.opt_step2[2:]:
+                                            opt2_used = True
+                                    else:
+                                        if action == "Attempt" and outcome != "OK":
+                                            new_step = step+"-2"
+                                        elif "Hint" in action:
+                                            new_step = step+"-1"
+                                        else:
+                                            new_step = step+"-0"
+                                            
+                                    step_names_token.append(new_step)
+                                    
+                                else:
+                                    if not (step in options.opt_step1 or step in options.opt_step2 or step == "FinalAnswer"):
+                                        if action == "Attempt" and outcome != "OK":
+                                            new_step = step+"-2"
+                                        elif "Hint" in action:
+                                            new_step = step+"-1"
+                                        else:
+                                            new_step = step+"-0"
+                                            
+                                        if prev < new_step:
+                                            step_names_token[-1] = new_step
+                                
+                                original_steps_actions_attempts_help_levels_outcomes.append(f"{step}-{action}-{attempt}-{help_level}-{outcome}")
+                                original_steps.append(step)
+                        if (not opt1_used) and (not opt2_used):
+                            continue
+                        unique_steps_len = len([s for s in original_steps if not (s in options.opt_step1) and not(s in options.opt_step2)])
+                        if step_names_token and unique_steps_len > 4:
+                            info = ",".join([str(school), "\t".join(class_id), str(student), str(progress), str(prob), str(scenario), 
+                                           f"{1 if means_and_extremes else 0}", str(len(step_names_token)), 
+                                           "\t".join(map(str, original_steps_actions_attempts_help_levels_outcomes))])
+                            overall_data.append(["\t".join(step_names_token), info])
+                            label = None
+                            if opt1_used and opt2_used:
+                                label = "2"
+                            if (not opt1_used) and opt2_used:
+                                label = "1"
+                            if opt1_used and (not opt2_used):
+                                label = "0"
+                            print(f"opt1_used: {opt1_used}, opt2_used: {opt2_used} label : {label}")
+                            overall_labels.append(label)
+#                             proba = random.random()
+#                             # if prob in first_prob_list:
+#                             if proba <= 0.8:
+#                                 train_file.write("\t".join(step_names_token))
+#                                 train_file.write("\n")
+#                                 # school, class, student id, progress, problem name, scenario, 
+#                                 # prefered ER or ME, total steps length, 
+#                                 # original seq-action-attempt-help_level-outcome
+#                                 train_info.write(",".join([str(school), "\t".join(class_id), str(student), str(progress), str(prob), str(scenario), 
+#                                                f"{1 if means_and_extremes else 0}", str(len(step_names_token)), 
+#                                                "\t".join(map(str, original_steps_actions_attempts_help_levels_outcomes))]))
+#                                 train_info.write("\n")
+
+#                             elif proba > 0.9:
+#                             # elif prob in last_prob_list:
+#                                 test_file.write("\t".join(step_names_token))
+#                                 test_file.write("\n")
+#                                 # school, class, student id, progress, problem name, scenario, 
+#                                 # prefered ER or ME, total steps length, 
+#                                 # original seq-action-attempt-help_level-outcome
+#                                 test_info.write(",".join([str(school), "\t".join(class_id), str(student), str(progress), str(prob), str(scenario), 
+#                                                f"{1 if means_and_extremes else 0}", str(len(step_names_token)), 
+#                                                "\t".join(map(str, original_steps_actions_attempts_help_levels_outcomes))]))
+#                                 test_info.write("\n")
+
+#                             else:
+#                                 val_file.write("\t".join(step_names_token))
+#                                 val_file.write("\n")
+#                                 # school, class, student id, progress, problem name, scenario, 
+#                                 # prefered ER or ME, total steps length, 
+#                                 # original seq-action-attempt-help_level-outcome
+#                                 val_info.write(",".join([str(school), "\t".join(class_id), str(student), str(progress), str(prob), str(scenario), 
+#                                                f"{1 if means_and_extremes else 0}", str(len(step_names_token)), 
+#                                                "\t".join(map(str, original_steps_actions_attempts_help_levels_outcomes))]))
+#                                 val_info.write("\n")
+                        # break
+                    # break
+                    # break
+                # break
+        # break
+    overall_labels = np.array(overall_labels)
+    indices_of_zeros = list(np.where(overall_labels == '0')[0])
+    indices_of_ones = list(np.where(overall_labels == '1')[0])
+    indices_of_twos = list(np.where(overall_labels == '2')[0])
+    
+    train_len = int(len(overall_labels) * 0.10)
+    sample_size = int(train_len/3)
+    print(f"sample_size: {sample_size}")
+    sampled_instances = random.sample(indices_of_zeros, sample_size)
+    sampled_instances.extend(random.sample(indices_of_ones, sample_size))
+    sampled_instances.extend(random.sample(indices_of_twos, sample_size))
+
+    indices_of_zeros = [i for i in indices_of_zeros if not i in sampled_instances ]
+    indices_of_ones = [i for i in indices_of_ones if not i in sampled_instances ]
+    indices_of_twos = [i for i in indices_of_twos if not i in sampled_instances ]
+
+    balanced_test = min(len(indices_of_zeros), len(indices_of_ones), len(indices_of_twos))
+    print(f"balanced_test: {balanced_test}")
+    test_sampled_instances = random.sample(indices_of_zeros, balanced_test)
+    test_sampled_instances.extend(random.sample(indices_of_ones, balanced_test))
+    test_sampled_instances.extend(random.sample(indices_of_twos, balanced_test))
+
+    for index, (all_data, label) in enumerate(zip(overall_data, overall_labels)):
+
+        steps_seq = all_data[0]
+        info = all_data[1]
+
+        if index in sampled_instances:
+            train_file.write(steps_seq)
+            train_file.write("\n")
+            
+            train_info.write(info)
+            train_info.write("\n")
+            
+            train_label.write(label)
+            train_label.write("\n")
+        elif index in test_sampled_instances:
+            # proba = random.random()
+            # if proba <0.5:
+            test_file.write(steps_seq)
+            test_file.write("\n")
+
+            test_info.write(info)
+            test_info.write("\n")
+
+            test_label.write(label)
+            test_label.write("\n")
+#             else:
+#                 val_file.write(steps_seq)
+#                 val_file.write("\n")
+
+#                 val_info.write(info)
+#                 val_info.write("\n")
+
+#                 val_label.write(label)
+#                 val_label.write("\n")
+
+
+    train_file.close()
+    train_info.close()
+    train_label.close()
+    
+    # val_file.close()
+    # val_info.close()
+    # val_label.close()
+    
+    test_file.close()
+    test_info.close()
+    test_label.close()
+    
+def prepare_school_coded_finetuning_opts_intentional_files(data_processor, options):
+    '''
+        Ongoing research.
+        Labels:
+            0 - Opt 1
+            1 - Opt 2
+            2 - Both Opt
+    '''
+    chunk_iterator = data_processor.load_file_iterator(sep=",")
+
+    train_file = open(options.train_file_path, "w")
+    train_info = open(options.train_info_path, "w")
+    train_label = open(options.train_label_path, "w")
+    
+    val_file = open(options.val_file_path, "w")
+    val_info = open(options.val_info_path, "w")
+    val_label = open(options.val_label_path, "w")
+    
+    test_file = open(options.test_file_path, "w")
+    test_info = open(options.test_info_path, "w")
+    test_label = open(options.test_label_path, "w")
+    
+    # overall_data = []
+    # overall_labels = []
+    # kcs = pickle.load(open("dataset/CL_2223/ratio_proportion_change3_2223/unique_kcs_list.pkl", "rb"))
+    # kcs = [kc if not pd.isna(kc) for kc in kcs]
+    for chunk_data in chunk_iterator:
+        for school, school_group in chunk_data.groupby('CF (Anon School Id)'):
+            if not options.school or school in options.school:
+                print(f"{school} : {school_group.shape}")
+                school_group = school_group[(school_group['CF (Is StepByStep)'] == False) &
+                                            (school_group['CF (Encounter)'] == 0) &
+                                            (school_group['CF (Is Review Mode)'] == -1) ]
+                print(f"{school} : {school_group.shape}")
+                # for class_id, class_group in school_groups.groupby('CF (Anon Class Id)'):
+                for student, student_groups in school_group.groupby("Anon Student Id"):
+                    student_groups.sort_values(by="Time", inplace=True)
+                    # prob_list = list(pd.unique(student_groups["Problem Name"]))
+                    prob_list= list(pd.unique(student_groups[student_groups["CF (Workspace Progress Status)"]=="GRADUATED"]["Problem Name"]))
+                    # prob_list = prob_list[-int(len(prob_list)/2):]
+                    if len(prob_list) == 0:
+                        continue
+                    for prob, prob_groups in student_groups.groupby("Problem Name"):
+                        # if not prob in prob_list:
+                        #     continue
+                        actions = list(prob_groups["Action"])
+                        # A problem should be completed by a student clicking Done button.
+                        if not "Done" in actions: 
+                            continue
+                        unique_steps = list(pd.unique(prob_groups[prob_groups["CF (Is Autofilled)"] == False]["Step Name"]))
+                        unique_steps_len = len([s for s in unique_steps if not pd.isna(s) and not (s in options.opt_step1) and not (s in options.opt_step2)])
+                        if unique_steps_len < 4:
+                            continue
+                        unique_opt_steps_len = len([s for s in unique_steps if not pd.isna(s) and (s in options.opt_step1[1:] or s in options.opt_step2[1:])])
+                        if unique_opt_steps_len < 2:
+                            continue
+                        # print(unique_steps, unique_opt_steps_len)
+                        class_id = list(pd.unique(prob_groups["CF (Anon Class Id)"]))
+                        step_names_token = []
+                        original_steps_actions_attempts_help_levels_outcomes = []
+                        original_steps = []
+                        means_and_extremes = False
+                        opt1_used = False
+                        opt2_used = False
+                        for index, row in prob_groups[['Step Name', 'Action', 'Attempt At Step', 'CF (Is Autofilled)',
+                                                       'Outcome', 'Help Level', 'CF (Workspace Progress Status)', 
+                                                       'CF (Etalon)', 'CF (Problem Scenario Tags)']].iterrows():
+                            step = row["Step Name"]
+                            action = row["Action"]            # ['Attempt', 'Hint Request', 'Hint Level Change', 'Done']
+                            attempt = row["Attempt At Step"]  # number
+                            outcome = row["Outcome"]          # ['OK', 'BUG', 'ERROR', 'INITIAL_HINT', 'HINT_LEVEL_CHANGE']
+                            help_level = row["Help Level"]    # number
+                            progress = row["CF (Workspace Progress Status)"]
+                            scenario = row['CF (Problem Scenario Tags)']
+                            
+                            if not pd.isna(step):
+                                if step in options.opt_step1 and not means_and_extremes:
+                                    etalon = row["CF (Etalon)"]
+                                    if not pd.isna(etalon):
+                                        etalon = etalon.strip('{}')
+                                        key, value = etalon.split('=')
+                                        etalon = value
+                                        try:
+                                            etalon = int(etalon)
+                                        except Exception as e:
+                                            try:
+                                                etalon = float(etalon)
+                                                means_and_extremes = True
+                                            except Exception as e:
+                                                pass
+                                if row['CF (Is Autofilled)'] == True:
+                                    continue
+                                prev = step_names_token[-1] if step_names_token else ""
+                                prev_step = step_names_token[-1].split("-")[0] if step_names_token else ""
+
+                                if not step_names_token or step != prev_step:
+                                    if step in options.opt_step1 or step in options.opt_step2:
+                                        new_step = step
+                                        if step in options.opt_step1[1:]:
+                                            opt1_used = True
+                                        elif step in options.opt_step2[2:]:
+                                            opt2_used = True
+                                    else:
+                                        if action == "Attempt" and outcome != "OK":
+                                            new_step = step+"-2"
+                                        elif "Hint" in action:
+                                            new_step = step+"-1"
+                                        else:
+                                            new_step = step+"-0"
+                                            
+                                    step_names_token.append(new_step)
+                                    
+                                else:
+                                    if not (step in options.opt_step1 or step in options.opt_step2 or step == "FinalAnswer"):
+                                        if action == "Attempt" and outcome != "OK":
+                                            new_step = step+"-2"
+                                        elif "Hint" in action:
+                                            new_step = step+"-1"
+                                        else:
+                                            new_step = step+"-0"
+                                            
+                                        if prev < new_step:
+                                            step_names_token[-1] = new_step
+                                
+                                original_steps_actions_attempts_help_levels_outcomes.append(f"{step}-{action}-{attempt}-{help_level}-{outcome}")
+                                original_steps.append(step)
+                        # if (not opt1_used) and (not opt2_used):
+                        #     continue
+                        unique_steps_len = len([s for s in original_steps if not (s in options.opt_step1) and not(s in options.opt_step2)])
+                        if step_names_token and unique_steps_len > 4:
+                            info = ",".join([str(school), "\t".join(class_id), str(student), str(progress), str(prob), str(scenario), 
+                                           f"{1 if means_and_extremes else 0}", str(len(step_names_token)), 
+                                           "\t".join(map(str, original_steps_actions_attempts_help_levels_outcomes))])
+                            # overall_data.append(["\t".join(step_names_token), info])
+                            # label = None
+                            # if opt1_used and opt2_used:
+                            #     label = "2"
+                            # if (not opt1_used) and opt2_used:
+                            #     label = "1"
+                            # if opt1_used and (not opt2_used):
+                            #     label = "0"
+                            # print(f"opt1_used: {opt1_used}, opt2_used: {opt2_used} label : {label}")
+                            # overall_labels.append(label)
+                            
+                            proba = random.random()
+                            # if prob in first_prob_list:
+                            if proba <= 0.8:
+                                train_file.write("\t".join(step_names_token))
+                                train_file.write("\n")
+                                # school, class, student id, progress, problem name, scenario, 
+                                # prefered ER or ME, total steps length, 
+                                # original seq-action-attempt-help_level-outcome
+                                train_info.write(",".join([str(school), "\t".join(class_id), str(student), str(progress), str(prob), str(scenario), 
+                                               f"{1 if means_and_extremes else 0}", str(len(step_names_token)), 
+                                               "\t".join(map(str, original_steps_actions_attempts_help_levels_outcomes))]))
+                                train_info.write("\n")
+
+                            elif proba > 0.9:
+                            # elif prob in last_prob_list:
+                                test_file.write("\t".join(step_names_token))
+                                test_file.write("\n")
+                                # school, class, student id, progress, problem name, scenario, 
+                                # prefered ER or ME, total steps length, 
+                                # original seq-action-attempt-help_level-outcome
+                                test_info.write(",".join([str(school), "\t".join(class_id), str(student), str(progress), str(prob), str(scenario), 
+                                               f"{1 if means_and_extremes else 0}", str(len(step_names_token)), 
+                                               "\t".join(map(str, original_steps_actions_attempts_help_levels_outcomes))]))
+                                test_info.write("\n")
+
+                            else:
+                                val_file.write("\t".join(step_names_token))
+                                val_file.write("\n")
+                                # school, class, student id, progress, problem name, scenario, 
+                                # prefered ER or ME, total steps length, 
+                                # original seq-action-attempt-help_level-outcome
+                                val_info.write(",".join([str(school), "\t".join(class_id), str(student), str(progress), str(prob), str(scenario), 
+                                               f"{1 if means_and_extremes else 0}", str(len(step_names_token)), 
+                                               "\t".join(map(str, original_steps_actions_attempts_help_levels_outcomes))]))
+                                val_info.write("\n")
+                        # break
+                    # break
+                    # break
+                # break
+        # break
+#     overall_labels = np.array(overall_labels)
+#     indices_of_zeros = list(np.where(overall_labels == '0')[0])
+#     indices_of_ones = list(np.where(overall_labels == '1')[0])
+#     indices_of_twos = list(np.where(overall_labels == '2')[0])
+    
+#     train_len = int(len(overall_labels) * 0.10)
+#     sample_size = int(train_len/3)
+#     print(f"sample_size: {sample_size}")
+#     sampled_instances = random.sample(indices_of_zeros, sample_size)
+#     sampled_instances.extend(random.sample(indices_of_ones, sample_size))
+#     sampled_instances.extend(random.sample(indices_of_twos, sample_size))
+
+#     indices_of_zeros = [i for i in indices_of_zeros if not i in sampled_instances ]
+#     indices_of_ones = [i for i in indices_of_ones if not i in sampled_instances ]
+#     indices_of_twos = [i for i in indices_of_twos if not i in sampled_instances ]
+
+#     balanced_test = min(len(indices_of_zeros), len(indices_of_ones), len(indices_of_twos))
+#     print(f"balanced_test: {balanced_test}")
+#     test_sampled_instances = random.sample(indices_of_zeros, balanced_test)
+#     test_sampled_instances.extend(random.sample(indices_of_ones, balanced_test))
+#     test_sampled_instances.extend(random.sample(indices_of_twos, balanced_test))
+
+#     for index, (all_data, label) in enumerate(zip(overall_data, overall_labels)):
+
+#         steps_seq = all_data[0]
+#         info = all_data[1]
+
+#         if index in sampled_instances:
+#             train_file.write(steps_seq)
+#             train_file.write("\n")
+            
+#             train_info.write(info)
+#             train_info.write("\n")
+            
+#             train_label.write(label)
+#             train_label.write("\n")
+#         elif index in test_sampled_instances:
+#             # proba = random.random()
+#             # if proba <0.5:
+#             test_file.write(steps_seq)
+#             test_file.write("\n")
+
+#             test_info.write(info)
+#             test_info.write("\n")
+
+#             test_label.write(label)
+#             test_label.write("\n")
+# #             else:
+# #                 val_file.write(steps_seq)
+# #                 val_file.write("\n")
+
+# #                 val_info.write(info)
+# #                 val_info.write("\n")
+
+# #                 val_label.write(label)
+# #                 val_label.write("\n")
+
+
+    train_file.close()
+    train_info.close()
+    train_label.close()
+    
+    val_file.close()
+    val_info.close()
+    val_label.close()
+    
+    test_file.close()
+    test_info.close()
+    test_label.close()
+    
+def prepare_school_coded_finetuning_correctness_after_opts_files(data_processor, options):
+    '''
+        Ongoing research.
+        FinalAnswer step correctness
+        Correctness after opts:
+            0 if attempt at step>1
+            1 if attempt at step==1
+    '''
+    kcs = pickle.load(open("dataset/CL_2223/ratio_proportion_change3_2223/unique_kcs_list.pkl", "rb"))
+    kcs = [kc for kc in kcs if not pd.isna(kc)]
+    kcs = np.array(sorted(list(kcs)))
+    print(kcs, type(kcs))
+    print(f"KCs: {kcs}")
+    chunk_iterator = data_processor.load_file_iterator(sep=",")
+
+    train_file = open(options.train_file_path, "w")
+    train_info = open(options.train_info_path, "w")
+    train_label = open(options.train_label_path, "w")
+    
+#     val_file = open(options.val_file_path, "w")
+#     val_info = open(options.val_info_path, "w")
+#     val_label = open(options.val_label_path, "w")
+    
+    test_file = open(options.test_file_path, "w")
+    test_info = open(options.test_info_path, "w")
+    test_label = open(options.test_label_path, "w")
+    
+    overall_data = []
+    overall_labels = []
+    # kcs = pickle.load(open("dataset/CL_2223/ratio_proportion_change3_2223/unique_kcs_list.pkl", "rb"))
+    # kcs = [kc if not pd.isna(kc) for kc in kcs]
+    for chunk_data in chunk_iterator:
+        for school, school_group in chunk_data.groupby('CF (Anon School Id)'):
+            if not options.school or school in options.school:
+                print(f"{school} : {school_group.shape}")
+                school_group = school_group[(school_group['CF (Is StepByStep)'] == False) &
+                                            (school_group['CF (Encounter)'] == 0) &
+                                            (school_group['CF (Is Review Mode)'] == -1) ]
+                print(f"{school} : {school_group.shape}")
+                # for class_id, class_group in school_groups.groupby('CF (Anon Class Id)'):
+                for student, student_groups in school_group.groupby("Anon Student Id"):
+                    student_groups.sort_values(by="Time", inplace=True)
+                    # prob_list = list(pd.unique(student_groups["Problem Name"]))
+                    # prob_list= list(student_groups[student_groups["CF (Workspace Progress Status)"]=="GRADUATED"]["Problem Name"])
+                    # prob_list = prob_list[-int(len(prob_list)/2):]
+                    for prob, prob_groups in student_groups.groupby("Problem Name"):
+                        # if not prob in prob_list:
+                        #     continue
+                        actions = list(prob_groups["Action"])
+                        # A problem should be completed by a student clicking Done button.
+                        if not "Done" in actions: 
+                            continue
+                        unique_steps = list(pd.unique(prob_groups[prob_groups["CF (Is Autofilled)"] == False]["Step Name"]))
+                        unique_steps_len = len([s for s in unique_steps if not pd.isna(s) and not (s in options.opt_step1) and not (s in options.opt_step2)])
+                        if unique_steps_len < 4:
+                            continue
+                        unique_opt_steps_len = len([s for s in unique_steps if not pd.isna(s) and (s in options.opt_step1[1:] or s in options.opt_step2[1:])])
+                        if unique_opt_steps_len < 2:
+                            continue
+                        # print(unique_steps, unique_opt_steps_len)
+                        class_id = list(pd.unique(prob_groups["CF (Anon Class Id)"]))
+                        step_names_token = []
+                        original_steps_actions_attempts_help_levels_outcomes = []
+                        original_steps = []
+                        means_and_extremes = False
+                        opt1_used = False
+                        opt2_used = False
+                        final_after_opts = False
+                        correctness = "0"
+                        kcs_skills = [0 for i in kcs]
+                        diff_skills = [0 for i in kcs]
+                        finalanswer_skill = [0 for i in kcs]
+                        for index, row in prob_groups[['Step Name', 'Action', 'Attempt At Step', 'CF (Is Autofilled)',
+                                                       'Outcome', 'Help Level', 'CF (Workspace Progress Status)', 
+                                                       'CF (Skill Previous p-Known)', 'CF (Skill New p-Known)', 'KC Model(MATHia)', 
+                                                       'CF (Etalon)', 'CF (Problem Scenario Tags)']].iterrows():
+                            step = row["Step Name"]
+                            action = row["Action"]            # ['Attempt', 'Hint Request', 'Hint Level Change', 'Done']
+                            attempt = row["Attempt At Step"]  # number
+                            outcome = row["Outcome"]          # ['OK', 'BUG', 'ERROR', 'INITIAL_HINT', 'HINT_LEVEL_CHANGE']
+                            help_level = row["Help Level"]    # number
+                            progress = row["CF (Workspace Progress Status)"]
+                            scenario = row['CF (Problem Scenario Tags)']
+                            kc = row['KC Model(MATHia)']
+                            prev_skill = row['CF (Skill Previous p-Known)']
+                            curr_skill = row['CF (Skill New p-Known)']
+                            # print(kc, prev_skill)
+                            if not pd.isna(step):
+                                if step in options.opt_step1 and not means_and_extremes:
+                                    etalon = row["CF (Etalon)"]
+                                    if not pd.isna(etalon):
+                                        etalon = etalon.strip('{}')
+                                        key, value = etalon.split('=')
+                                        etalon = value
+                                        try:
+                                            etalon = int(etalon)
+                                        except Exception as e:
+                                            try:
+                                                etalon = float(etalon)
+                                                means_and_extremes = True
+                                            except Exception as e:
+                                                pass
+                                if row['CF (Is Autofilled)'] == True:
+                                    continue
+                                prev = step_names_token[-1] if step_names_token else ""
+                                prev_step = step_names_token[-1].split("-")[0] if step_names_token else ""
+
+                                if not step_names_token or step != prev_step:
+                                    if step in options.opt_step1 or step in options.opt_step2:
+                                        new_step = step
+                                        if step in options.opt_step1[1:]:
+                                            opt1_used = True
+                                        elif step in options.opt_step2[2:]:
+                                            opt2_used = True                                            
+                                    else:
+                                        if action == "Attempt" and outcome != "OK":
+                                            new_step = step+"-2"
+                                        elif "Hint" in action:
+                                            new_step = step+"-1"
+                                        else:
+                                            new_step = step+"-0"
+                                        
+                                        if step == "FinalAnswer" and (opt1_used or opt2_used) and not final_after_opts:
+                                            final_after_opts = True
+                                            if outcome == "OK":
+                                                correctness = "1"
+                                    step_names_token.append(new_step)
+                                    
+                                else:
+                                    if not (step in options.opt_step1 or step in options.opt_step2 or step == "FinalAnswer"):
+                                        if action == "Attempt" and outcome != "OK":
+                                            new_step = step+"-2"
+                                        elif "Hint" in action:
+                                            new_step = step+"-1"
+                                        else:
+                                            new_step = step+"-0"
+                                            
+                                        if prev < new_step:
+                                            step_names_token[-1] = new_step
+                                if not pd.isna(kc):
+                                    index = np.argwhere(kcs==kc).flatten()[0]
+                                    # print(index, type(index))
+                                    kcs_skills[index] = prev_skill
+                                    diff_skills[index] = prev_skill - curr_skill
+                                    if step == "FinalAnswer":
+                                        finalanswer_skill[index] = prev_skill
+
+                                original_steps_actions_attempts_help_levels_outcomes.append(f"{step}-{action}-{attempt}-{help_level}-{outcome}")
+                                original_steps.append(step)
+                        if (not opt1_used) and (not opt2_used):
+                            continue
+                        unique_steps_len = len([s for s in original_steps if not (s in options.opt_step1) and not(s in options.opt_step2)])
+                        if step_names_token and unique_steps_len > 4:
+                            label = None
+                            if opt1_used and opt2_used:
+                                label = "2"
+                            if (not opt1_used) and opt2_used:
+                                label = "1"
+                            if opt1_used and (not opt2_used):
+                                label = "0"
+                            # print(f"opt1_used: {opt1_ßused}, opt2_used: {opt2_used} label : {label}")
+                            info = ",".join([str(school), "\t".join(class_id), str(student), str(progress), str(prob), str(scenario), 
+                                           f"{1 if means_and_extremes else 0}", str(len(step_names_token)), 
+                                           "\t".join(map(str, original_steps_actions_attempts_help_levels_outcomes)), label,
+                                            "\t".join(map(str, kcs_skills)), "\t".join(map(str, diff_skills)), 
+                                             "\t".join(map(str, finalanswer_skill))])#str(finalanswer_skill)])
+                            overall_data.append(["\t".join(step_names_token), info])
+                            overall_labels.append(correctness)
+#                             proba = random.random()
+#                             # if prob in first_prob_list:
+#                             if proba <= 0.8:
+#                                 train_file.write("\t".join(step_names_token))
+#                                 train_file.write("\n")
+#                                 # school, class, student id, progress, problem name, scenario, 
+#                                 # prefered ER or ME, total steps length, 
+#                                 # original seq-action-attempt-help_level-outcome
+#                                 train_info.write(",".join([str(school), "\t".join(class_id), str(student), str(progress), str(prob), str(scenario), 
+#                                                f"{1 if means_and_extremes else 0}", str(len(step_names_token)), 
+#                                                "\t".join(map(str, original_steps_actions_attempts_help_levels_outcomes))]))
+#                                 train_info.write("\n")
+
+#                             elif proba > 0.9:
+#                             # elif prob in last_prob_list:
+#                                 test_file.write("\t".join(step_names_token))
+#                                 test_file.write("\n")
+#                                 # school, class, student id, progress, problem name, scenario, 
+#                                 # prefered ER or ME, total steps length, 
+#                                 # original seq-action-attempt-help_level-outcome
+#                                 test_info.write(",".join([str(school), "\t".join(class_id), str(student), str(progress), str(prob), str(scenario), 
+#                                                f"{1 if means_and_extremes else 0}", str(len(step_names_token)), 
+#                                                "\t".join(map(str, original_steps_actions_attempts_help_levels_outcomes))]))
+#                                 test_info.write("\n")
+
+#                             else:
+#                                 val_file.write("\t".join(step_names_token))
+#                                 val_file.write("\n")
+#                                 # school, class, student id, progress, problem name, scenario, 
+#                                 # prefered ER or ME, total steps length, 
+#                                 # original seq-action-attempt-help_level-outcome
+#                                 val_info.write(",".join([str(school), "\t".join(class_id), str(student), str(progress), str(prob), str(scenario), 
+#                                                f"{1 if means_and_extremes else 0}", str(len(step_names_token)), 
+#                                                "\t".join(map(str, original_steps_actions_attempts_help_levels_outcomes))]))
+#                                 val_info.write("\n")
+                        # break
+                    # break
+                    # break
+                # break
+        # break
+    overall_labels = np.array(overall_labels)
+    indices_of_zeros = list(np.where(overall_labels == '0')[0])
+    indices_of_ones = list(np.where(overall_labels == '1')[0])
+    # indices_of_twos = list(np.where(overall_labels == '2')[0])
+    
+    train_len = int(len(overall_labels) * 0.10)
+    sample_size = int(train_len/2)
+    print(f"sample_size: {sample_size}")
+    sampled_instances = random.sample(indices_of_zeros, sample_size)
+    sampled_instances.extend(random.sample(indices_of_ones, sample_size))
+    # sampled_instances.extend(random.sample(indices_of_twos, sample_size))
+
+    indices_of_zeros = [i for i in indices_of_zeros if not i in sampled_instances ]
+    indices_of_ones = [i for i in indices_of_ones if not i in sampled_instances ]
+    # indices_of_twos = [i for i in indices_of_twos if not i in sampled_instances ]
+
+    balanced_test = min(len(indices_of_zeros), len(indices_of_ones)) #, len(indices_of_twos))
+    print(f"balanced_test: {balanced_test}")
+    test_sampled_instances = random.sample(indices_of_zeros, balanced_test)
+    test_sampled_instances.extend(random.sample(indices_of_ones, balanced_test))
+    # test_sampled_instances.extend(random.sample(indices_of_twos, balanced_test))
+
+    for index, (all_data, label) in enumerate(zip(overall_data, overall_labels)):
+
+        steps_seq = all_data[0]
+        info = all_data[1]
+
+        if index in sampled_instances:
+            train_file.write(steps_seq)
+            train_file.write("\n")
+            
+            train_info.write(info)
+            train_info.write("\n")
+            
+            train_label.write(label)
+            train_label.write("\n")
+        elif index in test_sampled_instances:
+            # proba = random.random()
+            # if proba <0.5:
+            test_file.write(steps_seq)
+            test_file.write("\n")
+
+            test_info.write(info)
+            test_info.write("\n")
+
+            test_label.write(label)
+            test_label.write("\n")
+#             else:
+#                 val_file.write(steps_seq)
+#                 val_file.write("\n")
+
+#                 val_info.write(info)
+#                 val_info.write("\n")
+
+#                 val_label.write(label)
+#                 val_label.write("\n")
+
+
+    train_file.close()
+    train_info.close()
+    train_label.close()
+    
+    # val_file.close()
+    # val_info.close()
+    # val_label.close()
+    
+    test_file.close()
+    test_info.close()
+    test_label.close()
+    
+def prepare_school_coded_finetuning_correctness_after_opts_over_prob_files(data_processor, options):
+    '''
+        Ongoing research.
+        FinalAnswer step correctness
+        Correctness after opts:
+            0 if attempt at step>1
+            1 if attempt at step==1
+    '''
+    kcs = pickle.load(open("dataset/CL_2223/ratio_proportion_change3_2223/unique_kcs_list.pkl", "rb"))
+    kcs = [kc for kc in kcs if not pd.isna(kc)]
+    kcs = np.array(sorted(list(kcs)))
+    print(kcs, type(kcs))
+    print(f"KCs: {kcs}")
+    chunk_iterator = data_processor.load_file_iterator(sep=",")
+
+    train_file = open(options.train_file_path, "w")
+    train_info = open(options.train_info_path, "w")
+    train_label = open(options.train_label_path, "w")
+    
+#     val_file = open(options.val_file_path, "w")
+#     val_info = open(options.val_info_path, "w")
+#     val_label = open(options.val_label_path, "w")
+    
+    test_file = open(options.test_file_path, "w")
+    test_info = open(options.test_info_path, "w")
+    test_label = open(options.test_label_path, "w")
+    
+    train_data = []
+    train_labels = []
+    
+    test_data = []
+    test_labels = []
+    # kcs = pickle.load(open("dataset/CL_2223/ratio_proportion_change3_2223/unique_kcs_list.pkl", "rb"))
+    # kcs = [kc if not pd.isna(kc) for kc in kcs]
+    for chunk_data in chunk_iterator:
+        for school, school_group in chunk_data.groupby('CF (Anon School Id)'):
+            if not options.school or school in options.school:
+                print(f"{school} : {school_group.shape}")
+                school_group = school_group[(school_group['CF (Is StepByStep)'] == False) &
+                                            (school_group['CF (Encounter)'] == 0) &
+                                            (school_group['CF (Is Review Mode)'] == -1) ]
+                print(f"{school} : {school_group.shape}")
+                # for class_id, class_group in school_groups.groupby('CF (Anon Class Id)'):
+                for student, student_groups in school_group.groupby("Anon Student Id"):
+                    student_groups.sort_values(by="Time", inplace=True)
+                    train = True
+                    proba = random.random()
+                    if proba < 0.5:
+                        train = False
+                    # prob_list = list(pd.unique(student_groups["Problem Name"]))
+                    # prob_list= list(student_groups[student_groups["CF (Workspace Progress Status)"]=="GRADUATED"]["Problem Name"])
+                    # prob_list = prob_list[-int(len(prob_list)/2):]
+                    prev_kcs_skills = [0 for i in kcs]
+                    for pi, (prob, prob_groups) in enumerate(student_groups.groupby("Problem Name")):
+                        # if not prob in prob_list:
+                        #     continue
+                        actions = list(prob_groups["Action"])
+                        # A problem should be completed by a student clicking Done button.
+                        if not "Done" in actions: 
+                            continue
+                        unique_steps = list(pd.unique(prob_groups[prob_groups["CF (Is Autofilled)"] == False]["Step Name"]))
+                        unique_steps_len = len([s for s in unique_steps if not pd.isna(s) and not (s in options.opt_step1) and not (s in options.opt_step2)])
+                        if unique_steps_len < 4:
+                            continue
+                        unique_opt_steps_len = len([s for s in unique_steps if not pd.isna(s) and (s in options.opt_step1[1:] or s in options.opt_step2[1:])])
+                        if unique_opt_steps_len < 2:
+                            continue
+                        # print(unique_steps, unique_opt_steps_len)
+                        class_id = list(pd.unique(prob_groups["CF (Anon Class Id)"]))
+                        step_names_token = []
+                        original_steps_actions_attempts_help_levels_outcomes = []
+                        original_steps = []
+                        means_and_extremes = False
+                        opt1_used = False
+                        opt2_used = False
+                        final_after_opts = False
+                        correctness = "0"
+                        kcs_skills = [0 for i in kcs]
+                        diff_skills = [0 for i in kcs]
+                        finalanswer_skill = [0 for i in kcs]
+                        for index, row in prob_groups[['Step Name', 'Action', 'Attempt At Step', 'CF (Is Autofilled)',
+                                                       'Outcome', 'Help Level', 'CF (Workspace Progress Status)', 
+                                                       'CF (Skill Previous p-Known)', 'CF (Skill New p-Known)', 'KC Model(MATHia)', 
+                                                       'CF (Etalon)', 'CF (Problem Scenario Tags)']].iterrows():
+                            step = row["Step Name"]
+                            action = row["Action"]            # ['Attempt', 'Hint Request', 'Hint Level Change', 'Done']
+                            attempt = row["Attempt At Step"]  # number
+                            outcome = row["Outcome"]          # ['OK', 'BUG', 'ERROR', 'INITIAL_HINT', 'HINT_LEVEL_CHANGE']
+                            help_level = row["Help Level"]    # number
+                            progress = row["CF (Workspace Progress Status)"]
+                            scenario = row['CF (Problem Scenario Tags)']
+                            kc = row['KC Model(MATHia)']
+                            prev_skill = row['CF (Skill Previous p-Known)']
+                            curr_skill = row['CF (Skill New p-Known)']
+                            # print(kc, prev_skill)
+                            if not pd.isna(step):
+                                if step in options.opt_step1 and not means_and_extremes:
+                                    etalon = row["CF (Etalon)"]
+                                    if not pd.isna(etalon):
+                                        etalon = etalon.strip('{}')
+                                        key, value = etalon.split('=')
+                                        etalon = value
+                                        try:
+                                            etalon = int(etalon)
+                                        except Exception as e:
+                                            try:
+                                                etalon = float(etalon)
+                                                means_and_extremes = True
+                                            except Exception as e:
+                                                pass
+                                if row['CF (Is Autofilled)'] == True:
+                                    continue
+                                prev = step_names_token[-1] if step_names_token else ""
+                                prev_step = step_names_token[-1].split("-")[0] if step_names_token else ""
+
+                                if not step_names_token or step != prev_step:
+                                    if step in options.opt_step1 or step in options.opt_step2:
+                                        new_step = step
+                                        if step in options.opt_step1[1:]:
+                                            opt1_used = True
+                                        elif step in options.opt_step2[2:]:
+                                            opt2_used = True                                            
+                                    else:
+                                        if action == "Attempt" and outcome != "OK":
+                                            new_step = step+"-2"
+                                        elif "Hint" in action:
+                                            new_step = step+"-1"
+                                        else:
+                                            new_step = step+"-0"
+                                        
+                                        if step == "FinalAnswer" and (opt1_used or opt2_used) and not final_after_opts:
+                                            final_after_opts = True
+                                            if outcome == "OK":
+                                                correctness = "1"
+                                    step_names_token.append(new_step)
+                                    
+                                else:
+                                    if not (step in options.opt_step1 or step in options.opt_step2 or step == "FinalAnswer"):
+                                        if action == "Attempt" and outcome != "OK":
+                                            new_step = step+"-2"
+                                        elif "Hint" in action:
+                                            new_step = step+"-1"
+                                        else:
+                                            new_step = step+"-0"
+                                            
+                                        if prev < new_step:
+                                            step_names_token[-1] = new_step
+                                if not pd.isna(kc):
+                                    index = np.argwhere(kcs==kc).flatten()[0]
+                                    # print(index, type(index))
+                                    kcs_skills[index] = prev_skill
+                                    if pi != 0:
+                                        diff_skills[index] = prev_skill - prev_kcs_skills[index]
+                                    prev_kcs_skills[index] = prev_skill
+                                    if step == "FinalAnswer":
+                                        finalanswer_skill[index] = prev_skill
+
+                                original_steps_actions_attempts_help_levels_outcomes.append(f"{step}-{action}-{attempt}-{help_level}-{outcome}")
+                                original_steps.append(step)
+                        if (not opt1_used) and (not opt2_used):
+                            continue
+                        unique_steps_len = len([s for s in original_steps if not (s in options.opt_step1) and not(s in options.opt_step2)])
+                        if step_names_token and unique_steps_len > 4:
+                            label = None
+                            if opt1_used and opt2_used:
+                                label = "2"
+                            if (not opt1_used) and opt2_used:
+                                label = "1"
+                            if opt1_used and (not opt2_used):
+                                label = "0"
+                            # print(f"opt1_used: {opt1_ßused}, opt2_used: {opt2_used} label : {label}")
+                            info = ",".join([str(school), "\t".join(class_id), str(student), str(progress), str(prob), str(scenario), 
+                                           f"{1 if means_and_extremes else 0}", str(len(step_names_token)), 
+                                           "\t".join(map(str, original_steps_actions_attempts_help_levels_outcomes)), label,
+                                            "\t".join(map(str, kcs_skills)), "\t".join(map(str, diff_skills)), 
+                                             "\t".join(map(str, finalanswer_skill))])#str(finalanswer_skill)])
+                            if train:
+                                train_data.append(["\t".join(step_names_token), info])
+                                train_labels.append(correctness)
+                            else:
+                                test_data.append(["\t".join(step_names_token), info])
+                                test_labels.append(correctness)
+#                             proba = random.random()
+#                             # if prob in first_prob_list:
+#                             if proba <= 0.8:
+#                                 train_file.write("\t".join(step_names_token))
+#                                 train_file.write("\n")
+#                                 # school, class, student id, progress, problem name, scenario, 
+#                                 # prefered ER or ME, total steps length, 
+#                                 # original seq-action-attempt-help_level-outcome
+#                                 train_info.write(",".join([str(school), "\t".join(class_id), str(student), str(progress), str(prob), str(scenario), 
+#                                                f"{1 if means_and_extremes else 0}", str(len(step_names_token)), 
+#                                                "\t".join(map(str, original_steps_actions_attempts_help_levels_outcomes))]))
+#                                 train_info.write("\n")
+
+#                             elif proba > 0.9:
+#                             # elif prob in last_prob_list:
+#                                 test_file.write("\t".join(step_names_token))
+#                                 test_file.write("\n")
+#                                 # school, class, student id, progress, problem name, scenario, 
+#                                 # prefered ER or ME, total steps length, 
+#                                 # original seq-action-attempt-help_level-outcome
+#                                 test_info.write(",".join([str(school), "\t".join(class_id), str(student), str(progress), str(prob), str(scenario), 
+#                                                f"{1 if means_and_extremes else 0}", str(len(step_names_token)), 
+#                                                "\t".join(map(str, original_steps_actions_attempts_help_levels_outcomes))]))
+#                                 test_info.write("\n")
+
+#                             else:
+#                                 val_file.write("\t".join(step_names_token))
+#                                 val_file.write("\n")
+#                                 # school, class, student id, progress, problem name, scenario, 
+#                                 # prefered ER or ME, total steps length, 
+#                                 # original seq-action-attempt-help_level-outcome
+#                                 val_info.write(",".join([str(school), "\t".join(class_id), str(student), str(progress), str(prob), str(scenario), 
+#                                                f"{1 if means_and_extremes else 0}", str(len(step_names_token)), 
+#                                                "\t".join(map(str, original_steps_actions_attempts_help_levels_outcomes))]))
+#                                 val_info.write("\n")
+                        # break
+                    # break
+                    # break
+                # break
+        # break
+#     overall_labels = np.array(overall_labels)
+#     indices_of_zeros = list(np.where(overall_labels == '0')[0])
+#     indices_of_ones = list(np.where(overall_labels == '1')[0])
+#     # indices_of_twos = list(np.where(overall_labels == '2')[0])
+    
+#     train_len = int(len(overall_labels) * 0.10)
+#     sample_size = int(train_len/2)
+#     print(f"sample_size: {sample_size}")
+#     sampled_instances = random.sample(indices_of_zeros, sample_size)
+#     sampled_instances.extend(random.sample(indices_of_ones, sample_size))
+#     # sampled_instances.extend(random.sample(indices_of_twos, sample_size))
+
+#     indices_of_zeros = [i for i in indices_of_zeros if not i in sampled_instances ]
+#     indices_of_ones = [i for i in indices_of_ones if not i in sampled_instances ]
+#     # indices_of_twos = [i for i in indices_of_twos if not i in sampled_instances ]
+
+#     balanced_test = min(len(indices_of_zeros), len(indices_of_ones)) #, len(indices_of_twos))
+#     print(f"balanced_test: {balanced_test}")
+#     test_sampled_instances = random.sample(indices_of_zeros, balanced_test)
+#     test_sampled_instances.extend(random.sample(indices_of_ones, balanced_test))
+#     # test_sampled_instances.extend(random.sample(indices_of_twos, balanced_test))
+
+    for index, (all_data, label) in enumerate(zip(train_data, train_labels)):
+        steps_seq = all_data[0]
+        info = all_data[1]
+
+        train_file.write(steps_seq)
+        train_file.write("\n")
+
+        train_info.write(info)
+        train_info.write("\n")
+
+        train_label.write(label)
+        train_label.write("\n")
+    train_file.close()
+    train_info.close()
+    train_label.close()
+    
+    for index, (all_data, label) in enumerate(zip(test_data, test_labels)):
+        steps_seq = all_data[0]
+        info = all_data[1]
+
+        test_file.write(steps_seq)
+        test_file.write("\n")
+
+        test_info.write(info)
+        test_info.write("\n")
+
+        test_label.write(label)
+        test_label.write("\n")
+    test_file.close()
+    test_info.close()
+    test_label.close()
+    
+def prepare_school_coded_finetuning_correctness_after_opts_per_files(data_processor, options):
+    '''
+        Ongoing research.
+        FinalAnswer step correctness
+        Correctness after opts:
+            0 if attempt at step>1
+            1 if attempt at step==1
+    '''
+    kcs = pickle.load(open("dataset/CL_2223/ratio_proportion_change3_2223/unique_kcs_list.pkl", "rb"))
+    kcs = [kc for kc in kcs if not pd.isna(kc)]
+    kcs = np.array(sorted(list(kcs)))
+    print(kcs, type(kcs))
+    print(f"KCs: {kcs}")
+    chunk_iterator = data_processor.load_file_iterator(sep=",")
+
+    train_file = open(options.train_file_path, "w")
+    train_info = open(options.train_info_path, "w")
+    train_label = open(options.train_label_path, "w")
+    
+    val_file = open(options.val_file_path, "a")
+    val_info = open(options.val_info_path, "a")
+    val_label = open(options.val_label_path, "a")
+    
+    test_file = open(options.test_file_path, "w")
+    test_info = open(options.test_info_path, "w")
+    test_label = open(options.test_label_path, "w")
+    
+    overall_data = []
+    overall_labels = []
+    # kcs = pickle.load(open("dataset/CL_2223/ratio_proportion_change3_2223/unique_kcs_list.pkl", "rb"))
+    # kcs = [kc if not pd.isna(kc) for kc in kcs]
+    for chunk_data in chunk_iterator:
+        for school, school_group in chunk_data.groupby('CF (Anon School Id)'):
+            if not options.school or school in options.school:
+                print(f"{school} : {school_group.shape}")
+                school_group = school_group[(school_group['CF (Is StepByStep)'] == False) &
+                                            (school_group['CF (Encounter)'] == 0) &
+                                            (school_group['CF (Is Review Mode)'] == -1) ]
+                print(f"{school} : {school_group.shape}")
+                # for class_id, class_group in school_groups.groupby('CF (Anon Class Id)'):
+                for student, student_groups in school_group.groupby("Anon Student Id"):
+                    student_groups.sort_values(by="Time", inplace=True)
+                    # prob_list = list(pd.unique(student_groups["Problem Name"]))
+                    # prob_list= list(student_groups[student_groups["CF (Workspace Progress Status)"]=="GRADUATED"]["Problem Name"])
+                    # prob_list = prob_list[-int(len(prob_list)/2):]
+                    for prob, prob_groups in student_groups.groupby("Problem Name"):
+                        # if not prob in prob_list:
+                        #     continue
+                        actions = list(prob_groups["Action"])
+                        # A problem should be completed by a student clicking Done button.
+                        if not "Done" in actions: 
+                            continue
+                        unique_steps = list(pd.unique(prob_groups[prob_groups["CF (Is Autofilled)"] == False]["Step Name"]))
+                        unique_steps_len = len([s for s in unique_steps if not pd.isna(s) and not (s in options.opt_step1) and not (s in options.opt_step2)])
+                        if unique_steps_len < 4:
+                            continue
+                        unique_opt_steps_len = len([s for s in unique_steps if not pd.isna(s) and (s in options.opt_step1[1:] or s in options.opt_step2[1:])])
+                        if unique_opt_steps_len < 2:
+                            continue
+                        # print(unique_steps, unique_opt_steps_len)
+                        class_id = list(pd.unique(prob_groups["CF (Anon Class Id)"]))
+                        step_names_token = []
+                        original_steps_actions_attempts_help_levels_outcomes = []
+                        original_steps = []
+                        means_and_extremes = False
+                        opt1_used = False
+                        opt2_used = False
+                        final_after_opts = False
+                        correctness = "0"
+                        kcs_skills = [0 for i in kcs]
+                        diff_skills = [0 for i in kcs]
+                        finalanswer_skill = [0 for i in kcs]
+                        for index, row in prob_groups[['Step Name', 'Action', 'Attempt At Step', 'CF (Is Autofilled)',
+                                                       'Outcome', 'Help Level', 'CF (Workspace Progress Status)', 
+                                                       'CF (Skill Previous p-Known)', 'CF (Skill New p-Known)', 'KC Model(MATHia)', 
+                                                       'CF (Etalon)', 'CF (Problem Scenario Tags)']].iterrows():
+                            step = row["Step Name"]
+                            action = row["Action"]            # ['Attempt', 'Hint Request', 'Hint Level Change', 'Done']
+                            attempt = row["Attempt At Step"]  # number
+                            outcome = row["Outcome"]          # ['OK', 'BUG', 'ERROR', 'INITIAL_HINT', 'HINT_LEVEL_CHANGE']
+                            help_level = row["Help Level"]    # number
+                            progress = row["CF (Workspace Progress Status)"]
+                            scenario = row['CF (Problem Scenario Tags)']
+                            kc = row['KC Model(MATHia)']
+                            prev_skill = row['CF (Skill Previous p-Known)']
+                            curr_skill = row['CF (Skill New p-Known)']
+                            # print(kc, prev_skill)
+                            if not pd.isna(step):
+                                if step in options.opt_step1 and not means_and_extremes:
+                                    etalon = row["CF (Etalon)"]
+                                    if not pd.isna(etalon):
+                                        etalon = etalon.strip('{}')
+                                        key, value = etalon.split('=')
+                                        etalon = value
+                                        try:
+                                            etalon = int(etalon)
+                                        except Exception as e:
+                                            try:
+                                                etalon = float(etalon)
+                                                means_and_extremes = True
+                                            except Exception as e:
+                                                pass
+                                if row['CF (Is Autofilled)'] == True:
+                                    continue
+                                prev = step_names_token[-1] if step_names_token else ""
+                                prev_step = step_names_token[-1].split("-")[0] if step_names_token else ""
+
+                                if not step_names_token or step != prev_step:
+                                    if step in options.opt_step1 or step in options.opt_step2:
+                                        new_step = step
+                                        if step in options.opt_step1[1:]:
+                                            opt1_used = True
+                                        elif step in options.opt_step2[2:]:
+                                            opt2_used = True                                            
+                                    else:
+                                        if action == "Attempt" and outcome != "OK":
+                                            new_step = step+"-2"
+                                        elif "Hint" in action:
+                                            new_step = step+"-1"
+                                        else:
+                                            new_step = step+"-0"
+                                        
+                                        if step == "FinalAnswer" and (opt1_used or opt2_used) and not final_after_opts:
+                                            final_after_opts = True
+                                            if outcome == "OK":
+                                                correctness = "1"
+                                    step_names_token.append(new_step)
+                                    
+                                else:
+                                    if not (step in options.opt_step1 or step in options.opt_step2 or step == "FinalAnswer"):
+                                        if action == "Attempt" and outcome != "OK":
+                                            new_step = step+"-2"
+                                        elif "Hint" in action:
+                                            new_step = step+"-1"
+                                        else:
+                                            new_step = step+"-0"
+                                            
+                                        if prev < new_step:
+                                            step_names_token[-1] = new_step
+                                if not pd.isna(kc):
+                                    index = np.argwhere(kcs==kc).flatten()[0]
+                                    # print(index, type(index))
+                                    kcs_skills[index] = prev_skill
+                                    diff_skills[index] = prev_skill - curr_skill
+                                    if step == "FinalAnswer":
+                                        finalanswer_skill[index] = prev_skill
+
+                                original_steps_actions_attempts_help_levels_outcomes.append(f"{step}-{action}-{attempt}-{help_level}-{outcome}")
+                                original_steps.append(step)
+                        if (not opt1_used) and (not opt2_used):
+                            continue
+                        unique_steps_len = len([s for s in original_steps if not (s in options.opt_step1) and not(s in options.opt_step2)])
+                        if step_names_token and unique_steps_len > 4:
+                            label = None
+                            if opt1_used and opt2_used:
+                                label = "2"
+                            if (not opt1_used) and opt2_used:
+                                label = "1"
+                            if opt1_used and (not opt2_used):
+                                label = "0"
+                            # print(f"opt1_used: {opt1_ßused}, opt2_used: {opt2_used} label : {label}")
+                            info = ",".join([str(school), "\t".join(class_id), str(student), str(progress), str(prob), str(scenario), 
+                                           f"{1 if means_and_extremes else 0}", str(len(step_names_token)), 
+                                           "\t".join(map(str, original_steps_actions_attempts_help_levels_outcomes)), label,
+                                            "\t".join(map(str, kcs_skills)), "\t".join(map(str, diff_skills)), 
+                                             "\t".join(map(str, finalanswer_skill))])#str(finalanswer_skill)])
+                            overall_data.append(["\t".join(step_names_token), info])
+                            overall_labels.append(correctness)
+#                             proba = random.random()
+#                             # if prob in first_prob_list:
+#                             if proba <= 0.8:
+#                                 train_file.write("\t".join(step_names_token))
+#                                 train_file.write("\n")
+#                                 # school, class, student id, progress, problem name, scenario, 
+#                                 # prefered ER or ME, total steps length, 
+#                                 # original seq-action-attempt-help_level-outcome
+#                                 train_info.write(",".join([str(school), "\t".join(class_id), str(student), str(progress), str(prob), str(scenario), 
+#                                                f"{1 if means_and_extremes else 0}", str(len(step_names_token)), 
+#                                                "\t".join(map(str, original_steps_actions_attempts_help_levels_outcomes))]))
+#                                 train_info.write("\n")
+
+#                             elif proba > 0.9:
+#                             # elif prob in last_prob_list:
+#                                 test_file.write("\t".join(step_names_token))
+#                                 test_file.write("\n")
+#                                 # school, class, student id, progress, problem name, scenario, 
+#                                 # prefered ER or ME, total steps length, 
+#                                 # original seq-action-attempt-help_level-outcome
+#                                 test_info.write(",".join([str(school), "\t".join(class_id), str(student), str(progress), str(prob), str(scenario), 
+#                                                f"{1 if means_and_extremes else 0}", str(len(step_names_token)), 
+#                                                "\t".join(map(str, original_steps_actions_attempts_help_levels_outcomes))]))
+#                                 test_info.write("\n")
+
+#                             else:
+#                                 val_file.write("\t".join(step_names_token))
+#                                 val_file.write("\n")
+#                                 # school, class, student id, progress, problem name, scenario, 
+#                                 # prefered ER or ME, total steps length, 
+#                                 # original seq-action-attempt-help_level-outcome
+#                                 val_info.write(",".join([str(school), "\t".join(class_id), str(student), str(progress), str(prob), str(scenario), 
+#                                                f"{1 if means_and_extremes else 0}", str(len(step_names_token)), 
+#                                                "\t".join(map(str, original_steps_actions_attempts_help_levels_outcomes))]))
+#                                 val_info.write("\n")
+                        # break
+                    # break
+                    # break
+                # break
+        # break
+    overall_labels = np.array(overall_labels)
+    indices_of_zeros = list(np.where(overall_labels == '0')[0])
+    indices_of_ones = list(np.where(overall_labels == '1')[0])
+    # indices_of_twos = list(np.where(overall_labels == '2')[0])
+    
+    # train_len = int(len(overall_labels) * 0.10)
+    train_len = int(len(overall_labels) * float(options.per))
+    
+    sample_size = int(train_len/2)
+    if float(options.per) == 1:
+        sample_size = min(len(indices_of_zeros), len(indices_of_ones))
+    elif float(options.per) > 1:
+        sample_size = int(options.per)
+    print(f"sample_size: {sample_size}")
+    sampled_instances = random.sample(indices_of_zeros, sample_size)
+    sampled_instances.extend(random.sample(indices_of_ones, sample_size))
+    # sampled_instances.extend(random.sample(indices_of_twos, sample_size))
+
+    indices_of_zeros = [i for i in indices_of_zeros if not i in sampled_instances ]
+    indices_of_ones = [i for i in indices_of_ones if not i in sampled_instances ]
+    # indices_of_twos = [i for i in indices_of_twos if not i in sampled_instances ]
+
+    balanced_test = min(len(indices_of_zeros), len(indices_of_ones)) #, len(indices_of_twos))
+    print(f"balanced_test: {balanced_test}")
+    test_sampled_instances = random.sample(indices_of_zeros, balanced_test)
+    test_sampled_instances.extend(random.sample(indices_of_ones, balanced_test))
+    # test_sampled_instances.extend(random.sample(indices_of_twos, balanced_test))
+
+    for index, (all_data, label) in enumerate(zip(overall_data, overall_labels)):
+
+        steps_seq = all_data[0]
+        info = all_data[1]
+
+        if index in sampled_instances:
+            train_file.write(steps_seq)
+            train_file.write("\n")
+            
+            train_info.write(info)
+            train_info.write("\n")
+            
+            train_label.write(label)
+            train_label.write("\n")
+            if float(options.per) == 1.0:
+                val_file.write(steps_seq)
+                val_file.write("\n")
+
+                val_info.write(info)
+                val_info.write("\n")
+
+                val_label.write(label)
+                val_label.write("\n")
+        
+        elif index in test_sampled_instances:
+            # proba = random.random()
+            # if proba <0.5:
+            test_file.write(steps_seq)
+            test_file.write("\n")
+
+            test_info.write(info)
+            test_info.write("\n")
+
+            test_label.write(label)
+            test_label.write("\n")
+            
+            if float(options.per) != 1.0:
+                val_file.write(steps_seq)
+                val_file.write("\n")
+
+                val_info.write(info)
+                val_info.write("\n")
+
+                val_label.write(label)
+                val_label.write("\n")
+
+
+    train_file.close()
+    train_info.close()
+    train_label.close()
+    
+    val_file.close()
+    val_info.close()
+    val_label.close()
+    
+    test_file.close()
+    test_info.close()
+    test_label.close()
+
+    
+    
+def prepare_pretraining_vocab_file(options):
+    
+    # kc = pickle.load(open("dataset/unique/unique_kcs_list.pkl","rb"))
+    # kc_token = {"KC"+str(i):k for i, k in enumerate(kc)}
+    # pickle.dump(kc_token, open("pretraining/unique_dict_kc_token.pkl", "wb"))
+    
+    # steps = pickle.load(open(f"{options.dataset_folder}unique_steps_list.pkl","rb"))
+    # step_token = {"step"+str(i):k for i, k in enumerate(steps)}
+    # folder_name = options.workspace_name+"/" if options.workspace_name else ""
+    # pickle.dump(step_token, open(f"{folder_name}pretraining/unique_dict_step_token.pkl", "wb"))
+
+    # steps = pickle.load(open(f"{options.dataset_folder}unique_new_steps_w_action_attempt_list.pkl","rb"))
+    steps = pickle.load(open(f"{options.dataset_folder}unique_steps_list.pkl","rb"))
+
+    # print("No of unique kc", len(kc))
+    print("No of unique steps ", len(steps))
+    # print("No of unique problem", len(prob))
+    # print("Size of vocab ", len(steps))
+
+    ordered_steps = sorted(list(steps))
+
+    with (open(options.vocab_file_path,"w")) as vb_file:
+        vb_file.write("[PAD]\n")
+        vb_file.write("[UNK]\n")
+        vb_file.write("[MASK]\n")
+        vb_file.write("[CLS]\n")
+        vb_file.write("[SEP]\n")
+        # vb_file.write("\n".join(kc_token.keys()))
+        # vb_file.write("\n")
+        # vb_file.write("\n".join(step_token.keys()))
+        # vb_file.write("\n".join(ordered_steps))
+        for step in ordered_steps:
+            if step in options.opt_step1 or step in options.opt_step2:
+                vb_file.write(f"{step}\n")
+            else:
+                for i in range(3):
+                    vb_file.write(f"{step}-{i}\n")
+        vb_file.close()
+    with open(options.vocab_file_path,"r") as f:
+        l = f.readlines()
+        print(l, len(l))
+        f.close()
+
+
+def main(opt):
+    options = copy.deepcopy(opt)
+    if opt.workspace_name:
+        options.dataset_folder = opt.dataset_folder+opt.workspace_name+"/"
+        
+    data_processor = DataPreprocessor(input_file_path=opt.dataset)
+    
+    if opt.analyze_dataset_by_section:
+        print(f"Analyzing dataset by section for workspace: {opt.workspace_name}")
+        data_processor.analyze_dataset_by_section(opt.workspace_name)
+        
+        pickle.dump(data_processor.unique_students, open(f"{options.dataset_folder}unique_students_list.pkl", "wb"))
+        pickle.dump(data_processor.unique_problems, open(f"{options.dataset_folder}unique_problems_list.pkl", "wb"))
+        pickle.dump(data_processor.unique_prob_hierarchy, open(f"{options.dataset_folder}unique_hierarchy_list.pkl", "wb"))
+        pickle.dump(data_processor.unique_kcs, open(f"{options.dataset_folder}unique_kcs_list.pkl", "wb"))
+        pickle.dump(data_processor.unique_steps, open(f"{options.dataset_folder}unique_steps_list.pkl", "wb"))
+        
+    if opt.analyze_dataset_by_school:
+        print(f"Analyzing dataset of all school for workspace: {opt.workspace_name}")
+        data_processor.analyze_dataset_by_school(opt.workspace_name)
+        
+        if not os.path.exists(options.dataset_folder):
+            os.makedirs(options.dataset_folder)
+        pickle.dump(data_processor.unique_schools, open(f"{options.dataset_folder}unique_schools_list.pkl", "wb"))
+        pickle.dump(data_processor.unique_class, open(f"{options.dataset_folder}unique_class_list.pkl", "wb"))
+        pickle.dump(data_processor.unique_students, open(f"{options.dataset_folder}unique_students_list.pkl", "wb"))
+        pickle.dump(data_processor.unique_problems, open(f"{options.dataset_folder}unique_problems_list.pkl", "wb"))
+        pickle.dump(data_processor.unique_kcs, open(f"{options.dataset_folder}unique_kcs_list.pkl", "wb"))
+        pickle.dump(data_processor.unique_steps, open(f"{options.dataset_folder}unique_steps_list.pkl", "wb"))
+        pickle.dump(data_processor.unique_new_steps_w_action_attempt, open(f"{options.dataset_folder}unique_new_steps_w_action_attempt_list.pkl", "wb"))
+        pickle.dump(data_processor.unique_new_steps_w_action_attempt_kcs, open(f"{options.dataset_folder}unique_new_steps_w_action_attempt_kcs.pkl", "wb"))
+        pickle.dump(data_processor.unique_new_steps_w_kcs, open(f"{options.dataset_folder}unique_new_steps_w_kcs_list.pkl", "wb"))
+
+    if opt.workspace_name:
+        for k,v in vars(opt).items():
+            if 'path' in k:
+                if v:
+                    redirect_path = opt.workspace_name+"/"
+                    if opt.school and opt.pretrain:
+                        sch = f"sch_largest_{len(opt.school)}-coded" #f"sch_largest_655"
+                        redirect_path = redirect_path + sch+"/"
+                    if opt.school_folder:
+                        redirect_path = redirect_path + opt.school_folder+"/"
+                    # else:
+                    #     sch = "sch_largest_655"                        
+                    if k != "vocab_file_path":
+                        if opt.pretrain:
+                            redirect_path = redirect_path + "pretraining/"
+                        else:
+                            if opt.code:
+                                redirect_path = redirect_path + f"{opt.code}/"
+                            elif opt.finetune_task:
+                                if opt.diff_val_folder and "val" in v:
+                                    redirect_path = redirect_path + f"finetuning/"
+                                else:
+                                    redirect_path = redirect_path + f"finetuning/{opt.finetune_task}/"
+                        if not os.path.exists(redirect_path):
+                            os.makedirs(redirect_path)
+                    else:
+                        if not os.path.exists(redirect_path+"/pretraining/"):
+                            os.makedirs(redirect_path+"/pretraining/")
+                    setattr(options, f"{k}", redirect_path+v)
+                    # setattr(options, f"{k}", opt.workspace_name+"/check/"+v)
+                    print(f"options.{k} : {getattr(options, f'{k}')}")
+
+
+    
+    if options.pretrain:
+        print("Preparing vocab...")
+        prepare_pretraining_vocab_file(options)
+        print("Preparing pre-training dataset...")
+        # old non-repeated steps
+        # prepare_pretraining_files(data_processor, options)
+        # coded
+        # prepare_school_coded_pretraining_files(data_processor, options)
+        prepare_school_coded_finetuning_opts_intentional_files(data_processor, options)
+        # prepare_pretraining_files(data_processor, options)
+        # prepare_school_pretraining_files(data_processor, options)
+    # else:
+    #     print("Preparing attention dataset...")
+    #     prepare_school_attention_files(data_processor, options)
+    else:
+        print("Preparing fine-tuning dataset...")
+        # _1920
+        # prepare_finetuning_10per_files(data_processor, options)
+        # prepare_finetuning_IS_FS_files(data_processor, options)
+        # prepare_finetuning_correctness_files(data_processor, options)
+
+        # _2223
+        # prepare_school_coded_finetuning_partial_seq_files(data_processor, options)
+        # prepare_school_coded_finetuning_opts_files(data_processor, options)
+        prepare_school_coded_finetuning_correctness_after_opts_per_files(data_processor, options)
+        # prepare_school_coded_finetuning_correctness_after_opts_files(data_processor, options)
+        # prepare_school_coded_finetuning_correctness_after_opts_over_prob_files(data_processor, options)
+        # prepare_finetuning_IS_files(data_processor, options)
+    #     # prepare_finetuning_FS_files(data_processor, options)
+        # prepare_finetuning_correctness_aaai_files(data_processor, options)
+    #     # prepare_finetuning_SL_files(data_processor, options)
+    #     # prepare_finetuning_effectiveness_files(data_processor, options)
+    #     prepare_attn_test_files(data_processor, options)
+        
+    
+if __name__ == "__main__":
+    parser = argparse.ArgumentParser()
+    parser.add_argument('-dataset_folder', type=str, default="dataset/CL4999_1920/")
+
+    parser.add_argument('-analyze_dataset_by_section', type=bool, default=False)
+    parser.add_argument('-analyze_dataset_by_school', type=bool, default=False)
+    parser.add_argument('-workspace_name', type=str, default=None)
+    parser.add_argument('-school', nargs='+', type=str, default=None)
+    parser.add_argument('-school_folder', type=str, default=None)
+    
+    # parser.add_argument('-highGRschool', nargs='+', type=str, default=None)
+    # parser.add_argument('-lowGRschool', nargs='+', type=str, default=None)
+
+    parser.add_argument('-code', type=str, default=None)
+    parser.add_argument('-finetune_task', type=str, default=None)
+
+    parser.add_argument('-per', type=float, default=None)
+    parser.add_argument("-diff_val_folder", type=bool, default=False, help="use for different val folder")
+
+    parser.add_argument('-opt_step1', nargs='+', type=str, help='List of optional steps 1')
+    parser.add_argument('-opt_step2', nargs='+', type=str, help='List of optional steps 2')
+    parser.add_argument('-final_step', nargs='+', type=str, help='List of final step')
+    
+    parser.add_argument('-dataset', type=str, default="dataset/CL4999_1920/course2_1920_4999_students_datashop.txt")
+    
+    parser.add_argument('-pretrain', type=bool, default=False)
+    parser.add_argument('-vocab_file_path', type=str, default="pretraining/vocab.txt") #pretraining/vocab.txt
+
+    # Prepare for pretraining
+    parser.add_argument('-train_file_path', type=str, default="train.txt") #pretraining/pretrain.txt
+    parser.add_argument('-train_info_path', type=str, default="train_info.txt") #pretraining/pretrain_info.txt
+    parser.add_argument('-train_label_path', type=str, default="train_label.txt") #finetuning/train_label.txt
+
+    parser.add_argument('-val_file_path', type=str, default="val.txt") #pretraining/val.txt
+    parser.add_argument('-val_info_path', type=str, default="val_info.txt") #pretraining/val_info.txt
+    parser.add_argument('-val_label_path', type=str, default="val_label.txt") #finetuning/val_label.txt
+
+    parser.add_argument('-test_file_path', type=str, default="test.txt") #pretraining/test.txt
+    parser.add_argument('-test_info_path', type=str, default="test_info.txt") #pretraining/test_info.txt
+    parser.add_argument('-test_label_path', type=str, default="test_label.txt") #finetuning/test_label.txt
+
+
+#     parser.add_argument('-train_gt_label_path', type=str, default="finetuning/train_gt_label.txt")
+#     parser.add_argument('-test_gt_label_path', type=str, default="finetuning/test_gt_label.txt")
+
+
+    options = parser.parse_args()
+    if not options.opt_step1:
+        setattr(options, "opt_step1", [])
+    print("Optional steps 1: ", options.opt_step1)
+    
+    if not options.opt_step2:
+        setattr(options, "opt_step2", [])
+    print("Optional steps 2: ", options.opt_step2)
+    
+    if not options.final_step:
+        setattr(options, "final_step", [])
+    print("Final steps: ", options.final_step)
+    
+    main(options)
+    
+    
+    
\ No newline at end of file