Spaces:

suryadev1
/

astra

Running

App Files Files Community

suryadev1 commited on 3 days ago

Commit

2d48e21

1 Parent(s): e2f614c

Upload 2 files

Browse files

Files changed (2) hide show

app.py +65 -27
new_test_saved_finetuned_model.py +62 -5

app.py CHANGED Viewed

@@ -29,25 +29,37 @@ def process_file(model_name,inc_slider,progress=Progress(track_tqdm=True)):
     # shutil.copyfile(label.name, saved_test_label)
     # shutil.copyfile(info.name, saved_train_info)
     parent_location="ratio_proportion_change3_2223/sch_largest_100-coded/finetuning/"
-    test_info_location=parent_location+"test_info.txt"
-    test_location=parent_location+"test.txt"
-    label_location=parent_location+"test_label.txt"
-    if(model_name=="ASTRA-FT-HGR"):
         finetune_task="highGRschool10"
-        # test_info_location=parent_location+"fullTest/test_info.txt"
-        # test_location=parent_location+"fullTest/test.txt"
     elif(model_name== "ASTRA-FT-LGR" ):
         finetune_task="lowGRschoolAll"
-        # test_info_location=parent_location+"lowGRschoolAll/test_info.txt"
-        # test_location=parent_location+"lowGRschoolAll/test.txt"
     elif(model_name=="ASTRA-FT-FULL"):
-        # test_info_location=parent_location+"fullTest/test_info.txt"
-        # test_location=parent_location+"fullTest/test.txt"
         finetune_task="fullTest"
     else:
         finetune_task=None
     # Load the test_info file and the graduation rate file
     test_info = pd.read_csv(test_info_location, sep=',', header=None, engine='python')
     grad_rate_data = pd.DataFrame(pd.read_pickle('assests/school_grduation_rate.pkl'),columns=['school_number','grad_rate'])  # Load the grad_rate data
     # Step 1: Extract unique school numbers from test_info
@@ -57,7 +69,7 @@ def process_file(model_name,inc_slider,progress=Progress(track_tqdm=True)):
     schools = grad_rate_data[grad_rate_data['school_number'].isin(unique_schools)]
     # Define a threshold for high and low graduation rates (adjust as needed)
-    grad_rate_threshold = 0.9
     # Step 4: Divide schools into high and low graduation rate groups
     high_grad_schools = schools[schools['grad_rate'] >= grad_rate_threshold]['school_number'].unique()
@@ -113,17 +125,19 @@ def process_file(model_name,inc_slider,progress=Progress(track_tqdm=True)):
     'high' if idx in high_indices else 'low' for idx in selected_rows_df2.index
     ]
     # Group data by opt_task1 and opt_task2 based on test_info[6]
-    opt_task_groups = ['opt_task1' if test_info.loc[idx, 6] == 0 else 'opt_task2' for idx in selected_rows_df2.index]
     progress(0.2, desc="Running fine-tuned models...")
     print("finetuned task: ",finetune_task)
     subprocess.run([
         "python", "new_test_saved_finetuned_model.py",
         "-workspace_name", "ratio_proportion_change3_2223/sch_largest_100-coded",
         "-finetune_task", finetune_task,
         "-test_dataset_path","../../../../fileHandler/selected_rows.txt",
         # "-test_label_path","../../../../train_label.txt",
-        "-finetuned_bert_classifier_checkpoint",
-        "ratio_proportion_change3_2223/sch_largest_100-coded/output/highGRschool10/bert_fine_tuned.model.ep42",
         "-e",str(1),
         "-b",str(1000)
     ])
@@ -132,6 +146,8 @@ def process_file(model_name,inc_slider,progress=Progress(track_tqdm=True)):
     # Load tlb and plb
     with open("fileHandler/tlabels_plabels.pkl", "rb") as f:
         tlb, plb = pickle.load(f)
     # Define function to filter and write CSV
     def process_and_write_csv(filtered_data, filename):
@@ -152,20 +168,40 @@ def process_file(model_name,inc_slider,progress=Progress(track_tqdm=True)):
             row_num = 1
             for _, row in filtered_data.iterrows():
-                school, class_id, student_id, status, problem, _, time_zone, duration, attempts = row[:9]
-                steps_data = row[8]
                 for step in steps_data.split('\t'):
                     step_parts = step.split('-')
                     step_name = step_parts[0]
-                    action = step_parts[1] if len(step_parts) > 1 else ""
-                    attempt = step_parts[2] if len(step_parts) > 2 else ""
-                    outcome = step_parts[-1] if len(step_parts) > 3 else ""
                     row_data = [
-                        row_num, "", "", student_id, "", time_zone, duration, "", "", "", "",
-                        problem, problem, "", "", step_name, attempt, "", outcome, "", action, "", "", "", "", "", "", "", "", "", "","",
-                        school, class_id, "", "", "", "", "PROMOTED"
                     ]
                     writer.writerow(row_data)
                     row_num += 1
@@ -179,7 +215,8 @@ def process_file(model_name,inc_slider,progress=Progress(track_tqdm=True)):
             # Filter the data
             filtered_data = selected_test_info.iloc[matching_indices]
-            filtered_data = filtered_data[filtered_data[6] == task_type]  # Ensure test_info[6] matches
             # Define filename dynamically
             task_type_map = {0: "ER", 1: "ME"}
@@ -291,8 +328,7 @@ def process_file(model_name,inc_slider,progress=Progress(track_tqdm=True)):
         data = file.readlines()
     selected_data = [data[i] for i in indices if i < len(data)]
     # Assuming test_info[7] is a list with ideal tasks for each instance
-    ideal_tasks = test_info[6]  # A list where each element is either 1 or 2
     # Initialize counters
     task_counts = {
     1: {"ER": 0, "ME": 0, "both": 0,"none":0},
@@ -665,7 +701,7 @@ def process_file(model_name,inc_slider,progress=Progress(track_tqdm=True)):
     Model: {model_name}
     ---------------------------\n
     Time Taken: {result['time_taken_from_start']:.2f} seconds
-    Number of schools sampled: {len(unique_schools)}
     Total number of instances from HGR schools : {len(high_indices)}
     Total number of instances from LGR schools: {len(low_indices)}
@@ -741,7 +777,9 @@ def process_file(model_name,inc_slider,progress=Progress(track_tqdm=True)):
 # List of models for the dropdown menu
 # models = ["ASTRA-FT-HGR", "ASTRA-FT-LGR", "ASTRA-FT-FULL"]
-models = ["ASTRA-FT-HGR", "ASTRA-FT-FULL"]
 content = """
 <h1 style="color: black;">A S T R A</h1>
 <h2 style="color: black;">An AI Model for Analyzing Math Strategies</h2>

     # shutil.copyfile(label.name, saved_test_label)
     # shutil.copyfile(info.name, saved_train_info)
     parent_location="ratio_proportion_change3_2223/sch_largest_100-coded/finetuning/"
+    test_info_location=parent_location+"overallTestData/test_info.txt"
+    test_location=parent_location+"overallTestData/test.txt"
+    label_location=parent_location+"overallTestData/test_label.txt"
+    # "ASTRA-FT-HGR-RANDOM10", "ASTRA-FT-FIRST10-WSKILLS", "ASTRA-FT-FIRST10-WTIME", "ASTRA-FT-FIRST10-WSKILLS_WTIME"
+    checkpoint = "ratio_proportion_change3_2223/sch_largest_100-coded/output/"
+    if(model_name=="ASTRA-FT-HGR-RANDOM10"):
         finetune_task="highGRschool10"
+        checkpoint += "highGRschool10/bert_fine_tuned.model.ep42"
     elif(model_name== "ASTRA-FT-LGR" ):
         finetune_task="lowGRschoolAll"
     elif(model_name=="ASTRA-FT-FULL"):
         finetune_task="fullTest"
+    elif(model_name in ["ASTRA-FT-FIRST10-WSKILLS", "ASTRA-FT-FIRST10-WTIME", "ASTRA-FT-FIRST10-WSKILLS_WTIME"]):
+        finetune_task="first10"
+        if model_name == "ASTRA-FT-FIRST10-WSKILLS":
+            checkpoint += "first10/bert_fine_tuned.model.first10%.wskills.ep24"
+        elif model_name == "ASTRA-FT-FIRST10-WTIME":
+            checkpoint += "first10/bert_fine_tuned.model.first10%.wfaopttime.wttime.wttopttime.wttnoopttime.ep23"
+        elif model_name == "ASTRA-FT-FIRST10-WSKILLS_WTIME":
+            checkpoint += "first10/bert_fine_tuned.model.first10%.wskills.wfaopttime.wttime.wttopttime.wttnoopttime.ep40"
     else:
         finetune_task=None
     # Load the test_info file and the graduation rate file
     test_info = pd.read_csv(test_info_location, sep=',', header=None, engine='python')
+    def convert_etalon(x):
+        means_and_extremes = 1
+        if x.is_integer():
+            means_and_extremes = 0
+        return means_and_extremes
+    test_info[8] = test_info[7].apply(convert_etalon) # 7th column contains etalon of factor which decides the ER/ME problem type
     grad_rate_data = pd.DataFrame(pd.read_pickle('assests/school_grduation_rate.pkl'),columns=['school_number','grad_rate'])  # Load the grad_rate data
     # Step 1: Extract unique school numbers from test_info
     schools = grad_rate_data[grad_rate_data['school_number'].isin(unique_schools)]
     # Define a threshold for high and low graduation rates (adjust as needed)
+    grad_rate_threshold = 0.8 #0.9
     # Step 4: Divide schools into high and low graduation rate groups
     high_grad_schools = schools[schools['grad_rate'] >= grad_rate_threshold]['school_number'].unique()
     'high' if idx in high_indices else 'low' for idx in selected_rows_df2.index
     ]
     # Group data by opt_task1 and opt_task2 based on test_info[6]
+    opt_task_groups = ['opt_task1' if test_info.loc[idx, 8] == 0 else 'opt_task2' for idx in selected_rows_df2.index]
     progress(0.2, desc="Running fine-tuned models...")
     print("finetuned task: ",finetune_task)
     subprocess.run([
         "python", "new_test_saved_finetuned_model.py",
         "-workspace_name", "ratio_proportion_change3_2223/sch_largest_100-coded",
+        "-model_name", model_name,
         "-finetune_task", finetune_task,
         "-test_dataset_path","../../../../fileHandler/selected_rows.txt",
         # "-test_label_path","../../../../train_label.txt",
+        "-finetuned_bert_classifier_checkpoint", checkpoint,
+        "-s",str(128),
         "-e",str(1),
         "-b",str(1000)
     ])
     # Load tlb and plb
     with open("fileHandler/tlabels_plabels.pkl", "rb") as f:
         tlb, plb = pickle.load(f)
+        print("t==p = 0: ", sum([t==p for t,p in zip(tlb, plb) if t==0]))
+        print("t==p = 1: ", sum([t==p for t,p in zip(tlb, plb) if t==1]))
     # Define function to filter and write CSV
     def process_and_write_csv(filtered_data, filename):
             row_num = 1
             for _, row in filtered_data.iterrows():
+                # school, class_id, student_id, status, problem, _, time_zone, duration, attempts = row[:9]
+                # sch_NPHBD11809,17,stu_CRJBA61379,GRADUATED,ratio_proportion_change3-134,[strategygame],1,4.0,4.0,10,
+                # PercentChange-Attempt-1-0-OK-1667479255281	NumeratorQuantity1-Attempt-1-0-JIT-1667479268893	NumeratorQuantity1-Attempt-2-0-ERROR-1667479284199	NumeratorQuantity1-Attempt-3-0-OK-1667479294890	DenominatorQuantity1-Attempt-1-0-OK-1667479298749	NumeratorQuantity2-Attempt-1-0-OK-1667479301999	OptionalTask_1-Attempt-1-0-OK-1667479304886	DenominatorFactor-Attempt-1-0-OK-1667479314566	NumeratorFactor-Attempt-1-0-OK-1667479315579	EquationAnswer-Attempt-1-0-OK-1667479323750	FinalAnswerDirection-Attempt-1-0-OK-1667479333439	FinalAnswer-Attempt-1-0-OK-1667479338185,
+                # 1,
+                # 0	0.999767840033168	0	0	0.999996274310286	0	0.321529253998353	0.999722748307354	0.999840947031115,
+                # 0	-0.0002057730279919623	0	0	-3.302306839980673e-06	0	-0.41429892410820995	-0.00022392554103201068	-0.00012846367037400164,
+                # 0	0.999767840033168	0	0	0	0	0	0	0,
+                # 1667479255281	1667479294890	1667479298749	1667479301999	1667479304886	1667479314566	1667479315579	1667479323750	1667479333439	1667479338185,
+                # 0	39609	3859	3250	2887	9680	1013	8171	9689	4746,
+                # 2887	9680	1013	8171,0	39609	3859	3250	9689	4746,
+                # 14435,
+                # 82904
+                school, prob_solved, student_id, status, problem, prob_type, opt_type, _, _ = row[:9]
+                steps_data = row[10]
+                # if row_num == 1:
+                #     print(row)
                 for step in steps_data.split('\t'):
                     step_parts = step.split('-')
                     step_name = step_parts[0]
+                    action = step_parts[1]
+                    attempt = step_parts[2]
+                    help_level = step_parts[3]
+                    outcome = step_parts[4]
+                    curr_time = step_parts[5]
                     row_data = [
+                        row_num, "", "", student_id, "", curr_time, "", "", "", "", "",
+                        "ratio_proportion_change3", problem, "", "", step_name, attempt, "", outcome, "", action, "",
+                        "", "", help_level, "", "", "", "", "", "", "",
+                        school, "", "", "", "", "", status, ""
                     ]
                     writer.writerow(row_data)
                     row_num += 1
             # Filter the data
             filtered_data = selected_test_info.iloc[matching_indices]
+            # new data contains etalon instead of 0/1 for ER/ME
+            filtered_data = filtered_data[filtered_data[8] == task_type]  # Ensure test_info[6] matches
             # Define filename dynamically
             task_type_map = {0: "ER", 1: "ME"}
         data = file.readlines()
     selected_data = [data[i] for i in indices if i < len(data)]
     # Assuming test_info[7] is a list with ideal tasks for each instance
+    ideal_tasks = test_info[8]  # A list where each element is either 1 or 2
     # Initialize counters
     task_counts = {
     1: {"ER": 0, "ME": 0, "both": 0,"none":0},
     Model: {model_name}
     ---------------------------\n
     Time Taken: {result['time_taken_from_start']:.2f} seconds
+    Number of schools sampled: {len(random_schools)}
     Total number of instances from HGR schools : {len(high_indices)}
     Total number of instances from LGR schools: {len(low_indices)}
 # List of models for the dropdown menu
 # models = ["ASTRA-FT-HGR", "ASTRA-FT-LGR", "ASTRA-FT-FULL"]
+# models = ["ASTRA-FT-HGR", "ASTRA-FT-FULL"]
+models = ["ASTRA-FT-HGR-RANDOM10", "ASTRA-FT-FIRST10-WSKILLS", "ASTRA-FT-FIRST10-WTIME", "ASTRA-FT-FIRST10-WSKILLS_WTIME"]
 content = """
 <h1 style="color: black;">A S T R A</h1>
 <h2 style="color: black;">An AI Model for Analyzing Math Strategies</h2>

new_test_saved_finetuned_model.py CHANGED Viewed

@@ -6,7 +6,7 @@ from torch.optim import Adam
 from torch.utils.data import DataLoader
 import pickle
 print("here1",os.getcwd())
-from src.dataset import TokenizerDataset, TokenizerDatasetForCalibration
 from src.vocab import Vocab
 print("here3",os.getcwd())
 from src.bert import BERT
@@ -19,6 +19,7 @@ import tqdm
 import sys
 import time
 import numpy as np
 from sklearn.metrics import precision_score, recall_score, f1_score, confusion_matrix, roc_curve, roc_auc_score
 import matplotlib.pyplot as plt
@@ -466,12 +467,59 @@ class BERTFineTuneCalibratedTrainer:
             sys.stdout = sys.__stdout__
         sys.stdout = sys.__stdout__
 def train():
     parser = argparse.ArgumentParser()
     parser.add_argument('-workspace_name', type=str, default=None)
     parser.add_argument('-code', type=str, default=None, help="folder for pretraining outputs and logs")
     parser.add_argument('-finetune_task', type=str, default=None, help="folder inside finetuning")
     parser.add_argument("-attention", type=bool, default=False, help="analyse attention scores")
@@ -559,10 +607,19 @@ def train():
     vocab_obj.load_vocab()
     print("Vocab Size: ", len(vocab_obj.vocab))
     print("Testing using finetuned model......")
-    print("Loading Test Dataset", args.test_dataset_path)
-    test_dataset = TokenizerDataset(args.test_dataset_path, args.test_label_path, vocab_obj, seq_len=args.seq_len)
     # test_dataset = TokenizerDatasetForCalibration(args.test_dataset_path, args.test_label_path, vocab_obj, seq_len=args.seq_len)
     print("Creating Dataloader...")

 from torch.utils.data import DataLoader
 import pickle
 print("here1",os.getcwd())
+from src.dataset import TokenizerDataset, TokenizerwSkillsDataset, TokenizerwTimeDataset, TokenizerwSkillsTimeDataset
 from src.vocab import Vocab
 print("here3",os.getcwd())
 from src.bert import BERT
 import sys
 import time
 import numpy as np
+from sklearn.preprocessing import QuantileTransformer
 from sklearn.metrics import precision_score, recall_score, f1_score, confusion_matrix, roc_curve, roc_auc_score
 import matplotlib.pyplot as plt
             sys.stdout = sys.__stdout__
         sys.stdout = sys.__stdout__
+def prepare_normalized_time_df():
+    faopt_time = []
+    total_time = []
+    nonopt_time = []
+    opt_time = []
+    school = []
+    student = []
+    progress = []
+    prob_id = []
+    with open("ratio_proportion_change3_2223/sch_largest_100-coded/finetuning/fullData/train_info.txt", "r") as f:
+        for line in f:
+            line = line.strip()
+            if line:
+                line = line.split(",")
+                sch = line[0]
+                school.append(sch)
+                stu = line[2]
+                student.append(stu)
+                status = line[3]
+                progress.append(status)
+                pid = line[4]
+                prob_id.append(pid)
+                total = float(line[-1])#/60000
+                faopt = float(line[-2])#/60000
+                nonopt = sum([float(i) for i in line[-3].split("\t")])
+                opt = sum([float(i) for i in line[-4].split("\t")])
+                faopt_time.append(faopt)
+                total_time.append(total)
+                nonopt_time.append(nonopt)
+                opt_time.append(opt)
+    df = pd.DataFrame({"school": school, "student": student, "progress": progress, "prob_id": prob_id,
+                    "faopt_time": faopt_time, "total_time": total_time,
+                    "nonopt_time": nonopt_time, "opt_time": opt_time})
+    for col in df.columns:
+        print(col, col.endswith('time'))
+        if col.endswith('time'): #col == "faopt_time" or col =="total_time":
+            num_df = df[col]
+            col_values = num_df.values.reshape(-1, 1)
+            nt = QuantileTransformer(output_distribution='normal')
+            col_values_norm = nt.fit_transform(col_values)
+            df[col] = col_values_norm
+            print(df[col].describe())
+    df.set_index(["school", "student", "progress", "prob_id"], inplace=True)
+    df.to_pickle("ratio_proportion_change3_2223/sch_largest_100-coded/time_info/full_data_normalized_time.pkl")
 def train():
     parser = argparse.ArgumentParser()
     parser.add_argument('-workspace_name', type=str, default=None)
+    parser.add_argument('-model_name', type=str, default=None)
     parser.add_argument('-code', type=str, default=None, help="folder for pretraining outputs and logs")
     parser.add_argument('-finetune_task', type=str, default=None, help="folder inside finetuning")
     parser.add_argument("-attention", type=bool, default=False, help="analyse attention scores")
     vocab_obj.load_vocab()
     print("Vocab Size: ", len(vocab_obj.vocab))
+    prepare_normalized_time_df()
     print("Testing using finetuned model......")
+    print("Loading Test Dataset", args.test_dataset_path)
+    # "ASTRA-FT-HGR-RANDOM10", "ASTRA-FT-FIRST10-WSKILLS", "ASTRA-FT-FIRST10-WTIME", "ASTRA-FT-FIRST10-WSKILLS_WTIME"
+    # test_dataset = TokenizerDataset(args.test_dataset_path, args.test_label_path, vocab_obj, seq_len=args.seq_len)
+    if args.model_name == "ASTRA-FT-HGR-RANDOM10":
+        test_dataset = TokenizerwSkillsDataset(args.test_dataset_path, args.test_label_path, vocab_obj, seq_len=args.seq_len)
+    elif args.model_name == "ASTRA-FT-FIRST10-WSKILLS":
+        test_dataset = TokenizerwSkillsDataset(args.test_dataset_path, args.test_label_path, vocab_obj, seq_len=args.seq_len)
+    elif args.model_name == "ASTRA-FT-FIRST10-WTIME":
+        test_dataset = TokenizerwTimeDataset(args.test_dataset_path, args.test_label_path, vocab_obj, seq_len=args.seq_len)
+    elif args.model_name == "ASTRA-FT-FIRST10-WSKILLS_WTIME":
+        test_dataset = TokenizerwSkillsTimeDataset(args.test_dataset_path, args.test_label_path, vocab_obj, seq_len=args.seq_len)
     # test_dataset = TokenizerDatasetForCalibration(args.test_dataset_path, args.test_label_path, vocab_obj, seq_len=args.seq_len)
     print("Creating Dataloader...")