suryadev1 commited on
Commit
c343cc3
·
1 Parent(s): ee40bd7

Remove large file train_info.txt

Browse files
app.py CHANGED
@@ -7,9 +7,10 @@ import subprocess
7
  import shutil
8
  import matplotlib.pyplot as plt
9
  from sklearn.metrics import roc_curve, auc
 
10
  # Define the function to process the input file and model selection
11
 
12
- def process_file(file,label,info,inc_val,progress=Progress(track_tqdm=True)):
13
  # progress = gr.Progress(track_tqdm=True)
14
  progress(0, desc="Starting the processing")
15
  with open(file.name, 'r') as f:
@@ -21,27 +22,66 @@ def process_file(file,label,info,inc_val,progress=Progress(track_tqdm=True)):
21
  shutil.copyfile(file.name, saved_test_dataset)
22
  shutil.copyfile(label.name, saved_test_label)
23
  shutil.copyfile(info.name, saved_train_info)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
24
  # For demonstration purposes, we'll just return the content with the selected model name
25
- # if(model_name=="highGRschool10"):
26
- # checkpoint="ratio_proportion_change3/output/FS/bert_fine_tuned.model.ep32"
27
- # elif(model_name=="lowGRschoolAll"):
28
- # checkpoint="ratio_proportion_change3/output/IS/bert_fine_tuned.model.ep14"
29
- # elif(model_name=="fullTest"):
30
- # checkpoint="ratio_proportion_change3/output/correctness/bert_fine_tuned.model.ep48"
31
- # else:
32
- # checkpoint=None
33
- # print(checkpoint)
34
- if (inc_val<5):
35
- model_name="highGRschool10"
36
- elif(inc_val>=5 & inc_val<10):
37
- model_name="highGRschool10"
38
  else:
39
- model_name="highGRschool10"
 
 
 
 
 
 
 
 
 
40
  subprocess.run([
41
  "python", "new_test_saved_finetuned_model.py",
42
  "-workspace_name", "ratio_proportion_change3_2223/sch_largest_100-coded",
43
- "-finetune_task", model_name,
44
- # "-test_dataset_path","../../../../train.txt",
45
  # "-test_label_path","../../../../train_label.txt",
46
  "-finetuned_bert_classifier_checkpoint",
47
  "ratio_proportion_change3_2223/sch_largest_100-coded/output/highGRschool10/bert_fine_tuned.model.ep42",
@@ -77,12 +117,26 @@ def process_file(file,label,info,inc_val,progress=Progress(track_tqdm=True)):
77
  progress(1.0)
78
  # Prepare text output
79
  text_output = f"Model: {model_name}\nResult:\n{result}"
80
-
 
 
 
 
 
 
 
 
 
 
 
 
 
 
81
  return text_output,plot_path
82
 
83
  # List of models for the dropdown menu
84
 
85
- models = ["highGRschool10", "lowGRschoolAll", "fullTest"]
86
 
87
  # Create the Gradio interface
88
  with gr.Blocks(css="""
@@ -275,10 +329,10 @@ tbody.svelte-18wv37q>tr.svelte-18wv37q:nth-child(odd) {
275
 
276
  info_input = gr.File(label="Upload test info", file_types=['.txt'], elem_classes="file-box")
277
 
278
- # model_dropdown = gr.Dropdown(choices=models, label="Select Finetune Task", elem_classes="dropdown-menu")
279
 
280
 
281
- increment_slider = gr.Slider(minimum=1, maximum=50, step=5, label="Schools number", value=1)
282
 
283
  with gr.Row():
284
  output_text = gr.Textbox(label="Output Text")
@@ -286,7 +340,7 @@ tbody.svelte-18wv37q>tr.svelte-18wv37q:nth-child(odd) {
286
 
287
  btn = gr.Button("Submit")
288
 
289
- btn.click(fn=process_file, inputs=[file_input,label_input,info_input,increment_slider], outputs=[output_text,output_image])
290
 
291
 
292
  # Launch the app
 
7
  import shutil
8
  import matplotlib.pyplot as plt
9
  from sklearn.metrics import roc_curve, auc
10
+ import pandas as pd
11
  # Define the function to process the input file and model selection
12
 
13
+ def process_file(file,label,info,model_name,inc_slider,progress=Progress(track_tqdm=True)):
14
  # progress = gr.Progress(track_tqdm=True)
15
  progress(0, desc="Starting the processing")
16
  with open(file.name, 'r') as f:
 
22
  shutil.copyfile(file.name, saved_test_dataset)
23
  shutil.copyfile(label.name, saved_test_label)
24
  shutil.copyfile(info.name, saved_train_info)
25
+
26
+
27
+ # Load the test_info file and the graduation rate file
28
+ test_info = pd.read_csv('train_info.txt', sep=',', header=None, engine='python')
29
+ grad_rate_data = pd.DataFrame(pd.read_pickle('school_grduation_rate.pkl'),columns=['school_number','grad_rate']) # Load the grad_rate data
30
+
31
+ # Step 1: Extract unique school numbers from test_info
32
+ unique_schools = test_info[0].unique()
33
+
34
+ # Step 2: Filter the grad_rate_data using the unique school numbers
35
+ schools = grad_rate_data[grad_rate_data['school_number'].isin(unique_schools)]
36
+
37
+ # Define a threshold for high and low graduation rates (adjust as needed)
38
+ grad_rate_threshold = 0.9
39
+
40
+ # Step 4: Divide schools into high and low graduation rate groups
41
+ high_grad_schools = schools[schools['grad_rate'] >= grad_rate_threshold]['school_number'].unique()
42
+ low_grad_schools = schools[schools['grad_rate'] < grad_rate_threshold]['school_number'].unique()
43
+
44
+ # Step 5: Sample percentage of schools from each group
45
+ high_sample = pd.Series(high_grad_schools).sample(frac=inc_slider/100, random_state=1).tolist()
46
+ low_sample = pd.Series(low_grad_schools).sample(frac=inc_slider/100, random_state=1).tolist()
47
+
48
+ # Step 6: Combine the sampled schools
49
+ random_schools = high_sample + low_sample
50
+
51
+ # Step 7: Get indices for the sampled schools
52
+ indices = test_info[test_info[0].isin(random_schools)].index.tolist()
53
+
54
+ # Load the test file and select rows based on indices
55
+ test = pd.read_csv('train.txt', sep=',', header=None, engine='python')
56
+ selected_rows_df2 = test.loc[indices]
57
+
58
+ # Save the selected rows to a file
59
+ selected_rows_df2.to_csv('selected_rows.txt', sep='\t', index=False, header=False, quoting=3, escapechar=' ')
60
+
61
+
62
  # For demonstration purposes, we'll just return the content with the selected model name
63
+ if(model_name=="High Graduated Schools"):
64
+ finetune_task="highGRschool10"
65
+ elif(model_name== "Low Graduated Schools" ):
66
+ finetune_task="highGRschool10"
67
+ elif(model_name=="Full Set"):
68
+ finetune_task="highGRschool10"
 
 
 
 
 
 
 
69
  else:
70
+ finetune_task=None
71
+ # print(checkpoint)
72
+ progress(0.1, desc="Files created and saved")
73
+ # if (inc_val<5):
74
+ # model_name="highGRschool10"
75
+ # elif(inc_val>=5 & inc_val<10):
76
+ # model_name="highGRschool10"
77
+ # else:
78
+ # model_name="highGRschool10"
79
+ progress(0.2, desc="Executing models")
80
  subprocess.run([
81
  "python", "new_test_saved_finetuned_model.py",
82
  "-workspace_name", "ratio_proportion_change3_2223/sch_largest_100-coded",
83
+ "-finetune_task", "highGRschool10",
84
+ "-test_dataset_path","../../../../selected_rows.txt",
85
  # "-test_label_path","../../../../train_label.txt",
86
  "-finetuned_bert_classifier_checkpoint",
87
  "ratio_proportion_change3_2223/sch_largest_100-coded/output/highGRschool10/bert_fine_tuned.model.ep42",
 
117
  progress(1.0)
118
  # Prepare text output
119
  text_output = f"Model: {model_name}\nResult:\n{result}"
120
+ # Prepare text output with HTML formatting
121
+ text_output = f"""
122
+ Model: {model_name}\n
123
+ Result Summary:\n
124
+ -----------------\n
125
+ Average Loss: {result['avg_loss']:.4f}\n
126
+ Total Accuracy: {result['total_acc']:.2f}%\n
127
+ Precision: {result['precisions']:.2f}\n
128
+ Recall: {result['recalls']:.2f}\n
129
+ F1-Score: {result['f1_scores']:.2f}\n
130
+ Time Taken: {result['time_taken_from_start']:.2f} seconds\n
131
+ AUC Score: {result['auc_score']:.4f}\n
132
+ -----------------\n
133
+ Note: The ROC Curve is also displayed for the evaluation.
134
+ """
135
  return text_output,plot_path
136
 
137
  # List of models for the dropdown menu
138
 
139
+ models = ["High Graduated Schools", "Low Graduated Schools", "Full Set"]
140
 
141
  # Create the Gradio interface
142
  with gr.Blocks(css="""
 
329
 
330
  info_input = gr.File(label="Upload test info", file_types=['.txt'], elem_classes="file-box")
331
 
332
+ model_dropdown = gr.Dropdown(choices=models, label="Select Finetune Task", elem_classes="dropdown-menu")
333
 
334
 
335
+ increment_slider = gr.Slider(minimum=5, maximum=100, step=5, label="Schools Percentage", value=5)
336
 
337
  with gr.Row():
338
  output_text = gr.Textbox(label="Output Text")
 
340
 
341
  btn = gr.Button("Submit")
342
 
343
+ btn.click(fn=process_file, inputs=[file_input,label_input,info_input,model_dropdown,increment_slider], outputs=[output_text,output_image])
344
 
345
 
346
  # Launch the app
new_test_saved_finetuned_model.py CHANGED
@@ -495,7 +495,7 @@ def train():
495
  parser.add_argument("-hs", "--hidden", type=int, default=64, help="hidden size of transformer model") #64
496
  parser.add_argument("-l", "--layers", type=int, default=4, help="number of layers") #4
497
  parser.add_argument("-a", "--attn_heads", type=int, default=4, help="number of attention heads") #8
498
- parser.add_argument("-s", "--seq_len", type=int, default=5, help="maximum sequence length")
499
 
500
  parser.add_argument("-b", "--batch_size", type=int, default=500, help="number of batch_size") #64
501
  parser.add_argument("-e", "--epochs", type=int, default=1)#1501, help="number of epochs") #501
@@ -508,7 +508,7 @@ def train():
508
  # parser.add_argument("--corpus_lines", type=int, default=None, help="total number of lines in corpus")
509
  parser.add_argument("--cuda_devices", type=int, nargs='+', default=None, help="CUDA device ids")
510
  # parser.add_argument("--on_memory", type=bool, default=False, help="Loading on memory: true or false")
511
-
512
  parser.add_argument("--dropout", type=float, default=0.1, help="dropout of network")
513
  parser.add_argument("--lr", type=float, default=1e-05, help="learning rate of adam") #1e-3
514
  parser.add_argument("--adam_weight_decay", type=float, default=0.01, help="weight_decay of adam")
 
495
  parser.add_argument("-hs", "--hidden", type=int, default=64, help="hidden size of transformer model") #64
496
  parser.add_argument("-l", "--layers", type=int, default=4, help="number of layers") #4
497
  parser.add_argument("-a", "--attn_heads", type=int, default=4, help="number of attention heads") #8
498
+ parser.add_argument("-s", "--seq_len", type=int, default=128, help="maximum sequence length")
499
 
500
  parser.add_argument("-b", "--batch_size", type=int, default=500, help="number of batch_size") #64
501
  parser.add_argument("-e", "--epochs", type=int, default=1)#1501, help="number of epochs") #501
 
508
  # parser.add_argument("--corpus_lines", type=int, default=None, help="total number of lines in corpus")
509
  parser.add_argument("--cuda_devices", type=int, nargs='+', default=None, help="CUDA device ids")
510
  # parser.add_argument("--on_memory", type=bool, default=False, help="Loading on memory: true or false")
511
+
512
  parser.add_argument("--dropout", type=float, default=0.1, help="dropout of network")
513
  parser.add_argument("--lr", type=float, default=1e-05, help="learning rate of adam") #1e-3
514
  parser.add_argument("--adam_weight_decay", type=float, default=0.01, help="weight_decay of adam")
plot.png CHANGED
result.txt CHANGED
@@ -1,7 +1,7 @@
1
- avg_loss: 0.8249401861713046
2
- total_acc: 50.0
3
- precisions: 0.25
4
- recalls: 0.5
5
- f1_scores: 0.3333333333333333
6
- time_taken_from_start: 30.98168659210205
7
- auc_score: 0.7724651292107545
 
1
+ avg_loss: 0.5631513595581055
2
+ total_acc: 69.7320542507443
3
+ precisions: 0.7236992960620143
4
+ recalls: 0.6973205425074429
5
+ f1_scores: 0.6879225873063946
6
+ time_taken_from_start: 73.04951095581055
7
+ auc_score: 0.7452296224317393
roc_data.pkl CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:e2d449987338fb8aa00d855ae60967ad80c2f3dd2867e052ad9cf4621cd1dae0
3
- size 358640
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:f4beb5de79dfb3592402832ced8db0c87f3264e46c0813553c40728c7ddafed5
3
+ size 29285
school_grduation_rate.pkl ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:6c0c99dd8fc601de1fc8f4af5880bf71b7198c09bf0d016a880b02043e0b3d03
3
+ size 18356
selected_rows.txt ADDED
The diff for this file is too large to render. See raw diff
 
train.txt CHANGED
The diff for this file is too large to render. See raw diff
 
train_info.txt DELETED
@@ -1 +0,0 @@
1
- test
 
 
train_label.txt CHANGED
The diff for this file is too large to render. See raw diff