Niko.Koutsoubis commited on
Commit
f369cf3
Β·
1 Parent(s): a091733

Add site-specific extraction scripts for federated learning

Browse files
Files changed (2) hide show
  1. extract_site1.sh +77 -0
  2. extract_site2.sh +77 -0
extract_site1.sh ADDED
@@ -0,0 +1,77 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ #!/bin/bash
2
+ # Extract embeddings for Site 1 (Split 1)
3
+ # Usage: bash extract_site1.sh
4
+
5
+ set -e # Exit on error
6
+
7
+ # Configuration
8
+ ROOT_DIR="/roshare/nlst_global/data_23Dec2024/manifest-NLST_allCT/NLST/" # Update this to your NLST data path
9
+ OUTPUT_BASE="site1_data"
10
+ SITE_SPLITS="../subsets/site_splits"
11
+
12
+ echo "============================================"
13
+ echo "SITE 1 - Embedding Extraction"
14
+ echo "============================================"
15
+ echo ""
16
+
17
+ # Create output directories
18
+ mkdir -p ${OUTPUT_BASE}/train
19
+ mkdir -p ${OUTPUT_BASE}/test
20
+
21
+ # Extract training embeddings
22
+ echo "πŸ“¦ Extracting TRAINING embeddings..."
23
+ python extract-embeddings.py \
24
+ --root-dir ${ROOT_DIR} \
25
+ --pid-csv ${SITE_SPLITS}/train_pid_labelsT0T7_split_1.csv \
26
+ --output-dir ${OUTPUT_BASE}/train \
27
+ --num-workers 8 \
28
+ --checkpoint-interval 500
29
+
30
+ echo ""
31
+ echo "βœ“ Training embeddings complete!"
32
+ echo ""
33
+
34
+ # Extract test embeddings
35
+ echo "πŸ“¦ Extracting TEST embeddings..."
36
+ python extract-embeddings.py \
37
+ --root-dir ${ROOT_DIR} \
38
+ --pid-csv ${SITE_SPLITS}/test_pid_labelsT0T7_split_1.csv \
39
+ --output-dir ${OUTPUT_BASE}/test \
40
+ --num-workers 8 \
41
+ --checkpoint-interval 500
42
+
43
+ echo ""
44
+ echo "βœ“ Test embeddings complete!"
45
+ echo ""
46
+
47
+ # Prepare files for federated learning
48
+ echo "πŸ“‹ Preparing files for federated learning..."
49
+ mkdir -p ${OUTPUT_BASE}/fl_ready
50
+
51
+ # Copy and rename embeddings
52
+ cp ${OUTPUT_BASE}/train/all_embeddings.parquet ${OUTPUT_BASE}/fl_ready/site1_embeddings_train.parquet
53
+ cp ${OUTPUT_BASE}/test/all_embeddings.parquet ${OUTPUT_BASE}/fl_ready/site1_embeddings_test.parquet
54
+
55
+ # Extract just pid and label columns from the CSV files
56
+ echo "Creating site1_labels-train.csv..."
57
+ head -n 1 ${SITE_SPLITS}/train_pid_labelsT0T7_split_1.csv | cut -d, -f1,14-20 > ${OUTPUT_BASE}/fl_ready/site1_labels-train.csv
58
+ tail -n +2 ${SITE_SPLITS}/train_pid_labelsT0T7_split_1.csv | cut -d, -f1,14-20 >> ${OUTPUT_BASE}/fl_ready/site1_labels-train.csv
59
+
60
+ echo "Creating site1_labels-test.csv..."
61
+ head -n 1 ${SITE_SPLITS}/test_pid_labelsT0T7_split_1.csv | cut -d, -f1,14-20 > ${OUTPUT_BASE}/fl_ready/site1_labels-test.csv
62
+ tail -n +2 ${SITE_SPLITS}/test_pid_labelsT0T7_split_1.csv | cut -d, -f1,14-20 >> ${OUTPUT_BASE}/fl_ready/site1_labels-test.csv
63
+
64
+ echo ""
65
+ echo "============================================"
66
+ echo "SITE 1 - COMPLETE! βœ…"
67
+ echo "============================================"
68
+ echo ""
69
+ echo "FL-ready files in: ${OUTPUT_BASE}/fl_ready/"
70
+ ls -lh ${OUTPUT_BASE}/fl_ready/
71
+ echo ""
72
+ echo "Files ready for federated learning:"
73
+ echo " βœ“ site1_embeddings_train.parquet"
74
+ echo " βœ“ site1_embeddings_test.parquet"
75
+ echo " βœ“ site1_labels-train.csv"
76
+ echo " βœ“ site1_labels-test.csv"
77
+ echo ""
extract_site2.sh ADDED
@@ -0,0 +1,77 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ #!/bin/bash
2
+ # Extract embeddings for Site 2 (Split 2)
3
+ # Usage: bash extract_site2.sh
4
+
5
+ set -e # Exit on error
6
+
7
+ # Configuration
8
+ ROOT_DIR="/roshare/nlst_global/data_23Dec2024/manifest-NLST_allCT/NLST/" # Update this to your NLST data path
9
+ OUTPUT_BASE="site2_data"
10
+ SITE_SPLITS="../subsets/site_splits"
11
+
12
+ echo "============================================"
13
+ echo "SITE 2 - Embedding Extraction"
14
+ echo "============================================"
15
+ echo ""
16
+
17
+ # Create output directories
18
+ mkdir -p ${OUTPUT_BASE}/train
19
+ mkdir -p ${OUTPUT_BASE}/test
20
+
21
+ # Extract training embeddings
22
+ echo "πŸ“¦ Extracting TRAINING embeddings..."
23
+ python extract-embeddings.py \
24
+ --root-dir ${ROOT_DIR} \
25
+ --pid-csv ${SITE_SPLITS}/train_pid_labelsT0T7_split_2.csv \
26
+ --output-dir ${OUTPUT_BASE}/train \
27
+ --num-workers 8 \
28
+ --checkpoint-interval 500
29
+
30
+ echo ""
31
+ echo "βœ“ Training embeddings complete!"
32
+ echo ""
33
+
34
+ # Extract test embeddings
35
+ echo "πŸ“¦ Extracting TEST embeddings..."
36
+ python extract-embeddings.py \
37
+ --root-dir ${ROOT_DIR} \
38
+ --pid-csv ${SITE_SPLITS}/test_pid_labelsT0T7_split_2.csv \
39
+ --output-dir ${OUTPUT_BASE}/test \
40
+ --num-workers 8 \
41
+ --checkpoint-interval 500
42
+
43
+ echo ""
44
+ echo "βœ“ Test embeddings complete!"
45
+ echo ""
46
+
47
+ # Prepare files for federated learning
48
+ echo "πŸ“‹ Preparing files for federated learning..."
49
+ mkdir -p ${OUTPUT_BASE}/fl_ready
50
+
51
+ # Copy and rename embeddings
52
+ cp ${OUTPUT_BASE}/train/all_embeddings.parquet ${OUTPUT_BASE}/fl_ready/site2_embeddings_train.parquet
53
+ cp ${OUTPUT_BASE}/test/all_embeddings.parquet ${OUTPUT_BASE}/fl_ready/site2_embeddings_test.parquet
54
+
55
+ # Extract just pid and label columns from the CSV files
56
+ echo "Creating site2_labels-train.csv..."
57
+ head -n 1 ${SITE_SPLITS}/train_pid_labelsT0T7_split_2.csv | cut -d, -f1,14-20 > ${OUTPUT_BASE}/fl_ready/site2_labels-train.csv
58
+ tail -n +2 ${SITE_SPLITS}/train_pid_labelsT0T7_split_2.csv | cut -d, -f1,14-20 >> ${OUTPUT_BASE}/fl_ready/site2_labels-train.csv
59
+
60
+ echo "Creating site2_labels-test.csv..."
61
+ head -n 1 ${SITE_SPLITS}/test_pid_labelsT0T7_split_2.csv | cut -d, -f1,14-20 > ${OUTPUT_BASE}/fl_ready/site2_labels-test.csv
62
+ tail -n +2 ${SITE_SPLITS}/test_pid_labelsT0T7_split_2.csv | cut -d, -f1,14-20 >> ${OUTPUT_BASE}/fl_ready/site2_labels-test.csv
63
+
64
+ echo ""
65
+ echo "============================================"
66
+ echo "SITE 2 - COMPLETE! βœ…"
67
+ echo "============================================"
68
+ echo ""
69
+ echo "FL-ready files in: ${OUTPUT_BASE}/fl_ready/"
70
+ ls -lh ${OUTPUT_BASE}/fl_ready/
71
+ echo ""
72
+ echo "Files ready for federated learning:"
73
+ echo " βœ“ site2_embeddings_train.parquet"
74
+ echo " βœ“ site2_embeddings_test.parquet"
75
+ echo " βœ“ site2_labels-train.csv"
76
+ echo " βœ“ site2_labels-test.csv"
77
+ echo ""