Niko.Koutsoubis
commited on
Commit
Β·
f369cf3
1
Parent(s):
a091733
Add site-specific extraction scripts for federated learning
Browse files- extract_site1.sh +77 -0
- extract_site2.sh +77 -0
extract_site1.sh
ADDED
|
@@ -0,0 +1,77 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
#!/bin/bash
|
| 2 |
+
# Extract embeddings for Site 1 (Split 1)
|
| 3 |
+
# Usage: bash extract_site1.sh
|
| 4 |
+
|
| 5 |
+
set -e # Exit on error
|
| 6 |
+
|
| 7 |
+
# Configuration
|
| 8 |
+
ROOT_DIR="/roshare/nlst_global/data_23Dec2024/manifest-NLST_allCT/NLST/" # Update this to your NLST data path
|
| 9 |
+
OUTPUT_BASE="site1_data"
|
| 10 |
+
SITE_SPLITS="../subsets/site_splits"
|
| 11 |
+
|
| 12 |
+
echo "============================================"
|
| 13 |
+
echo "SITE 1 - Embedding Extraction"
|
| 14 |
+
echo "============================================"
|
| 15 |
+
echo ""
|
| 16 |
+
|
| 17 |
+
# Create output directories
|
| 18 |
+
mkdir -p ${OUTPUT_BASE}/train
|
| 19 |
+
mkdir -p ${OUTPUT_BASE}/test
|
| 20 |
+
|
| 21 |
+
# Extract training embeddings
|
| 22 |
+
echo "π¦ Extracting TRAINING embeddings..."
|
| 23 |
+
python extract-embeddings.py \
|
| 24 |
+
--root-dir ${ROOT_DIR} \
|
| 25 |
+
--pid-csv ${SITE_SPLITS}/train_pid_labelsT0T7_split_1.csv \
|
| 26 |
+
--output-dir ${OUTPUT_BASE}/train \
|
| 27 |
+
--num-workers 8 \
|
| 28 |
+
--checkpoint-interval 500
|
| 29 |
+
|
| 30 |
+
echo ""
|
| 31 |
+
echo "β Training embeddings complete!"
|
| 32 |
+
echo ""
|
| 33 |
+
|
| 34 |
+
# Extract test embeddings
|
| 35 |
+
echo "π¦ Extracting TEST embeddings..."
|
| 36 |
+
python extract-embeddings.py \
|
| 37 |
+
--root-dir ${ROOT_DIR} \
|
| 38 |
+
--pid-csv ${SITE_SPLITS}/test_pid_labelsT0T7_split_1.csv \
|
| 39 |
+
--output-dir ${OUTPUT_BASE}/test \
|
| 40 |
+
--num-workers 8 \
|
| 41 |
+
--checkpoint-interval 500
|
| 42 |
+
|
| 43 |
+
echo ""
|
| 44 |
+
echo "β Test embeddings complete!"
|
| 45 |
+
echo ""
|
| 46 |
+
|
| 47 |
+
# Prepare files for federated learning
|
| 48 |
+
echo "π Preparing files for federated learning..."
|
| 49 |
+
mkdir -p ${OUTPUT_BASE}/fl_ready
|
| 50 |
+
|
| 51 |
+
# Copy and rename embeddings
|
| 52 |
+
cp ${OUTPUT_BASE}/train/all_embeddings.parquet ${OUTPUT_BASE}/fl_ready/site1_embeddings_train.parquet
|
| 53 |
+
cp ${OUTPUT_BASE}/test/all_embeddings.parquet ${OUTPUT_BASE}/fl_ready/site1_embeddings_test.parquet
|
| 54 |
+
|
| 55 |
+
# Extract just pid and label columns from the CSV files
|
| 56 |
+
echo "Creating site1_labels-train.csv..."
|
| 57 |
+
head -n 1 ${SITE_SPLITS}/train_pid_labelsT0T7_split_1.csv | cut -d, -f1,14-20 > ${OUTPUT_BASE}/fl_ready/site1_labels-train.csv
|
| 58 |
+
tail -n +2 ${SITE_SPLITS}/train_pid_labelsT0T7_split_1.csv | cut -d, -f1,14-20 >> ${OUTPUT_BASE}/fl_ready/site1_labels-train.csv
|
| 59 |
+
|
| 60 |
+
echo "Creating site1_labels-test.csv..."
|
| 61 |
+
head -n 1 ${SITE_SPLITS}/test_pid_labelsT0T7_split_1.csv | cut -d, -f1,14-20 > ${OUTPUT_BASE}/fl_ready/site1_labels-test.csv
|
| 62 |
+
tail -n +2 ${SITE_SPLITS}/test_pid_labelsT0T7_split_1.csv | cut -d, -f1,14-20 >> ${OUTPUT_BASE}/fl_ready/site1_labels-test.csv
|
| 63 |
+
|
| 64 |
+
echo ""
|
| 65 |
+
echo "============================================"
|
| 66 |
+
echo "SITE 1 - COMPLETE! β
"
|
| 67 |
+
echo "============================================"
|
| 68 |
+
echo ""
|
| 69 |
+
echo "FL-ready files in: ${OUTPUT_BASE}/fl_ready/"
|
| 70 |
+
ls -lh ${OUTPUT_BASE}/fl_ready/
|
| 71 |
+
echo ""
|
| 72 |
+
echo "Files ready for federated learning:"
|
| 73 |
+
echo " β site1_embeddings_train.parquet"
|
| 74 |
+
echo " β site1_embeddings_test.parquet"
|
| 75 |
+
echo " β site1_labels-train.csv"
|
| 76 |
+
echo " β site1_labels-test.csv"
|
| 77 |
+
echo ""
|
extract_site2.sh
ADDED
|
@@ -0,0 +1,77 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
#!/bin/bash
|
| 2 |
+
# Extract embeddings for Site 2 (Split 2)
|
| 3 |
+
# Usage: bash extract_site2.sh
|
| 4 |
+
|
| 5 |
+
set -e # Exit on error
|
| 6 |
+
|
| 7 |
+
# Configuration
|
| 8 |
+
ROOT_DIR="/roshare/nlst_global/data_23Dec2024/manifest-NLST_allCT/NLST/" # Update this to your NLST data path
|
| 9 |
+
OUTPUT_BASE="site2_data"
|
| 10 |
+
SITE_SPLITS="../subsets/site_splits"
|
| 11 |
+
|
| 12 |
+
echo "============================================"
|
| 13 |
+
echo "SITE 2 - Embedding Extraction"
|
| 14 |
+
echo "============================================"
|
| 15 |
+
echo ""
|
| 16 |
+
|
| 17 |
+
# Create output directories
|
| 18 |
+
mkdir -p ${OUTPUT_BASE}/train
|
| 19 |
+
mkdir -p ${OUTPUT_BASE}/test
|
| 20 |
+
|
| 21 |
+
# Extract training embeddings
|
| 22 |
+
echo "π¦ Extracting TRAINING embeddings..."
|
| 23 |
+
python extract-embeddings.py \
|
| 24 |
+
--root-dir ${ROOT_DIR} \
|
| 25 |
+
--pid-csv ${SITE_SPLITS}/train_pid_labelsT0T7_split_2.csv \
|
| 26 |
+
--output-dir ${OUTPUT_BASE}/train \
|
| 27 |
+
--num-workers 8 \
|
| 28 |
+
--checkpoint-interval 500
|
| 29 |
+
|
| 30 |
+
echo ""
|
| 31 |
+
echo "β Training embeddings complete!"
|
| 32 |
+
echo ""
|
| 33 |
+
|
| 34 |
+
# Extract test embeddings
|
| 35 |
+
echo "π¦ Extracting TEST embeddings..."
|
| 36 |
+
python extract-embeddings.py \
|
| 37 |
+
--root-dir ${ROOT_DIR} \
|
| 38 |
+
--pid-csv ${SITE_SPLITS}/test_pid_labelsT0T7_split_2.csv \
|
| 39 |
+
--output-dir ${OUTPUT_BASE}/test \
|
| 40 |
+
--num-workers 8 \
|
| 41 |
+
--checkpoint-interval 500
|
| 42 |
+
|
| 43 |
+
echo ""
|
| 44 |
+
echo "β Test embeddings complete!"
|
| 45 |
+
echo ""
|
| 46 |
+
|
| 47 |
+
# Prepare files for federated learning
|
| 48 |
+
echo "π Preparing files for federated learning..."
|
| 49 |
+
mkdir -p ${OUTPUT_BASE}/fl_ready
|
| 50 |
+
|
| 51 |
+
# Copy and rename embeddings
|
| 52 |
+
cp ${OUTPUT_BASE}/train/all_embeddings.parquet ${OUTPUT_BASE}/fl_ready/site2_embeddings_train.parquet
|
| 53 |
+
cp ${OUTPUT_BASE}/test/all_embeddings.parquet ${OUTPUT_BASE}/fl_ready/site2_embeddings_test.parquet
|
| 54 |
+
|
| 55 |
+
# Extract just pid and label columns from the CSV files
|
| 56 |
+
echo "Creating site2_labels-train.csv..."
|
| 57 |
+
head -n 1 ${SITE_SPLITS}/train_pid_labelsT0T7_split_2.csv | cut -d, -f1,14-20 > ${OUTPUT_BASE}/fl_ready/site2_labels-train.csv
|
| 58 |
+
tail -n +2 ${SITE_SPLITS}/train_pid_labelsT0T7_split_2.csv | cut -d, -f1,14-20 >> ${OUTPUT_BASE}/fl_ready/site2_labels-train.csv
|
| 59 |
+
|
| 60 |
+
echo "Creating site2_labels-test.csv..."
|
| 61 |
+
head -n 1 ${SITE_SPLITS}/test_pid_labelsT0T7_split_2.csv | cut -d, -f1,14-20 > ${OUTPUT_BASE}/fl_ready/site2_labels-test.csv
|
| 62 |
+
tail -n +2 ${SITE_SPLITS}/test_pid_labelsT0T7_split_2.csv | cut -d, -f1,14-20 >> ${OUTPUT_BASE}/fl_ready/site2_labels-test.csv
|
| 63 |
+
|
| 64 |
+
echo ""
|
| 65 |
+
echo "============================================"
|
| 66 |
+
echo "SITE 2 - COMPLETE! β
"
|
| 67 |
+
echo "============================================"
|
| 68 |
+
echo ""
|
| 69 |
+
echo "FL-ready files in: ${OUTPUT_BASE}/fl_ready/"
|
| 70 |
+
ls -lh ${OUTPUT_BASE}/fl_ready/
|
| 71 |
+
echo ""
|
| 72 |
+
echo "Files ready for federated learning:"
|
| 73 |
+
echo " β site2_embeddings_train.parquet"
|
| 74 |
+
echo " β site2_embeddings_test.parquet"
|
| 75 |
+
echo " β site2_labels-train.csv"
|
| 76 |
+
echo " β site2_labels-test.csv"
|
| 77 |
+
echo ""
|