VatsalPatel18
/

omics-plip-1

Model card Files Files and versions Community

VatsalPatel18 commited on Aug 27, 2024

Commit

70884da

verified ·

1 Parent(s): 9a4b2a2

Upload 19 files

Browse files

Files changed (19) hide show

Dockerfile +26 -0
ReadME.pdf +0 -0
benchmark_train_inception.py +112 -0
benchmark_train_resnet50.py +114 -0
evaluate_on_test_cohort2.py +122 -0
evaluate_risk_classifier.py +83 -0
extract_omics_aligned_tiles_features.py +55 -0
extract_tiles_from_wsi.py +37 -0
make_dataset_for_benchmark_models.py +101 -0
make_train_data_for_omics_plip.py +62 -0
make_train_data_for_risk_classification.py +107 -0
pre_process_tiles.py +44 -0
requirements.txt +12 -0
requirementsT.txt +217 -0
train_GWSIF_classifier.py +134 -0
train_and_evaluate_risk_classifier.py +144 -0
train_omics_plip_model.py +89 -0
train_risk_classifier.py +103 -0
train_risk_classifier_optional.py +109 -0

Dockerfile ADDED Viewed

	@@ -0,0 +1,26 @@

+# Use an official Python runtime as a parent image
+FROM python:3.8-slim-buster
+# Set the working directory in the container
+WORKDIR /app
+# Install system and Python dependencies
+RUN apt-get update && \
+    apt-get install -y build-essential openslide-tools libgl1-mesa-glx && \
+    apt-get clean && \
+    rm -rf /var/lib/apt/lists/*
+# Copy the entire genomic_plip_model directory contents into the container at /app
+RUN adduser --disabled-password --gecos '' myuser
+USER myuser
+COPY ./ /app/
+# Install Python dependencies
+RUN pip install --no-cache-dir -r requirements.txt
+# Create a non-root user and switch to it for security
+EXPOSE 8888
+# Set the entrypoint to a shell command
+ENTRYPOINT ["/bin/bash"]

ReadME.pdf ADDED Viewed

Binary file (68.2 kB). View file

benchmark_train_inception.py ADDED Viewed

	@@ -0,0 +1,112 @@

+import os
+import numpy as np
+import tensorflow as tf
+import json
+from tensorflow.keras.preprocessing.image import ImageDataGenerator as IDG
+from sklearn.metrics import roc_auc_score, f1_score, accuracy_score, precision_score, recall_score
+import argparse
+import pandas as pd
+# Function to compute additional metrics like AUC, Precision, Recall, and F1 Score
+def compute_additional_metrics(generator, model):
+    y_true = generator.classes
+    y_pred_prob = model.predict(generator)
+    y_pred = np.argmax(y_pred_prob, axis=1)
+    auc = roc_auc_score(y_true, y_pred_prob[:, 1])
+    precision = precision_score(y_true, y_pred, average='macro')
+    recall = recall_score(y_true, y_pred, average='macro')
+    f1 = f1_score(y_true, y_pred, average='macro')
+    accuracy = accuracy_score(y_true, y_pred)
+    return auc, precision, recall, f1, accuracy, y_pred_prob
+# Function to save evaluation metrics
+def save_evaluation_metrics(generator, model, dataset_name, save_dir):
+    auc, precision, recall, f1, accuracy, y_pred_prob = compute_additional_metrics(generator, model)
+    metrics = {
+        'auc': auc,
+        'precision': precision,
+        'recall': recall,
+        'f1_score': f1,
+        'accuracy': accuracy
+    }
+    # Save predictions
+    np.savez_compressed(os.path.join(save_dir, f'{dataset_name}_predictions.npz'), predictions=y_pred_prob, labels=generator.classes)
+    return metrics
+if __name__ == "__main__":
+    parser = argparse.ArgumentParser(description='Train and evaluate InceptionV3 on benchmark datasets.')
+    parser.add_argument('--dataset_dir', type=str, required=True, help='Directory containing train, validate, test, and test2 directories.')
+    parser.add_argument('--save_dir', type=str, default='./results/', help='Directory to save the model and evaluation results.')
+    parser.add_argument('--epochs', type=int, default=5, help='Number of training epochs.')
+    args = parser.parse_args()
+    train_dir = os.path.join(args.dataset_dir, 'train')
+    validate_dir = os.path.join(args.dataset_dir, 'validate')
+    test_dir = os.path.join(args.dataset_dir, 'test')
+    test2_dir = os.path.join(args.dataset_dir, 'test2')
+    os.makedirs(args.save_dir, exist_ok=True)
+    # Set up InceptionV3 model
+    with tf.device('GPU:0'):
+        inception = tf.keras.applications.InceptionV3(include_top=False, weights='imagenet', input_shape=(299, 299, 3))
+        last_layer = inception.get_layer('mixed10')
+        last_output = last_layer.output
+        x = tf.keras.layers.GlobalAveragePooling2D()(last_output)
+        x = tf.keras.layers.Dense(2, activation='softmax')(x)  # Assuming binary classification
+        model = tf.keras.Model(inputs=inception.input, outputs=x)
+        model.compile(loss='categorical_crossentropy', optimizer='adam', metrics=['accuracy', 'Recall', 'Precision'])
+        # Image data generators
+        train_datagen = IDG(rescale=1/255.0, horizontal_flip=True)
+        validate_datagen = IDG(rescale=1/255.0)
+        test_datagen = IDG(rescale=1/255.0)
+        train_generator = train_datagen.flow_from_directory(train_dir, target_size=(299, 299),
+                                                            class_mode='categorical', batch_size=64)
+        validate_generator = validate_datagen.flow_from_directory(validate_dir, target_size=(299, 299),
+                                                                  class_mode='categorical', batch_size=64)
+        test_generator = test_datagen.flow_from_directory(test_dir, target_size=(299, 299),
+                                                          class_mode='categorical', batch_size=64)
+        test2_generator = test_datagen.flow_from_directory(test2_dir, target_size=(299, 299),
+                                                           class_mode='categorical', batch_size=64)
+        # Training the model
+        hist = model.fit(train_generator, epochs=args.epochs, validation_data=validate_generator, verbose=1, shuffle=True)
+        # Save the trained model
+        model.save(os.path.join(args.save_dir, 'inception_model.hdf5'))
+        # Save training history separately
+        training_log = {
+            'loss': hist.history['loss'],
+            'val_loss': hist.history['val_loss'],
+            'accuracy': hist.history['accuracy'],
+            'val_accuracy': hist.history['val_accuracy'],
+            'recall': hist.history['recall'],
+            'val_recall': hist.history['val_recall'],
+            'precision': hist.history['precision'],
+            'val_precision': hist.history['val_precision']
+        }
+        with open(os.path.join(args.save_dir, 'training_log.json'), 'w') as f:
+            json.dump(training_log, f)
+        # Evaluate the model on each dataset and save metrics
+        train_metrics = save_evaluation_metrics(train_generator, model, "train", args.save_dir)
+        validate_metrics = save_evaluation_metrics(validate_generator, model, "validate", args.save_dir)
+        test_metrics = save_evaluation_metrics(test_generator, model, "test", args.save_dir)
+        test2_metrics = save_evaluation_metrics(test2_generator, model, "test2", args.save_dir)
+        # Save the evaluation metrics in a JSON file
+        evaluation_metrics = {
+            'train_metrics': train_metrics,
+            'validate_metrics': validate_metrics,
+            'test_metrics': test_metrics,
+            'test2_metrics': test2_metrics
+        }
+        with open(os.path.join(args.save_dir, 'evaluation_metrics.json'), 'w') as f:
+            json.dump(evaluation_metrics, f)
+        print("Training and evaluation metrics saved.")

benchmark_train_resnet50.py ADDED Viewed

	@@ -0,0 +1,114 @@

+import os
+import numpy as np
+import tensorflow as tf
+import json
+from tensorflow.keras.preprocessing.image import ImageDataGenerator as IDG
+from sklearn.metrics import roc_auc_score, f1_score, accuracy_score, precision_score, recall_score
+import argparse
+import pandas as pd
+# Function to compute additional metrics like AUC, Precision, Recall, and F1 Score
+def compute_additional_metrics(generator, model):
+    y_true = generator.classes
+    y_pred_prob = model.predict(generator)
+    y_pred = np.argmax(y_pred_prob, axis=1)
+    auc = roc_auc_score(y_true, y_pred_prob[:, 1])
+    precision = precision_score(y_true, y_pred, average='macro')
+    recall = recall_score(y_true, y_pred, average='macro')
+    f1 = f1_score(y_true, y_pred, average='macro')
+    accuracy = accuracy_score(y_true, y_pred)
+    return auc, precision, recall, f1, accuracy, y_pred_prob
+# Function to save evaluation metrics
+def save_evaluation_metrics(generator, model, dataset_name, save_dir):
+    auc, precision, recall, f1, accuracy, y_pred_prob = compute_additional_metrics(generator, model)
+    metrics = {
+        'auc': auc,
+        'precision': precision,
+        'recall': recall,
+        'f1_score': f1,
+        'accuracy': accuracy
+    }
+    # Save predictions
+    np.savez_compressed(os.path.join(save_dir, f'{dataset_name}_predictions.npz'), predictions=y_pred_prob, labels=generator.classes)
+    return metrics
+if __name__ == "__main__":
+    parser = argparse.ArgumentParser(description='Train and evaluate ResNet50 on benchmark datasets.')
+    parser.add_argument('--dataset_dir', type=str, required=True, help='Directory containing train, validate, test, and test2 directories.')
+    parser.add_argument('--save_dir', type=str, default='./results/', help='Directory to save the model and evaluation results.')
+    parser.add_argument('--epochs', type=int, default=10, help='Number of training epochs.')
+    args = parser.parse_args()
+    train_dir = os.path.join(args.dataset_dir, 'train')
+    validate_dir = os.path.join(args.dataset_dir, 'validate')
+    test_dir = os.path.join(args.dataset_dir, 'test')
+    test2_dir = os.path.join(args.dataset_dir, 'test2')
+    os.makedirs(args.save_dir, exist_ok=True)
+    # Set up ResNet50 model
+    with tf.device('GPU:0'):
+        resnet = tf.keras.applications.ResNet50(include_top=False, weights='imagenet', input_shape=(224, 224, 3))
+        last_layer = resnet.get_layer('conv5_block3_out')
+        last_output = last_layer.output
+        x = tf.keras.layers.GlobalAveragePooling2D()(last_output)
+        x = tf.keras.layers.Dense(2, activation='softmax')(x)  # Assuming binary classification
+        model = tf.keras.Model(inputs=resnet.input, outputs=x)
+        model.compile(loss='categorical_crossentropy', optimizer='adam', metrics=['accuracy', 'Recall', 'Precision'])
+        # Image data generators
+        train_datagen = IDG(rescale=1/255.0, horizontal_flip=True)
+        validate_datagen = IDG(rescale=1/255.0)
+        test_datagen = IDG(rescale=1/255.0)
+        batch_size = 64
+        train_generator = train_datagen.flow_from_directory(train_dir, target_size=(224, 224),
+                                                            class_mode='categorical', batch_size=batch_size)
+        validate_generator = validate_datagen.flow_from_directory(validate_dir, target_size=(224, 224),
+                                                                  class_mode='categorical', batch_size=batch_size)
+        test_generator = test_datagen.flow_from_directory(test_dir, target_size=(224, 224),
+                                                          class_mode='categorical', batch_size=batch_size)
+        test2_generator = test_datagen.flow_from_directory(test2_dir, target_size=(224, 224),
+                                                           class_mode='categorical', batch_size=batch_size)
+        # Training the model
+        hist = model.fit(train_generator, epochs=args.epochs, validation_data=validate_generator, verbose=1, shuffle=True)
+        # Save the trained model
+        model.save(os.path.join(args.save_dir, 'risk_classifier_resnet_model.hdf5'))
+        # Save training history separately
+        training_log = {
+            'loss': hist.history['loss'],
+            'val_loss': hist.history['val_loss'],
+            'accuracy': hist.history['accuracy'],
+            'val_accuracy': hist.history['val_accuracy'],
+            'recall': hist.history['recall'],
+            'val_recall': hist.history['val_recall'],
+            'precision': hist.history['precision'],
+            'val_precision': hist.history['val_precision']
+        }
+        with open(os.path.join(args.save_dir, 'resnet_training_log.json'), 'w') as f:
+            json.dump(training_log, f)
+        # Evaluate the model on each dataset and save metrics
+        train_metrics = save_evaluation_metrics(train_generator, model, "train", args.save_dir)
+        validate_metrics = save_evaluation_metrics(validate_generator, model, "validate", args.save_dir)
+        test_metrics = save_evaluation_metrics(test_generator, model, "test", args.save_dir)
+        test2_metrics = save_evaluation_metrics(test2_generator, model, "test2", args.save_dir)
+        # Save the evaluation metrics in a JSON file
+        evaluation_metrics = {
+            'train_metrics': train_metrics,
+            'validate_metrics': validate_metrics,
+            'test_metrics': test_metrics,
+            'test2_metrics': test2_metrics
+        }
+        with open(os.path.join(args.save_dir, 'resnet_evaluation_metrics.json'), 'w') as f:
+            json.dump(evaluation_metrics, f)
+        print("Training and evaluation metrics saved.")

evaluate_on_test_cohort2.py ADDED Viewed

	@@ -0,0 +1,122 @@

+import os
+import numpy as np
+import pandas as pd
+import torch
+import json
+import tensorflow as tf
+from sklearn.metrics import roc_auc_score, f1_score, accuracy_score, precision_score, recall_score
+import argparse
+# Function to load and preprocess the dataset
+def load_and_preprocess_data(metadata_file, data_dir):
+    dff = pd.read_csv(metadata_file, skiprows=0)
+    if 'Unnamed: 0' in dff.columns:
+        del dff['Unnamed: 0']
+    # Filter and map classes to 0 and 1
+    classified_df = dff[dff['Class'].isin([1, 3])]
+    classified_df['Class'] = classified_df['Class'].map({1: 1, 3: 0})
+    df = classified_df.set_index('PatientID')
+    # Filter for patients that have corresponding WSI data
+    available_patients = set(os.listdir(data_dir))
+    df = df.loc[df.index.intersection(available_patients)]
+    df = df.sample(frac=1)
+    return df
+# Function to create bags of tiles
+def create_bags(df, data_dir):
+    data = {'test2': {'X': [], 'Y': []}}
+    for pID, row in df.iterrows():
+        fol_p = os.path.join(data_dir, pID)
+        tiles = os.listdir(fol_p)
+        tile_data = []
+        for tile in tiles:
+            tile_p = os.path.join(fol_p, tile)
+            np1 = torch.load(tile_p).numpy()
+            tile_data.append(np1)
+        bag = np.squeeze(tile_data, axis=1)
+        bag_label = row['Class']
+        data['test2']['X'].append(bag)
+        data['test2']['Y'].append(np.array([bag_label]))
+    data['test2']['X'] = np.array(data['test2']['X'])
+    data['test2']['Y'] = np.array(data['test2']['Y'])
+    print(f"Data[test2]['X'] shape: {data['test2']['X'].shape}, dtype: {data['test2']['X'].dtype}")
+    return data
+# Function to pad the data to ensure uniform bag length
+def prepare_data_with_padding(data, max_length=2000):
+    padded_data = []
+    for bag in data:
+        if len(bag) < max_length:
+            padding = np.zeros((max_length - len(bag), bag.shape[1]))
+            padded_bag = np.vstack((bag, padding))
+        else:
+            padded_bag = bag
+        padded_data.append(padded_bag)
+    return np.array(padded_data)
+# Function to compute additional metrics using sklearn
+def compute_additional_metrics(X, Y, model):
+    predictions = model.predict(X).flatten()
+    predictions_binary = (predictions > 0.5).astype(int)  # Convert probabilities to class labels (0 or 1)
+    auc = roc_auc_score(Y, predictions)
+    precision = precision_score(Y, predictions_binary)
+    recall = recall_score(Y, predictions_binary)
+    f1 = f1_score(Y, predictions_binary)
+    return auc, precision, recall, f1, predictions
+# Function to evaluate the model on a given dataset using sklearn metrics
+def evaluate_dataset(model, X, Y, dataset_name, save_dir):
+    # Evaluate using TensorFlow's model.evaluate() for loss and accuracy
+    eval_metrics = model.evaluate(X, Y, verbose=0)
+    # Compute additional metrics using sklearn
+    auc, precision, recall, f1, predictions = compute_additional_metrics(X, Y, model)
+    metrics = {
+        'loss': eval_metrics[0],
+        'accuracy': eval_metrics[1],
+        'auc': auc,
+        'precision': precision,
+        'recall': recall,
+        'f1_score': f1
+    }
+    # Save the predictions for each sample
+    np.savez_compressed(os.path.join(save_dir, f'{dataset_name}_predictions.npz'), predictions=predictions, labels=Y)
+    return metrics
+if __name__ == "__main__":
+    # Command line arguments
+    parser = argparse.ArgumentParser(description='Evaluate a trained model on a secondary test dataset (test2).')
+    parser.add_argument('--metadata_file', type=str, required=True, help='Path to the metadata CSV file for test2.')
+    parser.add_argument('--data_dir', type=str, required=True, help='Directory containing the extracted tissue features.')
+    parser.add_argument('--model_path', type=str, required=True, help='Path to the saved model file.')
+    parser.add_argument('--save_dir', type=str, default='./evaluation_results_test2/', help='Directory to save evaluation results.')
+    args = parser.parse_args()
+    if not os.path.exists(args.save_dir):
+        os.makedirs(args.save_dir)
+    # Load and preprocess the test2 data
+    df_test2 = load_and_preprocess_data(args.metadata_file, args.data_dir)
+    data_test2 = create_bags(df_test2, args.data_dir)
+    # Prepare the test2 data with padding
+    test2_X = prepare_data_with_padding(data_test2['test2']['X'], max_length=2000)
+    test2_Y = np.array(data_test2['test2']['Y']).flatten()
+    # Load the saved model
+    model = tf.keras.models.load_model(args.model_path)
+    # Evaluate the model on the test2 dataset
+    test2_metrics = evaluate_dataset(model, test2_X, test2_Y, "test2", args.save_dir)
+    # Save the metrics to a JSON file
+    with open(os.path.join(args.save_dir, 'evaluation_metrics_test2.json'), 'w') as f:
+        json.dump(test2_metrics, f, indent=4)
+    print("Evaluation metrics saved to evaluation_metrics_test2.json")

evaluate_risk_classifier.py ADDED Viewed

	@@ -0,0 +1,83 @@

+import os
+import numpy as np
+import tensorflow as tf
+from sklearn.metrics import roc_auc_score, f1_score, accuracy_score, precision_score, recall_score
+import argparse
+import json
+import pandas as pd
+# Function to compute additional metrics like AUC, Precision, Recall, and F1 Score
+def compute_additional_metrics(X, Y, model):
+    predictions = model.predict(X).flatten()
+    predictions_binary = (predictions > 0.5).astype(int)  # Convert probabilities to class labels (0 or 1)
+    auc = roc_auc_score(Y, predictions)
+    precision = precision_score(Y, predictions_binary)
+    recall = recall_score(Y, predictions_binary)
+    f1 = f1_score(Y, predictions_binary)
+    return auc, precision, recall, f1, predictions
+# Function to evaluate the model on a given dataset
+def evaluate_dataset(model, X, Y, dataset_name, save_dir):
+    eval_metrics = model.evaluate(X, Y, verbose=0)
+    auc, precision, recall, f1, predictions = compute_additional_metrics(X, Y, model)
+    metrics = {
+        'loss': eval_metrics[0],
+        'accuracy': eval_metrics[1],
+        'auc': auc,
+        'precision': precision,
+        'recall': recall,
+        'f1_score': f1
+    }
+    # Save the predictions for each sample
+    np.savez_compressed(os.path.join(save_dir, f'{dataset_name}_predictions.npz'), predictions=predictions, labels=Y)
+    return metrics
+# Function to evaluate the model on train, validate, and test datasets
+def evaluate_all_datasets(model, train_X, train_Y, validate_X, validate_Y, test_X, test_Y, save_dir):
+    train_metrics = evaluate_dataset(model, train_X, train_Y, "train", save_dir)
+    validate_metrics = evaluate_dataset(model, validate_X, validate_Y, "validate", save_dir)
+    test_metrics = evaluate_dataset(model, test_X, test_Y, "test", save_dir)
+    metrics = {
+        'train': train_metrics,
+        'validate': validate_metrics,
+        'test': test_metrics
+    }
+    # Display the metrics in a tabular format
+    metrics_df = pd.DataFrame(metrics).T
+    print(metrics_df.to_string())
+    # Save metrics to a JSON file
+    with open(os.path.join(save_dir, 'evaluation_metrics.json'), 'w') as f:
+        json.dump(metrics, f, indent=4)
+    print("Evaluation metrics saved to evaluation_metrics.json")
+    return metrics
+if __name__ == "__main__":
+    # Command line arguments
+    parser = argparse.ArgumentParser(description='Evaluate a trained multiple instance learning classifier on risk data.')
+    parser.add_argument('--data_file', type=str, required=True, help='Path to the saved .npz file with training, validation, and test data.')
+    parser.add_argument('--model_path', type=str, required=True, help='Path to the saved model file.')
+    parser.add_argument('--save_dir', type=str, default='./evaluation_results/', help='Directory to save the evaluation results.')
+    args = parser.parse_args()
+    if not os.path.exists(args.save_dir):
+        os.makedirs(args.save_dir)
+    # Load the preprocessed data
+    data = np.load(args.data_file)
+    train_X, train_Y = data['train_X'], data['train_Y']
+    validate_X, validate_Y = data['validate_X'], data['validate_Y']
+    test_X, test_Y = data['test_X'], data['test_Y']
+    # Load the saved model
+    model = tf.keras.models.load_model(args.model_path)
+    # Evaluate the model
+    metrics = evaluate_all_datasets(model, train_X, train_Y, validate_X, validate_Y, test_X, test_Y, args.save_dir)

extract_omics_aligned_tiles_features.py ADDED Viewed

	@@ -0,0 +1,55 @@

+import os
+import torch
+from torch.utils.data import Dataset
+from pathlib import Path
+import argparse
+from scripts.genomic_plip_model import GenomicPLIPModel
+from transformers import CLIPVisionModel
+class PatientTileDataset(Dataset):
+    def __init__(self, data_dir, model, save_dir):
+        super().__init__()
+        self.data_dir = data_dir
+        self.model = model
+        self.save_dir = Path(save_dir)
+        self.files = []
+        for patient_id in os.listdir(data_dir):
+            patient_dir = os.path.join(data_dir, patient_id)
+            if os.path.isdir(patient_dir):
+                for f in os.listdir(patient_dir):
+                    if f.endswith('.pt'):
+                        self.files.append((os.path.join(patient_dir, f), patient_id))
+    def __len__(self):
+        return len(self.files)
+    def __getitem__(self, idx):
+        file_path, patient_id = self.files[idx]
+        data = torch.load(file_path)
+        tile_data = torch.from_numpy(data['tile_data'][0]).unsqueeze(0)  # Add batch dimension
+        with torch.no_grad():
+            vision_features, _ = self.model(pixel_values=tile_data, score_vector=torch.zeros(1, 4))
+        feature_path = self.save_dir / patient_id / os.path.basename(file_path)
+        feature_path.parent.mkdir(parents=True, exist_ok=True)
+        torch.save(vision_features, feature_path)
+        return feature_path
+def extract_features(data_dir, save_dir, model_path):
+    original_model = CLIPVisionModel.from_pretrained("./plip/")
+    custom_model = GenomicPLIPModel(original_model)
+    custom_model.load_state_dict(torch.load(model_path))
+    custom_model.eval()
+    dataset = PatientTileDataset(data_dir=data_dir, model=custom_model, save_dir=save_dir)
+    for _ in dataset:
+        pass
+if __name__ == "__main__":
+    parser = argparse.ArgumentParser(description="Extract features from genomic aligned tiles.")
+    parser.add_argument('--data_dir', type=str, default='plip_preprocess/', help='Directory containing the pre processed patient data.')
+    parser.add_argument('--save_dir', type=str, default='omics_align_features/', help='Directory to save the extracted features.')
+    parser.add_argument('--model_path', type=str, default='./save_model/omics_plip.pth', help='Path to the trained model file.')
+    args = parser.parse_args()
+    extract_features(args.data_dir, args.save_dir, args.model_path)

extract_tiles_from_wsi.py ADDED Viewed

	@@ -0,0 +1,37 @@

+import argparse
+from scripts.slide_processor_parallel import SlideProcessor
+def main():
+    parser = argparse.ArgumentParser(description='Process whole slide images from a directory.')
+    # Required arguments
+    parser.add_argument('-d', '--directory', type=str, required=True,
+                        help='Directory containing whole slide image files.')
+    parser.add_argument('-o', '--output_dir', type=str, required=True,
+                        help='Directory to save the processed tiles.')
+    # Optional arguments with defaults
+    parser.add_argument('-t', '--tile_size', type=int, default=1024,
+                        help='Size of the tile in pixels (default: 1024).')
+    parser.add_argument('-v', '--overlap', type=int, default=0,
+                        help='Overlap of tiles in pixels (default: 0).')
+    parser.add_argument('-th', '--tissue_threshold', type=float, default=0.65,
+                        help='Threshold for tissue detection as a float (default: 0.65).')
+    parser.add_argument('-w', '--max_workers', type=int, default=30,
+                        help='Maximum number of worker threads/processes (default: 30).')
+    args = parser.parse_args()
+    # Initialize the SlideProcessor with the parsed arguments
+    processor = SlideProcessor(
+        tile_size=args.tile_size,
+        overlap=args.overlap,
+        tissue_threshold=args.tissue_threshold,
+        max_workers=args.max_workers
+    )
+    # Start the processing
+    processor.parallel_process(base_dir=args.directory, output_dir=args.output_dir)
+if __name__ == '__main__':
+    main()

make_dataset_for_benchmark_models.py ADDED Viewed

	@@ -0,0 +1,101 @@

+import os
+import random
+import shutil
+import pandas as pd
+from sklearn.model_selection import train_test_split
+import argparse
+def load_and_preprocess_data(metadata_file, data_dir):
+    df = pd.read_csv(metadata_file, skiprows=0)
+    if 'Unnamed: 0' in df.columns:
+        del df['Unnamed: 0']
+    # Filter and map classes to 0 and 1
+    classified_df = df[df['Class'].isin([1, 3])]
+    classified_df['Class'] = classified_df['Class'].map({1: 1, 3: 0})
+    df = classified_df.set_index('PatientID')
+    # Filter for patients that have corresponding WSI data
+    available_patients = set(os.listdir(data_dir))
+    df = df.loc[df.index.intersection(available_patients)]
+    df = df.sample(frac=1)
+    return df
+def create_data_splits(df):
+    class1 = list(df[df['Class'] == 1].index)
+    class0 = list(df[df['Class'] == 0].index)
+    C1_X_train, C1_X_test = train_test_split(class1, test_size=0.3)
+    C0_X_train, C0_X_test = train_test_split(class0, test_size=0.2)
+    C1_X_validate, C1_X_test = train_test_split(C1_X_test, test_size=0.6)
+    C0_X_validate, C0_X_test = train_test_split(C0_X_test, test_size=0.5)
+    X_train = []; X_train.extend(C1_X_train); X_train.extend(C0_X_train)
+    X_test = []; X_test.extend(C1_X_test); X_test.extend(C0_X_test)
+    X_validate = []; X_validate.extend(C1_X_validate); X_validate.extend(C0_X_validate)
+    random.shuffle(X_train)
+    random.shuffle(X_test)
+    random.shuffle(X_validate)
+    data_info = {'train': X_train, 'test': X_test, 'validate': X_validate}
+    print(" C0 - Train : {} , Validate : {} , Test : {} ".format(len(C0_X_train), len(C0_X_test), len(C0_X_validate)))
+    print(" C1 - Train : {} , Validate : {} , Test : {} ".format(len(C1_X_train), len(C1_X_test), len(C1_X_validate)))
+    return data_info
+def copy_tiles(patient_ids, dest_folder, source_dir, num_tiles_per_patient):
+    for pID in patient_ids:
+        flp = os.path.join(source_dir, pID)
+        if os.path.exists(flp):
+            tiles = os.listdir(flp)
+            selected_tiles = random.sample(tiles, min(num_tiles_per_patient, len(tiles)))
+            for tile in selected_tiles:
+                tile_p = os.path.join(flp, tile)
+                new_p = os.path.join(dest_folder, tile)
+                shutil.copy(tile_p, new_p)
+        else:
+            print(f"Folder not found for patient {pID}")
+def process_cohorts(primary_metadata, secondary_metadata, source_dir, dataset_dir, num_tiles_per_patient):
+    # Create necessary directories if they don't exist
+    os.makedirs(os.path.join(dataset_dir, 'train/class1/'), exist_ok=True)
+    os.makedirs(os.path.join(dataset_dir, 'train/class0/'), exist_ok=True)
+    os.makedirs(os.path.join(dataset_dir, 'test/class1/'), exist_ok=True)
+    os.makedirs(os.path.join(dataset_dir, 'test/class0/'), exist_ok=True)
+    os.makedirs(os.path.join(dataset_dir, 'validate/class1/'), exist_ok=True)
+    os.makedirs(os.path.join(dataset_dir, 'validate/class0/'), exist_ok=True)
+    os.makedirs(os.path.join(dataset_dir, 'test2/class1/'), exist_ok=True)
+    os.makedirs(os.path.join(dataset_dir, 'test2/class0/'), exist_ok=True)
+    # Load and preprocess primary cohort
+    primary_df = load_and_preprocess_data(primary_metadata, source_dir)
+    primary_data_info = create_data_splits(primary_df)
+    # Load and preprocess secondary cohort
+    secondary_df = load_and_preprocess_data(secondary_metadata, source_dir)
+    secondary_data_info = {'test2': secondary_df.index.tolist()}
+    # Copy tiles for the primary cohort
+    copy_tiles(primary_data_info['train'], os.path.join(dataset_dir, 'train/class1/'), source_dir, num_tiles_per_patient)
+    copy_tiles(primary_data_info['test'], os.path.join(dataset_dir, 'test/class1/'), source_dir, num_tiles_per_patient)
+    copy_tiles(primary_data_info['validate'], os.path.join(dataset_dir, 'validate/class1/'), source_dir, num_tiles_per_patient)
+    # Copy tiles for the secondary cohort
+    copy_tiles(secondary_data_info['test2'], os.path.join(dataset_dir, 'test2/class1/'), source_dir, num_tiles_per_patient)
+    print("Tiles copying completed for both cohorts.")
+if __name__ == "__main__":
+    parser = argparse.ArgumentParser(description='Create dataset for benchmark models from primary and secondary cohorts.')
+    parser.add_argument('--primary_metadata', type=str, required=True, help='Path to the primary cohort metadata CSV file.')
+    parser.add_argument('--secondary_metadata', type=str, required=True, help='Path to the secondary cohort metadata CSV file.')
+    parser.add_argument('--source_dir', type=str, required=True, help='Directory containing raw tissue tiles.')
+    parser.add_argument('--dataset_dir', type=str, required=True, help='Directory to save the processed dataset.')
+    parser.add_argument('--num_tiles_per_patient', type=int, default=595, help='Number of tiles to select per patient.')
+    args = parser.parse_args()
+    process_cohorts(args.primary_metadata, args.secondary_metadata, args.source_dir, args.dataset_dir, args.num_tiles_per_patient)

make_train_data_for_omics_plip.py ADDED Viewed

	@@ -0,0 +1,62 @@

+import os
+import random
+import shutil
+from sklearn.model_selection import train_test_split
+import argparse
+def main(num_tiles_per_patient, source_dir, dataset_dir, test_val_size, val_size):
+    # Create necessary directories if they don't exist
+    os.makedirs(os.path.join(dataset_dir, 'train'), exist_ok=True)
+    os.makedirs(os.path.join(dataset_dir, 'test'), exist_ok=True)
+    os.makedirs(os.path.join(dataset_dir, 'validate'), exist_ok=True)
+    with open('./data/tcga-hnscc-patients.json', 'r') as f:
+        patient_data = json.load(f)
+    # Separate the patients into study and restricted groups
+    study_patients = patient_data['study']
+    restricted_patients = patient_data['restricted']
+    # List all patient directories in the source directory
+    files = os.listdir(source_dir)
+    # Filter files based on study patients
+    study_files = [file for file in files if file in study_patients]
+    # Split the data into train, test, and validation sets
+    train, test_val = train_test_split(files, test_size=test_val_size)
+    test, val = train_test_split(test_val, test_size=val_size)
+    # Function to process and copy files
+    def process_and_copy(file_list, type):
+        for file in file_list:
+            fol_p = os.path.join(source_dir, file)
+            tiles = os.listdir(fol_p)
+            selected_tiles = random.sample(tiles, min(num_tiles_per_patient, len(tiles)))
+            for tile in selected_tiles:
+                tile_p = os.path.join(fol_p, tile)
+                new_p = os.path.join(dataset_dir, type, tile)
+                shutil.copy(tile_p, new_p)
+    # Process and copy files for each dataset
+    process_and_copy(train, 'train')
+    process_and_copy(test, 'test')
+    process_and_copy(val, 'validate')
+if __name__ == "__main__":
+    parser = argparse.ArgumentParser(description='Split data into train, test, and validation sets.')
+    parser.add_argument('--num_tiles_per_patient', type=int, default=595,
+                        help='Number of tiles to select per patient.')
+    parser.add_argument('--source_dir', type=str, default='plip_preprocess',
+                        help='Directory containing patient folders.')
+    parser.add_argument('--dataset_dir', type=str, default='Datasets/train_03',
+                        help='Root directory for the train, test, and validate directories.')
+    parser.add_argument('--test_val_size', type=float, default=0.4,
+                        help='Size of the test and validation sets combined.')
+    parser.add_argument('--val_size', type=float, default=0.5,
+                        help='Proportion of validation set in the test-validation split.')
+    args = parser.parse_args()
+    main(args.num_tiles_per_patient, args.source_dir, args.dataset_dir, args.test_val_size, args.val_size)

make_train_data_for_risk_classification.py ADDED Viewed

	@@ -0,0 +1,107 @@

+import os
+import numpy as np
+import pandas as pd
+import torch
+import random
+from sklearn.model_selection import train_test_split
+import argparse
+def prepare_data_with_padding(data, max_length=None):
+    if max_length is None:
+        max_length = max(len(bag) for bag in data)
+    padded_data = []
+    for bag in data:
+        if len(bag) < max_length:
+            padding = np.zeros((max_length - len(bag), bag.shape[1]))
+            padded_bag = np.vstack((bag, padding))
+        else:
+            padded_bag = bag
+        padded_data.append(padded_bag)
+    return np.array(padded_data), max_length
+def create_bags(data_info, df13, data_dir):
+    data = {'train': {'X': [], 'Y': []}, 'test': {'X': [], 'Y': []}, 'validate': {'X': [], 'Y': []}}
+    for split in ['train', 'test', 'validate']:
+        for pID in data_info[split]:
+            fol_p = os.path.join(data_dir, pID)
+            tiles = os.listdir(fol_p)
+            tile_data = []
+            for tile in tiles:
+                tile_p = os.path.join(fol_p, tile)
+                np1 = torch.load(tile_p).numpy()
+                tile_data.append(np1)
+            patient_label = df13.loc[pID, 'Class']
+            bag = np.squeeze(tile_data, axis=1)
+            bag_label = 1 if patient_label == 1 else 0
+            data[split]['X'].append(bag)
+            data[split]['Y'].append(np.array([bag_label]))
+        data[split]['X'] = np.array(data[split]['X'])
+        data[split]['Y'] = np.array(data[split]['Y'])
+        print(f"Data[{split}]['X'] shape: {data[split]['X'].shape}, dtype: {data[split]['X'].dtype}")
+    return data
+def process_and_save(data_dir, metadata_file, save_dir):
+    # Load and preprocess metadata
+    dff = pd.read_csv(metadata_file, skiprows=0)
+    del dff['Unnamed: 0']
+    classified_df = dff[dff['Class'].isin([1, 3])]
+    classified_df['Class'] = classified_df['Class'].map({1: 1, 3: 0})
+    df13 = classified_df.set_index('PatientID')
+    there = set(list(df13.index))
+    wsi_there = os.listdir(data_dir)
+    use = list(there.intersection(wsi_there))
+    df13 = df13.loc[use]
+    df13 = df13.sample(frac=1)
+    class1 = list(df13[df13['Class'] == 1].index)
+    class0 = list(df13[df13['Class'] == 0].index)
+    C1_X_train, C1_X_test = train_test_split(class1, test_size=0.3)
+    C0_X_train, C0_X_test = train_test_split(class0, test_size=0.2)
+    C1_X_validate, C1_X_test = train_test_split(C1_X_test, test_size=0.6)
+    C0_X_validate, C0_X_test = train_test_split(C0_X_test, test_size=0.5)
+    X_train = C1_X_train + C0_X_train
+    X_test = C1_X_test + C0_X_test
+    X_validate = C1_X_validate + C0_X_validate
+    random.shuffle(X_train)
+    random.shuffle(X_test)
+    random.shuffle(X_validate)
+    data_info = {'train': X_train, 'test': X_test, 'validate': X_validate}
+    print(" C0 - Train : {} , Validate : {} , Test : {} ".format(len(C0_X_train), len(C0_X_test), len(C0_X_validate)))
+    print(" C1 - Train : {} , Validate : {} , Test : {} ".format(len(C1_X_train), len(C1_X_test), len(C1_X_validate)))
+    # Create bags and prepare data with padding
+    data = create_bags(data_info, df13, data_dir)
+    train_X, _ = prepare_data_with_padding(data['train']['X'], 2000)
+    train_Y = np.array(data['train']['Y']).flatten()
+    validate_X, _ = prepare_data_with_padding(data['validate']['X'], 2000)
+    validate_Y = np.array(data['validate']['Y']).flatten()
+    test_X, _ = prepare_data_with_padding(data['test']['X'], 2000)
+    test_Y = np.array(data['test']['Y']).flatten()
+    # Save the processed arrays to a single file
+    np.savez_compressed(os.path.join(save_dir, 'training_risk_classifier_data.npz'),
+                        train_X=train_X, train_Y=train_Y,
+                        validate_X=validate_X, validate_Y=validate_Y,
+                        test_X=test_X, test_Y=test_Y)
+    print("Data saved successfully in:", os.path.join(save_dir, 'training_risk_classifier_data.npz'))
+if __name__ == "__main__":
+    parser = argparse.ArgumentParser(description='Process, split, and save the data with padding.')
+    parser.add_argument('--data_dir', type=str, help='Directory containing the extracted features.')
+    parser.add_argument('--metadata_file', type=str, default='data/data1.hnsc.p3.csv', help='CSV file containing the metadata for the samples.')
+    parser.add_argument('--save_dir', type=str, default='Datasets', help='Directory to save the processed data.')
+    args = parser.parse_args()
+    if not os.path.exists(args.save_dir):
+        os.makedirs(args.save_dir)
+    process_and_save(args.data_dir, args.metadata_file, args.save_dir)

pre_process_tiles.py ADDED Viewed

	@@ -0,0 +1,44 @@

+import os
+import pandas as pd
+from scripts.PlipDataProcess import PlipDataProcess  # Updated folder name
+from transformers import CLIPImageProcessor
+import argparse
+def main(csv_file, root_dir, save_dir):
+    # Load the CSV file and set 'PatientID' as the index
+    df4 = pd.read_csv(csv_file).set_index('PatientID')
+    # List directories in the root directory (assuming each directory corresponds to a patient)
+    files = [file for file in os.listdir(root_dir) if os.path.isdir(os.path.join(root_dir, file))]
+    # Initialize the image processor
+    img_processor = CLIPImageProcessor.from_pretrained("./plip/")
+    # Initialize the dataset processing object
+    dataset = PlipDataProcess(
+        root_dir=root_dir,
+        files=files,
+        df=df4,
+        img_processor=img_processor,
+        num_tiles_per_patient=2000,
+        max_workers=64,
+        save_dir=save_dir
+    )
+    # Process each item in the dataset
+    for i in range(len(dataset)):
+        _ = dataset[i]  # Trigger processing of the i-th item
+if __name__ == '__main__':
+    parser = argparse.ArgumentParser(description="Process WSI images and generate tiles")
+    # Define arguments
+    parser.add_argument('--csv_file', type=str, required=True, help='Path to the CSV file with patient scores')
+    parser.add_argument('--root_dir', type=str, required=True, help='Root directory for WSI tiles')
+    parser.add_argument('--save_dir', type=str, required=True, help='Directory to save the processed tile data')
+    # Parse arguments
+    args = parser.parse_args()
+    # Call the main function with the parsed arguments
+    main(csv_file=args.csv_file, root_dir=args.root_dir, save_dir=args.save_dir)

requirements.txt ADDED Viewed

	@@ -0,0 +1,12 @@

+numpy==1.19.2
+pandas==1.3.4
+matplotlib==3.5.2
+openslide-python==1.1.2
+scikit-image==0.18.1
+scikit-learn==1.2.1
+tqdm==4.62.3
+Pillow==9.4.0
+transformers==4.33.2
+torch==2.0.1
+jupyterlab==3.2.1
+tensorflow==2.6.1

requirementsT.txt ADDED Viewed

	@@ -0,0 +1,217 @@

+absl-py==2.1.0
+aiohttp==3.9.5
+aiosignal==1.3.1
+anndata==0.9.1
+anyio @ file:///home/conda/feedstock_root/build_artifacts/anyio_1666191106763/work/dist
+appdirs==1.4.4
+argon2-cffi @ file:///home/conda/feedstock_root/build_artifacts/argon2-cffi_1640817743617/work
+argon2-cffi-bindings @ file:///home/conda/feedstock_root/build_artifacts/argon2-cffi-bindings_1649500328244/work
+astor==0.8.1
+asttokens @ file:///home/conda/feedstock_root/build_artifacts/asttokens_1670263926556/work
+astunparse==1.6.3
+async-timeout==4.0.3
+attrs @ file:///home/conda/feedstock_root/build_artifacts/attrs_1671632566681/work
+autograd==1.5
+autograd-gamma==0.5.0
+Babel @ file:///home/conda/feedstock_root/build_artifacts/babel_1677767029043/work
+backcall @ file:///home/conda/feedstock_root/build_artifacts/backcall_1592338393461/work
+backports.functools-lru-cache @ file:///home/conda/feedstock_root/build_artifacts/backports.functools_lru_cache_1618230623929/work
+beautifulsoup4 @ file:///home/conda/feedstock_root/build_artifacts/beautifulsoup4_1680888073205/work
+bleach @ file:///home/conda/feedstock_root/build_artifacts/bleach_1674535352125/work
+Bottleneck @ file:///opt/conda/conda-bld/bottleneck_1657175564434/work
+brotlipy==0.7.0
+certifi @ file:///home/conda/feedstock_root/build_artifacts/certifi_1720457958366/work/certifi
+cffi @ file:///croot/cffi_1670423208954/work
+charset-normalizer @ file:///tmp/build/80754af9/charset-normalizer_1630003229654/work
+cmake==3.27.2
+contourpy @ file:///opt/conda/conda-bld/contourpy_1663827406301/work
+cryptography @ file:///croot/cryptography_1677533068310/work
+cycler @ file:///tmp/build/80754af9/cycler_1637851556182/work
+datasets==2.19.1
+debugpy @ file:///home/builder/ci_310/debugpy_1640789504635/work
+decorator @ file:///home/conda/feedstock_root/build_artifacts/decorator_1641555617451/work
+defusedxml @ file:///home/conda/feedstock_root/build_artifacts/defusedxml_1615232257335/work
+dill==0.3.8
+entrypoints @ file:///home/conda/feedstock_root/build_artifacts/entrypoints_1643888246732/work
+executing @ file:///home/conda/feedstock_root/build_artifacts/executing_1667317341051/work
+fastjsonschema @ file:///home/conda/feedstock_root/build_artifacts/python-fastjsonschema_1677336799617/work/dist
+filelock @ file:///croot/filelock_1672387128942/work
+flatbuffers==24.3.25
+flit_core @ file:///croot/flit-core_1679397103445/work/source/flit_core
+fonttools==4.25.0
+formulaic==0.5.2
+frozenlist==1.4.1
+fsspec==2023.6.0
+future==0.18.3
+gast==0.5.4
+git-filter-repo==2.38.0
+gmpy2 @ file:///tmp/build/80754af9/gmpy2_1645455533097/work
+google-pasta==0.2.0
+graphviz==0.20.1
+grpcio==1.64.0
+h5py==3.11.0
+huggingface-hub==0.23.2
+idna @ file:///croot/idna_1666125576474/work
+imageio==2.34.2
+imbalanced-learn==0.11.0
+importlib-metadata @ file:///home/conda/feedstock_root/build_artifacts/importlib-metadata_1680895625127/work
+importlib-resources @ file:///home/conda/feedstock_root/build_artifacts/importlib_resources_1676919000169/work
+interface-meta==1.3.0
+ipykernel @ file:///home/conda/feedstock_root/build_artifacts/ipykernel_1655369107642/work
+ipython @ file:///home/conda/feedstock_root/build_artifacts/ipython_1680185408135/work
+ipython-genutils==0.2.0
+jedi @ file:///home/conda/feedstock_root/build_artifacts/jedi_1669134318875/work
+Jinja2 @ file:///croot/jinja2_1666908132255/work
+joblib==1.4.2
+json5 @ file:///home/conda/feedstock_root/build_artifacts/json5_1600692310011/work
+jsonpickle==3.0.1
+jsonschema @ file:///home/conda/feedstock_root/build_artifacts/jsonschema-meta_1669810440410/work
+jupyter-client @ file:///home/conda/feedstock_root/build_artifacts/jupyter_client_1654730843242/work
+jupyter-server @ file:///home/conda/feedstock_root/build_artifacts/jupyter_server_1676473377907/work
+jupyter_core @ file:///home/conda/feedstock_root/build_artifacts/jupyter_core_1669775088561/work
+jupyterlab @ file:///home/conda/feedstock_root/build_artifacts/jupyterlab_1674494302491/work
+jupyterlab-pygments @ file:///home/conda/feedstock_root/build_artifacts/jupyterlab_pygments_1649936611996/work
+jupyterlab_server @ file:///home/conda/feedstock_root/build_artifacts/jupyterlab_server_1680275157923/work
+keras==3.3.3
+keras-tuner==1.4.7
+kiwisolver @ file:///croot/kiwisolver_1672387140495/work
+kt-legacy==1.0.5
+lazy_loader==0.4
+libclang==18.1.1
+lifelines==0.27.4
+lit==16.0.6
+llvmlite==0.40.0
+Markdown==3.6
+markdown-it-py==3.0.0
+MarkupSafe @ file:///opt/conda/conda-bld/markupsafe_1654597864307/work
+matplotlib @ file:///croot/matplotlib-suite_1679593461707/work
+matplotlib-inline @ file:///home/conda/feedstock_root/build_artifacts/matplotlib-inline_1660814786464/work
+matplotlib-venn==0.11.9
+mdurl==0.1.2
+mil==1.0.5
+mistune @ file:///home/conda/feedstock_root/build_artifacts/mistune_1675771498296/work
+mkl-fft==1.3.1
+mkl-random @ file:///home/builder/ci_310/mkl_random_1641843545607/work
+mkl-service==2.4.0
+ml-dtypes==0.3.2
+mpmath==1.2.1
+multidict==6.0.5
+multiprocess==0.70.16
+munkres==1.1.4
+namex==0.0.8
+natsort==8.3.1
+nbclassic @ file:///home/conda/feedstock_root/build_artifacts/nbclassic_1680699279518/work
+nbclient @ file:///home/conda/feedstock_root/build_artifacts/nbclient_1680676954923/work
+nbconvert @ file:///home/conda/feedstock_root/build_artifacts/nbconvert-meta_1680629662454/work
+nbformat @ file:///home/conda/feedstock_root/build_artifacts/nbformat_1679336765223/work
+nest-asyncio @ file:///home/conda/feedstock_root/build_artifacts/nest-asyncio_1664684991461/work
+networkx @ file:///croot/networkx_1678964333703/work
+notebook @ file:///home/conda/feedstock_root/build_artifacts/notebook_1680870634737/work
+notebook_shim @ file:///home/conda/feedstock_root/build_artifacts/notebook-shim_1667478401171/work
+numba==0.57.0
+numexpr @ file:///croot/numexpr_1668713893690/work
+numpy @ file:///croot/numpy_and_numpy_base_1672336185480/work
+nvidia-cublas-cu11==11.10.3.66
+nvidia-cuda-cupti-cu11==11.7.101
+nvidia-cuda-nvrtc-cu11==11.7.99
+nvidia-cuda-runtime-cu11==11.7.99
+nvidia-cudnn-cu11==8.5.0.96
+nvidia-cufft-cu11==10.9.0.58
+nvidia-curand-cu11==10.2.10.91
+nvidia-cusolver-cu11==11.4.0.1
+nvidia-cusparse-cu11==11.7.4.91
+nvidia-nccl-cu11==2.14.3
+nvidia-nvtx-cu11==11.7.91
+opencv-python==4.8.0.76
+openslide-python==1.1.2
+opt-einsum==3.3.0
+optree==0.11.0
+packaging @ file:///croot/packaging_1678965309396/work
+pandas @ file:///croot/pandas_1692289311655/work
+pandocfilters @ file:///home/conda/feedstock_root/build_artifacts/pandocfilters_1631603243851/work
+parso @ file:///home/conda/feedstock_root/build_artifacts/parso_1638334955874/work
+patsy==0.5.3
+pexpect @ file:///home/conda/feedstock_root/build_artifacts/pexpect_1667297516076/work
+pickleshare @ file:///home/conda/feedstock_root/build_artifacts/pickleshare_1602536217715/work
+Pillow==9.4.0
+pkgutil_resolve_name @ file:///home/conda/feedstock_root/build_artifacts/pkgutil-resolve-name_1633981968097/work
+plotly==5.14.1
+ply==3.11
+pooch @ file:///tmp/build/80754af9/pooch_1623324770023/work
+prometheus-client @ file:///home/conda/feedstock_root/build_artifacts/prometheus_client_1674535637125/work
+prompt-toolkit @ file:///home/conda/feedstock_root/build_artifacts/prompt-toolkit_1677600924538/work
+protobuf==4.25.3
+psutil @ file:///opt/conda/conda-bld/psutil_1656431268089/work
+ptyprocess @ file:///home/conda/feedstock_root/build_artifacts/ptyprocess_1609419310487/work/dist/ptyprocess-0.7.0-py2.py3-none-any.whl
+pure-eval @ file:///home/conda/feedstock_root/build_artifacts/pure_eval_1642875951954/work
+pyarrow==16.1.0
+pyarrow-hotfix==0.6
+pycparser @ file:///tmp/build/80754af9/pycparser_1636541352034/work
+Pygments @ file:///home/conda/feedstock_root/build_artifacts/pygments_1672682006896/work
+pykan==0.0.2
+pynndescent==0.5.10
+pyOpenSSL @ file:///croot/pyopenssl_1677607685877/work
+pyparsing @ file:///opt/conda/conda-bld/pyparsing_1661452539315/work
+PyQt5-sip==12.11.0
+pyrsistent @ file:///home/builder/ci_310/pyrsistent_1640807196327/work
+PySocks @ file:///home/builder/ci_310/pysocks_1640793678128/work
+python-dateutil @ file:///home/conda/feedstock_root/build_artifacts/python-dateutil_1626286286081/work
+pytz @ file:///home/conda/feedstock_root/build_artifacts/pytz_1680088766131/work
+pyvis==0.3.2
+PyYAML==6.0.1
+pyzmq @ file:///opt/conda/conda-bld/pyzmq_1657724186960/work
+regex==2023.8.8
+requests @ file:///croot/requests_1678709721434/work
+rich==13.7.1
+rpy2==3.5.11
+safetensors==0.3.3
+scanpy==1.9.3
+scikit-image==0.24.0
+scikit-learn==1.5.1
+scipy==1.14.0
+seaborn @ file:///croot/seaborn_1673479180098/work
+Send2Trash @ file:///home/conda/feedstock_root/build_artifacts/send2trash_1628511208346/work
+session-info==1.0.0
+sip @ file:///tmp/abs_44cd77b_pu/croots/recipe/sip_1659012365470/work
+six @ file:///tmp/build/80754af9/six_1644875935023/work
+sniffio @ file:///home/conda/feedstock_root/build_artifacts/sniffio_1662051266223/work
+soupsieve @ file:///home/conda/feedstock_root/build_artifacts/soupsieve_1658207591808/work
+stack-data @ file:///home/conda/feedstock_root/build_artifacts/stack_data_1669632077133/work
+statsmodels==0.14.0
+stdlib-list==0.8.0
+sympy @ file:///croot/sympy_1668202399572/work
+tenacity==8.2.2
+tensorboard==2.16.2
+tensorboard-data-server==0.7.2
+tensorflow==2.16.1
+tensorflow-io-gcs-filesystem==0.37.0
+termcolor==2.4.0
+terminado @ file:///home/conda/feedstock_root/build_artifacts/terminado_1670253674810/work
+threadpoolctl==3.5.0
+tifffile==2024.7.24
+tinycss2 @ file:///home/conda/feedstock_root/build_artifacts/tinycss2_1666100256010/work
+tokenizers==0.13.3
+toml @ file:///tmp/build/80754af9/toml_1616166611790/work
+tomli @ file:///home/conda/feedstock_root/build_artifacts/tomli_1644342247877/work
+torch==2.0.0
+torch-geometric @ file:///usr/share/miniconda/envs/test/conda-bld/pyg_1679554663466/work
+torchvision==0.15.2
+tornado @ file:///home/conda/feedstock_root/build_artifacts/tornado_1648827254365/work
+tqdm @ file:///croot/tqdm_1679561862951/work
+traitlets @ file:///home/conda/feedstock_root/build_artifacts/traitlets_1675110562325/work
+transformers==4.32.1
+triton==2.0.0
+typing_extensions @ file:///croot/typing_extensions_1669924550328/work
+tzdata @ file:///home/conda/feedstock_root/build_artifacts/python-tzdata_1707747584337/work
+tzlocal==5.0.1
+umap-learn==0.5.3
+urllib3 @ file:///croot/urllib3_1680254681959/work
+wcwidth @ file:///home/conda/feedstock_root/build_artifacts/wcwidth_1673864653149/work
+webencodings==0.5.1
+websocket-client @ file:///home/conda/feedstock_root/build_artifacts/websocket-client_1675567828044/work
+Werkzeug==3.0.3
+wrapt==1.15.0
+xxhash==3.4.1
+yarl==1.9.4
+yellowbrick==1.5
+zipp @ file:///home/conda/feedstock_root/build_artifacts/zipp_1677313463193/work

train_GWSIF_classifier.py ADDED Viewed

	@@ -0,0 +1,134 @@

+import os
+import numpy as np
+import tensorflow as tf
+from tensorflow.keras import layers, Model
+from sklearn.metrics import roc_auc_score, f1_score, accuracy_score, precision_score, recall_score
+import argparse
+import json
+# Define the function to create the multiple instance learning (MIL) model
+def create_simple_model(instance_shape, max_length):
+    inputs = layers.Input(shape=(max_length, instance_shape[-1]), name="bag_input")
+    flatten = layers.TimeDistributed(layers.Flatten())(inputs)
+    dense_1 = layers.TimeDistributed(layers.Dense(256, activation="relu"))(flatten)
+    dropout_1 = layers.TimeDistributed(layers.Dropout(0.5))(dense_1)
+    dense_2 = layers.TimeDistributed(layers.Dense(64, activation="relu"))(dropout_1)
+    dropout_2 = layers.TimeDistributed(layers.Dropout(0.5))(dense_2)
+    aggregated = layers.GlobalAveragePooling1D()(dropout_2)
+    norm_1 = layers.LayerNormalization()(aggregated)
+    output = layers.Dense(1, activation="sigmoid")(norm_1)
+    return Model(inputs, output)
+# Function to compute class weights
+def compute_class_weights(labels):
+    negative_count = len(np.where(labels == 0)[0])
+    positive_count = len(np.where(labels == 1)[0])
+    total_count = negative_count + positive_count
+    return {0: (1 / negative_count) * (total_count / 2), 1: (1 / positive_count) * (total_count / 2)}
+# Function to generate batches of data
+def data_generator(data, labels, batch_size=1):
+    class_weights = compute_class_weights(labels)
+    while True:
+        for i in range(0, len(data), batch_size):
+            batch_data = np.array(data[i:i + batch_size], dtype=np.float32)
+            batch_labels = np.array(labels[i:i + batch_size], dtype=np.float32)
+            batch_weights = np.array([class_weights[int(label)] for label in batch_labels], dtype=np.float32)
+            yield batch_data, batch_labels, batch_weights
+# Learning rate scheduler
+def lr_scheduler(epoch, lr):
+    decay_rate = 0.1
+    decay_step = 10
+    if epoch % decay_step == 0 and epoch:
+        return lr * decay_rate
+    return lr
+# Function to train the model
+def train(train_data, train_labels, val_data, val_labels, model):
+    file_path = "/tmp/best_model.weights.h5"
+    model_checkpoint = tf.keras.callbacks.ModelCheckpoint(file_path, monitor="val_loss", verbose=0, mode="min", save_best_only=True, save_weights_only=True)
+    early_stopping = tf.keras.callbacks.EarlyStopping(monitor="val_loss", patience=10, mode="min")
+    lr_callback = tf.keras.callbacks.LearningRateScheduler(lr_scheduler)
+    model.compile(optimizer="adam", loss="binary_crossentropy", metrics=["accuracy", "AUC"])
+    train_gen = data_generator(train_data, train_labels)
+    val_gen = data_generator(val_data, val_labels)
+    model.fit(train_gen, steps_per_epoch=len(train_data), validation_data=val_gen, validation_steps=len(val_data), epochs=50, batch_size=1, callbacks=[early_stopping, model_checkpoint, lr_callback], verbose=1)
+    model.load_weights(file_path)
+    return model
+# Function to compute additional metrics like AUC, Precision, Recall, and F1 Score
+def compute_additional_metrics(X, Y, model):
+    predictions = model.predict(X).flatten()
+    predictions_binary = (predictions > 0.5).astype(int)  # Convert probabilities to class labels (0 or 1)
+    auc = roc_auc_score(Y, predictions)
+    precision = precision_score(Y, predictions_binary)
+    recall = recall_score(Y, predictions_binary)
+    f1 = f1_score(Y, predictions_binary)
+    return auc, precision, recall, f1, predictions
+# Function to evaluate the model on a given dataset
+def evaluate_dataset(model, X, Y, dataset_name, save_dir):
+    eval_metrics = model.evaluate(X, Y, verbose=0)
+    auc, precision, recall, f1, predictions = compute_additional_metrics(X, Y, model)
+    metrics = {
+        'loss': eval_metrics[0],
+        'accuracy': eval_metrics[1],
+        'auc': auc,
+        'precision': precision,
+        'recall': recall,
+        'f1_score': f1
+    }
+    # Save the predictions for each sample
+    np.savez_compressed(os.path.join(save_dir, f'{dataset_name}_predictions.npz'), predictions=predictions, labels=Y)
+    return metrics
+# Function to evaluate the model on train, validate, and test datasets
+def evaluate_all_datasets(model, train_X, train_Y, validate_X, validate_Y, test_X, test_Y, save_dir):
+    train_metrics = evaluate_dataset(model, train_X, train_Y, "train", save_dir)
+    validate_metrics = evaluate_dataset(model, validate_X, validate_Y, "validate", save_dir)
+    test_metrics = evaluate_dataset(model, test_X, test_Y, "test", save_dir)
+    metrics = {
+        'train': train_metrics,
+        'validate': validate_metrics,
+        'test': test_metrics
+    }
+    with open(os.path.join(save_dir, 'evaluation_metrics.json'), 'w') as f:
+        json.dump(metrics, f, indent=4)
+    print("Evaluation metrics saved to evaluation_metrics.json")
+    return metrics
+if __name__ == "__main__":
+    # Command line arguments
+    parser = argparse.ArgumentParser(description='Train a multiple instance learning classifier on risk data.')
+    parser.add_argument('--data_file', type=str, required=True, help='Path to the saved .npz file with training and validation data.')
+    parser.add_argument('--save_dir', type=str, default='./model_save/', help='Directory to save the model and evaluation metrics.')
+    parser.add_argument('--epochs', type=int, default=50, help='Number of training epochs.')
+    args = parser.parse_args()
+    if not os.path.exists(args.save_dir):
+        os.makedirs(args.save_dir)
+    # Load the preprocessed data
+    data = np.load(args.data_file)
+    train_X, train_Y = data['train_X'], data['train_Y']
+    validate_X, validate_Y = data['validate_X'], data['validate_Y']
+    test_X, test_Y = data['test_X'], data['test_Y']
+    # Create the model
+    instance_shape = (train_X.shape[-1],)
+    max_length = train_X.shape[1]
+    model = create_simple_model(instance_shape, max_length)
+    # Train the model
+    trained_model = train(train_X, train_Y, validate_X, validate_Y, model)
+    # Evaluate the model
+    metrics = evaluate_all_datasets(trained_model, train_X, train_Y, validate_X, validate_Y, test_X, test_Y, args.save_dir)

train_and_evaluate_risk_classifier.py ADDED Viewed

	@@ -0,0 +1,144 @@

+import os
+import numpy as np
+import tensorflow as tf
+from tensorflow.keras import layers, Model
+from sklearn.metrics import roc_auc_score, f1_score, accuracy_score, precision_score, recall_score
+import argparse
+import json
+import pandas as pd
+# Define the function to create the multiple instance learning (MIL) model
+def create_simple_model(instance_shape, max_length):
+    inputs = layers.Input(shape=(max_length, instance_shape[-1]), name="bag_input")
+    flatten = layers.TimeDistributed(layers.Flatten())(inputs)
+    dense_1 = layers.TimeDistributed(layers.Dense(256, activation="relu"))(flatten)
+    dropout_1 = layers.TimeDistributed(layers.Dropout(0.5))(dense_1)
+    dense_2 = layers.TimeDistributed(layers.Dense(64, activation="relu"))(dropout_1)
+    dropout_2 = layers.TimeDistributed(layers.Dropout(0.5))(dense_2)
+    aggregated = layers.GlobalAveragePooling1D()(dropout_2)
+    norm_1 = layers.LayerNormalization()(aggregated)
+    output = layers.Dense(1, activation="sigmoid")(norm_1)
+    return Model(inputs, output)
+# Function to compute class weights
+def compute_class_weights(labels):
+    negative_count = len(np.where(labels == 0)[0])
+    positive_count = len(np.where(labels == 1)[0])
+    total_count = negative_count + positive_count
+    return {0: (1 / negative_count) * (total_count / 2), 1: (1 / positive_count) * (total_count / 2)}
+# Function to generate batches of data
+def data_generator(data, labels, batch_size=1):
+    class_weights = compute_class_weights(labels)
+    while True:
+        for i in range(0, len(data), batch_size):
+            batch_data = np.array(data[i:i + batch_size], dtype=np.float32)
+            batch_labels = np.array(labels[i:i + batch_size], dtype=np.float32)
+            batch_weights = np.array([class_weights[int(label)] for label in batch_labels], dtype=np.float32)
+            yield batch_data, batch_labels, batch_weights
+# Learning rate scheduler
+def lr_scheduler(epoch, lr):
+    decay_rate = 0.1
+    decay_step = 10
+    if epoch % decay_step == 0 and epoch:
+        return lr * decay_rate
+    return lr
+# Function to train the model
+def train(train_data, train_labels, val_data, val_labels, model, save_dir):
+    model_path = os.path.join(save_dir, "best_model.h5")
+    model_checkpoint = tf.keras.callbacks.ModelCheckpoint(model_path, monitor="val_loss", verbose=1, mode="min", save_best_only=True, save_weights_only=False)
+    early_stopping = tf.keras.callbacks.EarlyStopping(monitor="val_loss", patience=10, mode="min")
+    lr_callback = tf.keras.callbacks.LearningRateScheduler(lr_scheduler)
+    model.compile(optimizer="adam", loss="binary_crossentropy", metrics=["accuracy", "AUC"])
+    train_gen = data_generator(train_data, train_labels)
+    val_gen = data_generator(val_data, val_labels)
+    model.fit(train_gen, steps_per_epoch=len(train_data), validation_data=val_gen, validation_steps=len(val_data), epochs=50, batch_size=1, callbacks=[early_stopping, model_checkpoint, lr_callback], verbose=1)
+    return model
+# Function to compute additional metrics like AUC, Precision, Recall, and F1 Score
+def compute_additional_metrics(X, Y, model):
+    predictions = model.predict(X).flatten()
+    predictions_binary = (predictions > 0.5).astype(int)  # Convert probabilities to class labels (0 or 1)
+    auc = roc_auc_score(Y, predictions)
+    precision = precision_score(Y, predictions_binary)
+    recall = recall_score(Y, predictions_binary)
+    f1 = f1_score(Y, predictions_binary)
+    return auc, precision, recall, f1, predictions
+# Function to evaluate the model on a given dataset
+def evaluate_dataset(model, X, Y, dataset_name, save_dir):
+    eval_metrics = model.evaluate(X, Y, verbose=0)
+    auc, precision, recall, f1, predictions = compute_additional_metrics(X, Y, model)
+    metrics = {
+        'loss': eval_metrics[0],
+        'accuracy': eval_metrics[1],
+        'auc': auc,
+        'precision': precision,
+        'recall': recall,
+        'f1_score': f1
+    }
+    # Save the predictions for each sample
+    np.savez_compressed(os.path.join(save_dir, f'{dataset_name}_predictions.npz'), predictions=predictions, labels=Y)
+    return metrics
+# Function to evaluate the model on train, validate, and test datasets
+def evaluate_all_datasets(model, train_X, train_Y, validate_X, validate_Y, test_X, test_Y, save_dir):
+    train_metrics = evaluate_dataset(model, train_X, train_Y, "train", save_dir)
+    validate_metrics = evaluate_dataset(model, validate_X, validate_Y, "validate", save_dir)
+    test_metrics = evaluate_dataset(model, test_X, test_Y, "test", save_dir)
+    metrics = {
+        'train': train_metrics,
+        'validate': validate_metrics,
+        'test': test_metrics
+    }
+    # Display the metrics in a tabular format
+    metrics_df = pd.DataFrame(metrics).T
+    print(metrics_df.to_string())
+    # Save metrics to a JSON file
+    with open(os.path.join(save_dir, 'evaluation_metrics.json'), 'w') as f:
+        json.dump(metrics, f, indent=4)
+    print("Evaluation metrics saved to evaluation_metrics.json")
+    return metrics
+if __name__ == "__main__":
+    # Command line arguments
+    parser = argparse.ArgumentParser(description='Train a multiple instance learning classifier on risk data.')
+    parser.add_argument('--data_file', type=str, required=True, help='Path to the saved .npz file with training and validation data.')
+    parser.add_argument('--save_dir', type=str, default='./model_save/', help='Directory to save the model and evaluation metrics.')
+    parser.add_argument('--epochs', type=int, default=50, help='Number of training epochs.')
+    args = parser.parse_args()
+    if not os.path.exists(args.save_dir):
+        os.makedirs(args.save_dir)
+    # Load the preprocessed data
+    data = np.load(args.data_file)
+    train_X, train_Y = data['train_X'], data['train_Y']
+    validate_X, validate_Y = data['validate_X'], data['validate_Y']
+    test_X, test_Y = data['test_X'], data['test_Y']
+    # Create the model
+    instance_shape = (train_X.shape[-1],)
+    max_length = train_X.shape[1]
+    model = create_simple_model(instance_shape, max_length)
+    # Train the model
+    trained_model = train(train_X, train_Y, validate_X, validate_Y, model, args.save_dir)
+    # Save the final model after training
+    final_model_path = os.path.join(args.save_dir, "risk_classifier_model.h5")
+    trained_model.save(final_model_path)
+    print(f"Model saved successfully to {final_model_path}")
+    # Evaluate the model
+    metrics = evaluate_all_datasets(trained_model, train_X, train_Y, validate_X, validate_Y, test_X, test_Y, args.save_dir)

train_omics_plip_model.py ADDED Viewed

	@@ -0,0 +1,89 @@

+import torch
+import argparse
+from torch import optim
+from torch.utils.data import DataLoader
+from scripts.genomic_plip_model import GenomicPLIPModel
+from scripts.tile_file_dataloader import FlatTileDataset
+from transformers import CLIPVisionModel
+def train_model(data_dir, model_save_path, pretrained_model_path, lr, num_epochs, train_batch_size, validation_batch_size, num_workers):
+    # Load datasets
+    train_dataset = FlatTileDataset(data_dir=f'{data_dir}/train')
+    train_data_loader = DataLoader(train_dataset, batch_size=train_batch_size, shuffle=True, num_workers=num_workers)
+    validation_dataset = FlatTileDataset(data_dir=f'{data_dir}/validate')
+    validation_data_loader = DataLoader(validation_dataset, batch_size=validation_batch_size, shuffle=False, num_workers=num_workers)
+    # Initialize the model
+    base_model = CLIPVisionModel.from_pretrained(pretrained_model_path)
+    custom_model = GenomicPLIPModel(base_model)
+    criterion = torch.nn.CosineSimilarity(dim=1)
+    optimizer = optim.Adam(custom_model.parameters(), lr=lr)
+    for epoch in range(num_epochs):
+        # Training loop
+        custom_model.train()
+        train_loss = 0.0
+        for batch_images, batch_scores in train_data_loader:
+            optimizer.zero_grad()
+            batch_loss = 0
+            for img, score in zip(batch_images, batch_scores):
+                vision_features, score_features = custom_model(img.unsqueeze(0), score.unsqueeze(0))
+                cos_sim = criterion(score_features, vision_features)
+                loss = -cos_sim.mean()
+                batch_loss += loss.item()
+                loss.backward()
+            optimizer.step()
+            train_loss += batch_loss
+            print(f"Batch Cosine Similarity {batch_loss:.4f}")
+        avg_train_loss = train_loss / len(train_data_loader)
+        print(f"Epoch [{epoch+1}/{num_epochs}], Training Cosine Similarity: {avg_train_loss:.4f}")
+        # Validation loop
+        custom_model.eval()
+        validation_loss = 0.0
+        with torch.no_grad():
+            for batch_images, batch_scores in validation_data_loader:
+                batch_loss = 0
+                for img, score in zip(batch_images, batch_scores):
+                    vision_features, score_features = custom_model(img.unsqueeze(0), score.unsqueeze(0))
+                    cos_sim = criterion(score_features, vision_features)
+                    loss = -cos_sim.mean()
+                    batch_loss += loss.item()
+                validation_loss += batch_loss
+                print(f"Validation Batch Cosine Similarity {batch_loss:.4f}")
+            avg_validation_loss = validation_loss / len(validation_data_loader)
+            print(f"Epoch [{epoch+1}/{num_epochs}], Validation Cosine Similarity: {avg_validation_loss:.4f}")
+    # Save the trained model
+    torch.save(custom_model.state_dict(), model_save_path)
+if __name__ == "__main__":
+    parser = argparse.ArgumentParser(description='Train the Genomic PLIP Model')
+    parser.add_argument('--data_dir', type=str, default='Datasets/train_03', help='Directory containing the train, validate, and test datasets.')
+    parser.add_argument('--model_save_path', type=str, default='genomic_plip.pth', help='Path to save the trained model.')
+    parser.add_argument('--pretrained_model_path', type=str, default='./plip', help='Path to the pretrained CLIP model.')
+    parser.add_argument('--lr', type=float, default=0.00001, help='Learning rate for the optimizer.')
+    parser.add_argument('--num_epochs', type=int, default=1, help='Number of epochs to train for.')
+    parser.add_argument('--train_batch_size', type=int, default=128, help='Batch size for the training data loader.')
+    parser.add_argument('--validation_batch_size', type=int, default=128, help='Batch size for the validation data loader.')
+    parser.add_argument('--num_workers', type=int, default=32, help='Number of worker threads for data loading.')
+    args = parser.parse_args()
+    train_model(args.data_dir, args.model_save_path, args.pretrained_model_path, args.lr, args.num_epochs, args.train_batch_size, args.validation_batch_size, args.num_workers)

train_risk_classifier.py ADDED Viewed

	@@ -0,0 +1,103 @@

+import os
+import numpy as np
+import tensorflow as tf
+from tensorflow.keras import layers, Model
+import argparse
+from datetime import datetime
+# Define the function to create the multiple instance learning (MIL) model
+def create_simple_model2(instance_shape, max_length):
+    inputs = layers.Input(shape=(max_length, instance_shape[-1]), name="bag_input")
+    flatten = layers.TimeDistributed(layers.Flatten())(inputs)
+    dense_1 = layers.TimeDistributed(layers.Dense(256, activation="relu"))(flatten)
+    dropout_1 = layers.TimeDistributed(layers.Dropout(0.5))(dense_1)
+    dense_2 = layers.TimeDistributed(layers.Dense(64, activation="relu"))(dropout_1)
+    dropout_2 = layers.TimeDistributed(layers.Dropout(0.5))(dense_2)
+    aggregated = layers.GlobalAveragePooling1D()(dropout_2)
+    norm_1 = layers.LayerNormalization()(aggregated)
+    output = layers.Dense(1, activation="sigmoid")(norm_1)
+    return Model(inputs, output)
+def create_simple_model(instance_shape, max_length, num_heads=4, key_dim=64):
+    inputs = layers.Input(shape=(max_length, instance_shape[-1]), name="bag_input")
+    flatten = layers.TimeDistributed(layers.Flatten())(inputs)
+    dense_1 = layers.TimeDistributed(layers.Dense(256, activation="relu"))(flatten)
+    dropout_1 = layers.TimeDistributed(layers.Dropout(0.5))(dense_1)
+    dense_2 = layers.TimeDistributed(layers.Dense(64, activation="relu"))(dropout_1)
+    dropout_2 = layers.TimeDistributed(layers.Dropout(0.5))(dense_2)
+    attention_output, attention_scores = layers.MultiHeadAttention(
+        num_heads=num_heads,
+        key_dim=key_dim,
+        value_dim=64,
+        dropout=0.1,
+        use_bias=True
+    )(query=dropout_2, value=dropout_2, key=dropout_2, return_attention_scores=True)
+    aggregated = layers.GlobalAveragePooling1D()(attention_output)
+    norm_1 = layers.LayerNormalization()(aggregated)
+    output = layers.Dense(1, activation="sigmoid")(norm_1)
+    return Model(inputs, output)
+# Function to compute class weights
+def compute_class_weights(labels):
+    negative_count = len(np.where(labels == 0)[0])
+    positive_count = len(np.where(labels == 1)[0])
+    total_count = negative_count + positive_count
+    return {0: (1 / negative_count) * (total_count / 2), 1: (1 / positive_count) * (total_count / 2)}
+# Function to generate batches of data
+def data_generator(data, labels, batch_size=1):
+    class_weights = compute_class_weights(labels)
+    while True:
+        for i in range(0, len(data), batch_size):
+            batch_data = np.array(data[i:i + batch_size], dtype=np.float32)
+            batch_labels = np.array(labels[i:i + batch_size], dtype=np.float32)
+            batch_weights = np.array([class_weights[int(label)] for label in batch_labels], dtype=np.float32)
+            yield batch_data, batch_labels, batch_weights
+# Learning rate scheduler
+def lr_scheduler(epoch, lr):
+    decay_rate = 0.1
+    decay_step = 10
+    if epoch % decay_step == 0 and epoch:
+        return lr * decay_rate
+    return lr
+# Function to train the model
+def train(train_data, train_labels, val_data, val_labels, model, save_dir):
+    model_path = os.path.join(save_dir, "risk_classifier_model.h5")
+    model_checkpoint = tf.keras.callbacks.ModelCheckpoint(model_path, monitor="val_loss", verbose=1, mode="min", save_best_only=True, save_weights_only=False)
+    early_stopping = tf.keras.callbacks.EarlyStopping(monitor="val_loss", patience=10, mode="min")
+    lr_callback = tf.keras.callbacks.LearningRateScheduler(lr_scheduler)
+    model.compile(optimizer="adam", loss="binary_crossentropy", metrics=["accuracy", "AUC"])
+    train_gen = data_generator(train_data, train_labels)
+    val_gen = data_generator(val_data, val_labels)
+    model.fit(train_gen, steps_per_epoch=len(train_data), validation_data=val_gen, validation_steps=len(val_data), epochs=50, batch_size=1, callbacks=[early_stopping, model_checkpoint, lr_callback], verbose=1)
+    return model
+if __name__ == "__main__":
+    # Command line arguments
+    parser = argparse.ArgumentParser(description='Train a multiple instance learning classifier on risk data.')
+    parser.add_argument('--data_file', type=str, required=True, help='Path to the saved .npz file with training and validation data.')
+    parser.add_argument('--save_dir', type=str, default='./model_save/', help='Directory to save the model.')
+    parser.add_argument('--epochs', type=int, default=50, help='Number of training epochs.')
+    args = parser.parse_args()
+    if not os.path.exists(args.save_dir):
+        os.makedirs(args.save_dir)
+    # Load the preprocessed data
+    data = np.load(args.data_file)
+    train_X, train_Y = data['train_X'], data['train_Y']
+    validate_X, validate_Y = data['validate_X'], data['validate_Y']
+    # Create the model
+    instance_shape = (train_X.shape[-1],)
+    max_length = train_X.shape[1]
+    model = create_simple_model(instance_shape, max_length)
+    # Train the model
+    trained_model = train(train_X, train_Y, validate_X, validate_Y, model, args.save_dir)
+    # Final message after training and saving the model
+    print(f"Model saved successfully to {os.path.join(args.save_dir, 'risk_classifier_model.h5')}")

train_risk_classifier_optional.py ADDED Viewed

	@@ -0,0 +1,109 @@

+import os
+import numpy as np
+import tensorflow as tf
+from tensorflow.keras import layers, Model
+import argparse
+from datetime import datetime
+# Define the function to create the first model
+def create_simple_model(instance_shape, max_length):
+    inputs = layers.Input(shape=(max_length, instance_shape[-1]), name="bag_input")
+    flatten = layers.TimeDistributed(layers.Flatten())(inputs)
+    dense_1 = layers.TimeDistributed(layers.Dense(256, activation="relu"))(flatten)
+    dropout_1 = layers.TimeDistributed(layers.Dropout(0.5))(dense_1)
+    dense_2 = layers.TimeDistributed(layers.Dense(64, activation="relu"))(dropout_1)
+    dropout_2 = layers.TimeDistributed(layers.Dropout(0.5))(dense_2)
+    aggregated = layers.GlobalAveragePooling1D()(dropout_2)
+    norm_1 = layers.LayerNormalization()(aggregated)
+    output = layers.Dense(1, activation="sigmoid")(norm_1)
+    return Model(inputs, output)
+# Define the function to create the second model with attention
+def create_simple_model2(instance_shape, max_length, num_heads=4, key_dim=64):
+    inputs = layers.Input(shape=(max_length, instance_shape[-1]), name="bag_input")
+    flatten = layers.TimeDistributed(layers.Flatten())(inputs)
+    dense_1 = layers.TimeDistributed(layers.Dense(256, activation="relu"))(flatten)
+    dropout_1 = layers.TimeDistributed(layers.Dropout(0.5))(dense_1)
+    dense_2 = layers.TimeDistributed(layers.Dense(64, activation="relu"))(dropout_1)
+    dropout_2 = layers.TimeDistributed(layers.Dropout(0.5))(dense_2)
+    attention_output, attention_scores = layers.MultiHeadAttention(
+        num_heads=num_heads,
+        key_dim=key_dim,
+        value_dim=64,
+        dropout=0.1,
+        use_bias=True
+    )(query=dropout_2, value=dropout_2, key=dropout_2, return_attention_scores=True)
+    aggregated = layers.GlobalAveragePooling1D()(attention_output)
+    norm_1 = layers.LayerNormalization()(aggregated)
+    output = layers.Dense(1, activation="sigmoid")(norm_1)
+    return Model(inputs, output)
+# Function to compute class weights
+def compute_class_weights(labels):
+    negative_count = len(np.where(labels == 0)[0])
+    positive_count = len(np.where(labels == 1)[0])
+    total_count = negative_count + positive_count
+    return {0: (1 / negative_count) * (total_count / 2), 1: (1 / positive_count) * (total_count / 2)}
+# Function to generate batches of data
+def data_generator(data, labels, batch_size=1):
+    class_weights = compute_class_weights(labels)
+    while True:
+        for i in range(0, len(data), batch_size):
+            batch_data = np.array(data[i:i + batch_size], dtype=np.float32)
+            batch_labels = np.array(labels[i:i + batch_size], dtype=np.float32)
+            batch_weights = np.array([class_weights[int(label)] for label in batch_labels], dtype=np.float32)
+            yield batch_data, batch_labels, batch_weights
+# Learning rate scheduler
+def lr_scheduler(epoch, lr):
+    decay_rate = 0.1
+    decay_step = 10
+    if epoch % decay_step == 0 and epoch:
+        return lr * decay_rate
+    return lr
+# Function to train the model
+def train(train_data, train_labels, val_data, val_labels, model, save_dir):
+    model_path = os.path.join(save_dir, "risk_classifier_model.h5")
+    model_checkpoint = tf.keras.callbacks.ModelCheckpoint(model_path, monitor="val_loss", verbose=1, mode="min", save_best_only=True, save_weights_only=False)
+    early_stopping = tf.keras.callbacks.EarlyStopping(monitor="val_loss", patience=10, mode="min")
+    lr_callback = tf.keras.callbacks.LearningRateScheduler(lr_scheduler)
+    model.compile(optimizer="adam", loss="binary_crossentropy", metrics=["accuracy", "AUC"])
+    train_gen = data_generator(train_data, train_labels)
+    val_gen = data_generator(val_data, val_labels)
+    model.fit(train_gen, steps_per_epoch=len(train_data), validation_data=val_gen, validation_steps=len(val_data), epochs=50, batch_size=1, callbacks=[early_stopping, model_checkpoint, lr_callback], verbose=1)
+    return model
+if __name__ == "__main__":
+    # Command line arguments
+    parser = argparse.ArgumentParser(description='Train a multiple instance learning classifier on risk data.')
+    parser.add_argument('--data_file', type=str, required=True, help='Path to the saved .npz file with training and validation data.')
+    parser.add_argument('--save_dir', type=str, default='./model_save/', help='Directory to save the model.')
+    parser.add_argument('--epochs', type=int, default=50, help='Number of training epochs.')
+    parser.add_argument('--model_type', type=str, default='model1', choices=['model1', 'model2'], help='Type of model to use: model1 (default) or model2.')
+    args = parser.parse_args()
+    if not os.path.exists(args.save_dir):
+        os.makedirs(args.save_dir)
+    # Load the preprocessed data
+    data = np.load(args.data_file)
+    train_X, train_Y = data['train_X'], data['train_Y']
+    validate_X, validate_Y = data['validate_X'], data['validate_Y']
+    # Create the model based on the selected type
+    instance_shape = (train_X.shape[-1],)
+    max_length = train_X.shape[1]
+    if args.model_type == 'model2':
+        model = create_simple_model2(instance_shape, max_length)
+    else:
+        model = create_simple_model(instance_shape, max_length)
+    # Train the model
+    trained_model = train(train_X, train_Y, validate_X, validate_Y, model, args.save_dir)
+    # Final message after training and saving the model
+    print(f"Model saved successfully to {os.path.join(args.save_dir, 'risk_classifier_model.h5')}")