VatsalPatel18
/

omics-plip-1

Model card Files Files and versions Community

VatsalPatel18 commited on Aug 27, 2024

Commit

8381e8e

verified ·

1 Parent(s): 70884da

Upload 8 files

Browse files

Files changed (8) hide show

scripts/.ipynb_checkpoints/PlipDataProcess-checkpoint.py +56 -0
scripts/PlipDataProcess.py +56 -0
scripts/__pycache__/slide_processor_parallel.cpython-310.pyc +0 -0
scripts/genomic_plip_model.py +17 -0
scripts/slide_processor.py +157 -0
scripts/slide_processor_parallel.py +160 -0
scripts/tile_classifier.py +31 -0
scripts/tile_file_dataloader.py +25 -0

scripts/.ipynb_checkpoints/PlipDataProcess-checkpoint.py ADDED Viewed

	@@ -0,0 +1,56 @@

+import os
+import random
+import torch
+from PIL import Image
+from concurrent.futures import ThreadPoolExecutor
+class PlipDataProcess(torch.utils.data.Dataset):
+    def __init__(self, root_dir, files, df, img_processor=None, num_tiles_per_patient=128, max_workers=64, save_dir='processed_tile_data'):
+        self.root_dir = root_dir
+        self.files = files
+        self.df = df
+        self.img_processor = img_processor
+        self.num_tiles_per_patient = num_tiles_per_patient
+        self.max_workers = max_workers
+        self.save_dir = save_dir
+        if not os.path.exists(self.save_dir):
+            os.makedirs(self.save_dir)
+    def __len__(self):
+        return len(self.files)
+    def load_and_process_image(self, tile_path):
+        image = Image.open(tile_path)
+        return self.img_processor.preprocess(image)['pixel_values']
+    def save_individual_tile_data(self, tile_data, file_data, file_name, tile_name):
+        save_path = os.path.join(self.save_dir, file_name, f"{tile_name}.pt")
+        os.makedirs(os.path.dirname(save_path), exist_ok=True)
+        torch.save({'tile_data': tile_data, 'file_data': file_data}, save_path)
+    def __getitem__(self, idx):
+        file = self.files[idx]
+        tiles_path = os.path.join(self.root_dir, file,)
+        tiles = [tile for tile in os.listdir(tiles_path) if tile != '.ipynb_checkpoints']
+        selected_tiles = random.sample(tiles, min(self.num_tiles_per_patient, len(tiles)))
+        #file_data = torch.tensor(self.df.loc[f'{file}-01'].values, dtype=torch.float32)
+        try:
+            file_data = torch.tensor(self.df.loc[f'{file}-01'].values, dtype=torch.float32)
+        except KeyError:
+            # If the file is not found in the dataframe, create a tensor of zeros
+            # Shape is inferred from the other rows in the dataframe
+            num_features = self.df.shape[1]
+            file_data = torch.zeros(num_features, dtype=torch.float32)
+        with ThreadPoolExecutor(max_workers=self.max_workers) as executor:
+            for tile_name in selected_tiles:
+                tile_path = os.path.join(tiles_path, tile_name)
+                executor.submit(self.process_and_save_tile, tile_path, file_data, file, tile_name)
+        return idx
+    def process_and_save_tile(self, tile_path, file_data, file_name, tile_name):
+        tile_data = self.load_and_process_image(tile_path)
+        self.save_individual_tile_data(tile_data, file_data, file_name, tile_name)

scripts/PlipDataProcess.py ADDED Viewed

	@@ -0,0 +1,56 @@

+import os
+import random
+import torch
+from PIL import Image
+from concurrent.futures import ThreadPoolExecutor
+class PlipDataProcess(torch.utils.data.Dataset):
+    def __init__(self, root_dir, files, df, img_processor=None, num_tiles_per_patient=128, max_workers=64, save_dir='processed_tile_data'):
+        self.root_dir = root_dir
+        self.files = files
+        self.df = df
+        self.img_processor = img_processor
+        self.num_tiles_per_patient = num_tiles_per_patient
+        self.max_workers = max_workers
+        self.save_dir = save_dir
+        if not os.path.exists(self.save_dir):
+            os.makedirs(self.save_dir)
+    def __len__(self):
+        return len(self.files)
+    def load_and_process_image(self, tile_path):
+        image = Image.open(tile_path)
+        return self.img_processor.preprocess(image)['pixel_values']
+    def save_individual_tile_data(self, tile_data, file_data, file_name, tile_name):
+        save_path = os.path.join(self.save_dir, file_name, f"{tile_name}.pt")
+        os.makedirs(os.path.dirname(save_path), exist_ok=True)
+        torch.save({'tile_data': tile_data, 'file_data': file_data}, save_path)
+    def __getitem__(self, idx):
+        file = self.files[idx]
+        tiles_path = os.path.join(self.root_dir, file,)
+        tiles = [tile for tile in os.listdir(tiles_path) if tile != '.ipynb_checkpoints']
+        selected_tiles = random.sample(tiles, min(self.num_tiles_per_patient, len(tiles)))
+        #file_data = torch.tensor(self.df.loc[f'{file}-01'].values, dtype=torch.float32)
+        try:
+            file_data = torch.tensor(self.df.loc[f'{file}-01'].values, dtype=torch.float32)
+        except KeyError:
+            # If the file is not found in the dataframe, create a tensor of zeros
+            # Shape is inferred from the other rows in the dataframe
+            num_features = self.df.shape[1]
+            file_data = torch.zeros(num_features, dtype=torch.float32)
+        with ThreadPoolExecutor(max_workers=self.max_workers) as executor:
+            for tile_name in selected_tiles:
+                tile_path = os.path.join(tiles_path, tile_name)
+                executor.submit(self.process_and_save_tile, tile_path, file_data, file, tile_name)
+        return idx
+    def process_and_save_tile(self, tile_path, file_data, file_name, tile_name):
+        tile_data = self.load_and_process_image(tile_path)
+        self.save_individual_tile_data(tile_data, file_data, file_name, tile_name)

scripts/__pycache__/slide_processor_parallel.cpython-310.pyc ADDED Viewed

Binary file (6.34 kB). View file

scripts/genomic_plip_model.py ADDED Viewed

	@@ -0,0 +1,17 @@

+import torch
+from transformers import CLIPVisionModel
+class GenomicPLIPModel(torch.nn.Module):
+    def __init__(self, original_model):
+        super(GenomicPLIPModel, self).__init__()
+        self.vision_model = original_model.vision_model
+        self.vision_projection = torch.nn.Linear(768, 512)
+        self.fc_layer = torch.nn.Linear(4, 512)  # Fully connected layer for the 4D vector
+    def forward(self, pixel_values, score_vector):
+        vision_output = self.vision_model(pixel_values)
+        pooled_output = vision_output.pooler_output
+        vision_features = self.vision_projection(pooled_output)
+        score_features = self.fc_layer(score_vector)
+        return vision_features, score_features

scripts/slide_processor.py ADDED Viewed

	@@ -0,0 +1,157 @@

+import numpy as np
+import tensorflow as tf
+import pandas as pd
+import matplotlib.pyplot as plt
+import os
+import openslide
+from PIL import Image
+from openslide import OpenSlideError
+from openslide.deepzoom import DeepZoomGenerator
+import math
+import random
+from pyspark.ml.linalg import Vectors
+import pyspark.sql.functions as F
+from scipy.ndimage.morphology import binary_fill_holes
+from skimage.color import rgb2gray
+from skimage.feature import canny
+from skimage.morphology import binary_closing, binary_dilation, disk
+from concurrent.futures import ProcessPoolExecutor
+import tqdm
+class SlideProcessor:
+    def __init__(self, tile_size=1024, overlap=0, tissue_threshold=0.65, max_workers=30):
+        self.tile_size = tile_size
+        self.overlap = overlap
+        self.tissue_threshold = tissue_threshold
+        self.max_workers = max_workers
+    def optical_density(self, tile):
+        tile = tile.astype(np.float64)
+        od = -np.log((tile+1)/240)
+        return od
+    def keep_tile(self, tile, tissue_threshold=None):
+        if tissue_threshold is None:
+            tissue_threshold = self.tissue_threshold
+        if tile.shape[0:2] == (self.tile_size, self.tile_size):
+            tile_orig = tile
+            tile = rgb2gray(tile)
+            tile = 1 - tile
+            tile = canny(tile)
+            tile = binary_closing(tile, disk(10))
+            tile = binary_dilation(tile, disk(10))
+            tile = binary_fill_holes(tile)
+            percentage = tile.mean()
+            check1 = percentage >= tissue_threshold
+            tile = self.optical_density(tile_orig)
+            beta = 0.15
+            tile = np.min(tile, axis=2) >= beta
+            tile = binary_closing(tile, disk(2))
+            tile = binary_dilation(tile, disk(2))
+            tile = binary_fill_holes(tile)
+            percentage = tile.mean()
+            check2 = percentage >= tissue_threshold
+            return check1 and check2
+        else:
+            return False
+    def filter_tiles(self, tile_indices, generator):
+        filtered_tiles = []
+        for i in range(len(tile_indices)):
+            tile_size, overlap, zoom_level, col, row = tile_indices[i]
+            tile = np.asarray(generator.get_tile(zoom_level, (col, row)))
+            if self.keep_tile(tile, self.tissue_threshold):
+                filtered_tiles.append((col, row))
+        return filtered_tiles
+    def get_tiles(self, samples, tile_indices, generator):
+        tiles = []
+        for i in samples:
+            tile_size, overlap, zoom_level, col, row = tile_indices[i]
+            tile = np.asarray(generator.get_tile(zoom_level, (col, row)))
+            tiles.append((i, tile))
+        return tiles
+    def save_tiles(self, sample_tiles, slide_num, loc='pDataset/rest'):
+        for sample in sample_tiles:
+            i, tile = sample
+            im = Image.fromarray(tile)
+            fname = f"{slide_num}_{i}"
+            file_path = os.path.join(loc, f"{fname}.jpeg")
+            im.save(file_path)
+    def get_save_tiles(self, samples, tile_indices, slide_num, generator, file, loc=None):
+        if loc is None:
+            loc = f'/home/gp7/ml_pni/Dataset/tiles_1024/{file}'
+        for i, cord in enumerate(samples):
+            x, y = cord
+            tile_size, overlap, zoom_level, col, row = tile_indices[i]
+            tile = np.asarray(generator.get_tile(zoom_level, (x, y)))
+            im = Image.fromarray(tile)
+            fname = f"{slide_num}_{x}_{y}"
+            file_path = os.path.join(loc, f"{fname}.jpeg")
+            im.save(file_path)
+    def process_one_slide(self, file, base_dir='HNSC_DS', output_dir='/home/gp7/ml_pni/Dataset/tiles_1024'):
+        f2p = os.path.join(base_dir, f'{file}.svs')
+        if not os.path.exists(output_dir):
+            os.makedirs(output_dir)
+        img1 = openslide.open_slide(f2p)
+        generator = DeepZoomGenerator(img1, tile_size=self.tile_size, overlap=self.overlap, limit_bounds=True)
+        highest_zoom_level = generator.level_count - 1
+        try:
+            mag = int(img1.properties[openslide.PROPERTY_NAME_OBJECTIVE_POWER])
+            offset = math.floor((mag / 20) / 2)
+            level = highest_zoom_level - offset
+        except (ValueError, KeyError):
+            level = highest_zoom_level
+        zoom_level = level
+        cols, rows = generator.level_tiles[zoom_level]
+        tile_indices = [(self.tile_size, self.overlap, zoom_level, col, row) for col in range(cols) for row in range(rows)]
+        filter_sname = os.path.join(output_dir, f'{file}_info.npy')
+        if os.path.exists(filter_sname):
+            try:
+                filtered_tiles = np.load(filter_sname)
+                print(f"Found existing filtered tiles for {file}, skipping tile filtering.")
+            except:
+                print(f"Error reading {filter_sname}, re-filtering tiles.")
+                filtered_tiles = self.filter_tiles(tile_indices, generator)
+                np.save(filter_sname, filtered_tiles)
+        else:
+            print(f"Didn't find existing filtered tiles for {file}, filtering tiles.")
+            filtered_tiles = self.filter_tiles(tile_indices, generator)
+            np.save(filter_sname, filtered_tiles)
+        directory = os.path.join(output_dir, file)
+        if not os.path.exists(directory):
+            os.makedirs(directory)
+        existing_files_count = len([f for f in os.listdir(directory) if f.endswith('.jpeg')])
+        filtered_tiles_count = len(filtered_tiles)
+        threshold = 5
+        if abs(existing_files_count - filtered_tiles_count) <= threshold:
+            print(f"Found approximately the same number of files as filtered tiles for {file}, skipping tile saving.")
+        else:
+            print('Now going to save tiles')
+            self.get_save_tiles(filtered_tiles, tile_indices, file, generator, directory)
+        return file
+    def parallel_process(self, files, base_dir='HNSC_DS', output_dir='/home/gp7/ml_pni/Dataset/tiles_1024'):
+        with ProcessPoolExecutor(max_workers=self.max_workers) as executor:
+            results = list(tqdm.tqdm(executor.map(self.process_one_slide, files, [base_dir]*len(files), [output_dir]*len(files)), total=len(files)))
+        return results

scripts/slide_processor_parallel.py ADDED Viewed

	@@ -0,0 +1,160 @@

+import numpy as np
+from concurrent.futures import ThreadPoolExecutor
+import pandas as pd
+import matplotlib.pyplot as plt
+import os
+import openslide
+from PIL import Image
+from openslide import OpenSlideError
+from openslide.deepzoom import DeepZoomGenerator
+import math
+import random
+from scipy.ndimage.morphology import binary_fill_holes
+from skimage.color import rgb2gray
+from skimage.feature import canny
+from skimage.morphology import binary_closing, binary_dilation, disk
+from concurrent.futures import ProcessPoolExecutor
+import tqdm
+class SlideProcessor:
+    def __init__(self, tile_size=1024, overlap=0, tissue_threshold=0.65, max_workers=30):
+        self.tile_size = tile_size
+        self.overlap = overlap
+        self.tissue_threshold = tissue_threshold
+        self.max_workers = max_workers
+    def optical_density(self, tile):
+        tile = tile.astype(np.float64)
+        od = -np.log((tile+1)/240)
+        return od
+    def keep_tile(self, tile, tissue_threshold=None):
+        if tissue_threshold is None:
+            tissue_threshold = self.tissue_threshold
+        if tile.shape[0:2] == (self.tile_size, self.tile_size):
+            tile_orig = tile
+            tile = rgb2gray(tile)
+            tile = 1 - tile
+            tile = canny(tile)
+            tile = binary_closing(tile, disk(10))
+            tile = binary_dilation(tile, disk(10))
+            tile = binary_fill_holes(tile)
+            percentage = tile.mean()
+            check1 = percentage >= tissue_threshold
+            tile = self.optical_density(tile_orig)
+            beta = 0.15
+            tile = np.min(tile, axis=2) >= beta
+            tile = binary_closing(tile, disk(2))
+            tile = binary_dilation(tile, disk(2))
+            tile = binary_fill_holes(tile)
+            percentage = tile.mean()
+            check2 = percentage >= tissue_threshold
+            return check1 and check2
+        else:
+            return False
+    def filter_tiles(self, tile_indices, generator):
+        def process_tile(tile_index):
+            tile_size, overlap, zoom_level, col, row = tile_index
+            tile = np.asarray(generator.get_tile(zoom_level, (col, row)))
+            if self.keep_tile(tile, self.tissue_threshold):
+                return col, row
+            return None
+        with ThreadPoolExecutor(max_workers=self.max_workers) as executor:
+            results = executor.map(process_tile, tile_indices)
+        # Filter out None results and return the list of tiles to keep
+        return [result for result in results if result is not None]
+    def get_tiles(self, samples, tile_indices, generator):
+        tiles = []
+        for i in samples:
+            tile_size, overlap, zoom_level, col, row = tile_indices[i]
+            tile = np.asarray(generator.get_tile(zoom_level, (col, row)))
+            tiles.append((i, tile))
+        return tiles
+    def save_tiles(self, sample_tiles, slide_num, loc='pDataset/rest'):
+        for sample in sample_tiles:
+            i, tile = sample
+            im = Image.fromarray(tile)
+            fname = f"{slide_num}_{i}"
+            file_path = os.path.join(loc, f"{fname}.jpeg")
+            im.save(file_path)
+    def get_save_tiles(self, samples, tile_indices, slide_num, generator, file, loc):
+        def save_tile(cord):
+            x, y = cord
+            tile_index = next((ti for ti in tile_indices if ti[3] == x and ti[4] == y), None)
+            if tile_index:
+                tile_size, overlap, zoom_level, col, row = tile_index
+                tile = np.asarray(generator.get_tile(zoom_level, (x, y)))
+                im = Image.fromarray(tile)
+                fname = f"{slide_num}_{x}_{y}"
+                file_path = os.path.join(loc, f"{fname}.jpeg")
+                im.save(file_path)
+        with ThreadPoolExecutor(max_workers=self.max_workers) as executor:
+            executor.map(save_tile, samples)
+    def process_one_slide(self, file_loc, output_dir=None):
+        f2p = file_loc
+        if not os.path.exists(output_dir):
+            os.makedirs(output_dir)
+        img1 = openslide.open_slide(f2p)
+        generator = DeepZoomGenerator(img1, tile_size=self.tile_size, overlap=self.overlap, limit_bounds=True)
+        highest_zoom_level = generator.level_count - 1
+        try:
+            mag = int(img1.properties[openslide.PROPERTY_NAME_OBJECTIVE_POWER])
+            offset = math.floor((mag / 20) / 2)
+            level = highest_zoom_level - offset
+        except (ValueError, KeyError):
+            level = highest_zoom_level
+        zoom_level = level
+        cols, rows = generator.level_tiles[zoom_level]
+        tile_indices = [(self.tile_size, self.overlap, zoom_level, col, row) for col in range(cols) for row in range(rows)]
+        filtered_tiles = self.filter_tiles(tile_indices, generator)
+        #np.save(filter_sname, filtered_tiles)
+        if file_loc.endswith('.svs'):
+            file = file_loc[-16:-4]
+            print(file)
+        directory = os.path.join(output_dir, file)
+        if not os.path.exists(directory):
+            os.makedirs(directory)
+        existing_files_count = len([f for f in os.listdir(directory) if f.endswith('.jpeg')])
+        filtered_tiles_count = len(filtered_tiles)
+        threshold = 5
+        if abs(existing_files_count - filtered_tiles_count) <= threshold:
+            print(f"Found approximately the same number of files as filtered tiles for {file}, skipping tile saving.")
+        else:
+            print('Now going to save tiles')
+            self.get_save_tiles(filtered_tiles, tile_indices, file, generator,file, directory)
+            #np.save(directory, filtered_tiles)
+        return file
+    def parallel_process(self, base_dir='HNSC_DS', output_dir=None):
+        # List all .svs files in the base directory
+        files = [os.path.join(base_dir, f) for f in os.listdir(base_dir) if f.endswith('.svs')]
+        with ProcessPoolExecutor(max_workers=self.max_workers) as executor:
+            # Use executor.map to process each file. No need to repeat base_dir and output_dir as they are now constant for all files
+            results = list(tqdm.tqdm(executor.map(self.process_one_slide, files, [output_dir]*len(files)), total=len(files)))
+        return results

scripts/tile_classifier.py ADDED Viewed

	@@ -0,0 +1,31 @@

+import torch
+import torch.nn as nn
+from torch.utils.data import Dataset
+class SimpleNN(nn.Module):
+    def __init__(self):
+        super(SimpleNN, self).__init__()
+        self.fc1 = nn.Linear(512, 512)
+        self.fc2 = nn.Linear(512, 256)
+        self.fc3 = nn.Linear(256, 1)
+    def forward(self, x):
+        x = torch.relu(self.fc1(x))
+        x = torch.relu(self.fc2(x))
+        x = torch.sigmoid(self.fc3(x))
+        return x
+class CustomDataset(Dataset):
+<<<<<<< HEAD
+    def __init__(self, X, Y):
+=======
+    def __init__(self,X,Y):
+>>>>>>> docker
+        self.X = torch.tensor(X, dtype=torch.float32)
+        self.Y = torch.tensor(Y, dtype=torch.float32)
+    def __len__(self):
+        return len(self.X)
+    def __getitem__(self, index):
+        return self.X[index], self.Y[index]

scripts/tile_file_dataloader.py ADDED Viewed

	@@ -0,0 +1,25 @@

+import os
+import torch
+from torch.utils.data import Dataset
+class FlatTileDataset(Dataset):
+    def __init__(self, data_dir):
+        super().__init__()
+        self.data_dir = data_dir
+        # List all files in the data_dir that are files (not directories)
+        self.files = [os.path.join(data_dir, f) for f in os.listdir(data_dir) if os.path.isfile(os.path.join(data_dir, f))]
+    def __len__(self):
+        # Return the total number of files
+        return len(self.files)
+    def __getitem__(self, idx):
+        # Get the file path for the given index
+        file_path = self.files[idx]
+        # Load the data from the file
+        data = torch.load(file_path)
+        # Assuming the data file is a dictionary with 'tile_data' and 'file_data' keys
+        tile_data = torch.from_numpy(data['tile_data'][0])
+        file_data = data['file_data']
+        # Return the tile data and file data
+        return tile_data, file_data