|
import os |
|
import re |
|
import glob |
|
import pickle as pkl |
|
|
|
import copy |
|
import numpy as np |
|
import pandas as pd |
|
import matplotlib.pyplot as plt |
|
from matplotlib.pyplot import cm |
|
import warnings |
|
from tqdm import tqdm |
|
import skimage |
|
|
|
import phenograph |
|
import umap |
|
import seaborn as sns |
|
from scipy.stats import spearmanr |
|
|
|
import sys |
|
import platform |
|
from pathlib import Path |
|
FILE = Path(__file__).resolve() |
|
ROOT = FILE.parents[0] |
|
if str(ROOT) not in sys.path: |
|
sys.path.append(str(ROOT)) |
|
if platform.system() != 'Windows': |
|
ROOT = Path(os.path.relpath(ROOT, Path.cwd())) |
|
from classes import CytofImage, CytofImageTiff |
|
|
|
import hyperion_preprocess as pre |
|
import hyperion_segmentation as seg |
|
from utils import load_CytofImage |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
def _longest_substring(str1, str2): |
|
ans = "" |
|
len1, len2 = len(str1), len(str2) |
|
for i in range(len1): |
|
for j in range(len2): |
|
match = "" |
|
_len = 0 |
|
while ((i+_len < len1) and (j+_len < len2) and str1[i+_len] == str2[j+_len]): |
|
match += str1[i+_len] |
|
_len += 1 |
|
if len(match) > len(ans): |
|
ans = match |
|
return ans |
|
|
|
def extract_feature(channels, raw_image, nuclei_seg, cell_seg, filename, show_head=False): |
|
""" Extract nuclei and cell level feature from cytof image based on nuclei segmentation and cell segmentation |
|
results |
|
Inputs: |
|
channels = channels to extract feature from |
|
raw_image = raw cytof image |
|
nuclei_seg = nuclei segmentation result |
|
cell_seg = cell segmentation result |
|
filename = filename of current cytof image |
|
Returns: |
|
feature_summary_df = a dataframe containing summary of extracted features |
|
morphology = names of morphology features extracted |
|
|
|
:param channels: list |
|
:param raw_image: numpy.ndarray |
|
:param nuclei_seg: numpy.ndarray |
|
:param cell_seg: numpy.ndarray |
|
:param filename: string |
|
:param morpholoty: list |
|
:return feature_summary_df: pandas.core.frame.DataFrame |
|
""" |
|
assert (len(channels) == raw_image.shape[-1]) |
|
|
|
|
|
morphology = ["area", "convex_area", "eccentricity", "extent", |
|
"filled_area", "major_axis_length", "minor_axis_length", |
|
"orientation", "perimeter", "solidity", "pa_ratio"] |
|
|
|
|
|
nuclei_morphology = [_ + '_nuclei' for _ in morphology] |
|
cell_morphology = [_ + '_cell' for _ in morphology] |
|
|
|
|
|
|
|
sum_exp_nuclei = [_ + '_nuclei_sum' for _ in channels] |
|
ave_exp_nuclei = [_ + '_nuclei_ave' for _ in channels] |
|
|
|
|
|
sum_exp_cell = [_ + '_cell_sum' for _ in channels] |
|
ave_exp_cell = [_ + '_cell_ave' for _ in channels] |
|
|
|
|
|
column_names = ["filename", "id", "coordinate_x", "coordinate_y"] + \ |
|
sum_exp_nuclei + ave_exp_nuclei + nuclei_morphology + \ |
|
sum_exp_cell + ave_exp_cell + cell_morphology |
|
|
|
|
|
res = dict() |
|
for column_name in column_names: |
|
res[column_name] = [] |
|
|
|
n_nuclei = np.max(nuclei_seg) |
|
for nuclei_id in tqdm(range(2, n_nuclei + 1), position=0, leave=True): |
|
res["filename"].append(filename) |
|
res["id"].append(nuclei_id) |
|
regions = skimage.measure.regionprops((nuclei_seg == nuclei_id) * 1) |
|
if len(regions) >= 1: |
|
this_nucleus = regions[0] |
|
else: |
|
continue |
|
regions = skimage.measure.regionprops((cell_seg == nuclei_id) * 1) |
|
if len(regions) >= 1: |
|
this_cell = regions[0] |
|
else: |
|
continue |
|
centroid_y, centroid_x = this_nucleus.centroid |
|
res['coordinate_x'].append(centroid_x) |
|
res['coordinate_y'].append(centroid_y) |
|
|
|
|
|
for i, feature in enumerate(morphology[:-1]): |
|
res[nuclei_morphology[i]].append(getattr(this_nucleus, feature)) |
|
res[cell_morphology[i]].append(getattr(this_cell, feature)) |
|
res[nuclei_morphology[-1]].append(1.0 * this_nucleus.perimeter ** 2 / this_nucleus.filled_area) |
|
res[cell_morphology[-1]].append(1.0 * this_cell.perimeter ** 2 / this_cell.filled_area) |
|
|
|
|
|
for i, marker in enumerate(channels): |
|
ch = i |
|
res[sum_exp_nuclei[i]].append(np.sum(raw_image[nuclei_seg == nuclei_id, ch])) |
|
res[ave_exp_nuclei[i]].append(np.average(raw_image[nuclei_seg == nuclei_id, ch])) |
|
res[sum_exp_cell[i]].append(np.sum(raw_image[cell_seg == nuclei_id, ch])) |
|
res[ave_exp_cell[i]].append(np.average(raw_image[cell_seg == nuclei_id, ch])) |
|
|
|
feature_summary_df = pd.DataFrame(res) |
|
if show_head: |
|
print(feature_summary_df.head()) |
|
return feature_summary_df |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
def feature_quantile_normalization(feature_summary_df, features, qs=[75,99]): |
|
""" Calculate the q-quantiles of selected features given quantile q values. Then perform q-quantile normalization |
|
on these features using calculated quantile values. The feature_summary_df will be updated in-place with new |
|
columns "feature_qnormed" generated and added. Meanwhile, visualize distribution of log2 features before and after |
|
q-normalization |
|
Inputs: |
|
feature_summary_df = dataframe of extracted feature summary |
|
features = features to be normalized |
|
qs = quantile q values (default=[75,99]) |
|
Returns: |
|
quantiles = quantile values for each q |
|
:param feature_summary_df: pandas.core.frame.DataFrame |
|
:param features: list |
|
:param qs: list |
|
:return quantiles: dict |
|
""" |
|
expressions = [] |
|
expressions_normed = dict((key, []) for key in qs) |
|
quantiles = {} |
|
colors = cm.rainbow(np.linspace(0, 1, len(qs))) |
|
for feat in features: |
|
quantiles[feat] = {} |
|
expressions.extend(feature_summary_df[feat]) |
|
|
|
plt.hist(np.log2(np.array(expressions) + 0.0001), 100, density=True) |
|
for q, c in zip(qs, colors): |
|
quantile_val = np.quantile(expressions, q/100) |
|
quantiles[feat][q] = quantile_val |
|
plt.axvline(np.log2(quantile_val), label=f"{q}th percentile", c=c) |
|
print(f"{q}th percentile: {quantile_val}") |
|
|
|
|
|
normed = np.log2(feature_summary_df.loc[:, feat] / quantile_val + 0.0001) |
|
feature_summary_df.loc[:, f"{feat}_{q}normed"] = normed |
|
expressions_normed[q].extend(normed) |
|
plt.xlim(-15, 15) |
|
plt.xlabel("log2(expression of all markers)") |
|
plt.legend() |
|
plt.show() |
|
|
|
|
|
'''N = len(qs)+1 # (len(qs)+1) // 2 + (len(qs)+1) %2''' |
|
log_expressions = tuple([np.log2(np.array(expressions) + 0.0001)] + [expressions_normed[q] for q in qs]) |
|
labels = ["before normalization"] + [f"after {q} normalization" for q in qs] |
|
fig, ax = plt.subplots(1, 1, figsize=(12, 7)) |
|
ax.hist(log_expressions, 100, density=True, label=labels) |
|
ax.set_xlabel("log2(expressions for all markers)") |
|
plt.legend() |
|
plt.show() |
|
return quantiles |
|
|
|
|
|
def feature_scaling(feature_summary_df, features, inplace=False): |
|
"""Perform in-place mean-std scaling on selected features. Normally, do not scale nuclei sum feature |
|
Inputs: |
|
feature_summary_df = dataframe of extracted feature summary |
|
features = features to perform scaling on |
|
inplace = an indicator of whether perform the scaling in-place (Default=False) |
|
Returns: |
|
|
|
:param feature_summary_df: pandas.core.frame.DataFrame |
|
:param features: list |
|
:param inplace: bool |
|
""" |
|
|
|
scaled_feature_summary_df = feature_summary_df if inplace else feature_summary_df.copy() |
|
|
|
for feat in features: |
|
if feat not in feature_summary_df.columns: |
|
print(f"Warning: {feat} not available!") |
|
continue |
|
scaled_feature_summary_df[feat] = \ |
|
(scaled_feature_summary_df[feat] - np.average(scaled_feature_summary_df[feat])) \ |
|
/ np.std(scaled_feature_summary_df[feat]) |
|
if not inplace: |
|
return scaled_feature_summary_df |
|
|
|
|
|
|
|
|
|
|
|
|
|
def generate_summary(feature_summary_df, features, thresholds): |
|
"""Generate (cell level) summary table for each feature in features: feature name, total number (of cells), |
|
calculated GMM threshold for this feature, number of individuals (cells) with greater than threshold values, |
|
ratio of individuals (cells) with greater than threshold values |
|
Inputs: |
|
feature_summary_df = dataframe of extracted feature summary |
|
features = a list of features to generate summary table |
|
thresholds = (calculated GMM-based) thresholds for each feature |
|
Outputs: |
|
df_info = summary table for each feature |
|
|
|
:param feature_summary_df: pandas.core.frame.DataFrame |
|
:param features: list |
|
:param thresholds: dict |
|
:return df_info: pandas.core.frame.DataFrame |
|
""" |
|
|
|
df_info = pd.DataFrame(columns=['feature', 'total number', 'threshold', 'positive counts', 'positive ratio']) |
|
|
|
for feature in features: |
|
|
|
thres = thresholds[feature] |
|
X = feature_summary_df[feature].values |
|
n = sum(X > thres) |
|
N = len(X) |
|
|
|
df_new_row = pd.DataFrame({'feature': feature,'total number':N, 'threshold':thres, |
|
'positive counts':n, 'positive ratio': n/N}, index=[0]) |
|
df_info = pd.concat([df_info, df_new_row]) |
|
return df_info |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
def batch_extract_feature(files, markers, nuclei_markers, membrane_markers=None, show_vis=False): |
|
"""Extract features for cytof images from a list of files. Normally this list contains ROIs of the same slide |
|
Inputs: |
|
files = a list of files to be processed |
|
markers = a list of marker names used when generating the image |
|
nuclei_markers = a list of markers define the nuclei channel (used for nuclei segmentation) |
|
membrane_markers = a list of markers define the membrane channel (used for cell segmentation) (Default=None) |
|
show_vis = an indicator of showing visualization during process |
|
Outputs: |
|
file_features = a dictionary contains extracted features for each file |
|
|
|
:param files: list |
|
:param markers: list |
|
:param nuclei_markers: list |
|
:param membrane_markers: list |
|
:param show_vis: bool |
|
:return file_features: dict |
|
""" |
|
file_features = {} |
|
for f in tqdm(files): |
|
|
|
df = pre.cytof_read_data(f) |
|
|
|
df_ = pre.cytof_preprocess(df) |
|
column_names = markers[:] |
|
df_output = pre.define_special_channel(df_, 'nuclei', markers=nuclei_markers) |
|
column_names.insert(0, 'nuclei') |
|
if membrane_markers is not None: |
|
df_output = pre.define_special_channel(df_output, 'membrane', markers=membrane_markers) |
|
column_names.append('membrane') |
|
raw_image = pre.cytof_txt2img(df_output, marker_names=column_names) |
|
|
|
if show_vis: |
|
merged_im, _ = pre.cytof_merge_channels(raw_image, channel_ids=[0, -1], quantiles=None, visualize=False) |
|
plt.imshow(merged_im[0:200, 200:400, ...]) |
|
plt.title('Selected region of raw cytof image') |
|
plt.show() |
|
|
|
|
|
nuclei_img = raw_image[..., column_names.index('nuclei')] |
|
nuclei_seg, color_dict = seg.cytof_nuclei_segmentation(nuclei_img, show_process=False) |
|
if membrane_markers is not None: |
|
membrane_img = raw_image[..., column_names.index('membrane')] |
|
cell_seg, _ = seg.cytof_cell_segmentation(nuclei_seg, membrane_channel=membrane_img, show_process=False) |
|
else: |
|
cell_seg, _ = seg.cytof_cell_segmentation(nuclei_seg, show_process=False) |
|
if show_vis: |
|
marked_image_nuclei = seg.visualize_segmentation(raw_image, nuclei_seg, channel_ids=(0, -1), show=False) |
|
marked_image_cell = seg.visualize_segmentation(raw_image, cell_seg, channel_ids=(-1, 0), show=False) |
|
fig, axs = plt.subplots(1,2,figsize=(10,6)) |
|
axs[0].imshow(marked_image_nuclei[0:200, 200:400, :]), axs[0].set_title('nuclei segmentation') |
|
axs[1].imshow(marked_image_cell[0:200, 200:400, :]), axs[1].set_title('cell segmentation') |
|
plt.show() |
|
|
|
|
|
feat_names = markers[:] |
|
feat_names.insert(0, 'nuclei') |
|
df_feat_sum = extract_feature(feat_names, raw_image, nuclei_seg, cell_seg, filename=f) |
|
file_features[f] = df_feat_sum |
|
return file_features |
|
|
|
|
|
|
|
def batch_norm_scale(file_features, column_names, qs=[75,99]): |
|
"""Perform feature log transform, quantile normalization and scaling in a batch |
|
Inputs: |
|
file_features = A dictionary of dataframes containing extracted features. key - file name, item - feature table |
|
column_names = A list of markers. Should be consistent with column names in dataframe of features |
|
qs = quantile q values (Default=[75,99]) |
|
Outputs: |
|
file_features_out = log transformed, quantile normalized and scaled features for each file in the batch |
|
quantiles = a dictionary of quantile values for each file in the batch |
|
|
|
:param file_features: dict |
|
:param column_names: list |
|
:param qs: list |
|
:return file_features_out: dict |
|
:return quantiles: dict |
|
""" |
|
file_features_out = copy.deepcopy(file_features) |
|
|
|
|
|
cell_markers_sum = [_ + '_cell_sum' for _ in column_names] |
|
cell_markers_ave = [_ + '_cell_ave' for _ in column_names] |
|
nuclei_markers_sum = [_ + '_nuclei_sum' for _ in column_names] |
|
nuclei_markers_ave = [_ + '_nuclei_ave' for _ in column_names] |
|
|
|
|
|
morphology = ["area", "convex_area", "eccentricity", "extent", |
|
"filled_area", "major_axis_length", "minor_axis_length", |
|
"orientation", "perimeter", "solidity", "pa_ratio"] |
|
nuclei_morphology = [_ + '_nuclei' for _ in morphology] |
|
cell_morphology = [_ + '_cell' for _ in morphology] |
|
|
|
|
|
features_to_norm = [x for x in nuclei_markers_sum + nuclei_markers_ave + cell_markers_sum + cell_markers_ave \ |
|
if not x.startswith('nuclei')] |
|
|
|
|
|
scale_features = [] |
|
for feature_name in nuclei_morphology + cell_morphology + nuclei_markers_sum + nuclei_markers_ave + \ |
|
cell_markers_sum + cell_markers_ave: |
|
'''if feature_name not in nuclei_morphology + cell_morphology and not feature_name.startswith('nuclei'): |
|
scale_features += [feature_name, f"{feature_name}_75normed", f"{feature_name}_99normed"] |
|
else: |
|
scale_features += [feature_name]''' |
|
temp = [feature_name] |
|
if feature_name not in nuclei_morphology + cell_morphology and not feature_name.startswith('nuclei'): |
|
for q in qs: |
|
temp += [f"{feature_name}_{q}normed"] |
|
scale_features += temp |
|
|
|
quantiles = {} |
|
for f, df in file_features_out.items(): |
|
print(f) |
|
quantiles[f] = feature_quantile_normalization(df, features=features_to_norm, qs=qs) |
|
feature_scaling(df, features=scale_features, inplace=True) |
|
return file_features_out, quantiles |
|
|
|
|
|
def batch_scale_feature(outdir, normqs, df_io=None, files_scale=None): |
|
""" |
|
Inputs: |
|
outdir = output saving directory, which contains the scale file generated previously, |
|
the input_output_csv file with the list of available cytof_img class instances in the batch, |
|
as well as previously saved cytof_img class instances in .pkl files |
|
normqs = a list of q values of percentile normalization |
|
files_scale = full file name of the scaling information |
|
|
|
Outputs: None |
|
Scaled feature are saved as .csv files in subfolder "feature_qnormed_scaled" in outdir |
|
A new attribute will be added to cytof_img class instance, and the update class instance is saved in outdir |
|
""" |
|
if df_io is None: |
|
df_io = pd.read_csv(os.path.join(outdir, "input_output.csv")) |
|
|
|
for _i, normq in enumerate(normqs): |
|
n_attr = f"df_feature_{normq}normed" |
|
n_attr_scaled = f"{n_attr}_scaled" |
|
file_scale = files_scale[_i] if files_scale is not None else os.path.join(outdir, "{}normed_scale_params.csv".format(normq)) |
|
|
|
dirq = os.path.join(outdir, f"feature_{normq}normed_scaled") |
|
if not os.path.exists(dirq): |
|
os.makedirs(dirq) |
|
|
|
|
|
df_scale = pd.read_csv(file_scale, index_col=False) |
|
m = df_scale[df_scale.columns].iloc[0] |
|
s = df_scale[df_scale.columns].iloc[1] |
|
|
|
dfs = {} |
|
cytofs = {} |
|
|
|
for f_cytof in df_io['output_file']: |
|
|
|
cytof_img = pkl.load(open(f_cytof, "rb")) |
|
assert hasattr(cytof_img, n_attr), f"attribute {n_attr} not exist" |
|
df_feat = copy.deepcopy(getattr(cytof_img, n_attr)) |
|
|
|
assert len([x for x in df_scale.columns if x not in df_feat.columns]) == 0 |
|
|
|
|
|
df_feat[df_scale.columns] = (df_feat[df_scale.columns] - m) / s |
|
|
|
|
|
df_feat.to_csv(os.path.join(dirq, os.path.basename(f_cytof).replace('.pkl', '.csv')), index=False) |
|
|
|
|
|
setattr(cytof_img, n_attr_scaled, df_feat) |
|
|
|
|
|
pkl.dump(cytof_img, open(f_cytof, "wb")) |
|
|
|
|
|
def batch_generate_summary(outdir, feature_type="normed", normq=75, scaled=True, vis_thres=False): |
|
""" |
|
Inputs: |
|
outdir = output saving directory, which contains the scale file generated previously, as well as previously saved |
|
cytof_img class instances in .pkl files |
|
feature_type = type of feature to be used, available choices: "original", "normed", "scaled" |
|
normq = q value of quantile normalization |
|
scaled = a flag indicating whether or not use the scaled version of features (Default=False) |
|
vis_thres = a flag indicating whether or not visualize the process of calculating thresholds (Default=False) |
|
Outputs: None |
|
Two .csv files, one for cell sum and the other for cell average features, are saved for each ROI, containing the |
|
threshold and cell count information of each feature, in the subfolder "marker_summary" under outdir |
|
""" |
|
assert feature_type in ["original", "normed", "scaled"], 'accepted feature types are "original", "normed", "scaled"' |
|
if feature_type == "original": |
|
feat_name = "" |
|
elif feature_type == "normed": |
|
feat_name = f"{normq}normed" |
|
else: |
|
feat_name = f"{normq}normed_scaled" |
|
|
|
n_attr = f"df_feature_{feat_name}" |
|
|
|
dir_sum = os.path.join(outdir, "marker_summary", feat_name) |
|
print(dir_sum) |
|
if not os.path.exists(dir_sum): |
|
os.makedirs(dir_sum) |
|
|
|
seen = 0 |
|
dfs = {} |
|
cytofs = {} |
|
df_io = pd.read_csv(os.path.join(outdir, "input_output.csv")) |
|
for f in df_io['output_file'].tolist(): |
|
f_roi = os.path.basename(f).split(".pkl")[0] |
|
cytof_img = pkl.load(open(f, "rb")) |
|
|
|
|
|
df_feat = getattr(cytof_img, n_attr) |
|
dfs[f] = getattr(cytof_img, n_attr) |
|
cytofs[f] = cytof_img |
|
|
|
|
|
if seen == 0: |
|
feat_cell_sum = cytof_img.features['cell_sum'] |
|
feat_cell_ave = cytof_img.features['cell_ave'] |
|
seen += 1 |
|
|
|
|
|
all_df = pd.concat(dfs.values(), ignore_index=True) |
|
print("Getting thresholds for marker sum") |
|
thres_sum = _get_thresholds(all_df, feat_cell_sum, visualize=vis_thres) |
|
print("Getting thresholds for marker average") |
|
thres_ave = _get_thresholds(all_df, feat_cell_ave, visualize=vis_thres) |
|
for f, cytof_img in cytofs.items(): |
|
f_roi = os.path.basename(f).split(".pkl")[0] |
|
df_info_cell_sum_f = generate_summary(dfs[f], features=feat_cell_sum, thresholds=thres_sum) |
|
df_info_cell_ave_f = generate_summary(dfs[f], features=feat_cell_ave, thresholds=thres_ave) |
|
setattr(cytof_img, f"cell_count_{feat_name}_sum", df_info_cell_sum_f) |
|
setattr(cytof_img, f"cell_count_{feat_name}_ave", df_info_cell_ave_f) |
|
df_info_cell_sum_f.to_csv(os.path.join(dir_sum, f"{f_roi}_cell_count_sum.csv"), index=False) |
|
df_info_cell_ave_f.to_csv(os.path.join(dir_sum, f"{f_roi}_cell_count_ave.csv"), index=False) |
|
pkl.dump(cytof_img, open(f, "wb")) |
|
return dir_sum |
|
|
|
|
|
|
|
def _gather_roi_expressions(df_io, normqs=[75]): |
|
"""Only cell level sum""" |
|
expressions = {} |
|
expressions_normed = {} |
|
for roi in df_io["ROI"].unique(): |
|
expressions[roi] = [] |
|
f_cytof_im = df_io.loc[df_io["ROI"] == roi, "output_file"].values[0] |
|
cytof_im = load_CytofImage(f_cytof_im) |
|
for feature_name in cytof_im.features['cell_sum']: |
|
expressions[roi].extend(cytof_im.df_feature[feature_name]) |
|
expressions_normed[roi] = dict((q, {}) for q in normqs) |
|
for q in expressions_normed[roi].keys(): |
|
expressions_normed[roi][q] = [] |
|
normed_feat = getattr(cytof_im, "df_feature_{}normed".format(q)) |
|
for feature_name in cytof_im.features['cell_sum']: |
|
expressions_normed[roi][q].extend(normed_feat[feature_name]) |
|
return expressions, expressions_normed |
|
|
|
|
|
def visualize_normalization(df_slide_roi, normqs=[75], level="slide"): |
|
expressions_, expressions_normed_ = _gather_roi_expressions(df_slide_roi, normqs=normqs) |
|
if level == "slide": |
|
prefix = "Slide" |
|
expressions, expressions_normed = {}, {} |
|
for slide in df_slide_roi["Slide"].unique(): |
|
f_rois = df_slide_roi.loc[df_slide_roi["Slide"] == slide, "ROI"].values |
|
rois = [x.replace('.txt', '') for x in f_rois] |
|
expressions[slide] = [] |
|
expressions_normed[slide] = dict((q, []) for q in normqs) |
|
for roi in rois: |
|
expressions[slide].extend(expressions_[roi]) |
|
|
|
for q in expressions_normed[slide].keys(): |
|
expressions_normed[slide][q].extend(expressions_normed_[roi][q]) |
|
|
|
else: |
|
expressions, expressions_normed = expressions_, expressions_normed_ |
|
prefix = "ROI" |
|
num_q = len(normqs) |
|
for key, key_exp in expressions.items(): |
|
print("Showing {} {}".format(prefix, key)) |
|
fig, ax = plt.subplots(1, num_q + 1, figsize=(4 * (num_q + 1), 4)) |
|
ax[0].hist((np.log2(np.array(key_exp) + 0.0001),), 100, density=True) |
|
ax[0].set_title("Before normalization") |
|
ax[0].set_xlabel("log2(cellular expression of all markers)") |
|
for i, q in enumerate(normqs): |
|
ax[i + 1].hist((np.array(expressions_normed[key][q]) + 0.0001,), 100, density=True) |
|
ax[i + 1].set_title("After {}-th percentile normalization".format(q)) |
|
ax[i + 1].set_xlabel("log2(cellular expression of all markers)") |
|
plt.show() |
|
return expressions, expressions_normed |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
def _gather_roi_co_exp(df_slide_roi, outdir, feat_name, accumul_type): |
|
"""roi level co-expression analysis""" |
|
n_attr = f"df_feature_{feat_name}" |
|
expected_percentages = {} |
|
edge_percentages = {} |
|
num_cells = {} |
|
|
|
for seen_roi, f_roi in enumerate(df_slide_roi["ROI"].unique()): |
|
roi = f_roi.replace(".txt", "") |
|
slide = df_slide_roi.loc[df_slide_roi["ROI"] == f_roi, "Slide"].values[0] |
|
f_cytof_im = "{}_{}.pkl".format(slide, roi) |
|
if not f_cytof_im in os.listdir(os.path.join(outdir, "cytof_images")): |
|
print("{} not found, skip".format(f_cytof_im)) |
|
continue |
|
cytof_im = load_CytofImage(os.path.join(outdir, "cytof_images", f_cytof_im)) |
|
df_feat = getattr(cytof_im, n_attr) |
|
|
|
if seen_roi == 0: |
|
|
|
marker_col_all = [x for x in df_feat.columns if "cell_{}".format(accumul_type) in x] |
|
marker_all = [x.split('(')[0] for x in marker_col_all] |
|
n_marker = len(marker_col_all) |
|
n_cell = len(df_feat) |
|
|
|
df_info_cell = getattr(cytof_im,"cell_count_{}_{}".format(feat_name,accumul_type)) |
|
pos_nums = df_info_cell["positive counts"].values |
|
pos_ratios = df_info_cell["positive ratio"].values |
|
thresholds = df_info_cell["threshold"].values |
|
|
|
|
|
expected_percentage = np.zeros((n_marker, n_marker)) |
|
|
|
|
|
|
|
|
|
|
|
|
|
for ii in range(n_marker): |
|
for jj in range(n_marker): |
|
expected_percentage[ii, jj] = pos_nums[ii] * pos_nums[jj] |
|
expected_percentages[roi] = expected_percentage |
|
|
|
|
|
|
|
edge_nums = np.zeros_like(expected_percentage) |
|
for ii in range(n_marker): |
|
_x = df_feat[marker_col_all[ii]].values > thresholds[ii] |
|
for jj in range(n_marker): |
|
_y = df_feat[marker_col_all[jj]].values > thresholds[jj] |
|
edge_nums[ii, jj] = np.sum(np.all([_x, _y], axis=0)) |
|
edge_percentages[roi] = edge_nums |
|
num_cells[roi] = n_cell |
|
return expected_percentages, edge_percentages, num_cells, marker_all, marker_col_all |
|
|
|
|
|
def co_expression_analysis(df_slide_roi, outdir, feature_type, accumul_type, co_exp_markers="all", normq=75, |
|
level="slide", clustergrid=None): |
|
""" |
|
""" |
|
assert level in ["slide", "roi"], "Only slide or roi levels are accepted!" |
|
assert feature_type in ["original", "normed", "scaled"] |
|
if feature_type == "original": |
|
feat_name = "" |
|
elif feature_type == "normed": |
|
feat_name = f"{normq}normed" |
|
else: |
|
feat_name = f"{normq}normed_scaled" |
|
|
|
print(feat_name) |
|
dir_cytof_img = os.path.join(outdir, "cytof_images") |
|
|
|
expected_percentages, edge_percentages, num_cells, marker_all, marker_col_all = \ |
|
_gather_roi_co_exp(df_slide_roi, outdir, feat_name, accumul_type) |
|
|
|
if co_exp_markers != "all": |
|
|
|
assert (isinstance(co_exp_markers, list) and all([x in marker_all for x in co_exp_markers])) |
|
marker_idx = np.array([marker_all.index(x) for x in co_exp_markers]) |
|
marker_all = [marker_all[x] for x in marker_idx] |
|
marker_col_all = [marker_col_all[x] for x in marker_idx] |
|
else: |
|
marker_idx = np.arange(len(marker_all)) |
|
|
|
if level == "slide": |
|
|
|
for slide in df_slide_roi["Slide"].unique(): |
|
for seen_roi, f_roi in enumerate(df_slide_roi.loc[df_slide_roi["Slide"] == slide, "ROI"]): |
|
roi = f_roi.replace(".txt", "") |
|
if roi not in expected_percentages: |
|
continue |
|
if seen_roi == 0: |
|
expected_percentages[slide] = expected_percentages[roi] |
|
edge_percentages[slide] = edge_percentages[roi] |
|
num_cells[slide] = num_cells[roi] |
|
else: |
|
expected_percentages[slide] += expected_percentages[roi] |
|
edge_percentages[slide] += edge_percentages[roi] |
|
num_cells[slide] += num_cells[roi] |
|
expected_percentages.pop(roi) |
|
edge_percentages.pop(roi) |
|
num_cells.pop(roi) |
|
|
|
co_exps = {} |
|
for key, expected_percentage in expected_percentages.items(): |
|
expected_percentage = expected_percentage / num_cells[key] ** 2 |
|
edge_percentage = edge_percentages[key] / num_cells[key] |
|
|
|
|
|
edge_percentage_norm = np.log10(edge_percentage / expected_percentage + 0.1) |
|
|
|
|
|
edge_percentage_norm[np.isnan(edge_percentage_norm)] = np.log10(1 + 0.1) |
|
|
|
co_exps[key] = edge_percentage_norm |
|
|
|
|
|
for f_key, edge_percentage_norm in co_exps.items(): |
|
plt.figure(figsize=(6, 6)) |
|
ax = sns.heatmap(edge_percentage_norm[marker_idx, :][:, marker_idx], center=np.log10(1 + 0.1), |
|
|
|
cmap='RdBu_r', vmin=-1, vmax=3, |
|
xticklabels=marker_all, yticklabels=marker_all) |
|
ax.set_aspect('equal') |
|
plt.title(f_key) |
|
plt.show() |
|
|
|
if clustergrid is None: |
|
plt.figure() |
|
clustergrid = sns.clustermap(edge_percentage_norm[marker_idx, :][:, marker_idx], |
|
|
|
center=np.log10(1 + 0.1), cmap='RdBu_r', vmin=-1, vmax=3, |
|
xticklabels=marker_all, yticklabels=marker_all, figsize=(6, 6)) |
|
plt.title(f_key) |
|
plt.show() |
|
|
|
|
|
plt.figure() |
|
sns.clustermap(edge_percentage_norm[marker_idx, :][:, marker_idx] \ |
|
|
|
[clustergrid.dendrogram_row.reordered_ind, :][:, clustergrid.dendrogram_row.reordered_ind], |
|
center=np.log10(1 + 0.1), cmap='RdBu_r', vmin=-1, vmax=3, |
|
xticklabels=np.array(marker_all)[clustergrid.dendrogram_row.reordered_ind], |
|
yticklabels=np.array(marker_all)[clustergrid.dendrogram_row.reordered_ind], |
|
figsize=(6, 6), row_cluster=False, col_cluster=False) |
|
plt.title(f_key) |
|
plt.show() |
|
return co_exps, marker_idx, clustergrid |
|
|
|
|
|
from scipy.stats import spearmanr |
|
|
|
def _gather_roi_corr(df_slide_roi, outdir, feat_name, accumul_type): |
|
"""roi level correlation analysis""" |
|
|
|
n_attr = f"df_feature_{feat_name}" |
|
feats = {} |
|
|
|
for seen_roi, f_roi in enumerate(df_slide_roi["ROI"].unique()): |
|
roi = f_roi.replace(".txt", "") |
|
slide = df_slide_roi.loc[df_slide_roi["ROI"] == f_roi, "Slide"].values[0] |
|
f_cytof_im = "{}_{}.pkl".format(slide, roi) |
|
if not f_cytof_im in os.listdir(os.path.join(outdir, "cytof_images")): |
|
print("{} not found, skip".format(f_cytof_im)) |
|
continue |
|
cytof_im = load_CytofImage(os.path.join(outdir, "cytof_images", f_cytof_im)) |
|
df_feat = getattr(cytof_im, n_attr) |
|
feats[roi] = df_feat |
|
|
|
if seen_roi == 0: |
|
|
|
marker_col_all = [x for x in df_feat.columns if "cell_{}".format(accumul_type) in x] |
|
marker_all = [x.split('(')[0] for x in marker_col_all] |
|
return feats, marker_all, marker_col_all |
|
|
|
|
|
def correlation_analysis(df_slide_roi, outdir, feature_type, accumul_type, corr_markers="all", normq=75, level="slide", |
|
clustergrid=None): |
|
""" |
|
""" |
|
assert level in ["slide", "roi"], "Only slide or roi levels are accepted!" |
|
assert feature_type in ["original", "normed", "scaled"] |
|
if feature_type == "original": |
|
feat_name = "" |
|
elif feature_type == "normed": |
|
feat_name = f"{normq}normed" |
|
else: |
|
feat_name = f"{normq}normed_scaled" |
|
|
|
print(feat_name) |
|
dir_cytof_img = os.path.join(outdir, "cytof_images") |
|
|
|
feats, marker_all, marker_col_all = _gather_roi_corr(df_slide_roi, outdir, feat_name, accumul_type) |
|
n_marker = len(marker_all) |
|
|
|
corrs = {} |
|
|
|
if level == "slide": |
|
for slide in df_slide_roi["Slide"].unique(): |
|
for seen_roi, f_roi in enumerate(df_slide_roi.loc[df_slide_roi["Slide"] == slide, "ROI"]): |
|
roi = f_roi.replace(".txt", "") |
|
if roi not in feats: |
|
continue |
|
if seen_roi == 0: |
|
feats[slide] = feats[roi] |
|
else: |
|
|
|
feats[slide] = pd.concat([feats[slide], feats[roi]]) |
|
feats.pop(roi) |
|
|
|
for key, feat in feats.items(): |
|
correlation = np.zeros((n_marker, n_marker)) |
|
for i, feature_i in enumerate(marker_col_all): |
|
for j, feature_j in enumerate(marker_col_all): |
|
correlation[i, j] = spearmanr(feat[feature_i].values, feat[feature_j].values).correlation |
|
corrs[key] = correlation |
|
|
|
if corr_markers != "all": |
|
assert (isinstance(corr_markers, list) and all([x in marker_all for x in corr_markers])) |
|
marker_idx = np.array([marker_all.index(x) for x in corr_markers]) |
|
marker_all = [marker_all[x] for x in marker_idx] |
|
marker_col_all = [marker_col_all[x] for x in marker_idx] |
|
else: |
|
marker_idx = np.arange(len(marker_all)) |
|
|
|
|
|
for f_key, corr in corrs.items(): |
|
plt.figure(figsize=(6, 6)) |
|
ax = sns.heatmap(corr[marker_idx, :][:, marker_idx], center=np.log10(1 + 0.1), |
|
cmap='RdBu_r', vmin=-1, vmax=1, |
|
xticklabels=corr_markers, yticklabels=corr_markers) |
|
ax.set_aspect('equal') |
|
plt.title(f_key) |
|
plt.show() |
|
|
|
if clustergrid is None: |
|
plt.figure() |
|
clustergrid = sns.clustermap(corr[marker_idx, :][:, marker_idx], |
|
center=np.log10(1 + 0.1), cmap='RdBu_r', vmin=-1, vmax=1, |
|
xticklabels=corr_markers, yticklabels=corr_markers, figsize=(6, 6)) |
|
plt.title(f_key) |
|
plt.show() |
|
|
|
plt.figure() |
|
sns.clustermap(corr[marker_idx, :][:, marker_idx] \ |
|
[clustergrid.dendrogram_row.reordered_ind, :][:, clustergrid.dendrogram_row.reordered_ind], |
|
center=np.log10(1 + 0.1), cmap='RdBu_r', vmin=-1, vmax=1, |
|
xticklabels=np.array(corr_markers)[clustergrid.dendrogram_row.reordered_ind], |
|
yticklabels=np.array(corr_markers)[clustergrid.dendrogram_row.reordered_ind], |
|
figsize=(6, 6), row_cluster=False, col_cluster=False) |
|
plt.title(f_key) |
|
plt.show() |
|
return corrs, marker_idx, clustergrid |
|
|
|
|
|
|
|
from sklearn.neighbors import DistanceMetric |
|
from tqdm import tqdm |
|
|
|
def _gather_roi_interact(df_slide_roi, outdir, feat_name, accumul_type, interact_markers="all", thres_dist=50): |
|
dist = DistanceMetric.get_metric('euclidean') |
|
n_attr = f"df_feature_{feat_name}" |
|
edge_percentages = {} |
|
num_edges = {} |
|
for seen_roi, f_roi in enumerate(df_slide_roi["ROI"].unique()): |
|
roi = f_roi.replace(".txt", "") |
|
slide = df_slide_roi.loc[df_slide_roi["ROI"] == f_roi, "Slide"].values[0] |
|
f_cytof_im = "{}_{}.pkl".format(slide, roi) |
|
if not f_cytof_im in os.listdir(os.path.join(outdir, "cytof_images")): |
|
print("{} not found, skip".format(f_cytof_im)) |
|
continue |
|
cytof_im = load_CytofImage(os.path.join(outdir, "cytof_images", f_cytof_im)) |
|
df_feat = getattr(cytof_im, n_attr) |
|
n_cell = len(df_feat) |
|
dist_matrix = dist.pairwise(df_feat.loc[:, ['coordinate_x', 'coordinate_y']].values) |
|
|
|
if seen_roi==0: |
|
|
|
marker_col_all = [x for x in df_feat.columns if "cell_{}".format(accumul_type) in x] |
|
marker_all = [x.split('(')[0] for x in marker_col_all] |
|
n_marker = len(marker_col_all) |
|
|
|
|
|
df_info_cell = getattr(cytof_im,"cell_count_{}_{}".format(feat_name,accumul_type)) |
|
thresholds = df_info_cell["threshold"].values |
|
|
|
n_edges = 0 |
|
|
|
|
|
edge_nums = np.zeros((n_marker, n_marker)) |
|
|
|
cluster_sub = [] |
|
for i_cell in range(n_cell): |
|
_temp = set() |
|
for k in range(n_marker): |
|
if df_feat[marker_col_all[k]].values[i_cell] > thresholds[k]: |
|
_temp = _temp | {k} |
|
cluster_sub.append(_temp) |
|
|
|
for i in tqdm(range(n_cell)): |
|
for j in range(n_cell): |
|
if dist_matrix[i, j] > 0 and dist_matrix[i, j] < thres_dist: |
|
n_edges += 1 |
|
for m in cluster_sub[i]: |
|
for n in cluster_sub[j]: |
|
edge_nums[m, n] += 1 |
|
|
|
edge_percentages[roi] = edge_nums |
|
num_edges[roi] = n_edges |
|
return edge_percentages, num_edges, marker_all, marker_col_all |
|
|
|
|
|
def interaction_analysis(df_slide_roi, |
|
outdir, |
|
feature_type, |
|
accumul_type, |
|
interact_markers="all", |
|
normq=75, |
|
level="slide", |
|
thres_dist=50, |
|
clustergrid=None): |
|
""" |
|
""" |
|
assert level in ["slide", "roi"], "Only slide or roi levels are accepted!" |
|
assert feature_type in ["original", "normed", "scaled"] |
|
if feature_type == "original": |
|
feat_name = "" |
|
elif feature_type == "normed": |
|
feat_name = f"{normq}normed" |
|
else: |
|
feat_name = f"{normq}normed_scaled" |
|
|
|
print(feat_name) |
|
dir_cytof_img = os.path.join(outdir, "cytof_images") |
|
|
|
expected_percentages, _, num_cells, marker_all_, marker_col_all_ = \ |
|
_gather_roi_co_exp(df_slide_roi, outdir, feat_name, accumul_type) |
|
edge_percentages, num_edges, marker_all, marker_col_all = \ |
|
_gather_roi_interact(df_slide_roi, outdir, feat_name, accumul_type, interact_markers="all", |
|
thres_dist=thres_dist) |
|
|
|
if level == "slide": |
|
for slide in df_slide_roi["Slide"].unique(): |
|
for seen_roi, f_roi in enumerate(df_slide_roi.loc[df_slide_roi["Slide"] == slide, "ROI"]): |
|
roi = f_roi.replace(".txt", "") |
|
if roi not in expected_percentages: |
|
continue |
|
if seen_roi == 0: |
|
expected_percentages[slide] = expected_percentages[roi] |
|
edge_percentages[slide] = edge_percentages[roi] |
|
num_edges[slide] = num_edges[roi] |
|
num_cells[slide] = num_cells[roi] |
|
else: |
|
expected_percentages[slide] += expected_percentages[roi] |
|
edge_percentages[slide] += edge_percentages[roi] |
|
num_edges[slide] += num_edges[roi] |
|
num_cells[slide] += num_cells[roi] |
|
expected_percentages.pop(roi) |
|
edge_percentages.pop(roi) |
|
num_edges.pop(roi) |
|
num_cells.pop(roi) |
|
|
|
if interact_markers != "all": |
|
assert (isinstance(interact_markers, list) and all([x in marker_all for x in interact_markers])) |
|
marker_idx = np.array([marker_all.index(x) for x in interact_markers]) |
|
marker_all = [marker_all[x] for x in marker_idx] |
|
marker_col_all = [marker_col_all[x] for x in marker_idx] |
|
else: |
|
marker_idx = np.arange(len(marker_all)) |
|
|
|
interacts = {} |
|
for key, edge_percentage in edge_percentages.items(): |
|
expected_percentage = expected_percentages[key] / num_cells[key] ** 2 |
|
edge_percentage = edge_percentage / num_edges[key] |
|
|
|
|
|
edge_percentage_norm = np.log10(edge_percentage / expected_percentage + 0.1) |
|
|
|
|
|
edge_percentage_norm[np.isnan(edge_percentage_norm)] = np.log10(1 + 0.1) |
|
interacts[key] = edge_percentage_norm |
|
|
|
|
|
for f_key, interact_ in interacts.items(): |
|
interact = interact_[marker_idx, :][:, marker_idx] |
|
plt.figure(figsize=(6, 6)) |
|
ax = sns.heatmap(interact, center=np.log10(1 + 0.1), |
|
cmap='RdBu_r', vmin=-1, vmax=1, |
|
xticklabels=interact_markers, yticklabels=interact_markers) |
|
ax.set_aspect('equal') |
|
plt.title(f_key) |
|
plt.show() |
|
|
|
if clustergrid is None: |
|
plt.figure() |
|
clustergrid = sns.clustermap(interact, center=np.log10(1 + 0.1), cmap='RdBu_r', vmin=-1, vmax=1, |
|
xticklabels=interact_markers, yticklabels=interact_markers, figsize=(6, 6)) |
|
plt.title(f_key) |
|
plt.show() |
|
|
|
plt.figure() |
|
sns.clustermap( |
|
interact[clustergrid.dendrogram_row.reordered_ind, :][:, clustergrid.dendrogram_row.reordered_ind], |
|
center=np.log10(1 + 0.1), cmap='RdBu_r', vmin=-1, vmax=1, |
|
xticklabels=np.array(interact_markers)[clustergrid.dendrogram_row.reordered_ind], |
|
yticklabels=np.array(interact_markers)[clustergrid.dendrogram_row.reordered_ind], |
|
figsize=(6, 6), row_cluster=False, col_cluster=False) |
|
plt.title(f_key) |
|
plt.show() |
|
return interacts, clustergrid |
|
|
|
|
|
|
|
|
|
|
|
def clustering_phenograph(cohort_file, outdir, normq=75, feat_comb="all", k=None, save_vis=False, pheno_markers="all"): |
|
"""Perform Pheno-graph clustering for the cohort |
|
Inputs: |
|
cohort_file = a .csv file include the whole cohort |
|
outdir = output saving directory, previously saved cytof_img class instances in .pkl files |
|
normq = q value for quantile normalization |
|
feat_comb = desired feature combination to be used for phenograph clustering, acceptable choices: "all", |
|
"cell_sum", "cell_ave", "cell_sum_only", "cell_ave_only" (Default="all") |
|
k = number of initial neighbors to run Pheno-graph (Default=None) |
|
If k is not provided, k is set to N / 100, where N is the total number of single cells |
|
save_vis = a flag indicating whether to save the visualization output (Default=False) |
|
pheno_markers = a list of markers used in phenograph clustering (must be a subset of cytof_img.markers) |
|
Outputs: |
|
df_all = a dataframe of features for all cells in the cohort, with the clustering output saved in the column |
|
'phenotype_total{n_community}', where n_community stands for the total number of communities defined by the cohort |
|
Also, each individual cytof_img class instances will be updated with 2 new attributes: |
|
1)"num phenotypes ({feat_comb}_{normq}normed_{k})" |
|
2)"phenotypes ({feat_comb}_{normq}normed_{k})" |
|
feat_names = feature names (columns) used to generate PhenoGraph output |
|
k = the initial number of k used to run PhenoGraph |
|
pheno_name = the column name of the added column indicating phenograph cluster |
|
vis_savedir = the directory to save the visualization output |
|
markers = the list of markers used (minimal, for visualization purposes) |
|
""" |
|
|
|
vis_savedir = "" |
|
feat_groups = { |
|
"all": ["cell_sum", "cell_ave", "cell_morphology"], |
|
"cell_sum": ["cell_sum", "cell_morphology"], |
|
"cell_ave": ["cell_ave", "cell_morphology"], |
|
"cell_sum_only": ["cell_sum"], |
|
"cell_ave_only": ["cell_ave"] |
|
} |
|
assert feat_comb in feat_groups.keys(), f"{feat_comb} not supported!" |
|
|
|
feat_name = f"_{normq}normed_scaled" |
|
n_attr = f"df_feature{feat_name}" |
|
|
|
dfs = {} |
|
cytof_ims = {} |
|
|
|
df_io = pd.read_csv(os.path.join(outdir, "input_output.csv")) |
|
df_slide_roi = pd.read_csv(cohort_file) |
|
|
|
|
|
for i in df_io.index: |
|
f_out = df_io.loc[i, "output_file"] |
|
f_roi = f_out.split('/')[-1].split('.pkl')[0] |
|
if not os.path.isfile(f_out): |
|
print("{} not found, skip".format(f_out)) |
|
continue |
|
|
|
cytof_img = load_CytofImage(f_out) |
|
if i == 0: |
|
dict_feat = cytof_img.features |
|
markers = cytof_img.markers |
|
cytof_ims[f_roi] = cytof_img |
|
dfs[f_roi] = getattr(cytof_img, n_attr) |
|
|
|
feat_names = [] |
|
for y in feat_groups[feat_comb]: |
|
if "morphology" in y: |
|
feat_names += dict_feat[y] |
|
else: |
|
if pheno_markers == "all": |
|
feat_names += dict_feat[y] |
|
pheno_markers = markers |
|
else: |
|
assert isinstance(pheno_markers, list) |
|
ids = [markers.index(x) for x in pheno_markers] |
|
feat_names += [dict_feat[y][x] for x in ids] |
|
|
|
df_all = pd.concat([_ for key, _ in dfs.items()]) |
|
|
|
|
|
k = k if k else int(df_all.shape[0] / 100) |
|
communities, graph, Q = phenograph.cluster(df_all[feat_names], k=k, n_jobs=-1) |
|
n_community = len(np.unique(communities)) |
|
|
|
|
|
|
|
umap_2d = umap.UMAP(n_components=2, init='random', random_state=0) |
|
proj_2d = umap_2d.fit_transform(df_all[feat_names]) |
|
|
|
|
|
print("Visualization in 2d - cohort") |
|
plt.figure(figsize=(4, 4)) |
|
plt.title("cohort") |
|
sns.scatterplot(x=proj_2d[:, 0], y=proj_2d[:, 1], hue=communities, palette='tab20', |
|
|
|
hue_order=np.arange(n_community)) |
|
plt.axis('tight') |
|
plt.legend(bbox_to_anchor=(1.01, 1), loc=2, borderaxespad=0.) |
|
if save_vis: |
|
vis_savedir = os.path.join(outdir, "phenograph_{}_{}normed_{}".format(feat_comb, normq, k)) |
|
if not os.path.exists(vis_savedir): |
|
os.makedirs(vis_savedir) |
|
plt.savefig(os.path.join(vis_savedir, "cluster_scatter.png")) |
|
plt.show() |
|
|
|
|
|
pheno_name = f'phenotype_total{n_community}' |
|
df_all[pheno_name] = communities |
|
df_all['{}_projx'.format(pheno_name)] = proj_2d[:,0] |
|
df_all['{}_projy'.format(pheno_name)] = proj_2d[:,1] |
|
return df_all, feat_names, k, pheno_name, vis_savedir, markers |
|
|
|
|
|
def _gather_roi_pheno(df_slide_roi, df_all): |
|
"""Split whole df into df for each ROI""" |
|
pheno_roi = {} |
|
|
|
for i in df_slide_roi.index: |
|
path_i = df_slide_roi.loc[i, "path"] |
|
roi_i = df_slide_roi.loc[i, "ROI"] |
|
f_in = os.path.join(path_i, roi_i) |
|
cond = df_all["filename"] == f_in |
|
pheno_roi[roi_i.replace(".txt", "")] = df_all.loc[cond, :] |
|
return pheno_roi |
|
|
|
|
|
def _vis_cell_phenotypes(df_feat, communities, n_community, markers, list_features, accumul_type="sum", savedir=None, savename=""): |
|
""" Visualize cell phenotypes for a given dataframe of feature |
|
Args: |
|
df_feat: a dataframe of features |
|
communities: a list of communities (can be a subset of the cohort communities, but should be consistent with df_feat) |
|
n_community: number of communities in the cohort (n_community >= number of unique values in communities) |
|
markers: a list of markers used in CyTOF image (to be present in the heatmap visualization) |
|
list_features: a list of feature names (consistent with columns in df_feat) |
|
accumul_type: feature aggregation type, choose from "sum" and "ave" (default="sum") |
|
savedir: results saving directory. If not None, visualization plots will be saved in the desired directory (default=None) |
|
Returns: |
|
cell_cluster: a (N, M) matrix, where N = # of clustered communities, and M = # of markers |
|
|
|
cell_cluster_norm: the normalized form of cell_cluster (normalized by subtracting the median value) |
|
""" |
|
assert accumul_type in ["sum", "ave"], "Wrong accumulation type! Choose from 'sum' and 'ave'!" |
|
cell_cluster = np.zeros((n_community, len(markers))) |
|
for cluster in range(len(np.unique(communities))): |
|
df_sub = df_feat[communities == cluster] |
|
if df_sub.shape[0] == 0: |
|
continue |
|
|
|
for i, feat in enumerate(list_features): |
|
cell_cluster[cluster, i] = np.average(df_sub[feat]) |
|
cell_cluster_norm = cell_cluster - np.median(cell_cluster, axis=0) |
|
sns.heatmap(cell_cluster_norm, |
|
cmap='magma', |
|
xticklabels=markers, |
|
yticklabels=np.arange(len(np.unique(communities))) |
|
) |
|
plt.xlabel("Markers - {}".format(accumul_type)) |
|
plt.ylabel("Phenograph clusters") |
|
plt.title("normalized expression - cell {}".format(accumul_type)) |
|
savename += "_cell_{}.png".format(accumul_type) |
|
if savedir is not None: |
|
if not os.path.exists(savedir): |
|
os.makedirs(savedir) |
|
plt.savefig(os.path.join(savedir, savename)) |
|
plt.show() |
|
return cell_cluster, cell_cluster_norm |
|
|
|
def vis_phenograph(df_slide_roi, df_all, pheno_name, markers, used_feat, level="cohort", accumul_type="sum", |
|
to_save=False, savepath="./", vis_scatter=False): |
|
""" |
|
Args: |
|
df_slide_roi = a dataframe with slide-roi correspondence information included |
|
df_all = dataframe with feature and clustering results included |
|
pheno_name = name (key) of the phenograph output |
|
markers = a (minimal) list of markers used in Pheno-Graph (to visualize) |
|
list_feat = a list of features used (should be consistent with columns available in df_all) |
|
level = level to visualize, choose from "cohort", "slide", or "roi" (default="cohort") |
|
accumul_type = type of feature accumulation used (default="sum") |
|
to_save = a flag indicating whether or not save output (default=False) |
|
savepath = visualization saving directory (default="./") |
|
""" |
|
if to_save: |
|
if not os.path.exists(savepath): |
|
os.makedirs |
|
|
|
|
|
ids = [i for (i,x) in enumerate(used_feat) if re.search(".{}".format(accumul_type), x)] |
|
list_feat = [used_feat[i] for i in ids] |
|
|
|
'''# features used for cell ave |
|
accumul_type = "ave" |
|
ids = [i for (i,x) in enumerate(used_feats[key]) if re.search(".{}".format(accumul_type), x)] |
|
list_feats[accumul_type] = [used_feats[key][i] for i in ids] |
|
|
|
list_feat_morph = [x for x in used_feats[key] if x not in list_feats["sum"]+list_feats["ave"]]''' |
|
|
|
if accumul_type == "sum": |
|
suffix = "_cell_sum" |
|
elif accumul_type == "ave": |
|
suffix = "_cell_ave" |
|
|
|
assert level in ["cohort", "slide", "roi"], "Only 'cohort', 'slide' or 'roi' levels are accepted!" |
|
'''df_io = pd.read_csv(os.path.join(outdir, "input_output.csv"))''' |
|
|
|
n_community = len(df_all[pheno_name].unique()) |
|
if level == "cohort": |
|
phenos = {level: df_all} |
|
else: |
|
phenos = _gather_roi_pheno(df_slide_roi, df_all) |
|
if level == "slide": |
|
for slide in df_io["Slide"].unique(): |
|
for seen_roi, roi_i in enumerate(df_slide_roi.loc[df_slide_roi["Slide"] == slide, "ROI"]): |
|
|
|
f_roi = roi_i.replace(".txt", "") |
|
if seen_roi == 0: |
|
phenos[slide] = phenos[f_roi] |
|
else: |
|
phenos[slide] = pd.concat([phenos[slide], phenos[f_roi]]) |
|
phenos.pop(f_roi) |
|
|
|
|
|
savename = "" |
|
for key, df_pheno in phenos.items(): |
|
if to_save: |
|
savepath_ = os.path.join(savepath, level) |
|
savename = key |
|
communities = df_pheno[pheno_name] |
|
|
|
_vis_cell_phenotypes(df_pheno, communities, n_community, markers, list_feat, accumul_type, |
|
savedir=savepath_, savename=savename) |
|
|
|
|
|
if vis_scatter: |
|
proj_2d = df_pheno[['{}_projx'.format(pheno_name), '{}_projy'.format(pheno_name)]].to_numpy() |
|
|
|
plt.figure(figsize=(4, 4)) |
|
plt.title("cohort") |
|
sns.scatterplot(x=proj_2d[:, 0], y=proj_2d[:, 1], hue=communities, palette='tab20', |
|
|
|
hue_order=np.arange(n_community)) |
|
plt.axis('tight') |
|
plt.legend(bbox_to_anchor=(1.01, 1), loc=2, borderaxespad=0.) |
|
if to_save: |
|
plt.savefig(os.path.join(savepath_, "scatter_{}.png".format(savename))) |
|
plt.show() |
|
return phenos |
|
|
|
|
|
import sklearn.neighbors |
|
from sklearn.neighbors import kneighbors_graph as skgraph |
|
from sklearn.metrics import DistanceMetric |
|
from scipy import sparse as sp |
|
import networkx as nx |
|
|
|
def _gather_roi_distances(df_slide_roi, outdir, name_pheno, thres_dist=50): |
|
dist = DistanceMetric.get_metric('euclidean') |
|
dist_matrices = {} |
|
for i, f_roi in enumerate(df_slide_roi['ROI'].unique()): |
|
roi = f_roi.replace('.txt', '') |
|
slide = df_slide_roi.loc[df_slide_roi["ROI"] == f_roi, "Slide"].values[0] |
|
f_cytof_im = "{}_{}.pkl".format(slide, roi) |
|
if not f_cytof_im in os.listdir(os.path.join(outdir, "cytof_images")): |
|
print("{} not found, skip".format(f_cytof_im)) |
|
continue |
|
cytof_im = load_CytofImage(os.path.join(outdir, "cytof_images", f_cytof_im)) |
|
df_sub = cytof_im.df_feature |
|
dist_matrices[roi] = {} |
|
dist_matrices[roi]['dist'] = dist.pairwise(df_sub.loc[:, ['coordinate_x', 'coordinate_y']].values) |
|
|
|
phenograph = getattr(cytof_im, 'phenograph')[name_pheno] |
|
cluster = phenograph['clusters'].values |
|
|
|
if i == 0: |
|
n_cluster = phenograph['num_community'] |
|
|
|
|
|
expected_percentage = np.zeros((n_cluster, n_cluster)) |
|
for _i in range(n_cluster): |
|
for _j in range(n_cluster): |
|
expected_percentage[_i, _j] = sum(cluster == _i) * sum(cluster == _j) |
|
dist_matrices[roi]['expected_percentage'] = expected_percentage |
|
dist_matrices[roi]['num_cell'] = len(df_sub) |
|
|
|
|
|
edge_nums = np.zeros_like(expected_percentage) |
|
dist_matrix = dist_matrices[roi]['dist'] |
|
n_cells = dist_matrix.shape[0] |
|
for _i in range(n_cells): |
|
for _j in range(n_cells): |
|
if dist_matrix[_i, _j] > 0 and dist_matrix[_i, _j] < thres_dist: |
|
edge_nums[cluster[_i], cluster[_j]] += 1 |
|
|
|
dist_matrices[roi]['edge_nums'] = edge_nums |
|
return dist_matrices |
|
|
|
|
|
def _gather_roi_kneighbor_graphs(df_slide_roi, outdir, name_pheno, k=8): |
|
graphs = {} |
|
for i, f_roi in enumerate(df_slide_roi['ROI'].unique()): |
|
roi = f_roi.replace('.txt', '') |
|
f_cytof_im = "{}.pkl".format(roi) |
|
if not f_cytof_im in os.listdir(os.path.join(outdir, "cytof_images")): |
|
print("{} not found, skip".format(f_cytof_im)) |
|
continue |
|
cytof_im = load_CytofImage(os.path.join(outdir, "cytof_images", f_cytof_im)) |
|
df_sub = cytof_im.df_feature |
|
graph = skgraph(np.array(df_sub.loc[:, ['coordinate_x', 'coordinate_y']]), n_neighbors=k, mode='distance') |
|
graph.toarray() |
|
I, J, V = sp.find(graph) |
|
|
|
graphs[roi] = {} |
|
graphs[roi]['I'] = I |
|
graphs[roi]['J'] = J |
|
graphs[roi]['V'] = V |
|
graphs[roi]['graph'] = graph |
|
|
|
phenograph = getattr(cytof_im, 'phenograph')[name_pheno] |
|
cluster = phenograph['clusters'].values |
|
|
|
if i == 0: |
|
n_cluster = phenograph['num_community'] |
|
|
|
|
|
edge_nums = np.zeros((n_cluster, n_cluster)) |
|
for _i, _j in zip(I, J): |
|
edge_nums[cluster[_i], cluster[_j]] += 1 |
|
graphs[roi]['edge_nums'] = edge_nums |
|
'''edge_percentages = edge_nums/np.sum(edge_nums)''' |
|
|
|
expected_percentage = np.zeros((n_cluster, n_cluster)) |
|
for _i in range(n_cluster): |
|
for _j in range(n_cluster): |
|
expected_percentage[_i, _j] = sum(cluster == _i) * sum(cluster == _j) |
|
graphs[roi]['expected_percentage'] = expected_percentage |
|
graphs[roi]['num_cell'] = len(df_sub) |
|
return graphs |
|
|
|
|
|
def interaction_analysis(df_slide_roi, outdir, name_pheno, method="distance", k=8, thres_dist=50, level="slide", clustergrid=None): |
|
assert method in ["distance", "graph"], "Method can be either 'distance' or 'graph'!" |
|
|
|
if method == "distance": |
|
info = _gather_roi_distances(df_slide_roi, outdir, name_pheno, thres_dist) |
|
else: |
|
info = _gather_roi_kneighbor_graphs(df_slide_roi, outdir, name_pheno, k) |
|
|
|
interacts = {} |
|
if level == "slide": |
|
for slide in df_slide_roi["Slide"].unique(): |
|
for seen_roi, f_roi in enumerate(df_slide_roi.loc[df_slide_roi["Slide"] == slide, "ROI"]): |
|
roi = f_roi.replace(".txt", "") |
|
if seen_roi == 0: |
|
info[slide] = {} |
|
info[slide]['edge_nums'] = info[roi]['edge_nums'] |
|
info[slide]['expected_percentage'] = info[roi]['expected_percentage'] |
|
info[slide]['num_cell'] = info[roi]['num_cell'] |
|
|
|
else: |
|
info[slide]['edge_nums'] += info[roi]['edge_nums'] |
|
info[slide]['expected_percentage'] += info[roi]['expected_percentage'] |
|
info[slide]['num_cell'] += info[roi]['num_cell'] |
|
info.pop(roi) |
|
|
|
for key, item in info.items(): |
|
edge_percentage = item['edge_nums'] / np.sum(item['edge_nums']) |
|
expected_percentage = item['expected_percentage'] / item['num_cell'] ** 2 |
|
|
|
|
|
interact_norm = np.log10(edge_percentage/expected_percentage + 0.1) |
|
|
|
|
|
interact_norm[np.isnan(interact_norm)] = np.log10(1 + 0.1) |
|
interacts[key] = interact_norm |
|
|
|
|
|
for f_key, interact in interacts.items(): |
|
plt.figure(figsize=(6, 6)) |
|
ax = sns.heatmap(interact, center=np.log10(1 + 0.1), |
|
cmap='RdBu_r', vmin=-1, vmax=1) |
|
ax.set_aspect('equal') |
|
plt.title(f_key) |
|
plt.show() |
|
|
|
if clustergrid is None: |
|
plt.figure() |
|
clustergrid = sns.clustermap(interact, center=np.log10(1 + 0.1), |
|
cmap='RdBu_r', vmin=-1, vmax=1, |
|
xticklabels=np.arange(interact.shape[0]), |
|
yticklabels=np.arange(interact.shape[0]), |
|
figsize=(6, 6)) |
|
plt.title(f_key) |
|
plt.show() |
|
|
|
plt.figure() |
|
sns.clustermap(interact[clustergrid.dendrogram_row.reordered_ind, :]\ |
|
[:, clustergrid.dendrogram_row.reordered_ind], |
|
center=np.log10(1 + 0.1), cmap='RdBu_r', vmin=-1, vmax=1, |
|
xticklabels=clustergrid.dendrogram_row.reordered_ind, |
|
yticklabels=clustergrid.dendrogram_row.reordered_ind, |
|
figsize=(6, 6), row_cluster=False, col_cluster=False) |
|
plt.title(f_key) |
|
plt.show() |
|
|
|
return interacts, clustergrid |