#!/usr/bin/env python
# coding: utf-8

# # Spatial clustering and denoising expressions
# 
# Spatial clustering, which shares an analogy with single-cell clustering, has expanded the scope of tissue physiology studies from cell-centroid to structure-centroid with spatially resolved transcriptomics (SRT) data.
# 
# Here, we presented four spatial clustering methods in OmicVerse.
# 
# We made three improvements in integrating the `GraphST`,`BINARY`,`CAST` and `STAGATE` algorithm in OmicVerse:
# - We removed the preprocessing that comes with `GraphST` and used the preprocessing consistent with all SRTs in OmicVerse
# - We optimised the dimensional display of `GraphST`, and PCA is considered a self-contained computational step.
# - We implemented `mclust` using Python, removing the R language dependency.
# - We provided a unified interface `ov.space.cluster`, the user can use the function interface at once to complete all the simultaneous
# 
# If you found this tutorial helpful, please cite `GraphST`,`BINARY`,`CAST` and `STAGATE` and `OmicVerse`:
# 
# - Long, Y., Ang, K.S., Li, M. et al. Spatially informed clustering, integration, and deconvolution of spatial transcriptomics with GraphST. Nat Commun 14, 1155 (2023). https://doi.org/10.1038/s41467-023-36796-3
# - Lin S, Cui Y, Zhao F, Yang Z, Song J, Yao J, et al. Complete spatially resolved gene expression is not necessary for identifying spatial domains. Cell Genomics. 2024;4:100565.
# - Tang, Z., Luo, S., Zeng, H. et al. Search and match across spatial omics samples at single-cell resolution. Nat Methods 21, 1818–1829 (2024). https://doi.org/10.1038/s41592-024-02410-7
# - Dong, K., Zhang, S. Deciphering spatial domains from spatially resolved transcriptomics with an adaptive graph attention auto-encoder. Nat Commun 13, 1739 (2022). https://doi.org/10.1038/s41467-022-29439-6
# 
# 

# In[1]:


import omicverse as ov
#print(f"omicverse version: {ov.__version__}")
import scanpy as sc
#print(f"scanpy version: {sc.__version__}")
ov.plot_set()


# ## Preprocess data
# 
# Here we present our re-analysis of 151676 sample of the dorsolateral prefrontal cortex (DLPFC) dataset. Maynard et al. has manually annotated DLPFC layers and white matter (WM) based on the morphological features and gene markers.
# 
# This tutorial demonstrates how to identify spatial domains on 10x Visium data using STAGATE. The processed data are available at https://github.com/LieberInstitute/spatialLIBD. We downloaded the manual annotation from the spatialLIBD package and provided at https://drive.google.com/drive/folders/10lhz5VY7YfvHrtV40MwaqLmWz56U9eBP?usp=sharing.

# In[2]:


adata = sc.read_visium(path='data', count_file='151676_filtered_feature_bc_matrix.h5')
adata.var_names_make_unique()


# <div class="admonition warning">
#   <p class="admonition-title">Note</p>
#   <p>
#     We introduced the spatial special svg calculation module prost in omicverse versions greater than `1.6.0` to replace scanpy's HVGs, if you want to use scanpy's HVGs you can set mode=`scanpy` in `ov.space.svg` or use the following code.
#   </p>
# </div>
# 
# ```python
# #adata=ov.pp.preprocess(adata,mode='shiftlog|pearson',n_HVGs=3000,target_sum=1e4)
# #adata.raw = adata
# #adata = adata[:, adata.var.highly_variable_features]
# ```

# In[3]:


sc.pp.calculate_qc_metrics(adata, inplace=True)
adata = adata[:,adata.var['total_counts']>100]
adata=ov.space.svg(adata,mode='prost',n_svgs=3000,target_sum=1e4,platform="visium",)
adata


# In[5]:


adata.write('data/cluster_svg.h5ad',compression='gzip')


# In[2]:


adata=ov.read('data/cluster_svg.h5ad',compression='gzip')


# (Optional) We read the ground truth area of our spatial data
# 
# This step is not mandatory to run, in the tutorial, it's just to demonstrate the accuracy of our clustering effect, and in your own tasks, there is often no Ground_truth

# In[3]:


# read the annotation
import pandas as pd
import os
Ann_df = pd.read_csv(os.path.join('data', '151676_truth.txt'), sep='\t', header=None, index_col=0)
Ann_df.columns = ['Ground Truth']
adata.obs['Ground Truth'] = Ann_df.loc[adata.obs_names, 'Ground Truth']
sc.pl.spatial(adata, img_key="hires", color=["Ground Truth"])


# ## Method1: GraphST
# 
# GraphST was rated as one of the best spatial clustering algorithms on Nature Method 2024.04, so we first tried to call GraphST for spatial domain identification in OmicVerse.

# In[4]:


methods_kwargs={}
methods_kwargs['GraphST']={
    'device':'cuda:0',
    'n_pcs':30
}

adata=ov.space.clusters(adata,
                  methods=['GraphST'],
                  methods_kwargs=methods_kwargs,
                  lognorm=1e4)


# In[11]:


ov.utils.cluster(adata,use_rep='graphst|original|X_pca',method='mclust',n_components=10,
                 modelNames='EEV', random_state=112,
                )
adata.obs['mclust_GraphST'] = ov.utils.refine_label(adata, radius=50, key='mclust') 
adata.obs['mclust_GraphST']=adata.obs['mclust_GraphST'].astype('category')


# In[12]:


res=ov.space.merge_cluster(adata,groupby='mclust_GraphST',use_rep='graphst|original|X_pca',
                  threshold=0.2,plot=True)


# In[13]:


sc.pl.spatial(adata, color=['mclust_GraphST','mclust_GraphST_tree','mclust','Ground Truth'])


# We can also use `mclust_R` to cluster the spatial domain, but this method need to install `rpy2` at first.
# 
# The use of the mclust algorithm requires the rpy2 package and the mclust package. See https://pypi.org/project/rpy2/ and https://cran.r-project.org/web/packages/mclust/index.html for detail.

# In[14]:


ov.utils.cluster(adata,use_rep='graphst|original|X_pca',method='mclust_R',n_components=10,
                 random_state=42,
                )
adata.obs['mclust_R_GraphST'] = ov.utils.refine_label(adata, radius=30, key='mclust_R') 
adata.obs['mclust_R_GraphST']=adata.obs['mclust_R_GraphST'].astype('category')
res=ov.space.merge_cluster(adata,groupby='mclust_R_GraphST',use_rep='graphst|original|X_pca',
                  threshold=0.2,plot=True)


# In[15]:


sc.pl.spatial(adata, color=['mclust_R_GraphST','mclust_R_GraphST_tree','mclust','Ground Truth'])


# ## Method2: BINARY
# 
# BINARY outperforms existing methods across various SRT data types while using significantly less input information.
# 
# If your data is very large, or very sparse, I believe BINARY would be a great choice.

# In[3]:


methods_kwargs={}
methods_kwargs['BINARY']={
    'use_method':'KNN',
    'cutoff':6,
    'obs_key':'BINARY_sample',
    'use_list':None,
    'pos_weight':10,
    'device':'cuda:0',
    'hidden_dims':[512, 30],
    'n_epochs': 1000,
    'lr':  0.001,
    'key_added': 'BINARY',
    'gradient_clipping': 5,
    'weight_decay': 0.0001,
    'verbose': True,
    'random_seed':0,
    'lognorm':1e4,
    'n_top_genes':2000,
}
adata=ov.space.clusters(adata,
                  methods=['BINARY'],
                 methods_kwargs=methods_kwargs)


# if you want to use R's `mclust`, you can use `ov.utils.cluster`.
# 
# But you need to install `rpy2` and `mclust` at first.

# In[4]:


ov.utils.cluster(adata,use_rep='BINARY',method='mclust_R',n_components=10,
                 random_state=42,
                )
adata.obs['mclust_BINARY'] = ov.utils.refine_label(adata, radius=30, key='mclust_R') 
adata.obs['mclust_BINARY']=adata.obs['mclust_BINARY'].astype('category')


# In[5]:


res=ov.space.merge_cluster(adata,groupby='mclust_BINARY',use_rep='BINARY',
                  threshold=0.01,plot=True)


# In[6]:


sc.pl.spatial(adata, color=['mclust_BINARY','mclust_BINARY_tree','mclust','Ground Truth'])


# In[10]:


ov.utils.cluster(adata,use_rep='BINARY',method='mclust',n_components=10,
                 modelNames='EEV', random_state=42,
                )
adata.obs['mclustpy_BINARY'] = ov.utils.refine_label(adata, radius=30, key='mclust') 
adata.obs['mclustpy_BINARY']=adata.obs['mclustpy_BINARY'].astype('category')


# In[13]:


adata.obs['mclustpy_BINARY']=adata.obs['mclustpy_BINARY'].astype('category')
res=ov.space.merge_cluster(adata,groupby='mclustpy_BINARY',use_rep='BINARY',
                  threshold=0.013,plot=True)


# In[14]:


sc.pl.spatial(adata, color=['mclustpy_BINARY','mclustpy_BINARY_tree','mclust','Ground Truth'])
#adata.obs['mclust_BINARY'] = ov.utils.refine_label(adata, radius=30, key='mclust') 
#adata.obs['mclust_BINARY']=adata.obs['mclust_BINARY'].astype('category')


# ## Method3: STAGATE
# 
# STAGATE is designed for spatial clustering and denoising expressions of spatial resolved transcriptomics (ST) data.
# 
# STAGATE learns low-dimensional latent embeddings with both spatial information and gene expressions via a graph attention auto-encoder. The method adopts an attention mechanism in the middle layer of the encoder and decoder, which adaptively learns the edge weights of spatial neighbor networks, and further uses them to update the spot representation by collectively aggregating information from its neighbors. The latent embeddings and the reconstructed expression profiles can be used to downstream tasks such as spatial domain identification, visualization, spatial trajectory inference, data denoising and 3D expression domain extraction.
# 
# Dong, Kangning, and Shihua Zhang. “Deciphering spatial domains from spatially resolved transcriptomics with an adaptive graph attention auto-encoder.” Nature Communications 13.1 (2022): 1-12.
# 
# 
# Here, we used `ov.space.pySTAGATE` to construct a STAGATE object to train the model. 
# 

# In[12]:


methods_kwargs={}
methods_kwargs['STAGATE']={
    'num_batch_x':3,'num_batch_y':2,
    'spatial_key':['X','Y'],'rad_cutoff':200,
    'num_epoch':1000,'lr':0.001,
    'weight_decay':1e-4,'hidden_dims':[512, 30],
    'device':'cuda:0',
    #'n_top_genes':2000,
}

adata=ov.space.clusters(adata,
                  methods=['STAGATE'],
                 methods_kwargs=methods_kwargs)


# In[36]:


ov.utils.cluster(adata,use_rep='STAGATE',method='mclust_R',n_components=10,
                 random_state=112,
                )
adata.obs['mclust_R_STAGATE'] = ov.utils.refine_label(adata, radius=30, key='mclust_R') 
adata.obs['mclust_R_STAGATE']=adata.obs['mclust_R_STAGATE'].astype('category')
res=ov.space.merge_cluster(adata,groupby='mclust_R_STAGATE',use_rep='STAGATE',
                  threshold=0.005,plot=True)


# In[37]:


sc.pl.spatial(adata, color=['mclust_R_STAGATE','mclust_R_STAGATE_tree','mclust_R','Ground Truth'])


# ### Denoising

# In[52]:


adata.var.sort_values('PI',ascending=False).head(5)


# In[53]:


plot_gene = 'MBP'
import matplotlib.pyplot as plt
fig, axs = plt.subplots(1, 2, figsize=(8, 4))
sc.pl.spatial(adata, img_key="hires", color=plot_gene, show=False, ax=axs[0], title='RAW_'+plot_gene, vmax='p99')
sc.pl.spatial(adata, img_key="hires", color=plot_gene, show=False, ax=axs[1], title='STAGATE_'+plot_gene, layer='STAGATE_ReX', vmax='p99')


# ## Method4: CAST
# 
# CAST would be a great algorithm if your spatial transcriptome is at single-cell resolution and in multiple slices.

# In[38]:


methods_kwargs={}
methods_kwargs['CAST']={
    'output_path_t':'result/CAST_gas/output',
    'device':'cuda:0',
    'gpu_t':0
}
adata=ov.space.clusters(adata,
                  methods=['CAST'],
                 methods_kwargs=methods_kwargs)


# In[39]:


ov.utils.cluster(adata,use_rep='X_cast',method='mclust',n_components=10,
                 modelNames='EEV', random_state=42,
                )
adata.obs['mclust_CAST'] = ov.utils.refine_label(adata, radius=50, key='mclust') 
adata.obs['mclust_CAST']=adata.obs['mclust_CAST'].astype('category')


# In[40]:


res=ov.space.merge_cluster(adata,groupby='mclust_CAST',use_rep='X_cast',
                  threshold=0.1,plot=True)


# In[41]:


sc.pl.spatial(adata, color=['mclust_CAST','mclust_CAST_tree','mclust','Ground Truth'])


# In[42]:


adata


# ## Evaluate cluster
# 
# We use ARI to evaluate the scoring of our clusters against the true values
# 
# While it appears that STAGATE works best, note that this is only on this dataset.
# - If your data is spot-level resolution, GraphST, BINARY and STAGATE would be good algorithms to use
# - BINARY and CAST would be good algorithms if your data is NanoString or other single-cell resolution

# In[50]:


from sklearn.metrics.cluster import adjusted_rand_score

obs_df = adata.obs.dropna()
#GraphST
ARI = adjusted_rand_score(obs_df['mclust_GraphST'], obs_df['Ground Truth'])
print('mclust_GraphST: Adjusted rand index = %.2f' %ARI)

ARI = adjusted_rand_score(obs_df['mclust_R_GraphST'], obs_df['Ground Truth'])
print('mclust_R_GraphST: Adjusted rand index = %.2f' %ARI)

ARI = adjusted_rand_score(obs_df['mclust_R_STAGATE'], obs_df['Ground Truth'])
print('mclust_STAGATE: Adjusted rand index = %.2f' %ARI)

ARI = adjusted_rand_score(obs_df['mclust_BINARY'], obs_df['Ground Truth'])
print('mclust_BINARY: Adjusted rand index = %.2f' %ARI)

ARI = adjusted_rand_score(obs_df['mclustpy_BINARY'], obs_df['Ground Truth'])
print('mclustpy_BINARY: Adjusted rand index = %.2f' %ARI)

ARI = adjusted_rand_score(obs_df['mclust_CAST'], obs_df['Ground Truth'])
print('mclust_CAST: Adjusted rand index = %.2f' %ARI)


# In[ ]: