app.py
Ronak Ramachandran
import gradio as gr
import PIL
import numpy as np
import scipy
from scipy.stats import gaussian_kde
from scipy.optimize import curve_fit
import pandas as pd
from sklearn.preprocessing import StandardScaler
from sklearn.decomposition import PCA
from sklearn.neighbors import KernelDensity
import matplotlib as mpl
import matplotlib.pyplot as plt
import copy
df = pd.read_csv(
'./gene_tpm_brain_cerebellar_hemisphere_log2minus1NEW.txt', sep='\t')
gene_table = df.set_index('Description').drop(
columns=['id', 'Name']).T.reset_index(drop=True)
# ===============================================================================================
# ===============================================================================================
# ===============================================================================================
def plot_hist_gauss(col, ax=None, orientation='vertical', label=''):
show = True if ax is None else False
ax = col.plot.hist(orientation=orientation, density=True,
alpha=0.2, ax=ax, subplots=False)
hist, bin_edges = np.histogram(col, density=True)
bin_centers = (bin_edges[:-1] + bin_edges[1:]) / 2
def gauss(x, A, mu, sigma):
return A * np.exp(-(x - mu)**2 / (2. * sigma**2))
p0 = [1, 5, 1]
popt, pcov = curve_fit(gauss, bin_centers, hist, p0=p0) # hist
A, mu, sigma = popt
granularity = 100
x = np.linspace(col.min(), col.max(), granularity)
if orientation == 'horizontal':
ax.plot(gauss(x, *popt), x, c='C0', label='Fitted data')
ax.hlines(mu, *ax.get_xlim(), colors='C3', label='Fitted mean')
ax.plot(x, gauss(x, *popt), c='C0', label='Fitted data')
ax.vlines(mu, *ax.get_ylim(), colors='C3', label='Fitted mean')
if show:
return popt
def plot_gene(gene, ax=None, orientation='vertical'):
plot_hist_gauss(gene_table[gene], ax=ax,
orientation=orientation, label=gene)
# ===============================================================================================
# ===============================================================================================
# ===============================================================================================
def plot_genes(x_gene=None, y_gene=None, ax=None, mode='raw', gene_table=gene_table):
Produces a scatterplot of the TPM (Transcriptions Per Million) of two genes,
and fits data to bivariate Gaussian which is also plotted.
x_gene : str
The common name of the gene to be plotted along the x-axis.
y_gene : str
The common name of the gene to be plotted along the y-axis.
ax : matplotlib axes object, default None
An axes of the current figure.
mode : str, default 'raw'
The mode of plotting:
- 'raw' : plot data as is
- 'norm' : normalize and recenter before plotting
gene_table : pandas DataFrame, default global gene_table
A table containing the two genes to be plotted as columns
plotted_data : pandas DataFrame
The two columns of data that were actually plotted
A : float
Amplitude of optimal bivariate Gaussian
x0 : float
x mean of optimal bivariate Gaussian
y0 : float
y mean of optimal bivariate Gaussian
sigma_x : float
Standard deviation along x axis of optimal bivariate Gaussian
sigma_y : float
Standard deviation along y axis of optimal bivariate Gaussian
rho : float
Pearson correlation coefficient of optimal bivariate Gaussian
z_offset : float
Additive offset of optimal bivariate Gaussian
show = True if ax is None else False
if ax is None:
ax = plt.axes()
ax.set_aspect('equal', adjustable='box')
if x_gene is not None and y_gene is not None:
two_cols = gene_table.loc[:, [x_gene, y_gene]]
else: # testing
print('WARNING: plot_genes requires two gene names as input. '
'You have omitted at least one, so random test data will '
'be plotted instead.')
x_gene, y_gene = 'x', 'y'
test_dist = np.random.default_rng().multivariate_normal(
mean=[100, 200], cov=[[1, 0.9], [0.9, np.sqrt(3)]], size=(1000))
two_cols = pd.DataFrame(data=test_dist, columns=[x_gene, y_gene])
# Mean and density ---------------------------------------------------------
mean = two_cols.mean()
data_for_kde = two_cols.values.T
density_estimator = gaussian_kde(data_for_kde)
z = density_estimator(data_for_kde)
# Fit to 2D Gaussian =======================================================
def bivariate_Gaussian(xy, A, x0, y0, sigma_x, sigma_y, rho, z_offset):
x, y = xy
# A should really be divided by (2*np.pi*sigma_x*sigma_y*np.sqrt(1-rho**2))
a = 1 / (2 * (1 - rho**2) * sigma_x**2)
b = - rho / ((1 - rho**2) * sigma_x * sigma_y)
c = 1 / (2 * (1 - rho**2) * sigma_y**2)
g = z_offset + A * \
np.exp(-(a * (x - x0)**2 + b * (x - x0) * (y - y0) + c * (y - y0)**2))
return g.ravel()
gran = 400 # granularity
x = np.linspace(two_cols[x_gene].min(), two_cols[x_gene].max(), gran)
y = np.linspace(two_cols[y_gene].min(), two_cols[y_gene].max(), gran)
pts = np.transpose(np.dstack(np.meshgrid(x, y)),
axes=[2, 0, 1]).reshape(2, -1)
p0 = (1, mean[0], mean[1], 1, 1, 0, 0)
popt, pcov = curve_fit(bivariate_Gaussian, pts,
density_estimator(pts), p0=p0)
A, x0, y0, sigma_x, sigma_y, rho, z_offset = popt
cov = np.array(
[[sigma_x**2, rho * sigma_x * sigma_y],
[rho * sigma_x * sigma_y, sigma_y**2]])
eigenvalues, eigenvectors = np.linalg.eig(cov)
# eigvals are variances along ellipse axes, eigvects are direction of axes
scaled_eigvects = np.sqrt(eigenvalues) * eigenvectors
# Plots ====================================================================
plotted_data = gene_table
if mode == 'raw':
# --- Plot Data ---
two_cols.plot.scatter(x=x_gene, y=y_gene, c=z,
s=2, ylabel=y_gene, ax=ax)
# --- Plot Fitted Gaussian ---
pts = pts.reshape(2, gran, gran)
data_fitted = bivariate_Gaussian(pts, *popt).reshape(gran, gran)
# contour
ax.contour(pts[0], pts[1], data_fitted, 8,
cmap='viridis', zorder=0, alpha=.5)
# center
ax.plot(x0, y0, 'rx')
# gene axes
ax.quiver([x0, x0], [y0, y0], [1, 0], [0, 1], angles='xy', scale_units='xy',
width=0.005, scale=1, color=['magenta', 'violet'], alpha=0.35)
# ellipse axes
ax.quiver([x0, x0], [y0, y0], *scaled_eigvects, angles='xy', scale_units='xy',
width=0.005, scale=1, color=['red', 'firebrick'], alpha=0.35)
plotted_data = two_cols
# --------------------------------------------------------------------------
elif mode == 'norm':
inv_cov = np.linalg.inv(scaled_eigvects)
recentered_data = two_cols.values - [x0, y0]
normed_data = recentered_data @ inv_cov.T
normed_two_cols = pd.DataFrame(
data=normed_data, columns=[x_gene, y_gene])
# --- Plot Data ---
normed_two_cols.plot.scatter(x=x_gene, y=y_gene, c=z, s=2, ax=ax,
xlabel='minor axis',
ylabel='major axis')
# --- Plot Fitted Gaussian ---
x = np.linspace(normed_two_cols[x_gene].min(),
normed_two_cols[x_gene].max(), gran)
y = np.linspace(normed_two_cols[y_gene].min(),
normed_two_cols[y_gene].max(), gran)
pts = np.transpose(np.dstack(np.meshgrid(x, y)), axes=[2, 0, 1])
pts = pts.reshape(2, gran, gran)
data_fitted = bivariate_Gaussian(pts, A, 0, 0, 1, 1, 0, z_offset)
data_fitted = data_fitted.reshape(gran, gran)
# contour
ax.contour(pts[0], pts[1], data_fitted, 8,
cmap='viridis', zorder=0, alpha=.5)
# center
ax.plot(0, 0, 'rx')
# gene axes
ax.quiver([0, 0], [0, 0], *inv_cov, angles='xy', scale_units='xy',
width=0.005, scale=1, color=['magenta', 'violet'], alpha=0.35)
# ellipse axes
ax.quiver([0, 0], [0, 0], [1, 0], [0, 1], angles='xy', scale_units='xy',
width=0.005, scale=1, color=['red', 'firebrick'], alpha=0.35)
plotted_data = normed_two_cols
# ==========================================================================
if show:
return (plotted_data,
A, x0, y0, sigma_x, sigma_y, rho, z_offset) # optimal gaussian params
# ===============================================================================================
# ===============================================================================================
# ===============================================================================================
def plot_scatter_hist(x_gene, y_gene, mode='raw'):
fig = plt.figure(layout='constrained')
ax = fig.add_gridspec(top=0.75, right=0.75).subplots()
# ax.set_aspect('equal', adjustable='box') # ax.set(aspect=1)
ax_histx = ax.inset_axes([0, 1.05, 1, 0.25], sharex=ax)
ax_histy = ax.inset_axes([1.05, 0, 0.25, 1], sharey=ax)
ax_histx.tick_params(axis="x", labelbottom=False)
ax_histy.tick_params(axis="y", labelleft=False)
plot_result = plot_genes(x_gene, y_gene, ax=ax, mode=mode)
plotted_data = plot_result[0]
x_A, x_mu, x_sigma = plot_hist_gauss(plotted_data[x_gene], ax=ax_histx)
y_A, y_mu, y_sigma = plot_hist_gauss(plotted_data[y_gene], ax=ax_histy,
ax.vlines(x_mu, *ax.get_ylim(),
label=f'{x_gene} mean', colors='C3', zorder=0)
ax.hlines(y_mu, *ax.get_xlim(),
label=f'{y_gene} mean', colors='C3', zorder=0)
# ax.fill_between([plotted_data[x_gene].min(), plotted_data[x_gene].max()],
# *ax.get_ylim(), color='C0', alpha=0.01, lw=0)
# ax.fill_betweenx([plotted_data[y_gene].min(), plotted_data[y_gene].max()],
# *ax.get_xlim(), color='C0', alpha=0.01, lw=0)
def create_correct_gene_plot(genes, mode):
if len(genes) == 0:
raise gr.Error("Please select at least one gene to plot.")
elif len(genes) == 1:
elif len(genes) == 2:
mode = 'norm' if mode else None
plot_scatter_hist(genes, mode)
raise gr.Error("Cannot plot more than two genes at a time.")
fig = plt.gcf()
return PIL.Image.frombytes(
'RGB', fig.canvas.get_width_height(), fig.canvas.tostring_rgb())
demo = gr.Interface(
gene_table.columns, value=["APP", "PSENEN"], multiselect=True, label="Genes", info="Select one or two genes to plot."
info="Recenter and normalize the Gaussian for two genes."),
# examples=[
# [2, "cat", ["Japan", "Pakistan"], "park", ["ate", "swam"], True],
# [4, "dog", ["Japan"], "zoo", ["ate", "swam"], False],
# [10, "bird", ["USA", "Pakistan"], "road", ["ran"], False],
# [8, "cat", ["Pakistan"], "zoo", ["ate"], True],
# ]
if __name__ == "__main__":