mport gradio as gr import PIL import numpy as np import scipy from scipy.stats import gaussian_kde from scipy.optimize import curve_fit import pandas as pd from sklearn.preprocessing import StandardScaler from sklearn.decomposition import PCA from sklearn.neighbors import KernelDensity import matplotlib as mpl import matplotlib.pyplot as plt import copy df = pd.read_csv( './gene_tpm_brain_cerebellar_hemisphere_log2minus1NEW.txt', sep='\t') gene_table = df.set_index('Description').drop( columns=['id', 'Name']).T.reset_index(drop=True) # =============================================================================================== # =============================================================================================== # =============================================================================================== def plot_hist_gauss(col, ax=None, orientation='vertical', label=''): show = True if ax is None else False ax = col.plot.hist(orientation=orientation, density=True, alpha=0.2, ax=ax, subplots=False) hist, bin_edges = np.histogram(col, density=True) bin_centers = (bin_edges[:-1] + bin_edges[1:]) / 2 def gauss(x, A, mu, sigma): return A * np.exp(-(x - mu)**2 / (2. * sigma**2)) p0 = [1, 5, 1] popt, pcov = curve_fit(gauss, bin_centers, hist, p0=p0) # hist A, mu, sigma = popt granularity = 100 x = np.linspace(col.min(), col.max(), granularity) if orientation == 'horizontal': ax.plot(gauss(x, *popt), x, c='C0', label='Fitted data') ax.hlines(mu, *ax.get_xlim(), colors='C3', label='Fitted mean') ax.set_ylabel(label) else: ax.plot(x, gauss(x, *popt), c='C0', label='Fitted data') ax.vlines(mu, *ax.get_ylim(), colors='C3', label='Fitted mean') ax.set_xlabel(label) if show: plt.show() return popt def plot_gene(gene, ax=None, orientation='vertical'): plot_hist_gauss(gene_table[gene], ax=ax, orientation=orientation, label=gene) # =============================================================================================== # =============================================================================================== # =============================================================================================== def plot_genes(x_gene=None, y_gene=None, ax=None, mode='raw', gene_table=gene_table): """ Produces a scatterplot of the TPM (Transcriptions Per Million) of two genes, and fits data to bivariate Gaussian which is also plotted. Parameters ---------- x_gene : str The common name of the gene to be plotted along the x-axis. y_gene : str The common name of the gene to be plotted along the y-axis. ax : matplotlib axes object, default None An axes of the current figure. mode : str, default 'raw' The mode of plotting: - 'raw' : plot data as is - 'norm' : normalize and recenter before plotting gene_table : pandas DataFrame, default global gene_table A table containing the two genes to be plotted as columns Returns ------- plotted_data : pandas DataFrame The two columns of data that were actually plotted A : float Amplitude of optimal bivariate Gaussian x0 : float x mean of optimal bivariate Gaussian y0 : float y mean of optimal bivariate Gaussian sigma_x : float Standard deviation along x axis of optimal bivariate Gaussian sigma_y : float Standard deviation along y axis of optimal bivariate Gaussian rho : float Pearson correlation coefficient of optimal bivariate Gaussian z_offset : float Additive offset of optimal bivariate Gaussian """ show = True if ax is None else False if ax is None: ax = plt.axes() ax.set_aspect('equal', adjustable='box') if x_gene is not None and y_gene is not None: two_cols = gene_table.loc[:, [x_gene, y_gene]] else: # testing print('WARNING: plot_genes requires two gene names as input. ' 'You have omitted at least one, so random test data will ' 'be plotted instead.') x_gene, y_gene = 'x', 'y' test_dist = np.random.default_rng().multivariate_normal( mean=[100, 200], cov=[[1, 0.9], [0.9, np.sqrt(3)]], size=(1000)) two_cols = pd.DataFrame(data=test_dist, columns=[x_gene, y_gene]) # Mean and density --------------------------------------------------------- mean = two_cols.mean() data_for_kde = two_cols.values.T density_estimator = gaussian_kde(data_for_kde) z = density_estimator(data_for_kde) # Fit to 2D Gaussian ======================================================= def bivariate_Gaussian(xy, A, x0, y0, sigma_x, sigma_y, rho, z_offset): x, y = xy # A should really be divided by (2*np.pi*sigma_x*sigma_y*np.sqrt(1-rho**2)) a = 1 / (2 * (1 - rho**2) * sigma_x**2) b = - rho / ((1 - rho**2) * sigma_x * sigma_y) c = 1 / (2 * (1 - rho**2) * sigma_y**2) g = z_offset + A * \ np.exp(-(a * (x - x0)**2 + b * (x - x0) * (y - y0) + c * (y - y0)**2)) return g.ravel() gran = 400 # granularity x = np.linspace(two_cols[x_gene].min(), two_cols[x_gene].max(), gran) y = np.linspace(two_cols[y_gene].min(), two_cols[y_gene].max(), gran) pts = np.transpose(np.dstack(np.meshgrid(x, y)), axes=[2, 0, 1]).reshape(2, -1) p0 = (1, mean[0], mean[1], 1, 1, 0, 0) popt, pcov = curve_fit(bivariate_Gaussian, pts, density_estimator(pts), p0=p0) A, x0, y0, sigma_x, sigma_y, rho, z_offset = popt cov = np.array( [[sigma_x**2, rho * sigma_x * sigma_y], [rho * sigma_x * sigma_y, sigma_y**2]]) eigenvalues, eigenvectors = np.linalg.eig(cov) # eigvals are variances along ellipse axes, eigvects are direction of axes scaled_eigvects = np.sqrt(eigenvalues) * eigenvectors # Plots ==================================================================== plotted_data = gene_table if mode == 'raw': # --- Plot Data --- two_cols.plot.scatter(x=x_gene, y=y_gene, c=z, s=2, ylabel=y_gene, ax=ax) # --- Plot Fitted Gaussian --- pts = pts.reshape(2, gran, gran) data_fitted = bivariate_Gaussian(pts, *popt).reshape(gran, gran) # contour ax.contour(pts[0], pts[1], data_fitted, 8, cmap='viridis', zorder=0, alpha=.5) # center ax.plot(x0, y0, 'rx') # gene axes ax.quiver([x0, x0], [y0, y0], [1, 0], [0, 1], angles='xy', scale_units='xy', width=0.005, scale=1, color=['magenta', 'violet'], alpha=0.35) # ellipse axes ax.quiver([x0, x0], [y0, y0], *scaled_eigvects, angles='xy', scale_units='xy', width=0.005, scale=1, color=['red', 'firebrick'], alpha=0.35) plotted_data = two_cols # -------------------------------------------------------------------------- elif mode == 'norm': inv_cov = np.linalg.inv(scaled_eigvects) recentered_data = two_cols.values - [x0, y0] normed_data = recentered_data @ inv_cov.T normed_two_cols = pd.DataFrame( data=normed_data, columns=[x_gene, y_gene]) # --- Plot Data --- normed_two_cols.plot.scatter(x=x_gene, y=y_gene, c=z, s=2, ax=ax, xlabel='minor axis', ylabel='major axis') # --- Plot Fitted Gaussian --- x = np.linspace(normed_two_cols[x_gene].min(), normed_two_cols[x_gene].max(), gran) y = np.linspace(normed_two_cols[y_gene].min(), normed_two_cols[y_gene].max(), gran) pts = np.transpose(np.dstack(np.meshgrid(x, y)), axes=[2, 0, 1]) pts = pts.reshape(2, gran, gran) data_fitted = bivariate_Gaussian(pts, A, 0, 0, 1, 1, 0, z_offset) data_fitted = data_fitted.reshape(gran, gran) # contour ax.contour(pts[0], pts[1], data_fitted, 8, cmap='viridis', zorder=0, alpha=.5) # center ax.plot(0, 0, 'rx') # gene axes ax.quiver([0, 0], [0, 0], *inv_cov, angles='xy', scale_units='xy', width=0.005, scale=1, color=['magenta', 'violet'], alpha=0.35) # ellipse axes ax.quiver([0, 0], [0, 0], [1, 0], [0, 1], angles='xy', scale_units='xy', width=0.005, scale=1, color=['red', 'firebrick'], alpha=0.35) plotted_data = normed_two_cols # ========================================================================== if show: plt.show() return (plotted_data, A, x0, y0, sigma_x, sigma_y, rho, z_offset) # optimal gaussian params # =============================================================================================== # =============================================================================================== # =============================================================================================== def plot_scatter_hist(x_gene, y_gene, mode='raw'): fig = plt.figure(layout='constrained') ax = fig.add_gridspec(top=0.75, right=0.75).subplots() # ax.set_aspect('equal', adjustable='box') # ax.set(aspect=1) ax_histx = ax.inset_axes([0, 1.05, 1, 0.25], sharex=ax) ax_histy = ax.inset_axes([1.05, 0, 0.25, 1], sharey=ax) ax_histx.tick_params(axis="x", labelbottom=False) ax_histy.tick_params(axis="y", labelleft=False) plot_result = plot_genes(x_gene, y_gene, ax=ax, mode=mode) plotted_data = plot_result[0] x_A, x_mu, x_sigma = plot_hist_gauss(plotted_data[x_gene], ax=ax_histx) y_A, y_mu, y_sigma = plot_hist_gauss(plotted_data[y_gene], ax=ax_histy, orientation='horizontal') ax_histx.set_ylabel('Freq') ax_histy.set_xlabel('Freq') ax.vlines(x_mu, *ax.get_ylim(), label=f'{x_gene} mean', colors='C3', zorder=0) ax.hlines(y_mu, *ax.get_xlim(), label=f'{y_gene} mean', colors='C3', zorder=0) # ax.fill_between([plotted_data[x_gene].min(), plotted_data[x_gene].max()], # *ax.get_ylim(), color='C0', alpha=0.01, lw=0) # ax.fill_betweenx([plotted_data[y_gene].min(), plotted_data[y_gene].max()], # *ax.get_xlim(), color='C0', alpha=0.01, lw=0) def create_correct_gene_plot(genes, mode): if len(genes) == 0: raise gr.Error("Please select at least one gene to plot.") elif len(genes) == 1: plot_gene(gene) elif len(genes) == 2: mode = 'norm' if mode else None plot_scatter_hist(genes, mode) else: raise gr.Error("Cannot plot more than two genes at a time.") fig = plt.gcf() return PIL.Image.frombytes( 'RGB', fig.canvas.get_width_height(), fig.canvas.tostring_rgb()) demo = gr.Interface( create_correct_gene_plot, [ gr.Dropdown( gene_table.columns, value=["APP", "PSENEN"], multiselect=True, label="Genes", info="Select one or two genes to plot." ), gr.Checkbox(label="Normalize", info="Recenter and normalize the Gaussian for two genes."), ], "image", # examples=[ # [2, "cat", ["Japan", "Pakistan"], "park", ["ate", "swam"], True], # [4, "dog", ["Japan"], "zoo", ["ate", "swam"], False], # [10, "bird", ["USA", "Pakistan"], "road", ["ran"], False], # [8, "cat", ["Pakistan"], "zoo", ["ate"], True], # ] ) if __name__ == "__main__": demo.launch()