Spaces:
Running
Running
import os | |
import pandas as pd | |
import numpy as np | |
description = ''' | |
This is a simple module for preprocessing data for the shiny app. | |
''' | |
def format_degs(): | |
''' | |
This function simply reads and formats the DEG files returning | |
a single large pd.DataFrame. Filters to only significant genes. | |
''' | |
deg_dir = 'data/differentially_expressed_genes/' | |
degs = {file.split('_D')[0] : pd.read_csv(deg_dir + file, sep = '\t') | |
for file in os.listdir(deg_dir)} | |
# Labeling data with dataset name | |
for ds, df in degs.items(): | |
degs[ds]['Dataset'] = [ds] * len(df) | |
# Combining all dataframes | |
degs = pd.concat([df for df in degs.values()]) | |
# Filtering out insignificant genes | |
degs = degs[degs['pvals_adj'] < 0.05] | |
return degs | |
def get_gse(): | |
''' | |
This function simply reads and formats geneset enrichment data. | |
Returns a pd.DataFrame. | |
''' | |
df = {'Dataset' : [], | |
'Group' : [], | |
'Value' : [], | |
'Variable' : []} | |
# Reading in gse data | |
gse = pd.read_csv('data/geneset_enrichment.tsv', sep = '\t') | |
# Subsetting to only hallmark | |
gse = gse[gse['Gene Set'] == 'Hallmark RNA'] | |
var_name = 'Gene Set Enrichment (-log10(adjusted p-value))' | |
for i, row in gse.iterrows(): | |
df['Dataset'].append(row['Dataset']) | |
df['Group'].append(row['Group Name']) | |
df['Value'].append(row['GSE (-log10(adj. p-val))']) | |
df['Variable'].append(var_name) | |
return pd.DataFrame(df) | |
def get_selection(): | |
''' | |
This function simply reads in and formats selection data from scMKL | |
results. Returns a pd.DataFrame. | |
''' | |
df = {'Dataset' : [], | |
'Group' : [], | |
'Value' : [], | |
'Variable' : []} | |
sel_dir = 'data/group_selections/' | |
# Reading in scMKL selection | |
selection = [pd.read_csv(sel_dir + file, sep = '\t') | |
for file in os.listdir(sel_dir)] | |
selection = pd.concat(selection) | |
# Filtering to only hallmark rna runs | |
selection = selection[selection['Modality'] == 'RNA - hallmark'] | |
# Formatting group names | |
rp = ('_', ' ') | |
names = selection['Group'].apply(lambda x: x.replace(*rp)[9:]) | |
selection['Group'] = names | |
# Grouping by group and summing selection | |
groupby = ['Dataset', 'Group', 'Modality'] | |
selection = selection.groupby(groupby)['Selected'].sum().reset_index() | |
var_name = 'scMKL Selection Frequency' | |
for i, row in selection.iterrows(): | |
df['Dataset'].append(row['Dataset']) | |
df['Group'].append(row['Group']) | |
df['Value'].append(row['Selected']) | |
df['Variable'].append(var_name) | |
return pd.DataFrame(df) | |
def get_overlap(degs, group_dict): | |
''' | |
This function takes DEGs in the form of a dataframe and returns a | |
dataframe with columns ['Group', 'Value', 'Variable', 'Dataset']. | |
''' | |
df = {'Dataset' : [], | |
'Group' : [], | |
'Value' : [], | |
'Variable' : []} | |
var_name = 'Proportion of DE Features' | |
for ds in set(degs['Dataset']): | |
# Getting array of deg genes for current dataset | |
ds_degs = np.array(degs[degs['Dataset'] == ds]['names'], | |
dtype = np.str_) | |
for group, genes in group_dict.items(): | |
# Finding num of overlap between DE genes and groupings | |
genes_list = list(genes) | |
overlap = np.isin(ds_degs, genes_list) | |
num_overlap = np.sum(overlap) | |
# Taking proportion | |
prop_overlap = num_overlap / len(genes) | |
# Formatting group name | |
group_name = group.replace('_', ' ')[9:] | |
df['Dataset'].append(ds) | |
df['Group'].append(group_name) | |
df['Value'].append(prop_overlap) | |
df['Variable'].append(var_name) | |
return pd.DataFrame(df) | |
def hallmarkify_tsv(): | |
''' | |
This script ties together the other functions and saves the tsv | |
for the GSEApy results tab. | |
''' | |
degs = format_degs() | |
group_dict = np.load('data/hallmark_groupings.pkl', allow_pickle = True) | |
deg_overlap = get_overlap(degs, group_dict) | |
gse = get_gse() | |
selection = get_selection() | |
all_ = pd.concat([deg_overlap, gse, selection]) | |
all_.to_csv('data/hallmark_enrichment_selection_overlap.tsv', sep = '\t', index = False) | |
return None |