import os import pandas as pd import numpy as np description = ''' This is a simple module for preprocessing data for the shiny app. ''' def format_degs(): ''' This function simply reads and formats the DEG files returning a single large pd.DataFrame. Filters to only significant genes. ''' deg_dir = 'data/differentially_expressed_genes/' degs = {file.split('_D')[0] : pd.read_csv(deg_dir + file, sep = '\t') for file in os.listdir(deg_dir)} # Labeling data with dataset name for ds, df in degs.items(): degs[ds]['Dataset'] = [ds] * len(df) # Combining all dataframes degs = pd.concat([df for df in degs.values()]) # Filtering out insignificant genes degs = degs[degs['pvals_adj'] < 0.05] return degs def get_gse(): ''' This function simply reads and formats geneset enrichment data. Returns a pd.DataFrame. ''' df = {'Dataset' : [], 'Group' : [], 'Value' : [], 'Variable' : []} # Reading in gse data gse = pd.read_csv('data/geneset_enrichment.tsv', sep = '\t') # Subsetting to only hallmark gse = gse[gse['Gene Set'] == 'Hallmark RNA'] var_name = 'Gene Set Enrichment (-log10(adjusted p-value))' for i, row in gse.iterrows(): df['Dataset'].append(row['Dataset']) df['Group'].append(row['Group Name']) df['Value'].append(row['GSE (-log10(adj. p-val))']) df['Variable'].append(var_name) return pd.DataFrame(df) def get_selection(): ''' This function simply reads in and formats selection data from scMKL results. Returns a pd.DataFrame. ''' df = {'Dataset' : [], 'Group' : [], 'Value' : [], 'Variable' : []} sel_dir = 'data/group_selections/' # Reading in scMKL selection selection = [pd.read_csv(sel_dir + file, sep = '\t') for file in os.listdir(sel_dir)] selection = pd.concat(selection) # Filtering to only hallmark rna runs selection = selection[selection['Modality'] == 'RNA - hallmark'] # Formatting group names rp = ('_', ' ') names = selection['Group'].apply(lambda x: x.replace(*rp)[9:]) selection['Group'] = names # Grouping by group and summing selection groupby = ['Dataset', 'Group', 'Modality'] selection = selection.groupby(groupby)['Selected'].sum().reset_index() var_name = 'scMKL Selection Frequency' for i, row in selection.iterrows(): df['Dataset'].append(row['Dataset']) df['Group'].append(row['Group']) df['Value'].append(row['Selected']) df['Variable'].append(var_name) return pd.DataFrame(df) def get_overlap(degs, group_dict): ''' This function takes DEGs in the form of a dataframe and returns a dataframe with columns ['Group', 'Value', 'Variable', 'Dataset']. ''' df = {'Dataset' : [], 'Group' : [], 'Value' : [], 'Variable' : []} var_name = 'Proportion of DE Features' for ds in set(degs['Dataset']): # Getting array of deg genes for current dataset ds_degs = np.array(degs[degs['Dataset'] == ds]['names'], dtype = np.str_) for group, genes in group_dict.items(): # Finding num of overlap between DE genes and groupings genes_list = list(genes) overlap = np.isin(ds_degs, genes_list) num_overlap = np.sum(overlap) # Taking proportion prop_overlap = num_overlap / len(genes) # Formatting group name group_name = group.replace('_', ' ')[9:] df['Dataset'].append(ds) df['Group'].append(group_name) df['Value'].append(prop_overlap) df['Variable'].append(var_name) return pd.DataFrame(df) def hallmarkify_tsv(): ''' This script ties together the other functions and saves the tsv for the GSEApy results tab. ''' degs = format_degs() group_dict = np.load('data/hallmark_groupings.pkl', allow_pickle = True) deg_overlap = get_overlap(degs, group_dict) gse = get_gse() selection = get_selection() all_ = pd.concat([deg_overlap, gse, selection]) all_.to_csv('data/hallmark_enrichment_selection_overlap.tsv', sep = '\t', index = False) return None