scMKL_analysis / helper.py
ivango17's picture
Added all lung data aside from umaps
292135e
import os
import pandas as pd
import numpy as np
description = '''
This is a simple module for preprocessing data for the shiny app.
'''
def format_degs():
'''
This function simply reads and formats the DEG files returning
a single large pd.DataFrame. Filters to only significant genes.
'''
deg_dir = 'data/differentially_expressed_genes/'
degs = {file.split('_D')[0] : pd.read_csv(deg_dir + file, sep = '\t')
for file in os.listdir(deg_dir)}
# Labeling data with dataset name
for ds, df in degs.items():
degs[ds]['Dataset'] = [ds] * len(df)
# Combining all dataframes
degs = pd.concat([df for df in degs.values()])
# Filtering out insignificant genes
degs = degs[degs['pvals_adj'] < 0.05]
return degs
def get_gse():
'''
This function simply reads and formats geneset enrichment data.
Returns a pd.DataFrame.
'''
df = {'Dataset' : [],
'Group' : [],
'Value' : [],
'Variable' : []}
# Reading in gse data
gse = pd.read_csv('data/geneset_enrichment.tsv', sep = '\t')
# Subsetting to only hallmark
gse = gse[gse['Gene Set'] == 'Hallmark RNA']
var_name = 'Gene Set Enrichment (-log10(adjusted p-value))'
for i, row in gse.iterrows():
df['Dataset'].append(row['Dataset'])
df['Group'].append(row['Group Name'])
df['Value'].append(row['GSE (-log10(adj. p-val))'])
df['Variable'].append(var_name)
return pd.DataFrame(df)
def get_selection():
'''
This function simply reads in and formats selection data from scMKL
results. Returns a pd.DataFrame.
'''
df = {'Dataset' : [],
'Group' : [],
'Value' : [],
'Variable' : []}
sel_dir = 'data/group_selections/'
# Reading in scMKL selection
selection = [pd.read_csv(sel_dir + file, sep = '\t')
for file in os.listdir(sel_dir)]
selection = pd.concat(selection)
# Filtering to only hallmark rna runs
selection = selection[selection['Modality'] == 'RNA - hallmark']
# Formatting group names
rp = ('_', ' ')
names = selection['Group'].apply(lambda x: x.replace(*rp)[9:])
selection['Group'] = names
# Grouping by group and summing selection
groupby = ['Dataset', 'Group', 'Modality']
selection = selection.groupby(groupby)['Selected'].sum().reset_index()
var_name = 'scMKL Selection Frequency'
for i, row in selection.iterrows():
df['Dataset'].append(row['Dataset'])
df['Group'].append(row['Group'])
df['Value'].append(row['Selected'])
df['Variable'].append(var_name)
return pd.DataFrame(df)
def get_overlap(degs, group_dict):
'''
This function takes DEGs in the form of a dataframe and returns a
dataframe with columns ['Group', 'Value', 'Variable', 'Dataset'].
'''
df = {'Dataset' : [],
'Group' : [],
'Value' : [],
'Variable' : []}
var_name = 'Proportion of DE Features'
for ds in set(degs['Dataset']):
# Getting array of deg genes for current dataset
ds_degs = np.array(degs[degs['Dataset'] == ds]['names'],
dtype = np.str_)
for group, genes in group_dict.items():
# Finding num of overlap between DE genes and groupings
genes_list = list(genes)
overlap = np.isin(ds_degs, genes_list)
num_overlap = np.sum(overlap)
# Taking proportion
prop_overlap = num_overlap / len(genes)
# Formatting group name
group_name = group.replace('_', ' ')[9:]
df['Dataset'].append(ds)
df['Group'].append(group_name)
df['Value'].append(prop_overlap)
df['Variable'].append(var_name)
return pd.DataFrame(df)
def hallmarkify_tsv():
'''
This script ties together the other functions and saves the tsv
for the GSEApy results tab.
'''
degs = format_degs()
group_dict = np.load('data/hallmark_groupings.pkl', allow_pickle = True)
deg_overlap = get_overlap(degs, group_dict)
gse = get_gse()
selection = get_selection()
all_ = pd.concat([deg_overlap, gse, selection])
all_.to_csv('data/hallmark_enrichment_selection_overlap.tsv', sep = '\t', index = False)
return None