Spaces:
Running
Running
File size: 4,420 Bytes
292135e |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 |
import os
import pandas as pd
import numpy as np
description = '''
This is a simple module for preprocessing data for the shiny app.
'''
def format_degs():
'''
This function simply reads and formats the DEG files returning
a single large pd.DataFrame. Filters to only significant genes.
'''
deg_dir = 'data/differentially_expressed_genes/'
degs = {file.split('_D')[0] : pd.read_csv(deg_dir + file, sep = '\t')
for file in os.listdir(deg_dir)}
# Labeling data with dataset name
for ds, df in degs.items():
degs[ds]['Dataset'] = [ds] * len(df)
# Combining all dataframes
degs = pd.concat([df for df in degs.values()])
# Filtering out insignificant genes
degs = degs[degs['pvals_adj'] < 0.05]
return degs
def get_gse():
'''
This function simply reads and formats geneset enrichment data.
Returns a pd.DataFrame.
'''
df = {'Dataset' : [],
'Group' : [],
'Value' : [],
'Variable' : []}
# Reading in gse data
gse = pd.read_csv('data/geneset_enrichment.tsv', sep = '\t')
# Subsetting to only hallmark
gse = gse[gse['Gene Set'] == 'Hallmark RNA']
var_name = 'Gene Set Enrichment (-log10(adjusted p-value))'
for i, row in gse.iterrows():
df['Dataset'].append(row['Dataset'])
df['Group'].append(row['Group Name'])
df['Value'].append(row['GSE (-log10(adj. p-val))'])
df['Variable'].append(var_name)
return pd.DataFrame(df)
def get_selection():
'''
This function simply reads in and formats selection data from scMKL
results. Returns a pd.DataFrame.
'''
df = {'Dataset' : [],
'Group' : [],
'Value' : [],
'Variable' : []}
sel_dir = 'data/group_selections/'
# Reading in scMKL selection
selection = [pd.read_csv(sel_dir + file, sep = '\t')
for file in os.listdir(sel_dir)]
selection = pd.concat(selection)
# Filtering to only hallmark rna runs
selection = selection[selection['Modality'] == 'RNA - hallmark']
# Formatting group names
rp = ('_', ' ')
names = selection['Group'].apply(lambda x: x.replace(*rp)[9:])
selection['Group'] = names
# Grouping by group and summing selection
groupby = ['Dataset', 'Group', 'Modality']
selection = selection.groupby(groupby)['Selected'].sum().reset_index()
var_name = 'scMKL Selection Frequency'
for i, row in selection.iterrows():
df['Dataset'].append(row['Dataset'])
df['Group'].append(row['Group'])
df['Value'].append(row['Selected'])
df['Variable'].append(var_name)
return pd.DataFrame(df)
def get_overlap(degs, group_dict):
'''
This function takes DEGs in the form of a dataframe and returns a
dataframe with columns ['Group', 'Value', 'Variable', 'Dataset'].
'''
df = {'Dataset' : [],
'Group' : [],
'Value' : [],
'Variable' : []}
var_name = 'Proportion of DE Features'
for ds in set(degs['Dataset']):
# Getting array of deg genes for current dataset
ds_degs = np.array(degs[degs['Dataset'] == ds]['names'],
dtype = np.str_)
for group, genes in group_dict.items():
# Finding num of overlap between DE genes and groupings
genes_list = list(genes)
overlap = np.isin(ds_degs, genes_list)
num_overlap = np.sum(overlap)
# Taking proportion
prop_overlap = num_overlap / len(genes)
# Formatting group name
group_name = group.replace('_', ' ')[9:]
df['Dataset'].append(ds)
df['Group'].append(group_name)
df['Value'].append(prop_overlap)
df['Variable'].append(var_name)
return pd.DataFrame(df)
def hallmarkify_tsv():
'''
This script ties together the other functions and saves the tsv
for the GSEApy results tab.
'''
degs = format_degs()
group_dict = np.load('data/hallmark_groupings.pkl', allow_pickle = True)
deg_overlap = get_overlap(degs, group_dict)
gse = get_gse()
selection = get_selection()
all_ = pd.concat([deg_overlap, gse, selection])
all_.to_csv('data/hallmark_enrichment_selection_overlap.tsv', sep = '\t', index = False)
return None |