File size: 4,420 Bytes
292135e
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
import os
import pandas as pd
import numpy as np


description = '''
This is a simple module for preprocessing data for the shiny app.
'''


def format_degs():
    '''
    This function simply reads and formats the DEG files returning 
    a single large pd.DataFrame. Filters to only significant genes.
    '''
    deg_dir = 'data/differentially_expressed_genes/'

    degs = {file.split('_D')[0] : pd.read_csv(deg_dir + file, sep = '\t')
            for file in os.listdir(deg_dir)}
    
    # Labeling data with dataset name
    for ds, df in degs.items():
        degs[ds]['Dataset'] = [ds] * len(df)

    # Combining all dataframes
    degs = pd.concat([df for df in degs.values()])

    # Filtering out insignificant genes
    degs = degs[degs['pvals_adj'] < 0.05]

    return degs


def get_gse():
    '''
    This function simply reads and formats geneset enrichment data. 
    Returns a pd.DataFrame.
    '''
    df = {'Dataset' : [],
          'Group' : [],
          'Value' : [],
          'Variable' : []}
    
    # Reading in gse data
    gse = pd.read_csv('data/geneset_enrichment.tsv', sep = '\t')
    
    # Subsetting to only hallmark
    gse = gse[gse['Gene Set'] == 'Hallmark RNA']

    var_name = 'Gene Set Enrichment (-log10(adjusted p-value))'

    for i, row in gse.iterrows():
        
        df['Dataset'].append(row['Dataset'])
        df['Group'].append(row['Group Name'])
        df['Value'].append(row['GSE (-log10(adj. p-val))'])
        df['Variable'].append(var_name)

    return pd.DataFrame(df)


def get_selection():
    '''
    This function simply reads in and formats selection data from scMKL 
    results. Returns a pd.DataFrame.
    '''
    df = {'Dataset' : [],
          'Group' : [],
          'Value' : [],
          'Variable' : []}
    
    sel_dir = 'data/group_selections/'
    
    # Reading in scMKL selection
    selection = [pd.read_csv(sel_dir + file, sep = '\t') 
                 for file in os.listdir(sel_dir)]
    selection = pd.concat(selection)

    # Filtering to only hallmark rna runs
    selection = selection[selection['Modality'] == 'RNA - hallmark']

    # Formatting group names
    rp = ('_', ' ')
    names = selection['Group'].apply(lambda x: x.replace(*rp)[9:])
    selection['Group'] = names

    # Grouping by group and summing selection
    groupby = ['Dataset', 'Group', 'Modality']
    selection = selection.groupby(groupby)['Selected'].sum().reset_index()

    var_name = 'scMKL Selection Frequency'

    for i, row in selection.iterrows():

        df['Dataset'].append(row['Dataset'])
        df['Group'].append(row['Group'])
        df['Value'].append(row['Selected'])
        df['Variable'].append(var_name)

    return pd.DataFrame(df)

def get_overlap(degs, group_dict):
    '''
    This function takes DEGs in the form of a dataframe and returns a 
    dataframe with columns ['Group', 'Value', 'Variable', 'Dataset'].
    '''
    df = {'Dataset' : [],
          'Group' : [],
          'Value' : [],
          'Variable' : []}
    
    var_name = 'Proportion of DE Features'

    for ds in set(degs['Dataset']):

        # Getting array of deg genes for current dataset
        ds_degs = np.array(degs[degs['Dataset'] == ds]['names'], 
                           dtype = np.str_)
        
        for group, genes in group_dict.items():

            # Finding num of overlap between DE genes and groupings
            genes_list = list(genes)
            overlap = np.isin(ds_degs, genes_list)
            num_overlap = np.sum(overlap)
            
            # Taking proportion
            prop_overlap = num_overlap / len(genes)

            # Formatting group name
            group_name = group.replace('_', ' ')[9:]

            df['Dataset'].append(ds)
            df['Group'].append(group_name)
            df['Value'].append(prop_overlap)
            df['Variable'].append(var_name)

    return pd.DataFrame(df)


def hallmarkify_tsv():
    '''
    This script ties together the other functions and saves the tsv 
    for the GSEApy results tab.
    '''
    degs = format_degs()
    group_dict = np.load('data/hallmark_groupings.pkl', allow_pickle = True)

    deg_overlap = get_overlap(degs, group_dict)
    gse = get_gse()
    selection = get_selection()

    all_ = pd.concat([deg_overlap, gse, selection])

    all_.to_csv('data/hallmark_enrichment_selection_overlap.tsv', sep = '\t', index = False)

    return None