VarWiz / app /logic /data_preprocessor.R
vojtam's picture
Upload 18 files
8e7c9f6 verified
box::use(
data.table[...],
stats[na.omit],
RColorBrewer[brewer.pal]
)
#' preprocess_data
#' @description
#' This function loads the datasets tsvs, merges them and filters the columns
#'
#'
#' @return table with the following columns:
#' Consequence | given_ref | Allele | var_name | ensembl_id | biotype |
#' gene_name | variant_class | chrom | pos | all_kegg_gene_names | refseq_id |
#' kegg_paths_id | kegg_paths_name
#' @export
#'
#' @examples
preprocess_data <- function() {
data_dir <- file.path("data/")
variants_tab_full <- fread(file.path(data_dir, "all_variants.annotated.processed.tsv"), na.strings = c(".", "-"))
kegg_tab <- fread(file.path(data_dir, "kegg_tab.tsv"))
variants_tab <- variants_tab_full[, .(Consequence, GIVEN_REF, Allele, Protein_position, HGVSp, Amino_acids, var_name, Gene, BIOTYPE, SOURCE, SYMBOL, VARIANT_CLASS, chrom, pos)]
variants_tab <- variants_tab[!is.na(variants_tab$HGVSp)]
variants_tab <- variants_tab[SOURCE == "Ensembl"]
setnames(variants_tab,
old = c("Gene", "GIVEN_REF", "BIOTYPE", "SYMBOL", "VARIANT_CLASS", "Consequence", "Protein_position", "Amino_acids"),
new = c("ensembl_id", "given_ref", "biotype", "gene_name", "variant_class", "consequence", "protein_pos", "amino_acids"))
## join the tables
tab <- variants_tab[kegg_tab, on = "ensembl_id", nomatch = 0]
tab[, `:=`(gene_definition = NULL, SOURCE = NULL, type = NULL)]
return(tab)
}
#' preprocess_maf_data
#' @description
#' This function loads the datasets tsvs, merges them and filters the columns
#'
#'
#' @return table
#' @export
#'
#' @examples
preprocess_maf_data <- function() {
data_dir <- file.path("data/")
kegg_tab <- fread(file.path(data_dir, "kegg_tab.tsv"))[, .(ensembl_id, kegg_paths_name)]
kegg_tab <- kegg_tab[ensembl_id != ""]
mutations_data <- fread(file.path(data_dir, "brca_tcga_pub2015/data_mutations.txt"), sep = "\t")
variants_tab <- mutations_data[, .(Hugo_Symbol, Gene, Chromosome, Start_Position, End_Position, Strand, Consequence, Variant_Classification, Variant_Type, Reference_Allele, Tumor_Seq_Allele2, HGVSc, HGVSp, Protein_position, Codons)]
setnames(variants_tab,
old = c("Gene", "Hugo_Symbol", "Tumor_Seq_Allele2"),
new = c("ensembl_id", "gene_name", "Allele")
)
variants_tab <- na.omit(variants_tab)
genes_sample <- variants_tab$gene_name
variants_tab <- variants_tab[gene_name %in% genes_sample]
variants_tab[, var_name := paste0(Chromosome, "_", Start_Position, "_", Reference_Allele, "/", Allele)]
tab <- variants_tab[kegg_tab, on = "ensembl_id", nomatch = 0, allow.cartesian = TRUE]
variant_types <-unique(tab$Variant_Type)
colors_tab <- data.table(col = brewer.pal(length(variant_types), "Set2"), Variant_Type = variant_types)
tab <- tab[colors_tab, on = "Variant_Type"]
return(tab)
}