| library(biomaRt) |
| human = useMart("ensembl", dataset = "hsapiens_gene_ensembl") |
| |
| |
| testing <- read.csv('testing.csv', row.names = 1) |
| training <- read.csv('/share/vault/Users/gz2294/Data/DMS/ClinVar.HGMD.PrimateAI.syn/training.csv', row.names = 1) |
| |
| |
| |
| |
| uniprot2geneid <- read.csv('uniprot2geneid.csv', row.names = 1) |
| |
| |
| |
| |
| |
| geneid2paralog <- read.csv('geneid2paralog.csv', row.names = 1) |
| |
| testing.geneids <- uniprot2geneid$ensembl_gene_id[uniprot2geneid$uniprot_gn_id %in% testing$uniprotID] |
| testing.paralogs <- geneid2paralog$hsapiens_paralog_ensembl_gene[geneid2paralog$ensembl_gene_id %in% testing.geneids] |
| testing.paralogs.uid <- uniprot2geneid$uniprot_gn_id[uniprot2geneid$ensembl_gene_id %in% testing.paralogs] |
| |
| training.filetered <- training[!training$uniprotID %in% c(testing.paralogs.uid, testing$uniprotID),] |
| training <- training.filetered |
| |
| |
| split.n <- 4 |
| set.seed(0) |
| to.drop <- sample(which(training$score==0), dim(training)[1]-floor(dim(training)[1]/split.n)*split.n) |
| if (length(to.drop) > 0) { |
| training <- training[-to.drop,] |
| } |
| split.by.uniprotID <- function(freq_table, number_to_select) { |
| set.seed(0) |
| selected = 0 |
| selected_uniprotIDs = c() |
| candidates = freq_table[freq_table$Freq <= number_to_select - selected,] |
| while ((selected < number_to_select) & (dim(candidates)[1] > 0)) { |
| selected_uniprotID = sample(as.character(candidates$Var1), size = 1) |
| selected_uniprotIDs <- c(selected_uniprotIDs, selected_uniprotID) |
| selected = selected + freq_table$Freq[freq_table$Var1 == selected_uniprotID] |
| |
| freq_table = freq_table[!freq_table$Var1 %in% selected_uniprotID,] |
| candidates = freq_table[freq_table$Freq <= number_to_select - selected,] |
| } |
| result = list(selected_uniprotIDs, freq_table) |
| result |
| } |
| |
| quarter.size <- floor(dim(training)[1] / split.n) |
| training_freq_table <- as.data.frame(table(training$uniprotID)) |
| splits <- list() |
| tmp <- split.by.uniprotID(freq_table = training_freq_table, quarter.size) |
| splits[[1]] <- which(training$uniprotID %in% tmp[[1]]) |
| left_freq_table <- tmp[[2]] |
| for (s in 2:split.n) { |
| tmp <- split.by.uniprotID(freq_table = left_freq_table, quarter.size) |
| splits[[s]] <- which(training$uniprotID %in% tmp[[1]]) |
| left_freq_table <- tmp[[2]] |
| } |
| |
| left_split <- which(training$uniprotID %in% left_freq_table$Var1) |
| for (s in 1:split.n) { |
| set.seed(0) |
| if (length(splits[[s]]) < quarter.size) { |
| to.add <- sample(left_split, quarter.size - length(splits[[s]])) |
| splits[[s]] <- c(splits[[s]], to.add) |
| left_split <- left_split[!left_split %in% to.add] |
| } |
| } |
| training$sequence.len <- nchar(training$sequence) |
| training$sequence.len.orig <- nchar(training$sequence.orig) |
| for (s in 1:split.n) { |
| tmp.split <- training[splits[[s]], ] |
| |
| set.seed(0) |
| tmp.split <- tmp.split[sample(dim(tmp.split)[1]),] |
| print(dim(tmp.split)[1]) |
| write.csv(tmp.split, paste0("training.", s-1, ".csv"), na = ".") |
| table(tmp.split$split) |
| } |
| |
| |
| |
| write.csv(training, file = 'training.csv') |
|
|
|
|