get.auc.by.epoch <- function(configs, base.line="uniprotID") {
  log.dir <- configs$log_dir
  data.train <- as.numeric(strsplit(system(paste0("wc -l ", configs$data_file_train), intern = T), split = " ")[[1]][1])
  num_saved_batches <- floor(ceiling(data.train * configs$train_size / configs$ngpus / configs$batch_size)
                             * configs$num_epochs / configs$num_save_batches) + 1
  epochs <- c(1:(configs$num_epochs))
  source('/share/pascal/Users/gz2294/Pipeline/AUROC.R')
  library(doParallel)
  cl <- makeCluster(72)
  registerDoParallel(cl)
  
  res <- foreach (i = 1:length(epochs), .combine=rbind) %dopar% {
    i <- epochs[i]
    source('~/Pipeline/AUROC.R')
    if (file.exists(paste0(log.dir, 'test_result.epoch.', i, '.csv'))) {
      test.result <- read.csv(paste0(log.dir, 'test_result.epoch.', i, '.csv'))
      if ('y.0' %in% colnames(test.result)) {
        if ('y.2' %in% colnames(test.result)) {
          # 3-dim logits
          test.result <- test.result[!is.na(test.result$y.0) & !is.na(test.result$y.1) & !is.na(test.result$y.2),]
          test.logits <- test.result[, c("y.0", "y.1", "y.2")]
          test.logits <- t(apply(as.matrix(test.logits), 1, soft.max))
          # check whether clinvar or gof/lof
          if (-1 %in% test.result$score) {
            test.logits <- test.logits[,3] / (test.logits[,2] + test.logits[,3])
          } else {
            test.logits <- 1 - test.logits[,1]
          }
        } else if ('y.1' %in% colnames(test.result)) {
          test.result <- test.result[!is.na(test.result$y.1),]
          test.logits <- test.result$y.1
        } else {
          test.result <- test.result[!is.na(test.result$y.0),]
          test.logits <- test.result$y.0
        }
      } else {
        test.result <- test.result[!is.na(test.result$y),]
        test.logits <- test.result$y
      }
      result <- plot.AUC(test.result$score, test.logits
                         # paste0(log.dir, 'test_result.epoch.', i, '.pdf')
                         )
      J_stats <- result$curve[,2] - result$curve[,1]
      optimal.cutoff <-  result$curve[which(J_stats==max(J_stats))[1],3]
    } else {
      result <- list(auc=NA)
      optimal.cutoff <- NA
    }
    if (file.exists(paste0(log.dir, 'result_dict.epoch.', i-1, '.ddp_rank.', 0, '.json'))) {
      val_losses <- c()
      train_losses <- c()
      if (configs$ngpus > 1) {
        for (rank in 0:(configs$ngpus-1)) {
          val_dic <- jsonlite::read_json(paste0(log.dir, 'result_dict.epoch.', i-1, '.ddp_rank.', rank, '.json'))
          if (!is.null(val_dic$val_loss_y)) {
            val_losses <- c(val_losses, val_dic$val_loss_y)
            train_losses <- c(train_losses, val_dic$train_loss_y)
          } else {
            val_losses <- c(val_losses, val_dic$val_loss)
            train_losses <- c(train_losses, val_dic$train_loss)
          }
        }
      } else {
        rank <- configs$gpu_id
        if (is.null(rank)) {
          rank <- 0
        }
        val_dic <- jsonlite::read_json(paste0(log.dir, 'result_dict.epoch.', i-1, '.ddp_rank.', rank, '.json'))
        if (!is.null(val_dic$val_loss_y)) {
          val_losses <- c(val_losses, val_dic$val_loss_y)
          train_losses <- c(train_losses, val_dic$train_loss_y)
        } else {
          val_losses <- c(val_losses, val_dic$val_loss)
          train_losses <- c(train_losses, val_dic$train_loss)
        }
      }
    } else {
      train_losses <- NA
      val_losses <- NA
    }
    if (file.exists(paste0(log.dir, 'test_result.epoch.', i, '.txt'))) {
      test_dic <- readLines(paste0(log.dir, 'test_result.epoch.', i, '.txt'), warn = F)
      test_dic <- gsub("'", '"', test_dic)
      test_dic <- jsonlite::fromJSON(test_dic)
      if (!is.null(test_dic$test_loss_y)) {
        test_losses <- test_dic$test_loss_y
      } else {
        test_losses <- test_dic$test_loss
      }
    } else {
      test_losses <- NA
    }
    res <- data.frame(train=mean(train_losses),
                      val=mean(val_losses),
                      test=mean(test_losses),
                      aucs=result$auc,
                      optimal.cutoffs=optimal.cutoff)
    res
  }
  stopCluster(cl)
  res$epochs <- epochs
  res <- res[!is.na(res$train),]
  val <- res$val
  aucs <- res$aucs
  optimal.cutoffs <- res$optimal.cutoffs
  epochs <- res$epochs
  train <- res$train
  to.plot <- data.frame(epoch=rep(epochs, 2),
                        loss=c(train, val),
                        auc=rep(aucs, 2),
                        metric_name=c(rep("train_loss", length(epochs)),
                                      rep("val_loss", length(epochs))))
  # calculate baseline
  if (base.line == "uniprotID") {
    baseline.uniprotID <- system(
      paste0("/share/vault/Users/gz2294/miniconda3/envs/r4-base/bin/python ",
             "/share/pascal/Users/gz2294/PreMode.final/analysis/random.forest.process.classifier.py ",
             configs$data_file_train, " ",
             configs$data_file_test), intern = T,
    )
    baseline.auc <- as.numeric(strsplit(baseline.uniprotID, ": ")[[1]][2])
    if (dim(res)[1] > 0) {
      res$baseline.auc <- baseline.auc
    } 
  } else if (base.line == "esm") {
    alphabet <- c('<cls>', '<pad>', '<eos>', '<unk>',
                  'L', 'A', 'G', 'V', 'S', 'E', 'R', 'T', 'I', 'D',
                  'P', 'K', 'Q', 'N', 'F', 'Y', 'M', 'H', 'W', 'C',
                  'X', 'B', 'U', 'Z', 'O', '.', '-',
                  '<null_1>', '<mask>')
    data.file.name <- configs$data_type
    fold <- strsplit(configs$data_file_train, "pfams.0.8.seed.")[[1]][2]
    fold <- as.numeric(substr(fold, 1, 1))
    if (is.na(fold)) {
      fold <- 0
    }
    baseline.file <- paste0('/share/pascal/Users/gz2294/PreMode.final/analysis/esm2.inference/',
                            data.file.name, "/testing.fold.", fold, ".logits.csv")
    test.result <- read.csv(configs$data_file_test, row.names = 1)
    if (file.exists(baseline.file)) {
      baseline.res <- read.csv(baseline.file)
      logits <- baseline.res[,2:34]
      colnames(logits) <- alphabet
      score <- c()
      for (k in 1:dim(logits)[1]) {
        score <- c(score, logits[k, test.result$alt[k]] - logits[k, test.result$ref[k]])
      }
      result <- plot.AUC(test.result$score, score)
      if (dim(res)[1] > 0) {
        res$baseline.auc <- result$auc
      } 
    } 
  }
  library(ggplot2)
  if (is.na(to.plot$auc[1])) {
    p <- ggplot(to.plot, aes(x=epoch)) +
      geom_line(aes(y=loss, col=metric_name)) +
      theme_bw() +
      theme(axis.text.x = element_text(angle = 45, vjust = 1, hjust=1))
  } else {
    p <- ggplot(to.plot, aes(x=epoch)) +
      geom_line(aes(y=loss, col=metric_name)) +
      geom_line(aes(y=auc)) +
      scale_y_continuous(
        # Features of the first axis
        name = "Loss",
        breaks = seq(0, max(1.1, max(to.plot$loss)), by = 0.05), limits = c(0, max(1.1, max(to.plot$loss))),
        # Add a second axis and specify its features
        sec.axis = sec_axis(~ . , name="AUC",
                            breaks = seq(0, max(1.1, max(to.plot$loss)), by = 0.05))
      ) +
      theme_bw() +
      theme(axis.text.x = element_text(angle = 45, vjust = 1, hjust=1))
  }
  
  ggsave('Loss.AUC.by.epoch.pdf', p, width = 9, height = 6)
  
  print(paste0("min val epoch (", epochs[which(val==min(val))[1]],
               ") AUC: ", round(aucs[which(val==min(val))[1]], digits = 2),
               " Optimal cutoff: ",
               round(optimal.cutoffs[which(val==min(val))[1]], digits = 2)))
  print(paste0("end epoch (", epochs[length(val)],
               ") AUC: ", round(aucs[length(aucs)], digits = 2),
               " Optimal cutoff: ",
               round(optimal.cutoffs[length(aucs)], digits = 2)))
  print(paste0("max AUC epoch (", epochs[which(aucs==max(aucs))[1]],
               "): ", round(max(aucs), digits = 2),
               " Optimal cutoff: ",
               round(optimal.cutoffs[which(aucs==max(aucs))[1]], digits = 2)))
  res
}


get.auc.by.step <- function(configs, base.line="uniprotID") {
  log.dir <- configs$log_dir
  data.train <- as.numeric(strsplit(system(paste0("wc -l ", configs$data_file_train), intern = T), split = " ")[[1]][1])
  num_saved_batches = floor(ceiling(data.train * configs$train_size / configs$ngpus / configs$batch_size)
                            * configs$num_epochs / configs$num_save_batches) + 1
  steps <- c(1:(num_saved_batches-1))*configs$num_save_batches
  source('/share/pascal/Users/gz2294/Pipeline/AUROC.R')
  library(doParallel)
  cl <- makeCluster(72)
  registerDoParallel(cl)
  res <- foreach (i = 1:length(steps), .combine=rbind) %dopar% {
  # for (i in 1:length(steps)) {  
    source('~/Pipeline/AUROC.R')
    i <- steps[i]
    if (file.exists(paste0(log.dir, 'test_result.step.', i, '.csv'))) {
      test.result <- read.csv(paste0(log.dir, 'test_result.step.', i, '.csv'))
      if ('y.0' %in% colnames(test.result)) {
        if ('y.2' %in% colnames(test.result)) {
          # 3-dim logits
          test.result <- test.result[!is.na(test.result$y.0) & !is.na(test.result$y.1) & !is.na(test.result$y.2),]
          test.logits <- test.result[, c("y.0", "y.1", "y.2")]
          test.logits <- t(apply(as.matrix(test.logits), 1, soft.max))
          # check whether clinvar or gof/lof
          if (-1 %in% test.result$score) {
            test.logits <- test.logits[,3] / (test.logits[,2] + test.logits[,3])
          } else {
            test.logits <- 1 - test.logits[,1]
          }
        } else if ('y.1' %in% colnames(test.result)) {
          test.result <- test.result[!is.na(test.result$y.1),]
          test.logits <- test.result$y.1
        } else {
          test.result <- test.result[!is.na(test.result$y.0),]
          test.logits <- test.result$y.0
        }
      } else {
        test.result <- test.result[!is.na(test.result$y),]
        test.logits <- test.result$y
      }
      result <- plot.AUC(test.result$score, test.logits
                         # paste0(log.dir, 'test_result.step.', i, '.pdf')
                         )
      J_stats <- result$curve[,2] - result$curve[,1]
      optimal.cutoff <- result$curve[which(J_stats==max(J_stats))[1],3]
    } else {
      result <- list(auc=NA)
      optimal.cutoff <- NA
    }
    if (file.exists(paste0(log.dir, 'result_dict.batch.', i, '.ddp_rank.', 0, '.json'))) {
      val_losses <- c()
      train_losses <- c()
      if (configs$ngpus > 1) {
        for (rank in 0:(configs$ngpus-1)) {
          val_dic <- jsonlite::read_json(paste0(log.dir, 'result_dict.batch.', i, '.ddp_rank.', rank, '.json'))
          if (!is.null(val_dic$val_loss_y)) {
            val_losses <- c(val_losses, val_dic$val_loss_y)
            train_losses <- c(train_losses, val_dic$train_loss_y)
          } else {
            val_losses <- c(val_losses, val_dic$val_loss)
            train_losses <- c(train_losses, val_dic$train_loss)
          }
        }
      } else {
        rank <- configs$gpu_id
        if (is.null(rank)) {
          rank <- 0
        }
        val_dic <- jsonlite::read_json(paste0(log.dir, 'result_dict.batch.', i, '.ddp_rank.', rank, '.json'))
        if (!is.null(val_dic$val_loss_y)) {
          val_losses <- c(val_losses, val_dic$val_loss_y)
          train_losses <- c(train_losses, val_dic$train_loss_y)
        } else {
          val_losses <- c(val_losses, val_dic$val_loss)
          train_losses <- c(train_losses, val_dic$train_loss)
        }
      }
    } else {
      val_losses <- NA
      train_losses <- NA
    }
    if (file.exists(paste0(log.dir, 'test_result.step.', i, '.txt'))) {
      test_dic <- readLines(paste0(log.dir, 'test_result.step.', i, '.txt'), warn = F)
      test_dic <- gsub("'", '"', test_dic)
      test_dic <- jsonlite::fromJSON(test_dic)
      if (!is.null(test_dic$test_loss_y)) {
        test_losses <- test_dic$test_loss_y
      } else {
        test_losses <- test_dic$test_loss
      }
    } else {
      test_losses <- NA
    }
    res <- data.frame(train=mean(train_losses),
                      val=mean(val_losses),
                      test=mean(test_losses),
                      aucs=result$auc,
                      optimal.cutoffs=optimal.cutoff)
    print(res)
  }
  stopCluster(cl)
  res$steps <- steps
  res <- res[!is.na(res$train),]
  val <- res$val
  aucs <- res$aucs
  optimal.cutoffs <- res$optimal.cutoffs
  steps <- res$steps
  train <- res$train
  to.plot <- data.frame(step=rep(steps, 2),
                        loss=c(train, val),
                        auc=rep(aucs, 2),
                        metric_name=c(rep("train_loss", length(steps)),
                                      rep("val_loss", length(steps))))
  # calculate baseline
  if (base.line == "uniprotID") {
    baseline.uniprotID <- system(
      paste0("/share/vault/Users/gz2294/miniconda3/envs/r4-base/bin/python ",
             "/share/pascal/Users/gz2294/PreMode.final/analysis/random.forest.process.classifier.py ",
             configs$data_file_train, " ",
             configs$data_file_test), intern = T,
    )
    baseline.auc <- as.numeric(strsplit(baseline.uniprotID, ": ")[[1]][2])
    if (dim(res)[1] > 0) {
      res$baseline.auc <- baseline.auc
    } 
  } else if (base.line == "esm") {
    alphabet <- c('<cls>', '<pad>', '<eos>', '<unk>',
                  'L', 'A', 'G', 'V', 'S', 'E', 'R', 'T', 'I', 'D',
                  'P', 'K', 'Q', 'N', 'F', 'Y', 'M', 'H', 'W', 'C',
                  'X', 'B', 'U', 'Z', 'O', '.', '-',
                  '<null_1>', '<mask>')
    data.file.name <- configs$data_type
    fold <- strsplit(configs$data_file_train, "pfams.0.8.seed.")[[1]][2]
    fold <- as.numeric(substr(fold, 1, 1))
    if (is.na(fold)) {
      fold <- 0
    }
    baseline.file <- paste0('/share/pascal/Users/gz2294/PreMode.final/analysis/esm2.inference/',
                            data.file.name, "/testing.fold.", fold, ".logits.csv")
    test.result <- read.csv(configs$data_file_test, row.names = 1)
    if (file.exists(baseline.file)) {
      baseline.res <- read.csv(baseline.file)
      logits <- baseline.res[,2:34]
      colnames(logits) <- alphabet
      score <- c()
      for (k in 1:dim(logits)[1]) {
        score <- c(score, logits[k, test.result$alt[k]] - logits[k, test.result$ref[k]])
      }
      result <- plot.AUC(test.result$score, score)
      if (dim(res)[1] > 0) {
        res$baseline.auc <- result$auc
      } 
    } 
  }
  library(ggplot2)
  if (is.na(to.plot$auc[1])) {
    p <- ggplot(to.plot, aes(x=step)) +
      geom_line(aes(y=loss, col=metric_name)) +
      scale_x_continuous(breaks =
                           seq(1*configs$num_save_batches,
                               (num_saved_batches - 1)*configs$num_save_batches,
                               by = configs$num_save_batches)) +
      theme_bw() +
      theme(axis.text.x = element_text(angle = 45, vjust = 1, hjust=1))
  } else {
    p <- ggplot(to.plot, aes(x=step)) +
      geom_line(aes(y=loss, col=metric_name)) +
      geom_line(aes(y=auc)) +
      scale_y_continuous(
        # Features of the first axis
        name = "Loss",
        breaks = seq(0, max(1.1, max(to.plot$loss)), by = 0.05), limits = c(0, max(1.1, max(to.plot$loss))),
        # Add a second axis and specify its features
        sec.axis = sec_axis(~ . , name="AUC",
                            breaks = seq(0, max(1.1, max(to.plot$loss)), by = 0.05))
      ) +
      scale_x_continuous(breaks =
                           seq(1*configs$num_save_batches,
                               (num_saved_batches - 1)*configs$num_save_batches,
                               by = configs$num_save_batches)) +
      theme_bw() +
      theme(axis.text.x = element_text(angle = 45, vjust = 1, hjust=1))
  }
  ggsave('Loss.AUC.by.step.pdf', p, width = max(6, min(6 * length(steps) / 50, 20)), height = 4)
  
  print(paste0("min val step (", steps[which(val==min(val))[1]],
               ") AUC: ", round(aucs[which(val==min(val))[1]], digits = 2),
               " Optimal cutoff: ",
               round(optimal.cutoffs[which(val==min(val))[1]], digits = 2)))
  print(paste0("end step (", steps[length(val)],
               ") AUC: ", round(aucs[length(aucs)], digits = 2),
               " Optimal cutoff: ",
               round(optimal.cutoffs[length(aucs)], digits = 2)))
  print(paste0("max AUC step (", steps[which(aucs==max(aucs))[1]],
               "): ", round(max(aucs), digits = 2),
               " Optimal cutoff: ",
               round(optimal.cutoffs[which(aucs==max(aucs))[1]], digits = 2)))
  res
}


get.auc.by.step.split.pLDDT <- function(configs, base.line="uniprotID") {
  log.dir <- configs$log_dir
  data.train <- as.numeric(strsplit(system(paste0("wc -l ", configs$data_file_train), intern = T), split = " ")[[1]][1])
  num_saved_batches = floor(ceiling(data.train * configs$train_size / configs$ngpus / configs$batch_size)
                            * configs$num_epochs / configs$num_save_batches) + 1
  steps <- c(1:(num_saved_batches-1))*configs$num_save_batches
  source('~/Pipeline/uniprot.table.add.annotation.R')
  test.file <- read.csv(configs$data_file_test)
  test.file <- uniprot.table.add.annotation.parallel(test.file, 'pLDDT.region')
  source('/share/pascal/Users/gz2294/Pipeline/AUROC.R')
  library(doParallel)
  cl <- makeCluster(72)
  registerDoParallel(cl)
  res <- foreach (i = 1:length(steps), .combine=rbind) %dopar% {
    # for (i in 1:length(steps)) {  
    source('~/Pipeline/AUROC.R')
    i <- steps[i]
    if (file.exists(paste0(log.dir, 'test_result.step.', i, '.csv'))) {
      test.result <- read.csv(paste0(log.dir, 'test_result.step.', i, '.csv'))
      if ('y.0' %in% colnames(test.result)) {
        if ('y.2' %in% colnames(test.result)) {
          # 3-dim logits
          test.result <- test.result[!is.na(test.result$y.0) & !is.na(test.result$y.1) & !is.na(test.result$y.2),]
          test.logits <- test.result[, c("y.0", "y.1", "y.2")]
          test.logits <- t(apply(as.matrix(test.logits), 1, soft.max))
          # check whether clinvar or gof/lof
          if (-1 %in% test.result$score) {
            test.logits <- test.logits[,3] / (test.logits[,2] + test.logits[,3])
          } else {
            test.logits <- 1 - test.logits[,1]
          }
        } else if ('y.1' %in% colnames(test.result)) {
          test.result <- test.result[!is.na(test.result$y.1),]
          test.logits <- test.result$y.1
        } else {
          test.result <- test.result[!is.na(test.result$y.0),]
          test.logits <- test.result$y.0
        }
      } else {
        test.result <- test.result[!is.na(test.result$y),]
        test.logits <- test.result$y
      }
      result <- plot.AUC(test.result$score, test.logits)
      result.1 <- plot.AUC(test.result$score[test.file$pLDDT.region >= 70], 
                           test.logits[test.file$pLDDT.region >= 70])
      result.2 <- plot.AUC(test.result$score[test.file$pLDDT.region < 70],
                           test.logits[test.file$pLDDT.region < 70])
      J_stats <- result$curve[,2] - result$curve[,1]
      optimal.cutoff <- result$curve[which(J_stats==max(J_stats))[1],3]
    } else {
      result <- list(auc=NA)
      result.1 <- list(auc=NA)
      result.2 <- list(auc=NA)
      optimal.cutoff <- NA
    }
    if (file.exists(paste0(log.dir, 'result_dict.batch.', i, '.ddp_rank.', 0, '.json'))) {
      val_losses <- c()
      train_losses <- c()
      if (configs$ngpus > 1) {
        for (rank in 0:(configs$ngpus-1)) {
          val_dic <- jsonlite::read_json(paste0(log.dir, 'result_dict.batch.', i, '.ddp_rank.', rank, '.json'))
          if (!is.null(val_dic$val_loss_y)) {
            val_losses <- c(val_losses, val_dic$val_loss_y)
            train_losses <- c(train_losses, val_dic$train_loss_y)
          } else {
            val_losses <- c(val_losses, val_dic$val_loss)
            train_losses <- c(train_losses, val_dic$train_loss)
          }
        }
      } else {
        rank <- configs$gpu_id
        if (is.null(rank)) {
          rank <- 0
        }
        val_dic <- jsonlite::read_json(paste0(log.dir, 'result_dict.batch.', i, '.ddp_rank.', rank, '.json'))
        if (!is.null(val_dic$val_loss_y)) {
          val_losses <- c(val_losses, val_dic$val_loss_y)
          train_losses <- c(train_losses, val_dic$train_loss_y)
        } else {
          val_losses <- c(val_losses, val_dic$val_loss)
          train_losses <- c(train_losses, val_dic$train_loss)
        }
      }
    } else {
      val_losses <- NA
      train_losses <- NA
    }
    if (file.exists(paste0(log.dir, 'test_result.step.', i, '.txt'))) {
      test_dic <- readLines(paste0(log.dir, 'test_result.step.', i, '.txt'), warn = F)
      test_dic <- gsub("'", '"', test_dic)
      test_dic <- jsonlite::fromJSON(test_dic)
      if (!is.null(test_dic$test_loss_y)) {
        test_losses <- test_dic$test_loss_y
      } else {
        test_losses <- test_dic$test_loss
      }
    } else {
      test_losses <- NA
    }
    res <- data.frame(train=mean(train_losses),
                      val=mean(val_losses),
                      test=mean(test_losses),
                      aucs=result$auc,
                      aucs.low_pLDDT=result.2$auc,
                      aucs.high_pLDDT=result.1$auc,
                      optimal.cutoffs=optimal.cutoff)
    print(res)
  }
  stopCluster(cl)
  res$steps <- steps
  res <- res[!is.na(res$train),]
  val <- res$val
  aucs <- res$aucs
  optimal.cutoffs <- res$optimal.cutoffs
  steps <- res$steps
  train <- res$train
  result.1.neg <- sum(test.file$pLDDT.region >= 70 & test.file$score == -1)
  result.1.pos <- sum(test.file$pLDDT.region >= 70 & test.file$score == 1)
  result.2.neg <- sum(test.file$pLDDT.region < 70 & test.file$score == -1)
  result.2.pos <- sum(test.file$pLDDT.region < 70 & test.file$score == 1)
  to.plot <- data.frame(step=rep(steps, 2),
                        loss=c(train, val),
                        auc=rep(aucs, 2),
                        auc.pLDDT=c(res$aucs.high_pLDDT, res$aucs.low_pLDDT),
                        auc.name = c(rep(paste0("pLDDT >= 0.7 (", result.1.neg, "/", result.1.pos, ")"),
                                         length(steps)),
                                     rep(paste0("pLDDT < 0.7 (", result.2.neg, "/", result.2.pos, ")"),
                                         length(steps))),
                        metric_name=c(rep("train_loss", length(steps)),
                                      rep("val_loss", length(steps))))
  # calculate baseline
  if (base.line == "uniprotID") {
    baseline.uniprotID <- system(
      paste0("/share/vault/Users/gz2294/miniconda3/envs/r4-base/bin/python ",
             "/share/pascal/Users/gz2294/PreMode.final/analysis/random.forest.process.classifier.py ",
             configs$data_file_train, " ",
             configs$data_file_test), intern = T,
    )
    baseline.auc <- as.numeric(strsplit(baseline.uniprotID, ": ")[[1]][2])
    if (dim(res)[1] > 0) {
      res$baseline.auc <- baseline.auc
    } 
  } else if (base.line == "esm") {
    alphabet <- c('<cls>', '<pad>', '<eos>', '<unk>',
                  'L', 'A', 'G', 'V', 'S', 'E', 'R', 'T', 'I', 'D',
                  'P', 'K', 'Q', 'N', 'F', 'Y', 'M', 'H', 'W', 'C',
                  'X', 'B', 'U', 'Z', 'O', '.', '-',
                  '<null_1>', '<mask>')
    data.file.name <- configs$data_type
    fold <- strsplit(configs$data_file_train, "pfams.0.8.seed.")[[1]][2]
    fold <- as.numeric(substr(fold, 1, 1))
    if (is.na(fold)) {
      fold <- 0
    }
    baseline.file <- paste0('/share/pascal/Users/gz2294/PreMode.final/analysis/esm2.inference/',
                            data.file.name, "/testing.fold.", fold, ".logits.csv")
    test.result <- read.csv(configs$data_file_test, row.names = 1)
    if (file.exists(baseline.file)) {
      baseline.res <- read.csv(baseline.file)
      logits <- baseline.res[,2:34]
      colnames(logits) <- alphabet
      score <- c()
      for (k in 1:dim(logits)[1]) {
        score <- c(score, logits[k, test.result$alt[k]] - logits[k, test.result$ref[k]])
      }
      result <- plot.AUC(test.result$score, score)
      if (dim(res)[1] > 0) {
        res$baseline.auc <- result$auc
      } 
    } 
  }
  library(ggplot2)
  if (is.na(to.plot$auc[1])) {
    p <- ggplot(to.plot, aes(x=step)) +
      geom_line(aes(y=loss, col=metric_name)) +
      scale_x_continuous(breaks =
                           seq(1*configs$num_save_batches,
                               (num_saved_batches - 1)*configs$num_save_batches,
                               by = configs$num_save_batches)) +
      theme_bw() +
      theme(axis.text.x = element_text(angle = 45, vjust = 1, hjust=1))
  } else {
    p <- ggplot(to.plot, aes(x=step)) +
      geom_line(aes(y=loss, col=metric_name)) +
      geom_line(aes(y=auc)) +
      geom_line(aes(y=auc.pLDDT, col=auc.name)) +
      scale_y_continuous(
        # Features of the first axis
        name = "Loss",
        breaks = seq(0, max(1.1, max(to.plot$loss)), by = 0.05), limits = c(0, max(1.1, max(to.plot$loss))),
        # Add a second axis and specify its features
        sec.axis = sec_axis(~ . , name="AUC",
                            breaks = seq(0, max(1.1, max(to.plot$loss)), by = 0.05))
      ) +
      scale_x_continuous(breaks =
                           seq(1*configs$num_save_batches,
                               (num_saved_batches - 1)*configs$num_save_batches,
                               by = configs$num_save_batches)) +
      theme_bw() +
      theme(axis.text.x = element_text(angle = 45, vjust = 1, hjust=1))
  }
  ggsave('Loss.AUC.by.step.pdf', p, width = max(9, min(9 * length(steps) / 50, 20)), height = 6)
  
  print(paste0("min val step (", steps[which(val==min(val))[1]],
               ") AUC: ", round(aucs[which(val==min(val))[1]], digits = 2),
               " Optimal cutoff: ",
               round(optimal.cutoffs[which(val==min(val))[1]], digits = 2)))
  print(paste0("end step (", steps[length(val)],
               ") AUC: ", round(aucs[length(aucs)], digits = 2),
               " Optimal cutoff: ",
               round(optimal.cutoffs[length(aucs)], digits = 2)))
  print(paste0("max AUC step (", steps[which(aucs==max(aucs))[1]],
               "): ", round(max(aucs), digits = 2),
               " Optimal cutoff: ",
               round(optimal.cutoffs[which(aucs==max(aucs))[1]], digits = 2)))
  res
}


get.R.by.epoch <- function(configs, bin=FALSE) {
  log.dir <- configs$log_dir
  epochs <- c(1:configs$num_epochs)
  library(doParallel)
  cl <- makeCluster(72)
  registerDoParallel(cl)
  res <- foreach (i = 1:length(epochs), .combine=dplyr::bind_rows) %dopar% {
    source('/share/pascal/Users/gz2294/Pipeline/AUROC.R')
    i <- epochs[i]
    if (file.exists(paste0(log.dir, 'test_result.epoch.', i, '.csv'))) {
      test.result <- read.csv(paste0(log.dir, 'test_result.epoch.', i, '.csv'))
      score.columns <- colnames(test.result)[grep("^score", colnames(test.result))]
      score.columns <- score.columns[order(score.columns)]
      result.columns <- colnames(test.result)[grep("^y.", colnames(test.result))]
      result.columns <- result.columns[order(result.columns)]
      test.result <- test.result[!is.na(test.result[,result.columns[1]]),]
      result <- plot.R2(test.result[,score.columns], test.result[,result.columns],
                        bin=bin,
                        filename=paste0(log.dir, 'test_result.epoch.', i, '.pdf'))
      # val_losses <- c()
      # train_losses <- c()
      rank <- 0
      val_dic <- jsonlite::read_json(paste0(log.dir, 'result_dict.epoch.', i-1, '.ddp_rank.', rank, '.json'))
      # val_losses <- c(val_losses, val_dic$val_loss)
      # train_losses <- c(train_losses, val_dic$train_loss)
      res <- data.frame(epochs = i,
                        train_loss = val_dic$val_loss,
                        val_loss = val_dic$train_loss,
                        R2s=t(as.data.frame(result$R2)))
    } else {
      res <- data.frame(epochs = NA,
                        train_loss = NA,
                        val_loss = NA)
    }
    res
  }
  stopCluster(cl)
  res <- res[!is.na(res$train_loss),]
  epochs <- res$epochs
  train <- res$train_loss
  val <- res$val_loss
  R2s <- as.data.frame(res[,startsWith(colnames(res), "R2s")])
  if (dim(R2s)[1] > 0) {
    to.plot <- data.frame(epoch=rep(epochs, 2),
                          loss=c(train, val),
                          R2=c(rowMeans(R2s), rowMeans(R2s)),
                          Rs=rbind(R2s, R2s),
                          metric_name=c(rep("train_loss", length(epochs)),
                                        rep("val_loss", length(epochs))))
    to.plot.2 <- data.frame()
    for (i in 1:dim(R2s)[2]) {
      to.plot.2 <- rbind(to.plot.2,
                         data.frame(R2=R2s[,i],
                                    epoch=epochs,
                                    label=paste0('assay.', i)))
    }
    library(ggplot2)
    p <- ggplot(to.plot, aes(x=epoch)) +
      geom_line(aes(y=loss, col=metric_name)) +
      geom_line(data = to.plot.2, aes(y=R2, col=label)) +
      scale_y_continuous(
        # Features of the first axis
        name = "Loss", breaks = round(seq(min(to.plot$loss)-0.5, max(to.plot$loss)+0.5, by = 0.1), 1),
        # Add a second axis and specify its features
        sec.axis = sec_axis(~ . , name="R", breaks = round(seq(min(R2s), max(R2s), by = 0.1), 1))
      ) +
      scale_x_continuous(breaks = seq(1, configs$num_epochs, by = 1)) +
      theme_bw() +
      theme(axis.text.x = element_text(angle = 45, vjust = 1, hjust=1))
    
    print(paste0("min val epoch R: ", round(R2s[which(val==min(val)),], digits = 100)))
    print(paste0("end epoch R: ", round(R2s[dim(R2s)[1],], digits = 100)))
    ggsave('Loss.R.by.epoch.pdf', p, width = min(49.9, 9 * length(epochs) / 10), height = 6)
  }
  res
}

get.R.by.step <- function(configs, bin=FALSE) {
  log.dir <- configs$log_dir
  data.train <- as.numeric(strsplit(system(paste0("wc -l ", configs$data_file_train), intern = T), split = " ")[[1]][1])
  num_saved_batches = floor(ceiling(floor(data.train * configs$train_size / configs$ngpus) / configs$batch_size)
                            * configs$num_epochs / configs$num_save_batches) + 1
  print(paste0("num batches: ", num_saved_batches))
  steps <- c(1:(num_saved_batches-1)) * configs$num_save_batches
  R2s <- data.frame()
  train <- c()
  val <- c()
  
  library(doParallel)
  cl <- makeCluster(72)
  registerDoParallel(cl)
  
  res <- foreach (i = 1:length(steps), .combine = dplyr::bind_rows) %dopar% {
    source('/share/pascal/Users/gz2294/Pipeline/AUROC.R')
    i <- steps[i]
    if (file.exists(paste0(log.dir, 'test_result.step.', i, '.csv'))) {
      test.result <- read.csv(paste0(log.dir, 'test_result.step.', i, '.csv'))
      score.columns <- colnames(test.result)[grep("^score", colnames(test.result))]
      score.columns <- score.columns[order(score.columns)]
      result.columns <- colnames(test.result)[grep("^y.", colnames(test.result))]
      result.columns <- result.columns[order(result.columns)]
      test.result <- test.result[!is.na(test.result[,result.columns[1]]),]
      result <- plot.R2(test.result[,score.columns], test.result[,result.columns],
                        bin=bin,
                        filename=paste0(log.dir, 'test_result.step.', i, '.pdf'))
      val_losses <- c()
      train_losses <- c()
      rank <- 0
      val_dic <- jsonlite::read_json(paste0(log.dir, 'result_dict.batch.', i, '.ddp_rank.', rank, '.json'))
      val_losses <- c(val_losses, val_dic$val_loss)
      train_losses <- c(train_losses, val_dic$train_loss)
      res <- data.frame(train=mean(train_losses), 
                        val=mean(val_losses), 
                        R2s=t(as.data.frame(result$R2)))
    } else {
      res <- data.frame(train=NA, 
                        val=NA, 
                        R2s=NA)
    }
  }
  stopCluster(cl)
  res$steps <- steps
  res <- res[!is.na(res$train),]
  train <- res$train
  val <- res$val
  steps <- res$steps
  R2s <- res[,colnames(res)[grep("^R2s", colnames(res))]]
  if (is.null(dim(R2s))) {
    R2s <- as.matrix(R2s)
  }
  to.plot <- data.frame(step=rep(steps, 2),
                        loss=c(train, val),
                        R2=c(rowMeans(R2s), rowMeans(R2s)),
                        Rs=rbind(R2s, R2s),
                        metric_name=c(rep("train_loss", length(steps)),
                                      rep("val_loss", length(steps))))
  to.plot.2 <- data.frame()
  for (i in 1:dim(R2s)[2]) {
    to.plot.2 <- rbind(to.plot.2,
                       data.frame(R2=R2s[,i],
                                  step=steps,
                                  label=paste0('assay.', i)))
  }
  
  library(ggplot2)
  p <- ggplot(to.plot, aes(x=step)) +
    geom_line(aes(y=loss, col=metric_name)) +
    geom_line(data = to.plot.2, aes(y=R2, col=label)) +
    scale_y_continuous(
      # Features of the first axis
      name = "Loss", breaks = round(seq(min(to.plot$loss)-0.5, max(to.plot$loss)+0.5, by = 0.1), 1),
      # Add a second axis and specify its features
      sec.axis = sec_axis(~ . , name="R", breaks = round(seq(min(R2s), max(R2s), by = 0.1), 1))
    ) +
    scale_x_continuous(breaks =
                         seq(1*configs$num_save_batches,
                             (num_saved_batches - 1)*configs$num_save_batches,
                             by = configs$num_save_batches)) +
    theme_bw() +
    theme(axis.text.x = element_text(angle = 45, vjust = 1, hjust=1))
  ggsave('Loss.R.by.step.pdf', p, width = min(max(9 * length(steps) / 100, 9), 49.9), height = 6)
  R2s <- as.data.frame(R2s)
  print(paste0("min val step (", steps[which(val==min(val))[1]],
               ") R: ", R2s[which(val==min(val))[1],]))
  print(paste0("end step (", steps[length(val)],
               ") R: ", R2s[length(steps),]))
  print(paste0("max R step (", steps[which(R2s==max(R2s))[1]],
               "): ", max(R2s)))
  # print(paste0("min val step (", steps[which(val==min(val))[1]],
  #              ") R: ", round(R2s[which(val==min(val))[1],], digits = 3)))
  # print(paste0("end step (", steps[length(val)],
  #              ") R: ", round(R2s[length(steps),], digits = 3)))
  # print(paste0("max R step (", steps[which(R2s==max(R2s))[1]],
  #              "): ", round(max(R2s), digits = 3)))
  res
}