|
import pandas as pd |
|
import numpy as np |
|
import os |
|
import subprocess |
|
import sys |
|
from Bio import SeqIO |
|
import shutil |
|
from fuson_plm.utils.logging import open_logfile, log_update |
|
from fuson_plm.data.config import CLUSTER |
|
|
|
def main(): |
|
|
|
LOG_PATH = "clustering_log.txt" |
|
INPUT_PATH = CLUSTER.INPUT_PATH |
|
MIN_SEQ_ID = CLUSTER.MIN_SEQ_ID |
|
C = CLUSTER.C |
|
COV_MODE = CLUSTER.COV_MODE |
|
PATH_TO_MMSEQS = CLUSTER.PATH_TO_MMSEQS |
|
MAX_SEQ_LENGTH = CLUSTER.MAX_SEQ_LENGTH |
|
|
|
with open_logfile(LOG_PATH): |
|
log_update("Input params from config.py:") |
|
CLUSTER.print_config(indent='\t') |
|
|
|
if not(os.path.exists("clustering")): |
|
os.mkdir("clustering") |
|
output_dir = "clustering/raw_output" |
|
|
|
|
|
sequences = pd.read_csv(INPUT_PATH) |
|
log_update(f"\nPreparing input data...\n\tInitial dataset size: {len(sequences)} sequences") |
|
|
|
sequences = sequences.loc[sequences['aa_seq'].str.len() <= MAX_SEQ_LENGTH].reset_index(drop=True) |
|
log_update(f"\tApplied length cutoff of {MAX_SEQ_LENGTH}AAs. New dataset size: {len(sequences)} sequences") |
|
|
|
sequences = dict(zip(sequences['seq_id'],sequences['aa_seq'])) |
|
fasta_path = make_fasta(sequences, "clustering/input.fasta") |
|
log_update(f"\tMade fasta of input sequences, saved at {fasta_path}") |
|
|
|
run_mmseqs_clustering(fasta_path, output_dir, min_seq_id=MIN_SEQ_ID, c=C, cov_mode=COV_MODE, path_to_mmseqs=PATH_TO_MMSEQS) |
|
|
|
|
|
clusters = analyze_clustering_result('clustering/input.fasta', 'clustering/raw_output/mmseqs_cluster.tsv') |
|
|
|
clusters.to_csv('clustering/mmseqs_full_results.csv',index=False) |
|
log_update("Processed and combined mmseqs output. Wrote comprehensive results to clustering/mmseqs_full_results.csv") |
|
cluster_summary(clusters) |
|
|
|
if __name__ == "__main__": |
|
main() |