|
import pandas as pd |
|
import os |
|
import subprocess |
|
import sys |
|
from Bio import SeqIO |
|
import shutil |
|
from fuson_plm.utils.logging import open_logfile, log_update |
|
|
|
def ensure_mmseqs_in_path(mmseqs_dir): |
|
""" |
|
Checks if MMseqs2 is in the PATH. If it's not, add it. MMseqs2 will not run if this is not done correctly. |
|
|
|
Args: |
|
mmseqs_dir (str): Directory containing MMseqs2 binaries |
|
""" |
|
mmseqs_bin = os.path.join(mmseqs_dir, 'mmseqs') |
|
|
|
|
|
if shutil.which('mmseqs') is None: |
|
|
|
os.environ['PATH'] = f"{mmseqs_dir}:{os.environ['PATH']}" |
|
log_update(f"\tAdded {mmseqs_dir} to PATH") |
|
|
|
def process_fasta(fasta_path): |
|
fasta_sequences = SeqIO.parse(open(fasta_path),'fasta') |
|
d = {} |
|
for fasta in fasta_sequences: |
|
id, sequence = fasta.id, str(fasta.seq) |
|
|
|
d[id] = sequence |
|
|
|
return d |
|
|
|
def analyze_clustering_result(input_fasta: str, tsv_path: str): |
|
""" |
|
Args: |
|
input_fasta (str): path to input fasta file |
|
""" |
|
|
|
|
|
input_d = process_fasta(input_fasta) |
|
|
|
|
|
clusters = pd.read_csv(f'{tsv_path}',sep='\t',header=None) |
|
clusters = clusters.rename(columns={ |
|
0: 'representative seq_id', |
|
1: 'member seq_id' |
|
}) |
|
|
|
clusters['representative seq'] = clusters['representative seq_id'].apply(lambda seq_id: input_d[seq_id]) |
|
clusters['member seq'] = clusters['member seq_id'].apply(lambda seq_id: input_d[seq_id]) |
|
|
|
|
|
clusters = clusters.sort_values(by=['representative seq_id','member seq_id'],ascending=True).reset_index(drop=True) |
|
|
|
return clusters |
|
|
|
def make_fasta(sequences: dict, fasta_path: str): |
|
""" |
|
Makes a fasta file from sequences, where the key is the header and the value is the sequence. |
|
|
|
Args: |
|
sequences (dict): A dictionary where the key is the header and the value is the sequence. |
|
|
|
Returns: |
|
str: The path to the fasta file. |
|
""" |
|
with open(fasta_path, 'w') as f: |
|
for header, sequence in sequences.items(): |
|
f.write(f'>{header}\n{sequence}\n') |
|
|
|
return fasta_path |
|
|
|
def run_mmseqs_clustering(input_fasta, output_dir, min_seq_id=0.3, c=0.8, cov_mode=0, cluster_mode=0, path_to_mmseqs='fuson_plm/mmseqs'): |
|
""" |
|
Runs MMSeqs2 clustering using easycluster module |
|
|
|
Args: |
|
input_fasta (str): path to input fasta file, formatted >header\nsequence\n>header\nsequence.... |
|
output_dir (str): path to output dir for clustering results |
|
min_seq_id (float): number [0,1] representing --min-seq-id in cluster command |
|
c (float): nunber [0,1] representing -c in cluster command |
|
cov_mode (int): number 0, 1, 2, or 3 representing --cov-mode in cluster command |
|
cluster_mode (int): number 0, 1, or 2 representing --cluster-mode in cluster command |
|
|
|
""" |
|
|
|
log_update("\nRunning MMSeqs clustering...") |
|
mmseqs_dir = os.path.join(path_to_mmseqs[0:path_to_mmseqs.index('/mmseqs')], 'mmseqs/bin') |
|
|
|
|
|
ensure_mmseqs_in_path(mmseqs_dir) |
|
|
|
|
|
mmseqs_bin = "mmseqs" |
|
|
|
|
|
os.makedirs(output_dir, exist_ok=True) |
|
|
|
|
|
cmd_easy_cluster = [ |
|
mmseqs_bin, "easy-cluster", input_fasta, os.path.join(output_dir, "mmseqs"), output_dir, |
|
"--min-seq-id", str(min_seq_id), |
|
"-c", str(c), |
|
"--cov-mode", str(cov_mode), |
|
"--cluster-mode", str(cluster_mode), |
|
"--dbtype", "1" |
|
] |
|
|
|
|
|
log_update("\n\tCommand entered to MMSeqs2:") |
|
log_update("\t" + " ".join(cmd_easy_cluster) + "\n") |
|
|
|
subprocess.run(cmd_easy_cluster, check=True) |
|
|
|
log_update(f"Clustering completed. Results are in {output_dir}") |
|
|
|
def cluster_summary(clusters: pd.DataFrame): |
|
""" |
|
Summarizes how many clusters were formed, how big they are, etc ... |
|
""" |
|
grouped_clusters = clusters.groupby('representative seq_id')['member seq_id'].count().reset_index().rename(columns={'member seq_id':'member count'}) |
|
assert len(grouped_clusters) == len(clusters['representative seq_id'].unique()) |
|
|
|
total_seqs = sum(grouped_clusters['member count']) |
|
log_update(f"Created {len(grouped_clusters)} clusters of {total_seqs} sequences") |
|
log_update(f"\t{len(grouped_clusters.loc[grouped_clusters['member count']==1])} clusters of size 1") |
|
csize1_seqs = sum(grouped_clusters[grouped_clusters['member count']==1]['member count']) |
|
log_update(f"\t\tsequences: {csize1_seqs} ({round(100*csize1_seqs/total_seqs, 2)}%)") |
|
|
|
log_update(f"\t{len(grouped_clusters.loc[grouped_clusters['member count']>1])} clusters of size > 1") |
|
csizeg1_seqs = sum(grouped_clusters[grouped_clusters['member count']>1]['member count']) |
|
log_update(f"\t\tsequences: {csizeg1_seqs} ({round(100*csizeg1_seqs/total_seqs, 2)}%)") |
|
log_update(f"\tlargest cluster: {max(grouped_clusters['member count'])}") |
|
|
|
log_update("\nCluster size breakdown below...") |
|
|
|
value_counts = grouped_clusters['member count'].value_counts().reset_index().rename(columns={'index':'cluster size (n_members)','member count': 'n_clusters'}) |
|
log_update(value_counts.sort_values(by='cluster size (n_members)',ascending=True).to_string(index=False)) |