ChatterjeeLab
/

FusOn-pLM

Inference Endpoints

Model card Files Files and versions Community

FusOn-pLM / fuson_plm /data /cluster.py

svincoff's picture

uploading data folder

1e6a1f0 about 1 month ago

history blame contribute delete

2.1 kB

	import pandas as pd
	import numpy as np
	import os
	import subprocess
	import sys
	from Bio import SeqIO
	import shutil
	from fuson_plm.utils.logging import open_logfile, log_update
	from fuson_plm.data.config import CLUSTER

	def main():
	# Read all the input args
	LOG_PATH = "clustering_log.txt"
	INPUT_PATH = CLUSTER.INPUT_PATH
	MIN_SEQ_ID = CLUSTER.MIN_SEQ_ID
	C = CLUSTER.C
	COV_MODE = CLUSTER.COV_MODE
	PATH_TO_MMSEQS = CLUSTER.PATH_TO_MMSEQS
	MAX_SEQ_LENGTH = CLUSTER.MAX_SEQ_LENGTH

	with open_logfile(LOG_PATH):
	log_update("Input params from config.py:")
	CLUSTER.print_config(indent='\t')
	# Make a subfolder for clustering results, and direct MMSeqs2 outputs here
	if not(os.path.exists("clustering")):
	os.mkdir("clustering")
	output_dir = "clustering/raw_output"

	# Make fasta of input file
	sequences = pd.read_csv(INPUT_PATH)
	log_update(f"\nPreparing input data...\n\tInitial dataset size: {len(sequences)} sequences")

	sequences = sequences.loc[sequences['aa_seq'].str.len() <= MAX_SEQ_LENGTH].reset_index(drop=True)
	log_update(f"\tApplied length cutoff of {MAX_SEQ_LENGTH}AAs. New dataset size: {len(sequences)} sequences")

	sequences = dict(zip(sequences['seq_id'],sequences['aa_seq']))
	fasta_path = make_fasta(sequences, "clustering/input.fasta")
	log_update(f"\tMade fasta of input sequences, saved at {fasta_path}")

	run_mmseqs_clustering(fasta_path, output_dir, min_seq_id=MIN_SEQ_ID, c=C, cov_mode=COV_MODE, path_to_mmseqs=PATH_TO_MMSEQS)

	# Brief read to preview results
	clusters = analyze_clustering_result('clustering/input.fasta', 'clustering/raw_output/mmseqs_cluster.tsv')
	# Save clusters
	clusters.to_csv('clustering/mmseqs_full_results.csv',index=False)
	log_update("Processed and combined mmseqs output. Wrote comprehensive results to clustering/mmseqs_full_results.csv")
	cluster_summary(clusters)

	if __name__ == "__main__":
	main()