Spaces:

Xue-Jun
/

StructureBasedSimilarityNetwork

Sleeping

App Files Files Community

StructureBasedSimilarityNetwork / utils.py

Xue-Jun

Upload 6 files

4a4b152 verified 6 months ago

raw

history blame contribute delete

5.8 kB

	import hashlib
	import os
	import sys
	from io import StringIO
	from pathlib import Path

	import numpy as np
	import pandas as pd
	import rpy2.robjects as ro
	from rpy2.robjects import pandas2ri
	from rpy2.robjects.conversion import localconverter

	from r_functions import export_matrix_to_newick_r, export_similarity_network_r
	from usalign_runner import USalignRunner


	def get_TM_mat_from_df(df):
	unique_chains = sorted(set(df["#PDBchain1"].unique()).union(set(df["PDBchain2"].unique())))
	chain_to_idx = {chain: idx for idx, chain in enumerate(unique_chains)}
	n = len(unique_chains)
	matrix = np.eye(n)
	for _, row in df.iterrows():
	chain1 = row["#PDBchain1"]
	chain2 = row["PDBchain2"]
	if chain1 in chain_to_idx and chain2 in chain_to_idx:
	i = chain_to_idx[chain1]
	j = chain_to_idx[chain2]
	matrix[j, i] = row["TM1"]
	matrix[i, j] = row["TM2"]

	columns_names = [chain.replace("/", "").replace(".pdb:A", "") for chain in unique_chains]
	df = pd.DataFrame(np.array(matrix), columns=columns_names, index=columns_names)
	return df


	def calculate_md5(files):
	hash_md5 = hashlib.md5()
	sorted_files = sorted(files, key=lambda x: x.name)

	for file in sorted_files:
	with open(file.name, "rb") as f:
	for chunk in iter(lambda: f.read(4096), b""):
	hash_md5.update(chunk)

	return hash_md5.hexdigest()


	def save_pdb_files(files, data_dir="./data"):
	"""Save uploaded PDB files to the specified directory."""
	if not files:
	return "No files uploaded"

	# Create data directory if it doesn't exist
	data_path = Path(data_dir)
	data_path.mkdir(parents=True, exist_ok=True)

	# Calculate MD5 hash for all files
	md5_hash = calculate_md5(files)

	file_dir = os.path.join(data_path, md5_hash)
	# file_dir.mkdir(exist_ok=True)
	try:
	os.mkdir(file_dir)
	except Exception:
	pass
	file_dir = os.path.join(data_path, md5_hash, "pdb")
	try:
	os.mkdir(file_dir)
	except Exception:
	pass
	print(f"Created directory: {file_dir}")

	# Create list file
	list_file = os.path.join(data_path, md5_hash, "pdb_list")

	filenames = []

	results = []
	for file in files:
	# Get original filename
	original_filename = os.path.basename(file.name)
	filenames.append(original_filename)
	# Check if file already exists
	target_path = os.path.join(file_dir, original_filename)
	print(f"Saving to: {target_path}")

	# Save the file
	with open(target_path, "wb") as f:
	f.write(open(file.name, "rb").read())
	results.append(f"Saved {original_filename}")

	# Write list file
	with open(list_file, "w") as f:
	f.write("\n".join(filenames))
	results.append(f"Created list file: {list_file}")

	return "\n".join(results)


	def run_usalign(md5_hash):
	"""Run USalign on the uploaded PDB files and return results as DataFrame."""
	try:
	runner = USalignRunner()
	data_path = Path("./data")
	pdb_dir = os.path.join(data_path, md5_hash, "pdb")
	list_file = os.path.join(data_path, md5_hash, "pdb_list")
	print(str(pdb_dir))
	print(str(list_file))
	return_code, stdout, stderr = runner.run_alignment(target_dir=str(pdb_dir), pdb_list_file=str(list_file))
	print(stdout)
	print(stderr)
	if return_code == 0:
	# Handle potential encoding issues
	df = pd.read_csv(StringIO(stdout), sep="\t", encoding=sys.getdefaultencoding())

	# Clean up any potential encoding artifacts in column names
	df.columns = [col.strip() for col in df.columns]
	return df
	else:
	return pd.DataFrame({"Error": [stderr]})
	except Exception as e:
	return pd.DataFrame({"Error": [e, stderr]})


	def run_community_analysis(results_df, data_dir, md5_hash, threshold):
	"""Run community analysis pipeline and return results."""
	try:
	# Generate TM matrix
	tm_matrix = get_TM_mat_from_df(results_df)

	tm_file = os.path.join("data", md5_hash, "tm_matrix.csv")
	newick_file = os.path.join("data", md5_hash, "clustering.newick")
	# network_file = os.path.join("data",md5_hash,"network.svg")
	network_edges_file = os.path.join("data", md5_hash, "network_cytoscape_export.xlsx")
	# cluster_file = os.path.join("data", md5_hash, "cluster_assignments.csv")

	with localconverter(ro.default_converter + pandas2ri.converter):
	r_tm_matrix = ro.conversion.py2rpy(tm_matrix)

	result = export_matrix_to_newick_r(r_tm_matrix, newick_file)
	newick_str = result[0]

	export_similarity_network_r(threshold, r_tm_matrix, network_edges_file)

	# cluster_df.to_csv(cluster_file,index=False)
	# combined_df.to_csv(network_edges_file,index=False)
	tm_matrix.to_csv(tm_file)
	# with open(newick_file, "w") as f:
	# f.write(newick_str)
	# Phylo.write(tree, newick_file, "newick")
	# fig.savefig(network_file, format="svg", bbox_inches="tight")
	# plt.close(fig)

	return {
	"tm_matrix": tm_matrix,
	"newick_str": newick_str,
	# "network_fig": fig,
	"files": [
	tm_file,
	newick_file,
	# network_file,
	network_edges_file,
	# cluster_file,
	],
	}
	except Exception as e:
	print("Error", str(e))
	return {"Error": str(e)}