Xue-Jun commited on
Commit
9b9c66d
·
1 Parent(s): 06fdf1f

first commit

Browse files
Files changed (6) hide show
  1. Dockerfile +66 -0
  2. app.py +102 -0
  3. r_functions.py +152 -0
  4. requirements.txt +9 -0
  5. usalign_runner.py +68 -0
  6. utils.py +280 -0
Dockerfile ADDED
@@ -0,0 +1,66 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ FROM docker.io/ubuntu:22.04
2
+
3
+ ENV TZ=Asia/Shanghai
4
+ ENV R_REMOTE_ERR=1
5
+ ENV DEBIAN_FRONTEND=noninteractive
6
+
7
+ RUN apt-get update -y && \
8
+ apt-get install -y --no-install-recommends \
9
+ build-essential \
10
+ software-properties-common \
11
+ dirmngr \
12
+ wget \
13
+ unzip \
14
+ make \
15
+ lsb-release && \
16
+ rm -rf /var/lib/apt/lists/*
17
+
18
+
19
+ RUN apt-get update -y \
20
+ && apt-get -y install iputils-ping \
21
+ && apt-get -y install wget \
22
+ && apt-get -y install net-tools \
23
+ && apt-get -y install vim \
24
+ && apt-get -y install openssh-server \
25
+ && apt-get -y install python3.9 \
26
+ && apt-get -y install python3-pip \
27
+ && apt-get -y install git \
28
+ && cd /usr/local/bin \
29
+ && rm -f python \
30
+ && rm -f python3 \
31
+ && rm -f pip \
32
+ && rm -f pip3 \
33
+ && ln -s /usr/bin/python3.9 python \
34
+ && ln -s /usr/bin/python3.9 python3 \
35
+ && ln -s /usr/bin/pip3 pip \
36
+ && ln -s /usr/bin/pip3 pip3 \
37
+ && apt-get clean
38
+ RUN apt install -y libpcre2-dev libdeflate-dev liblzma-dev libbz2-dev libblas-dev gfortran libicu-dev liblapack-dev libxml2-dev
39
+ RUN apt install --no-install-recommends software-properties-common dirmngr
40
+ RUN wget -qO- https://cloud.r-project.org/bin/linux/ubuntu/marutter_pubkey.asc | tee -a /etc/apt/trusted.gpg.d/cran_ubuntu_key.asc
41
+ RUN add-apt-repository "deb https://cloud.r-project.org/bin/linux/ubuntu $(lsb_release -cs)-cran40/"
42
+ RUN apt install --no-install-recommends r-base -y
43
+ RUN R -e "install.packages('remotes', repos = 'https://cloud.r-project.org/'); \
44
+ remotes::install_version('ape', version = '5.8.1', dependencies = TRUE, repos = 'https://cloud.r-project.org/'); \
45
+ remotes::install_version('igraph', version = '2.1.4', dependencies = TRUE, repos = 'https://cloud.r-project.org/'); \
46
+ remotes::install_version('openxlsx', version = '4.2.8', dependencies = TRUE, repos = 'https://cloud.r-project.org/')" && \
47
+ rm -rf /tmp/Rtmp*
48
+
49
+ RUN useradd -m -u 1000 user
50
+ USER user
51
+ ENV PATH="/home/user/.local/bin:$PATH"
52
+ WORKDIR /app
53
+ RUN git clone https://github.com/pylelab/USalign.git
54
+ WORKDIR /app/USalign
55
+ RUN make
56
+ WORKDIR /app
57
+ RUN chmod -R 777 /app
58
+ COPY --chmod=777 ./requirements.txt requirements.txt
59
+ RUN pip install --no-cache-dir --upgrade -r requirements.txt
60
+ RUN pip install rpy2==3.2.0
61
+ RUN pip install pandas==1.5.3
62
+ RUN pip install numpy==1.25.0
63
+ RUN pip install gradio==5.44.1
64
+ COPY --chmod=777 . /app
65
+ ENV MPLCONFIGDIR="/home/user/.config/matplotlib"
66
+ CMD ["python3", "app.py"]
app.py ADDED
@@ -0,0 +1,102 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import hashlib
2
+ import os
3
+ import sys
4
+ from io import StringIO
5
+ from pathlib import Path
6
+
7
+ import gradio as gr
8
+ import matplotlib.pyplot as plt
9
+ import pandas as pd
10
+
11
+ from usalign_runner import USalignRunner
12
+ from utils import calculate_md5, run_community_analysis, run_usalign, save_pdb_files
13
+
14
+ os.environ["GRADIO_ANALYTICS_ENABLED"] = "False"
15
+
16
+ # Create Gradio interface
17
+ with gr.Blocks() as demo:
18
+ gr.Markdown("# This is a Temp Title")
19
+
20
+ with gr.Row():
21
+ file_input = gr.File(
22
+ label="Upload PDB Files",
23
+ file_count="multiple",
24
+ file_types=[".pdb"],
25
+ )
26
+
27
+ output = gr.Textbox(
28
+ label="Upload Results", lines=5, max_lines=5, container=True # 默认显示行数 # 最大可见行数(超过后自动滚动)
29
+ )
30
+
31
+ threshold = gr.Slider(minimum=0, maximum=1, value=0.75, label="Threshold")
32
+
33
+ with gr.Row():
34
+
35
+ submit_btn = gr.Button("Upload Files")
36
+ run_usalign_btn = gr.Button("Run USalign")
37
+ community_btn = gr.Button("Run Community")
38
+
39
+ md5_hash = gr.State("")
40
+ with gr.Tab("USalign Results"):
41
+ results_df = gr.DataFrame(
42
+ label="USalign Results",
43
+ wrap=True,
44
+ )
45
+ with gr.Tab("TM Matrix"):
46
+ # Add new output components for community analysis with height limits
47
+ tm_matrix_output = gr.DataFrame(label="TM Matrix", wrap=True, show_label=True)
48
+ with gr.Tab("Newick Tree"):
49
+ newick_output = gr.Textbox(
50
+ label="Newick Tree", lines=5, max_lines=10, container=True # 默认显示行数 # 最大可见行数(超过后自动滚动)
51
+ )
52
+ # with gr.Tab("Structure Similarity Network"):
53
+ # network_plot = gr.Plot(label="Structure Similarity Network")
54
+
55
+ # Combine download buttons into a single row
56
+ with gr.Row():
57
+ with gr.Column():
58
+ gr.Markdown("### Download Results")
59
+ download_tm = gr.File(label="Download Files")
60
+
61
+ submit_btn.click(fn=save_pdb_files, inputs=[file_input], outputs=output)
62
+
63
+ def update_md5_hash(files):
64
+ if files:
65
+ return calculate_md5(files)
66
+ return ""
67
+
68
+ file_input.change(fn=update_md5_hash, inputs=[file_input], outputs=[md5_hash])
69
+
70
+ run_usalign_btn.click(fn=run_usalign, inputs=[md5_hash], outputs=[results_df])
71
+
72
+ def process_community_analysis(results_df, md5_hash, threshold):
73
+ if results_df.empty:
74
+ return None, None, None
75
+
76
+ results = run_community_analysis(results_df, "./data", md5_hash, threshold)
77
+
78
+ if "Error" in results:
79
+ return None, None, None
80
+
81
+ # Prepare download files
82
+
83
+ return (
84
+ results["tm_matrix"],
85
+ results["newick_str"],
86
+ # results["network_fig"],
87
+ results["files"],
88
+ )
89
+
90
+ community_btn.click(
91
+ fn=process_community_analysis,
92
+ inputs=[results_df, md5_hash, threshold],
93
+ outputs=[
94
+ tm_matrix_output,
95
+ newick_output,
96
+ # network_plot,
97
+ download_tm,
98
+ ],
99
+ )
100
+
101
+ if __name__ == "__main__":
102
+ demo.launch(server_name="0.0.0.0")
r_functions.py ADDED
@@ -0,0 +1,152 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import pandas as pd
2
+ import numpy as np
3
+ from rpy2.robjects import pandas2ri, r, Formula
4
+ from rpy2.robjects.packages import importr
5
+ from rpy2.robjects.vectors import StrVector, FloatVector, IntVector
6
+ from rpy2.robjects.conversion import localconverter
7
+ import rpy2.robjects as ro
8
+ import os
9
+
10
+
11
+ pandas2ri.activate()
12
+
13
+ # 导入必要的 R 包
14
+ stats = importr('stats')
15
+ ape = importr('ape')
16
+ igraph = importr('igraph', robject_translations={'.env': '_env_'})
17
+ openxlsx = importr('openxlsx')
18
+ # dplyr = importr('dplyr')
19
+
20
+ def get_r_matrix(df):
21
+ with localconverter(ro.default_converter + pandas2ri.converter):
22
+ r_tm_matrix = ro.conversion.py2rpy(df)
23
+ return r_tm_matrix
24
+
25
+
26
+ export_matrix_to_newick_r = ro.r("""
27
+ convert_to_newick <- function(tm_matrix, output_file) {
28
+ # 导入 ape 包
29
+ if (!require(ape, quietly = TRUE)) {
30
+ install.packages("ape", repos = "https://cran.r-project.org")
31
+ library(ape)
32
+ }
33
+
34
+ # 计算距离矩阵
35
+ dist_matrix <- dist(tm_matrix)
36
+
37
+ # 层次聚类
38
+ hclust_tree <- hclust(dist_matrix, method = "ward.D2")
39
+
40
+ # 转为 phylo 对象
41
+ phylo_tree <- as.phylo(hclust_tree)
42
+
43
+ # 导出为 Newick 格式
44
+ write.tree(phylo_tree, file = output_file)
45
+
46
+ newick_str <- write.tree(phylo_tree)
47
+
48
+ return(newick_str)
49
+ }
50
+ """)
51
+
52
+ export_similarity_network_r = ro.r("""
53
+ create_similarity_network_r <- function(threshold, tm_matrix, excel_path, csv_path) {
54
+ # 导入必要的包
55
+ if (!require(igraph, quietly = TRUE)) {
56
+ install.packages("igraph", repos = "https://cran.r-project.org")
57
+ library(igraph)
58
+ }
59
+ if (!require(openxlsx, quietly = TRUE)) {
60
+ install.packages("openxlsx", repos = "https://cran.r-project.org")
61
+ library(openxlsx)
62
+ }
63
+
64
+ # 根据相似性阈值创建边缘列表,并过滤掉自环
65
+ overthresholdedges <- which(tm_matrix >= threshold, arr.ind = TRUE)
66
+ overthresholdedges <- overthresholdedges[overthresholdedges[, 1] != overthresholdedges[, 2], ]
67
+
68
+ # 创建空的图形对象
69
+ graph <- graph.empty()
70
+
71
+ # 添加节点
72
+ nodes <- rownames(tm_matrix)
73
+ graph <- add_vertices(graph, nv = length(nodes), name = nodes)
74
+
75
+ # 添加边
76
+ for (i in 1:nrow(overthresholdedges)) {
77
+ graph <- add_edges(graph, c(overthresholdedges[i, 1], overthresholdedges[i, 2]))
78
+ }
79
+
80
+ # 转换为无向图
81
+ graph <- as.undirected(graph, mode = "collapse")
82
+
83
+ # 计算聚类
84
+ clusters <- fastgreedy.community(graph)
85
+
86
+ # 获取每个聚类的大小
87
+ cluster_sizes <- sizes(clusters)
88
+
89
+ # 按聚类大小降序排序
90
+ sorted_clusters <- clusters[order(cluster_sizes, decreasing = TRUE)]
91
+
92
+ # 获取每个聚类的成员
93
+ cluster_members <- membership(clusters)
94
+
95
+ # 找到孤立节点
96
+ singleton_nodes <- names(cluster_members[cluster_members %in% which(sizes(clusters) == 1)])
97
+
98
+ # 创建Cytoscape导出文件
99
+ cytoscape_export <- createWorkbook()
100
+
101
+ # 创建边Sheet
102
+ addWorksheet(cytoscape_export, sheetName = "Edges")
103
+ writeData(cytoscape_export, sheet = "Edges", x = "Source", startCol = 1, startRow = 1)
104
+ writeData(cytoscape_export, sheet = "Edges", x = "Target", startCol = 2, startRow = 1)
105
+
106
+ # 获取边列表
107
+ edges <- get.edgelist(graph)
108
+
109
+ # 填充边Sheet数据
110
+ if (nrow(edges) > 0) {
111
+ writeData(cytoscape_export, sheet = "Edges", x = V(graph)[edges[, 1]]$name, startCol = 1, startRow = 2)
112
+ writeData(cytoscape_export, sheet = "Edges", x = V(graph)[edges[, 2]]$name, startCol = 2, startRow = 2)
113
+ }
114
+
115
+ # 找到当前边Sheet的最后一行
116
+ last_edge_row <- nrow(edges) + 1
117
+
118
+ # 添加孤立节点
119
+ if (length(singleton_nodes) > 0) {
120
+ writeData(cytoscape_export, sheet = "Edges", x = singleton_nodes, startCol = 1, startRow = last_edge_row + 1)
121
+ }
122
+
123
+ # 保存Excel文件
124
+ saveWorkbook(cytoscape_export, excel_path, overwrite = TRUE)
125
+
126
+ saveWorkbook(cytoscape_export, "structure_based_similarity_network_cytoscape_export.xlsx", overwrite = TRUE)
127
+
128
+ # 创建一个空的数据框用于储存节点和聚类信息
129
+ export_clusters <- data.frame(protein = character(), cluster_name = character(), stringsAsFactors = FALSE)
130
+
131
+ # 遍历 sorted_clusters
132
+ cluster_index <- 1 # 初始化簇索引
133
+ for (cluster_name in names(sorted_clusters)) {
134
+ proteins <- sorted_clusters[[cluster_name]]
135
+ # 将每个 protein 和对应的 cluster_name 添加到数据框
136
+ for (protein in proteins) {
137
+ # 检查 protein 是否在 singleton_nodes 中
138
+ if (protein %in% singleton_nodes) {
139
+ current_cluster_name <- "singleton" # 修改为 "singleton"
140
+ } else {
141
+ current_cluster_name <- as.character(cluster_index) # 使用簇索引
142
+ }
143
+ export_clusters <- rbind(export_clusters, data.frame(protein = protein, cluster_name = current_cluster_name))
144
+ }
145
+ cluster_index <- cluster_index + 1 # 索引加1
146
+ }
147
+
148
+ write.csv(export_clusters, csv_path, row.names = FALSE, quote = TRUE)
149
+ # 返回聚类结果
150
+ return(list(cluster_data = export_clusters, graph = graph))
151
+ }
152
+ """)
requirements.txt ADDED
@@ -0,0 +1,9 @@
 
 
 
 
 
 
 
 
 
 
1
+ gradio==4.44.1
2
+ # fastcluster==1.1.28
3
+ networkx==2.7
4
+ python-louvain==0.16
5
+ pandas== 2.2.3
6
+ matplotlib==3.9.4
7
+ scipy==1.13.1
8
+ biopython==1.79
9
+ httpx[socks]
usalign_runner.py ADDED
@@ -0,0 +1,68 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import subprocess
2
+ import os
3
+ from typing import List, Optional
4
+ from pathlib import Path
5
+ import yaml
6
+
7
+ class USalignRunner:
8
+ def __init__(self, config_path: str = "config.yaml"):
9
+ """
10
+ Initialize USalignRunner with parameters from config file.
11
+
12
+ Args:
13
+ config_path (str): Path to the configuration file
14
+ """
15
+ with open(config_path, 'r',encoding="utf-8") as f:
16
+ config = yaml.safe_load(f)
17
+
18
+ self.usalign_path = Path(config['USalign']['path'])
19
+ self.default_params = {
20
+ 'tmscore': config['USalign']['tmscore'],
21
+ 'outfmt': config['USalign']['outfmt'],
22
+ 'mol': 'protein' # Default to protein alignment
23
+ }
24
+
25
+ if not self.usalign_path.exists():
26
+ raise FileNotFoundError(f"USalign executable not found at {self.usalign_path}")
27
+
28
+ def run_alignment(
29
+ self,
30
+ target_dir: str,
31
+ pdb_list_file: str,
32
+ tmscore: Optional[float] = None,
33
+ outfmt: Optional[int] = None,
34
+ ) -> tuple[int, str, str]:
35
+ tmscore = tmscore if tmscore is not None else self.default_params['tmscore']
36
+ outfmt = outfmt if outfmt is not None else self.default_params['outfmt']
37
+
38
+ # Create the command
39
+ cmd = [
40
+ str(self.usalign_path),
41
+ "-mol", self.default_params['mol'],
42
+ "-dir", str(target_dir),
43
+ pdb_list_file,
44
+ "-TMscore", str(tmscore),
45
+ "-outfmt", str(outfmt)
46
+ ]
47
+ print(cmd)
48
+
49
+ # Convert command list to string
50
+ cmd_str = " ".join(cmd)
51
+
52
+ try:
53
+ # Execute the command
54
+ process = subprocess.Popen(
55
+ cmd_str,
56
+ stdout=subprocess.PIPE,
57
+ stderr=subprocess.PIPE,
58
+ shell=True,
59
+ text=True
60
+ )
61
+
62
+ # Get output
63
+ stdout, stderr = process.communicate()
64
+
65
+ return process.returncode, stdout, stderr
66
+
67
+ except Exception as e:
68
+ return -1, "", str(e)
utils.py ADDED
@@ -0,0 +1,280 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import numpy as np
2
+ import pandas as pd
3
+ # import fastcluster
4
+ import networkx as nx
5
+ from community import community_louvain
6
+ from scipy.spatial.distance import pdist, squareform
7
+ from scipy.cluster.hierarchy import linkage, to_tree
8
+ from networkx.algorithms.community import greedy_modularity_communities
9
+ from Bio import Phylo
10
+ from Bio.Phylo.BaseTree import Tree, Clade
11
+ import matplotlib.pyplot as plt
12
+ import sys
13
+ import gradio as gr
14
+ import os
15
+ import hashlib
16
+ from pathlib import Path
17
+ import pandas as pd
18
+ from io import StringIO
19
+ from usalign_runner import USalignRunner
20
+ import pandas as pd
21
+ import numpy as np
22
+ from rpy2.robjects import pandas2ri, r, Formula
23
+ from rpy2.robjects.packages import importr
24
+ from rpy2.robjects.vectors import StrVector, FloatVector, IntVector
25
+ from rpy2.robjects.conversion import localconverter
26
+ import rpy2.robjects as ro
27
+ import os
28
+
29
+ from r_functions import get_r_matrix,export_matrix_to_newick_r,export_similarity_network_r
30
+
31
+ def get_TM_mat_from_df(df):
32
+ chain1_unique = df['#PDBchain1'].unique()
33
+ chain2_unique = df['PDBchain2'].unique()
34
+ unique_chains = sorted(set(df['#PDBchain1'].unique()).union(set(df['PDBchain2'].unique())))
35
+ chain_to_idx = {chain: idx for idx, chain in enumerate(unique_chains)}
36
+ n = len(unique_chains)
37
+ matrix = np.eye(n)
38
+ for _, row in df.iterrows():
39
+ chain1 = row['#PDBchain1']
40
+ chain2 = row['PDBchain2']
41
+ if chain1 in chain_to_idx and chain2 in chain_to_idx:
42
+ i = chain_to_idx[chain1]
43
+ j = chain_to_idx[chain2]
44
+ matrix[j, i] = row['TM1']
45
+ matrix[i, j] = row['TM2']
46
+
47
+ columns_names = [chain.replace("/","").replace(".pdb:A","") for chain in unique_chains]
48
+ df = pd.DataFrame(np.array(matrix),
49
+ columns=columns_names,
50
+ index=columns_names)
51
+ return df
52
+
53
+
54
+ # def get_cluster_z_from_df(df):
55
+ # dist_matrix = pdist(df, metric='euclidean')
56
+ # Z = fastcluster.linkage(dist_matrix, method='ward')
57
+ # return Z
58
+
59
+ def scipy_to_biopython(Z, labels):
60
+ """将scipy的linkage矩阵转换为Bio.Phylo树"""
61
+ tree = to_tree(Z, rd=False)
62
+
63
+ def build_clade(node):
64
+ if node.is_leaf():
65
+ return Clade(branch_length=node.dist, name=labels[node.id])
66
+ else:
67
+ left = build_clade(node.left)
68
+ right = build_clade(node.right)
69
+ return Clade(branch_length=node.dist, clades=[left, right])
70
+
71
+ root = build_clade(tree)
72
+ return Tree(root)
73
+
74
+ def write_str_to_file(s:str,file_path:str):
75
+ with open(file_path,'w',encoding="utf8") as f:
76
+ f.write(s)
77
+
78
+
79
+ def build_graph_from_mat_df(TM_score_matrix,threshold = 0.75):
80
+
81
+ G = nx.Graph()
82
+ G.add_nodes_from(TM_score_matrix.index)
83
+ matrix_values = TM_score_matrix.values
84
+ # np.fill_diagonal(matrix_values, 0) # 排除自环
85
+ rows, cols = np.where(matrix_values >= threshold)
86
+ edges = [(TM_score_matrix.index[i], TM_score_matrix.index[j])
87
+ for i, j in zip(rows, cols) if i != j]
88
+ G.add_edges_from(edges)
89
+ return G
90
+
91
+ def fill_community_to_graph(G):
92
+ partition = community_louvain.best_partition(G)
93
+ nx.set_node_attributes(G, partition, 'cluster')
94
+ return partition
95
+
96
+
97
+ def get_graph_fig(G,partition):
98
+ plt.figure(figsize=(12, 10))
99
+ pos = nx.spring_layout(G)
100
+ nx.draw_networkx_nodes(G, pos, node_size=50,
101
+ cmap=plt.cm.tab20, node_color=list(partition.values()))
102
+ nx.draw_networkx_edges(G, pos, alpha=0.3)
103
+ plt.title("Structure Similarity Network")
104
+ plt.axis('off')
105
+ fig = plt.gcf()
106
+ return fig
107
+
108
+
109
+
110
+ def calculate_md5(files):
111
+ """
112
+ Calculate MD5 hash for a list of files.
113
+ The hash is calculated by combining the content of all files in sorted order.
114
+
115
+ Args:
116
+ files: List of file objects from Gradio upload
117
+
118
+ Returns:
119
+ str: MD5 hash of the combined file contents
120
+ """
121
+ hash_md5 = hashlib.md5()
122
+
123
+ # Sort files by name to ensure consistent hash regardless of upload order
124
+ sorted_files = sorted(files, key=lambda x: x.name)
125
+
126
+ for file in sorted_files:
127
+ with open(file.name, "rb") as f:
128
+ for chunk in iter(lambda: f.read(4096), b""):
129
+ hash_md5.update(chunk)
130
+
131
+ return hash_md5.hexdigest()
132
+
133
+ def save_pdb_files(files, data_dir='./data'):
134
+ """Save uploaded PDB files to the specified directory."""
135
+ if not files:
136
+ return "No files uploaded"
137
+
138
+ # Create data directory if it doesn't exist
139
+ data_path = Path(data_dir)
140
+ data_path.mkdir(parents=True, exist_ok=True)
141
+
142
+ # Calculate MD5 hash for all files
143
+ md5_hash = calculate_md5(files)
144
+
145
+ file_dir = os.path.join(data_path , md5_hash )
146
+ # file_dir.mkdir(exist_ok=True)
147
+ try:
148
+ os.mkdir(file_dir)
149
+ except:
150
+ pass
151
+ file_dir = os.path.join(data_path , md5_hash , "pdb")
152
+ try:
153
+ os.mkdir(file_dir)
154
+ except:
155
+ pass
156
+ print(f"Created directory: {file_dir}")
157
+
158
+ # Create list file
159
+ list_file = os.path.join(data_path , md5_hash , "pdb_list")
160
+
161
+ filenames = []
162
+
163
+ results = []
164
+ for file in files:
165
+ # Get original filename
166
+ original_filename = os.path.basename(file.name)
167
+ filenames.append(original_filename)
168
+ # Check if file already exists
169
+ target_path = os.path.join(file_dir,original_filename )
170
+ print(f"Saving to: {target_path}")
171
+
172
+ # Save the file
173
+ with open(target_path, "wb") as f:
174
+ f.write(open(file.name, "rb").read())
175
+ results.append(f"Saved {original_filename}")
176
+
177
+ # Write list file
178
+ with open(list_file, "w") as f:
179
+ f.write("\n".join(filenames))
180
+ results.append(f"Created list file: {list_file}")
181
+
182
+ return "\n".join(results)
183
+
184
+ def run_usalign(md5_hash):
185
+ """Run USalign on the uploaded PDB files and return results as DataFrame."""
186
+ try:
187
+ runner = USalignRunner()
188
+ data_path = Path("./data")
189
+ pdb_dir = os.path.join(data_path , md5_hash , "pdb")
190
+ list_file = os.path.join(data_path , md5_hash , "pdb_list")
191
+ print(str(pdb_dir))
192
+ print(str(list_file))
193
+ return_code, stdout, stderr = runner.run_alignment(
194
+ target_dir=str(pdb_dir),
195
+ pdb_list_file=str(list_file)
196
+ )
197
+ print(stdout)
198
+ print(stderr)
199
+ if return_code == 0:
200
+ # Handle potential encoding issues
201
+ df = pd.read_csv(StringIO(stdout), sep="\t", encoding=sys.getdefaultencoding())
202
+
203
+ # Clean up any potential encoding artifacts in column names
204
+ df.columns = [col.strip() for col in df.columns]
205
+ return df
206
+ else:
207
+ return pd.DataFrame({"Error": [stderr]})
208
+ except Exception as e:
209
+ return pd.DataFrame({"Error": [stderr]})
210
+
211
+ def run_community_analysis(results_df, data_dir, md5_hash,threshold):
212
+ """Run community analysis pipeline and return results."""
213
+ try:
214
+ # Generate TM matrix
215
+ tm_matrix = get_TM_mat_from_df(results_df)
216
+
217
+ tm_file = os.path.join("data",md5_hash,"tm_matrix.csv")
218
+ newick_file = os.path.join("data",md5_hash,"clustering.newick")
219
+ # network_file = os.path.join("data",md5_hash,"network.svg")
220
+ network_edges_file = os.path.join("data",md5_hash,"network_cytoscape_export.xlsx")
221
+ cluster_file = os.path.join("data",md5_hash,"cluster_assignments.csv")
222
+
223
+ with localconverter(ro.default_converter + pandas2ri.converter):
224
+ r_tm_matrix = ro.conversion.py2rpy(tm_matrix)
225
+
226
+ result = export_matrix_to_newick_r(r_tm_matrix, newick_file)
227
+ newick_str = result[0]
228
+
229
+ export_similarity_network_r(threshold, r_tm_matrix,network_edges_file, cluster_file)
230
+
231
+
232
+ # cluster_df.to_csv(cluster_file,index=False)
233
+ # combined_df.to_csv(network_edges_file,index=False)
234
+ tm_matrix.to_csv(tm_file)
235
+ # with open(newick_file, "w") as f:
236
+ # f.write(newick_str)
237
+ # Phylo.write(tree, newick_file, "newick")
238
+ # fig.savefig(network_file, format="svg", bbox_inches="tight")
239
+ # plt.close(fig)
240
+
241
+ return {
242
+ "tm_matrix": tm_matrix,
243
+ "newick_str": newick_str,
244
+ # "network_fig": fig,
245
+ "files":[
246
+ tm_file,
247
+ newick_file,
248
+ # network_file,
249
+ network_edges_file,
250
+ cluster_file
251
+ ]
252
+ }
253
+ except Exception as e:
254
+ print("Error", str(e))
255
+ return {"Error": str(e)}
256
+
257
+
258
+
259
+ def get_dataframe_from_network(G,partition):
260
+ edges_data = [list(edge) for edge in G.edges()]
261
+ edges_df = pd.DataFrame(edges_data, columns=["Source", "Target"])
262
+ cluster_membership = {}
263
+ for idx, comm in enumerate(partition):
264
+ for node in comm:
265
+ cluster_membership[node] = f"cluster_{idx+1}"
266
+
267
+ singleton_nodes = [n for n in G.nodes if G.degree[n] == 0]
268
+ for node in singleton_nodes:
269
+ cluster_membership[node] = "singleton"
270
+
271
+ # 创建孤立节点的数据
272
+ singleton_data = [[node, ""] for node in singleton_nodes]
273
+ singleton_df = pd.DataFrame(singleton_data, columns=["Source", "Target"])
274
+
275
+ # 合并数据
276
+ combined_df = pd.concat([edges_df, singleton_df], ignore_index=True)
277
+ return combined_df
278
+
279
+ # # 导出为 CSV 文件
280
+ # combined_df.to_csv("structure_based_similarity_network_cytoscape_export.csv", index=False)