Spaces:

Xue-Jun
/

StructureBasedSimilarityNetwork

Sleeping

App Files Files Community

Xue-Jun commited on Sep 30, 2025

Commit

9b9c66d

1 Parent(s): 06fdf1f

first commit

Browse files

Files changed (6) hide show

Dockerfile +66 -0
app.py +102 -0
r_functions.py +152 -0
requirements.txt +9 -0
usalign_runner.py +68 -0
utils.py +280 -0

Dockerfile ADDED Viewed

	@@ -0,0 +1,66 @@

+FROM docker.io/ubuntu:22.04
+ENV TZ=Asia/Shanghai
+ENV R_REMOTE_ERR=1
+ENV DEBIAN_FRONTEND=noninteractive
+RUN apt-get update -y && \
+    apt-get install -y --no-install-recommends \
+    build-essential \
+    software-properties-common \
+    dirmngr \
+    wget \
+    unzip \
+    make \
+    lsb-release && \
+    rm -rf /var/lib/apt/lists/*
+RUN apt-get update -y \
+    && apt-get -y install iputils-ping \
+    && apt-get -y install wget \
+    && apt-get -y install net-tools \
+    && apt-get -y install vim \
+    && apt-get -y install openssh-server \
+    && apt-get -y install python3.9 \
+    && apt-get -y install python3-pip \
+    && apt-get -y install git \
+    && cd /usr/local/bin \
+    && rm -f python \
+    && rm -f python3 \
+    && rm -f pip \
+    && rm -f pip3 \
+    && ln -s /usr/bin/python3.9 python \
+    && ln -s /usr/bin/python3.9 python3 \
+    && ln -s /usr/bin/pip3 pip \
+    && ln -s /usr/bin/pip3 pip3 \
+    && apt-get clean
+RUN apt install -y libpcre2-dev libdeflate-dev liblzma-dev libbz2-dev libblas-dev gfortran libicu-dev liblapack-dev libxml2-dev
+RUN apt install --no-install-recommends software-properties-common dirmngr
+RUN  wget -qO- https://cloud.r-project.org/bin/linux/ubuntu/marutter_pubkey.asc | tee -a /etc/apt/trusted.gpg.d/cran_ubuntu_key.asc
+RUN add-apt-repository "deb https://cloud.r-project.org/bin/linux/ubuntu $(lsb_release -cs)-cran40/"
+RUN apt install --no-install-recommends r-base -y
+RUN R -e "install.packages('remotes', repos = 'https://cloud.r-project.org/'); \
+          remotes::install_version('ape', version = '5.8.1', dependencies = TRUE, repos = 'https://cloud.r-project.org/'); \
+          remotes::install_version('igraph', version = '2.1.4', dependencies = TRUE, repos = 'https://cloud.r-project.org/'); \
+          remotes::install_version('openxlsx', version = '4.2.8', dependencies = TRUE, repos = 'https://cloud.r-project.org/')" && \
+    rm -rf /tmp/Rtmp*
+RUN useradd -m -u 1000 user
+USER user
+ENV PATH="/home/user/.local/bin:$PATH"
+WORKDIR /app
+RUN git clone https://github.com/pylelab/USalign.git
+WORKDIR /app/USalign
+RUN make
+WORKDIR /app
+RUN chmod -R 777 /app
+COPY --chmod=777 ./requirements.txt requirements.txt
+RUN pip install --no-cache-dir --upgrade -r requirements.txt
+RUN pip install rpy2==3.2.0
+RUN pip install pandas==1.5.3
+RUN pip install numpy==1.25.0
+RUN pip install gradio==5.44.1
+COPY --chmod=777 . /app
+ENV MPLCONFIGDIR="/home/user/.config/matplotlib"
+CMD ["python3", "app.py"]

app.py ADDED Viewed

	@@ -0,0 +1,102 @@

+import hashlib
+import os
+import sys
+from io import StringIO
+from pathlib import Path
+import gradio as gr
+import matplotlib.pyplot as plt
+import pandas as pd
+from usalign_runner import USalignRunner
+from utils import calculate_md5, run_community_analysis, run_usalign, save_pdb_files
+os.environ["GRADIO_ANALYTICS_ENABLED"] = "False"
+# Create Gradio interface
+with gr.Blocks() as demo:
+    gr.Markdown("# This is a Temp Title")
+    with gr.Row():
+        file_input = gr.File(
+            label="Upload PDB Files",
+            file_count="multiple",
+            file_types=[".pdb"],
+        )
+    output = gr.Textbox(
+        label="Upload Results", lines=5, max_lines=5, container=True  # 默认显示行数  # 最大可见行数（超过后自动滚动）
+    )
+    threshold = gr.Slider(minimum=0, maximum=1, value=0.75, label="Threshold")
+    with gr.Row():
+        submit_btn = gr.Button("Upload Files")
+        run_usalign_btn = gr.Button("Run USalign")
+        community_btn = gr.Button("Run Community")
+    md5_hash = gr.State("")
+    with gr.Tab("USalign Results"):
+        results_df = gr.DataFrame(
+            label="USalign Results",
+            wrap=True,
+        )
+    with gr.Tab("TM Matrix"):
+        # Add new output components for community analysis with height limits
+        tm_matrix_output = gr.DataFrame(label="TM Matrix", wrap=True, show_label=True)
+    with gr.Tab("Newick Tree"):
+        newick_output = gr.Textbox(
+            label="Newick Tree", lines=5, max_lines=10, container=True  # 默认显示行数  # 最大可见行数（超过后自动滚动）
+        )
+    # with gr.Tab("Structure Similarity Network"):
+    #     network_plot = gr.Plot(label="Structure Similarity Network")
+    # Combine download buttons into a single row
+    with gr.Row():
+        with gr.Column():
+            gr.Markdown("### Download Results")
+            download_tm = gr.File(label="Download Files")
+    submit_btn.click(fn=save_pdb_files, inputs=[file_input], outputs=output)
+    def update_md5_hash(files):
+        if files:
+            return calculate_md5(files)
+        return ""
+    file_input.change(fn=update_md5_hash, inputs=[file_input], outputs=[md5_hash])
+    run_usalign_btn.click(fn=run_usalign, inputs=[md5_hash], outputs=[results_df])
+    def process_community_analysis(results_df, md5_hash, threshold):
+        if results_df.empty:
+            return None, None, None
+        results = run_community_analysis(results_df, "./data", md5_hash, threshold)
+        if "Error" in results:
+            return None, None, None
+        # Prepare download files
+        return (
+            results["tm_matrix"],
+            results["newick_str"],
+            # results["network_fig"],
+            results["files"],
+        )
+    community_btn.click(
+        fn=process_community_analysis,
+        inputs=[results_df, md5_hash, threshold],
+        outputs=[
+            tm_matrix_output,
+            newick_output,
+            # network_plot,
+            download_tm,
+        ],
+    )
+if __name__ == "__main__":
+    demo.launch(server_name="0.0.0.0")

r_functions.py ADDED Viewed

	@@ -0,0 +1,152 @@

+import pandas as pd
+import numpy as np
+from rpy2.robjects import pandas2ri, r, Formula
+from rpy2.robjects.packages import importr
+from rpy2.robjects.vectors import StrVector, FloatVector, IntVector
+from rpy2.robjects.conversion import localconverter
+import rpy2.robjects as ro
+import os
+pandas2ri.activate()
+# 导入必要的 R 包
+stats = importr('stats')
+ape = importr('ape')
+igraph = importr('igraph', robject_translations={'.env': '_env_'})
+openxlsx = importr('openxlsx')
+# dplyr = importr('dplyr')
+def get_r_matrix(df):
+    with localconverter(ro.default_converter + pandas2ri.converter):
+        r_tm_matrix = ro.conversion.py2rpy(df)
+    return r_tm_matrix
+export_matrix_to_newick_r = ro.r("""
+    convert_to_newick <- function(tm_matrix, output_file) {
+        # 导入 ape 包
+        if (!require(ape, quietly = TRUE)) {
+            install.packages("ape", repos = "https://cran.r-project.org")
+            library(ape)
+        }
+        # 计算距离矩阵
+        dist_matrix <- dist(tm_matrix)
+        # 层次聚类
+        hclust_tree <- hclust(dist_matrix, method = "ward.D2")
+        # 转为 phylo 对象
+        phylo_tree <- as.phylo(hclust_tree)
+        # 导出为 Newick 格式
+        write.tree(phylo_tree, file = output_file)
+        newick_str <- write.tree(phylo_tree)
+        return(newick_str)
+    }
+    """)
+export_similarity_network_r = ro.r("""
+create_similarity_network_r <- function(threshold, tm_matrix, excel_path, csv_path) {
+    # 导入必要的包
+    if (!require(igraph, quietly = TRUE)) {
+        install.packages("igraph", repos = "https://cran.r-project.org")
+        library(igraph)
+    }
+    if (!require(openxlsx, quietly = TRUE)) {
+        install.packages("openxlsx", repos = "https://cran.r-project.org")
+        library(openxlsx)
+    }
+    # 根据相似性阈值创建边缘列表，并过滤掉自环
+    overthresholdedges <- which(tm_matrix >= threshold, arr.ind = TRUE)
+    overthresholdedges <- overthresholdedges[overthresholdedges[, 1] != overthresholdedges[, 2], ]
+    # 创建空的图形对象
+    graph <- graph.empty()
+    # 添加节点
+    nodes <- rownames(tm_matrix)
+    graph <- add_vertices(graph, nv = length(nodes), name = nodes)
+    # 添加边
+    for (i in 1:nrow(overthresholdedges)) {
+        graph <- add_edges(graph, c(overthresholdedges[i, 1], overthresholdedges[i, 2]))
+    }
+    # 转换为无向图
+    graph <- as.undirected(graph, mode = "collapse")
+    # 计算聚类
+    clusters <- fastgreedy.community(graph)
+    # 获取每个聚类的大小
+    cluster_sizes <- sizes(clusters)
+    # 按聚类大小降序排序
+    sorted_clusters <- clusters[order(cluster_sizes, decreasing = TRUE)]
+    # 获取每个聚类的成员
+    cluster_members <- membership(clusters)
+    # 找到孤立节点
+    singleton_nodes <- names(cluster_members[cluster_members %in% which(sizes(clusters) == 1)])
+    # 创建Cytoscape导出文件
+    cytoscape_export <- createWorkbook()
+    # 创建边Sheet
+    addWorksheet(cytoscape_export, sheetName = "Edges")
+    writeData(cytoscape_export, sheet = "Edges", x = "Source", startCol = 1, startRow = 1)
+    writeData(cytoscape_export, sheet = "Edges", x = "Target", startCol = 2, startRow = 1)
+    # 获取边列表
+    edges <- get.edgelist(graph)
+    # 填充边Sheet数据
+    if (nrow(edges) > 0) {
+        writeData(cytoscape_export, sheet = "Edges", x = V(graph)[edges[, 1]]$name, startCol = 1, startRow = 2)
+        writeData(cytoscape_export, sheet = "Edges", x = V(graph)[edges[, 2]]$name, startCol = 2, startRow = 2)
+    }
+    # 找到当前边Sheet的最后一行
+    last_edge_row <- nrow(edges) + 1
+    # 添加孤立节点
+    if (length(singleton_nodes) > 0) {
+        writeData(cytoscape_export, sheet = "Edges", x = singleton_nodes, startCol = 1, startRow = last_edge_row + 1)
+    }
+    # 保存Excel文件
+    saveWorkbook(cytoscape_export, excel_path, overwrite = TRUE)
+    saveWorkbook(cytoscape_export, "structure_based_similarity_network_cytoscape_export.xlsx", overwrite = TRUE)
+    # 创建一个空的数据框用于储存节点和聚类信息
+    export_clusters <- data.frame(protein = character(), cluster_name = character(), stringsAsFactors = FALSE)
+    # 遍历 sorted_clusters
+    cluster_index <- 1  # 初始化簇索引
+    for (cluster_name in names(sorted_clusters)) {
+        proteins <- sorted_clusters[[cluster_name]]
+        # 将每个 protein 和对应的 cluster_name 添加到数据框
+        for (protein in proteins) {
+            # 检查 protein 是否在 singleton_nodes 中
+            if (protein %in% singleton_nodes) {
+                current_cluster_name <- "singleton"  # 修改为 "singleton"
+            } else {
+                current_cluster_name <- as.character(cluster_index)  # 使用簇索引
+            }
+            export_clusters <- rbind(export_clusters, data.frame(protein = protein, cluster_name = current_cluster_name))
+        }
+        cluster_index <- cluster_index + 1  # 索引加1
+    }
+    write.csv(export_clusters, csv_path, row.names = FALSE, quote = TRUE)
+    # 返回聚类结果
+    return(list(cluster_data = export_clusters, graph = graph))
+}
+""")

requirements.txt ADDED Viewed

	@@ -0,0 +1,9 @@

+gradio==4.44.1
+# fastcluster==1.1.28
+networkx==2.7
+python-louvain==0.16
+pandas== 2.2.3
+matplotlib==3.9.4
+scipy==1.13.1
+biopython==1.79
+httpx[socks]

usalign_runner.py ADDED Viewed

	@@ -0,0 +1,68 @@

+import subprocess
+import os
+from typing import List, Optional
+from pathlib import Path
+import yaml
+class USalignRunner:
+    def __init__(self, config_path: str = "config.yaml"):
+        """
+        Initialize USalignRunner with parameters from config file.
+        Args:
+            config_path (str): Path to the configuration file
+        """
+        with open(config_path, 'r',encoding="utf-8") as f:
+            config = yaml.safe_load(f)
+        self.usalign_path = Path(config['USalign']['path'])
+        self.default_params = {
+            'tmscore': config['USalign']['tmscore'],
+            'outfmt': config['USalign']['outfmt'],
+            'mol': 'protein'  # Default to protein alignment
+        }
+        if not self.usalign_path.exists():
+            raise FileNotFoundError(f"USalign executable not found at {self.usalign_path}")
+    def run_alignment(
+        self,
+        target_dir: str,
+        pdb_list_file: str,
+        tmscore: Optional[float] = None,
+        outfmt: Optional[int] = None,
+    ) -> tuple[int, str, str]:
+        tmscore = tmscore if tmscore is not None else self.default_params['tmscore']
+        outfmt = outfmt if outfmt is not None else self.default_params['outfmt']
+        # Create the command
+        cmd = [
+            str(self.usalign_path),
+            "-mol", self.default_params['mol'],
+            "-dir", str(target_dir),
+            pdb_list_file,
+            "-TMscore", str(tmscore),
+            "-outfmt", str(outfmt)
+        ]
+        print(cmd)
+        # Convert command list to string
+        cmd_str = "  ".join(cmd)
+        try:
+            # Execute the command
+            process = subprocess.Popen(
+                cmd_str,
+                stdout=subprocess.PIPE,
+                stderr=subprocess.PIPE,
+                shell=True,
+                text=True
+            )
+            # Get output
+            stdout, stderr = process.communicate()
+            return process.returncode, stdout, stderr
+        except Exception as e:
+            return -1, "", str(e)

utils.py ADDED Viewed

	@@ -0,0 +1,280 @@

+import numpy as np
+import pandas as pd
+# import fastcluster
+import networkx as nx
+from community import community_louvain
+from scipy.spatial.distance import pdist, squareform
+from scipy.cluster.hierarchy import linkage, to_tree
+from networkx.algorithms.community import greedy_modularity_communities
+from Bio import Phylo
+from Bio.Phylo.BaseTree import Tree, Clade
+import matplotlib.pyplot as plt
+import sys
+import gradio as gr
+import os
+import hashlib
+from pathlib import Path
+import pandas as pd
+from io import StringIO
+from usalign_runner import USalignRunner
+import pandas as pd
+import numpy as np
+from rpy2.robjects import pandas2ri, r, Formula
+from rpy2.robjects.packages import importr
+from rpy2.robjects.vectors import StrVector, FloatVector, IntVector
+from rpy2.robjects.conversion import localconverter
+import rpy2.robjects as ro
+import os
+from r_functions import get_r_matrix,export_matrix_to_newick_r,export_similarity_network_r
+def get_TM_mat_from_df(df):
+    chain1_unique = df['#PDBchain1'].unique()
+    chain2_unique = df['PDBchain2'].unique()
+    unique_chains = sorted(set(df['#PDBchain1'].unique()).union(set(df['PDBchain2'].unique())))
+    chain_to_idx = {chain: idx for idx, chain in enumerate(unique_chains)}
+    n = len(unique_chains)
+    matrix = np.eye(n)
+    for _, row in df.iterrows():
+        chain1 = row['#PDBchain1']
+        chain2 = row['PDBchain2']
+        if chain1 in chain_to_idx and chain2 in chain_to_idx:
+            i = chain_to_idx[chain1]
+            j = chain_to_idx[chain2]
+            matrix[j, i] = row['TM1']
+            matrix[i, j] = row['TM2']
+    columns_names = [chain.replace("/","").replace(".pdb:A","") for chain in unique_chains]
+    df = pd.DataFrame(np.array(matrix),
+                      columns=columns_names,
+                      index=columns_names)
+    return df
+# def get_cluster_z_from_df(df):
+#     dist_matrix = pdist(df, metric='euclidean')
+#     Z = fastcluster.linkage(dist_matrix, method='ward')
+#     return Z
+def scipy_to_biopython(Z, labels):
+    """将scipy的linkage矩阵转换为Bio.Phylo树"""
+    tree = to_tree(Z, rd=False)
+    def build_clade(node):
+        if node.is_leaf():
+            return Clade(branch_length=node.dist, name=labels[node.id])
+        else:
+            left = build_clade(node.left)
+            right = build_clade(node.right)
+            return Clade(branch_length=node.dist, clades=[left, right])
+    root = build_clade(tree)
+    return Tree(root)
+def write_str_to_file(s:str,file_path:str):
+    with open(file_path,'w',encoding="utf8") as f:
+        f.write(s)
+def build_graph_from_mat_df(TM_score_matrix,threshold = 0.75):
+    G = nx.Graph()
+    G.add_nodes_from(TM_score_matrix.index)
+    matrix_values = TM_score_matrix.values
+    # np.fill_diagonal(matrix_values, 0)  # 排除自环
+    rows, cols = np.where(matrix_values >= threshold)
+    edges = [(TM_score_matrix.index[i], TM_score_matrix.index[j])
+            for i, j in zip(rows, cols) if i != j]
+    G.add_edges_from(edges)
+    return G
+def fill_community_to_graph(G):
+    partition = community_louvain.best_partition(G)
+    nx.set_node_attributes(G, partition, 'cluster')
+    return partition
+def get_graph_fig(G,partition):
+    plt.figure(figsize=(12, 10))
+    pos = nx.spring_layout(G)
+    nx.draw_networkx_nodes(G, pos, node_size=50,
+                        cmap=plt.cm.tab20, node_color=list(partition.values()))
+    nx.draw_networkx_edges(G, pos, alpha=0.3)
+    plt.title("Structure Similarity Network")
+    plt.axis('off')
+    fig = plt.gcf()
+    return fig
+def calculate_md5(files):
+    """
+    Calculate MD5 hash for a list of files.
+    The hash is calculated by combining the content of all files in sorted order.
+    Args:
+        files: List of file objects from Gradio upload
+    Returns:
+        str: MD5 hash of the combined file contents
+    """
+    hash_md5 = hashlib.md5()
+    # Sort files by name to ensure consistent hash regardless of upload order
+    sorted_files = sorted(files, key=lambda x: x.name)
+    for file in sorted_files:
+        with open(file.name, "rb") as f:
+            for chunk in iter(lambda: f.read(4096), b""):
+                hash_md5.update(chunk)
+    return hash_md5.hexdigest()
+def save_pdb_files(files, data_dir='./data'):
+    """Save uploaded PDB files to the specified directory."""
+    if not files:
+        return "No files uploaded"
+    # Create data directory if it doesn't exist
+    data_path = Path(data_dir)
+    data_path.mkdir(parents=True, exist_ok=True)
+    # Calculate MD5 hash for all files
+    md5_hash = calculate_md5(files)
+    file_dir = os.path.join(data_path , md5_hash )
+    # file_dir.mkdir(exist_ok=True)
+    try:
+        os.mkdir(file_dir)
+    except:
+        pass
+    file_dir = os.path.join(data_path , md5_hash , "pdb")
+    try:
+        os.mkdir(file_dir)
+    except:
+        pass
+    print(f"Created directory: {file_dir}")
+    # Create list file
+    list_file = os.path.join(data_path , md5_hash , "pdb_list")
+    filenames = []
+    results = []
+    for file in files:
+        # Get original filename
+        original_filename = os.path.basename(file.name)
+        filenames.append(original_filename)
+        # Check if file already exists
+        target_path = os.path.join(file_dir,original_filename )
+        print(f"Saving to: {target_path}")
+        # Save the file
+        with open(target_path, "wb") as f:
+            f.write(open(file.name, "rb").read())
+        results.append(f"Saved {original_filename}")
+    # Write list file
+    with open(list_file, "w") as f:
+        f.write("\n".join(filenames))
+    results.append(f"Created list file: {list_file}")
+    return "\n".join(results)
+def run_usalign(md5_hash):
+    """Run USalign on the uploaded PDB files and return results as DataFrame."""
+    try:
+        runner = USalignRunner()
+        data_path = Path("./data")
+        pdb_dir = os.path.join(data_path , md5_hash , "pdb")
+        list_file = os.path.join(data_path , md5_hash , "pdb_list")
+        print(str(pdb_dir))
+        print(str(list_file))
+        return_code, stdout, stderr = runner.run_alignment(
+            target_dir=str(pdb_dir),
+            pdb_list_file=str(list_file)
+        )
+        print(stdout)
+        print(stderr)
+        if return_code == 0:
+            # Handle potential encoding issues
+            df = pd.read_csv(StringIO(stdout), sep="\t", encoding=sys.getdefaultencoding())
+            # Clean up any potential encoding artifacts in column names
+            df.columns = [col.strip() for col in df.columns]
+            return df
+        else:
+            return pd.DataFrame({"Error": [stderr]})
+    except Exception as e:
+        return pd.DataFrame({"Error": [stderr]})
+def run_community_analysis(results_df, data_dir, md5_hash,threshold):
+    """Run community analysis pipeline and return results."""
+    try:
+        # Generate TM matrix
+        tm_matrix = get_TM_mat_from_df(results_df)
+        tm_file = os.path.join("data",md5_hash,"tm_matrix.csv")
+        newick_file = os.path.join("data",md5_hash,"clustering.newick")
+        # network_file = os.path.join("data",md5_hash,"network.svg")
+        network_edges_file = os.path.join("data",md5_hash,"network_cytoscape_export.xlsx")
+        cluster_file = os.path.join("data",md5_hash,"cluster_assignments.csv")
+        with localconverter(ro.default_converter + pandas2ri.converter):
+            r_tm_matrix = ro.conversion.py2rpy(tm_matrix)
+            result = export_matrix_to_newick_r(r_tm_matrix, newick_file)
+            newick_str = result[0]
+            export_similarity_network_r(threshold, r_tm_matrix,network_edges_file, cluster_file)
+        # cluster_df.to_csv(cluster_file,index=False)
+        # combined_df.to_csv(network_edges_file,index=False)
+        tm_matrix.to_csv(tm_file)
+        # with open(newick_file, "w") as f:
+        #     f.write(newick_str)
+        # Phylo.write(tree, newick_file, "newick")
+        # fig.savefig(network_file, format="svg", bbox_inches="tight")
+        # plt.close(fig)
+        return {
+            "tm_matrix": tm_matrix,
+            "newick_str": newick_str,
+            # "network_fig": fig,
+            "files":[
+                tm_file,
+                newick_file,
+                # network_file,
+                network_edges_file,
+                cluster_file
+            ]
+        }
+    except Exception as e:
+        print("Error", str(e))
+        return {"Error": str(e)}
+def get_dataframe_from_network(G,partition):
+    edges_data = [list(edge) for edge in G.edges()]
+    edges_df = pd.DataFrame(edges_data, columns=["Source", "Target"])
+    cluster_membership = {}
+    for idx, comm in enumerate(partition):
+        for node in comm:
+            cluster_membership[node] = f"cluster_{idx+1}"
+    singleton_nodes = [n for n in G.nodes if G.degree[n] == 0]
+    for node in singleton_nodes:
+        cluster_membership[node] = "singleton"
+    # 创建孤立节点的数据
+    singleton_data = [[node, ""] for node in singleton_nodes]
+    singleton_df = pd.DataFrame(singleton_data, columns=["Source", "Target"])
+    # 合并数据
+    combined_df = pd.concat([edges_df, singleton_df], ignore_index=True)
+    return combined_df
+    # # 导出为 CSV 文件
+    # combined_df.to_csv("structure_based_similarity_network_cytoscape_export.csv", index=False)