Xue-Jun's picture
Upload 6 files
4a4b152 verified
import rpy2.robjects as ro
from rpy2.robjects import pandas2ri
from rpy2.robjects.conversion import localconverter
from rpy2.robjects.packages import importr
pandas2ri.activate()
# 导入必要的 R 包
stats = importr("stats")
ape = importr("ape")
igraph = importr("igraph", robject_translations={".env": "_env_"})
openxlsx = importr("openxlsx")
def get_r_matrix(df):
with localconverter(ro.default_converter + pandas2ri.converter):
r_tm_matrix = ro.conversion.py2rpy(df)
return r_tm_matrix
export_matrix_to_newick_r = ro.r(
"""
convert_to_newick <- function(tm_matrix, output_file) {
# 导入 ape 包
if (!require(ape, quietly = TRUE)) {
install.packages("ape", repos = "https://cran.r-project.org")
library(ape)
}
# 计算距离矩阵
dist_matrix <- dist(tm_matrix)
# 层次聚类
hclust_tree <- hclust(dist_matrix, method = "ward.D2")
# 转为 phylo 对象
phylo_tree <- as.phylo(hclust_tree)
# 导出为 Newick 格式
write.tree(phylo_tree, file = output_file)
newick_str <- write.tree(phylo_tree)
return(newick_str)
}
"""
)
export_similarity_network_r = ro.r(
"""
create_similarity_network_r <- function(threshold, tm_matrix, excel_path) {
# 导入必要的包
if (!require(igraph, quietly = TRUE)) {
install.packages("igraph", repos = "https://cran.r-project.org")
library(igraph)
}
if (!require(openxlsx, quietly = TRUE)) {
install.packages("openxlsx", repos = "https://cran.r-project.org")
library(openxlsx)
}
# 根据相似性阈值创建边缘列表,并过滤掉自环
overthresholdedges <- which(tm_matrix >= threshold, arr.ind = TRUE)
overthresholdedges <- overthresholdedges[overthresholdedges[, 1] != overthresholdedges[, 2], ]
# 创建空的图形对象
graph <- graph.empty()
# 添加节点
nodes <- rownames(tm_matrix)
graph <- add_vertices(graph, nv = length(nodes), name = nodes)
# 添加边
for (i in 1:nrow(overthresholdedges)) {
graph <- add_edges(graph, c(overthresholdedges[i, 1], overthresholdedges[i, 2]))
}
# 转换为无向图
graph <- as.undirected(graph, mode = "collapse")
# 计算聚类
clusters <- fastgreedy.community(graph)
# 获取每个聚类的大小
cluster_sizes <- sizes(clusters)
# 按聚类大小降序排序
sorted_clusters <- clusters[order(cluster_sizes, decreasing = TRUE)]
# 获取每个聚类的成员
cluster_members <- membership(clusters)
# 找到孤立节点
singleton_nodes <- names(cluster_members[cluster_members %in% which(sizes(clusters) == 1)])
# 创建Cytoscape导出文件
cytoscape_export <- createWorkbook()
# 创建边Sheet
addWorksheet(cytoscape_export, sheetName = "Edges")
writeData(cytoscape_export, sheet = "Edges", x = "Source", startCol = 1, startRow = 1)
writeData(cytoscape_export, sheet = "Edges", x = "Target", startCol = 2, startRow = 1)
# 获取边列表
edges <- get.edgelist(graph)
# 填充边Sheet数据
if (nrow(edges) > 0) {
writeData(cytoscape_export, sheet = "Edges", x = V(graph)[edges[, 1]]$name, startCol = 1, startRow = 2)
writeData(cytoscape_export, sheet = "Edges", x = V(graph)[edges[, 2]]$name, startCol = 2, startRow = 2)
}
# 找到当前边Sheet的最后一行
last_edge_row <- nrow(edges) + 1
# 添加孤立节点
if (length(singleton_nodes) > 0) {
writeData(cytoscape_export, sheet = "Edges", x = singleton_nodes, startCol = 1, startRow = last_edge_row + 1)
}
# 保存Excel文件
saveWorkbook(cytoscape_export, excel_path, overwrite = TRUE)
# 创建一个空的数据框用于储存节点和聚类信息
export_clusters <- data.frame(protein = character(), cluster_name = character(), stringsAsFactors = FALSE)
# 遍历 sorted_clusters
cluster_index <- 1 # 初始化簇索引
for (cluster_name in names(sorted_clusters)) {
proteins <- sorted_clusters[[cluster_name]]
# 将每个 protein 和对应的 cluster_name 添加到数据框
for (protein in proteins) {
# 检查 protein 是否在 singleton_nodes 中
if (protein %in% singleton_nodes) {
current_cluster_name <- "singleton" # 修改为 "singleton"
} else {
current_cluster_name <- as.character(cluster_index) # 使用簇索引
}
export_clusters <- rbind(export_clusters, data.frame(protein = protein, cluster_name = current_cluster_name))
}
cluster_index <- cluster_index + 1 # 索引加1
}
# 返回聚类结果
return(list(cluster_data = export_clusters, graph = graph))
}
"""
)