Spaces:

happyGPT
/

TCGA-Name-Query

Sleeping

App Files Files Community

TCGA-Name-Query / app.py

happyGPT

Lucky

9a6b9b4 verified about 1 year ago

raw

history blame contribute delete

10.6 kB

	"""
	作者：曾浩龙（独立开发）
	创建时间：2024 年 10 月 25 日
	第三方依赖库：Gradio (https://www.gradio.app/) 与 OpenAI Python API library (https://github.com/openai/openai-python)
	其他说明：本项目声明仅供学习和研究使用。
	"""
	import os
	import platform
	print(platform.python_version())

	import Levenshtein
	import gradio as gr
	from openai import OpenAI


	def find_closest_string(user_input, valid_strings):
	"""查找与输入字符串最接近的字符串，精确的编辑距离，根据 Levenshtein 编辑距离最小原则。"""
	min_distance = 999999999 # 初始化最小编辑距离为大的正数
	closest_string = None

	for valid_string in valid_strings:
	# 计算 Levenshtein 距离：
	distance = Levenshtein.distance(user_input, valid_string)
	if distance < min_distance:
	min_distance = distance
	closest_string = valid_string

	return closest_string


	def process_input(user_input):
	"""
	处理用户输入的字符串：
	1 - 若在有效集合中则直接返回；
	2 - 否则返回与用户输入的最接近的有效字符串。
	"""
	if user_input in valid_strings:
	return user_input # 如果用户输入的字符串在集合里，直接返回。
	else:
	# 否则，找到编辑距离最小的字符串。
	return find_closest_string(user_input, valid_strings)


	def demo(project_TCGA, output_language="Chinese"):
	project_TCGA = process_input(project_TCGA)
	name_English, name_Chinese = project_name_TCGA[project_TCGA]
	tcga_link = f"https://portal.gdc.cancer.gov/projects/{project_TCGA}"
	output1, output2 = None, None

	if output_language == "Chinese":
	output1 = f"✍️ 简称：{project_TCGA}\n❤️ 中文全称：{name_Chinese}\n💛 英文全称：{name_English}\n🔗 链接：{tcga_link}"
	system_instruction = f"您是公共卫生、流行病学、癌症研究和精准医学领域的专家，对{name_Chinese}有着深入的理解。"
	prompt_template = f"""
	您的任务是深入分析并撰写关于{name_Chinese}这种复杂疾病的摘要，内容必须准确、详实、逻辑清晰、可读性强，这对普通公众了解这种复杂疾病非常重要。
	具体内容需要包括：1 - {name_Chinese}的基本定义和概述，临床病理特征；2 - {name_Chinese}的病因和风险因素；3 - {name_Chinese}的流行病学调查结果，患病率和死亡率；4 - {name_Chinese}的临床症状与早期识别；5 - {name_Chinese}的疾病进展与转移及其密切相关的生物标志物和异常基因改变；6 - {name_Chinese}的生存率与预后；7 - {name_Chinese}的诊断、治疗方法和未来研究。
	""".strip()

	else:
	output1 = f"✍️ Abbreviation: {project_TCGA}\n❤️ Full name in Chinese: {name_Chinese}\n💛 Full Name in English: {name_English}\n🔗 Link: {tcga_link}"
	system_instruction = f"You are an expert in the fields of public health, epidemiology, cancer research, and precision medicine, with a deep comprehension of {name_English}."
	prompt_template = f"""
	Your task is to analyze and write an in-depth summary about the complex disease of {name_English} that must be accurate, informative, logical, and readable, which is very important for the general public to understand this complex disease.
	Specific content needs to include: 1 - Basic definition and overview of {name_English}, clinicopathologic features; 2 - Etiology and risk factors of {name_English}; 3 - Epidemiologic findings, prevalence, and mortality rates of {name_English}; 4 - Clinical signs and early recognition of {name_English}; 5 - Disease progression and metastasis of {name_English} and its closely related biomarkers and aberrant gene alterations; 6 - Survival and prognosis of {name_English}; and 7 - Diagnostics, therapeutic approaches, and future research of {name_English}.
	""".strip()

	try:
	# 要实例化一个 OpenAI 对象，你需要设置 OpenAI API Key、Base URL、最大重试次数以及超时限制时间。
	client = OpenAI(
	api_key=os.environ["OPENAI_API_KEY"],
	base_url=os.environ["API_BASE"],
	max_retries=3,
	timeout=60,
	)

	# 调用 client.chat.completions.create，设置关键参数。
	chat_completion = client.chat.completions.create(
	model="gpt-4o-mini-2024-07-18", # gpt-4o-mini-2024-07-18, gpt-4-turbo
	messages=[
	{"role": "system", "content": system_instruction},
	{"role": "user", "content": prompt_template},
	],
	n=1,
	seed=42,
	temperature=0.50,
	max_tokens=3600 if output_language == "Chinese" else 2048,
	logprobs=False,
	# top_logprobs=3,
	presence_penalty=0.20,
	frequency_penalty=0.20,
	)
	resp_text = chat_completion.choices[0].message.content.strip()

	# 在普通文本框不能用 "" 渲染加粗，Markdown 才可以。因此，将输入字符串中所有的 "" 替换为 ""。
	# if "**" in resp_text:
	# resp_text = resp_text.replace("**", "")
	# if "# " in resp_text:
	# resp_text = resp_text.replace("# ", "")
	# if "#" in resp_text:
	# resp_text = resp_text.replace("#", "")

	if output_language == "Chinese":
	# "🤖 请注意：以下内容通过提示工程驱动的 GPT-4 Turbo 生成\n\n"
	output2 = "" + resp_text
	else:
	# "🤖 Note: The following content is generated by the GPT-4 Turbo driven by Prompt Engineering\n\n"
	output2 = "" + resp_text

	except Exception as e:
	print(str(e), "Response Error")
	return output1, "Response Error"

	return output1, output2


	# TCGA 有 33 种癌症类型
	project_name_TCGA = {
	"TCGA-ACC": ["adrenocortical carcinoma", "肾上腺皮质癌"],
	"TCGA-BLCA": ["bladder urothelial carcinoma", "膀胱尿路上皮癌"],
	"TCGA-BRCA": ["breast invasive carcinoma", "浸润性乳腺癌"],
	"TCGA-CESC": [
	"cervical squamous cell carcinoma and endocervical adenocarcinoma",
	"宫颈鳞状细胞癌与宫颈内膜腺癌",
	],
	"TCGA-CHOL": ["cholangiocarcinoma", "胆管癌"],
	"TCGA-COAD": ["colon adenocarcinoma", "结肠腺癌"],
	"TCGA-DLBC": [
	"lymphoid neoplasm diffuse large B-cell lymphoma",
	"弥漫性大 B 细胞淋巴瘤",
	],
	"TCGA-ESCA": ["esophageal carcinoma", "食道癌"],
	"TCGA-GBM": ["glioblastoma multiforme", "多形性胶质母细胞瘤"],
	"TCGA-HNSC": ["head and neck squamous cell carcinoma", "头颈部鳞状细胞癌"],
	"TCGA-KICH": ["kidney chromophobe", "肾嫌色细胞癌"],
	"TCGA-KIRC": ["kidney renal clear cell carcinoma", "肾透明细胞癌"],
	"TCGA-KIRP": ["kidney renal papillary cell carcinoma", "乳头状肾细胞癌"],
	"TCGA-LAML": ["acute myeloid leukemia", "急性髓系白血病"],
	"TCGA-LGG": ["brain lower grade glioma", "低级别脑胶质瘤"],
	"TCGA-LIHC": ["liver hepatocellular carcinoma", "肝细胞癌"],
	"TCGA-LUAD": ["lung adenocarcinoma", "肺腺癌"],
	"TCGA-LUSC": ["lung squamous cell carcinoma", "肺鳞状细胞癌"],
	"TCGA-MESO": ["mesothelioma", "间皮瘤"],
	"TCGA-OV": ["ovarian serous cystadenocarcinoma", "卵巢浆液性囊腺癌"],
	"TCGA-PAAD": ["pancreatic adenocarcinoma", "胰腺腺癌"],
	"TCGA-PCPG": ["pheochromocytoma and paraganglioma", "嗜铬细胞瘤和副神经节瘤"],
	"TCGA-PRAD": ["prostate adenocarcinoma", "前列腺腺癌"],
	"TCGA-READ": ["rectum adenocarcinoma", "直肠腺癌"],
	"TCGA-SARC": ["sarcoma", "肉瘤"],
	"TCGA-SKCM": ["skin cutaneous melanoma", "皮肤黑色素瘤"],
	"TCGA-STAD": ["stomach adenocarcinoma", "胃腺癌"],
	"TCGA-TGCT": ["testicular germ cell tumors", "睾丸生殖细胞肿瘤"],
	"TCGA-THCA": ["thyroid carcinoma", "甲状腺癌"],
	"TCGA-THYM": ["thymoma", "胸腺瘤"],
	"TCGA-UCEC": ["uterine corpus endometrial carcinoma", "子宫体子宫内膜癌"],
	"TCGA-UCS": ["uterine carcinosarcoma", "子宫癌肉瘤"],
	"TCGA-UVM": ["uveal melanoma", "眼内（葡萄膜）黑色素瘤"],
	}
	# 预定义的字符串集合：
	valid_strings = {input_query for input_query in project_name_TCGA.keys()}
	# print(len(project_name_TCGA.keys()))
	# input_query = input("请输入您要查询的 TCGA 项目名称：")
	# print(project_name_TCGA[input_query])
	# print([k for k in project_name_TCGA.keys()])

	# 支持 Markdown 和 HTML 内容格式：
	# Abbreviations, Full Names and Descriptions of All Cancer Types Covered by TCGA Project.
	# desc = """<h1 align="center" style="font-family: Latin Modern Math, sans-serif; font-size: 22px; color: #00FF7F;">🎉 Abbreviations, Full Names and Descriptions of All Cancer Types Covered by TCGA Project 🧬</h1>"""

	desc = """<h1 align="center" style="font-family: KaiTi, sans-serif; font-size: 22px; color: #00FF7F;">🎉 TCGA 项目涉及的所有癌症类型的缩写、中英文全称和描述 🧬</h1>"""
	outputs = [
	gr.Textbox(
	label="🔎 1. 您查询的 TCGA 项目的癌症类型", show_copy_button=True
	), # 1. The Full Name of The Cancer Type Queried.
	gr.Textbox(
	label="👩‍⚕️ 2. 迅速了解这种癌症类型的信息",
	show_copy_button=True,
	), # 2. Insight Into The Cancer Type Being Queried. A Quick Look At The Cancer Type Being Queried
	]
	my_demo = gr.Interface(
	fn=demo,
	inputs=[
	gr.Dropdown(
	choices=[k for k in project_name_TCGA.keys()],
	value="TCGA-READ",
	allow_custom_value=True,
	filterable=True,
	interactive=True,
	label="⌨️ 请输入您要查询的 TCGA 项目名称，如 TCGA-READ",
	), # Please enter the name of the TCGA project you want to query, such as TCGA-READ.
	gr.Dropdown(
	choices=["Chinese", "English"],
	value="Chinese",
	allow_custom_value=False,
	label="👨‍💻 输出语言目前仅支持中文和英文",
	),
	],
	outputs=outputs,
	submit_btn=gr.Button("提交", variant="primary"),
	clear_btn=gr.Button("清除", variant="secondary"),
	cache_examples=True,
	examples=[["TCGA-READ", "Chinese"], ["TCGA-COAD", "English"]],
	description=desc,
	theme="JohnSmith9982/small_and_pretty",
	)
	my_demo.launch(show_api=False, show_error=True)