TCGA-Name-Query / app.py
happyGPT's picture
Lucky
9a6b9b4 verified
"""
作者:曾浩龙(独立开发)
创建时间:2024 年 10 月 25 日
第三方依赖库:Gradio (https://www.gradio.app/) 与 OpenAI Python API library (https://github.com/openai/openai-python)
其他说明:本项目声明仅供学习和研究使用。
"""
import os
import platform
print(platform.python_version())
import Levenshtein
import gradio as gr
from openai import OpenAI
def find_closest_string(user_input, valid_strings):
"""查找与输入字符串最接近的字符串,精确的编辑距离,根据 Levenshtein 编辑距离最小原则。"""
min_distance = 999999999 # 初始化最小编辑距离为大的正数
closest_string = None
for valid_string in valid_strings:
# 计算 Levenshtein 距离:
distance = Levenshtein.distance(user_input, valid_string)
if distance < min_distance:
min_distance = distance
closest_string = valid_string
return closest_string
def process_input(user_input):
"""
处理用户输入的字符串:
1 - 若在有效集合中则直接返回;
2 - 否则返回与用户输入的最接近的有效字符串。
"""
if user_input in valid_strings:
return user_input # 如果用户输入的字符串在集合里,直接返回。
else:
# 否则,找到编辑距离最小的字符串。
return find_closest_string(user_input, valid_strings)
def demo(project_TCGA, output_language="Chinese"):
project_TCGA = process_input(project_TCGA)
name_English, name_Chinese = project_name_TCGA[project_TCGA]
tcga_link = f"https://portal.gdc.cancer.gov/projects/{project_TCGA}"
output1, output2 = None, None
if output_language == "Chinese":
output1 = f"✍️ 简称:{project_TCGA}\n❤️ 中文全称:{name_Chinese}\n💛 英文全称:{name_English}\n🔗 链接:{tcga_link}"
system_instruction = f"您是公共卫生、流行病学、癌症研究和精准医学领域的专家,对{name_Chinese}有着深入的理解。"
prompt_template = f"""
您的任务是深入分析并撰写关于{name_Chinese}这种复杂疾病的摘要,内容必须准确、详实、逻辑清晰、可读性强,这对普通公众了解这种复杂疾病非常重要。
具体内容需要包括:1 - {name_Chinese}的基本定义和概述,临床病理特征;2 - {name_Chinese}的病因和风险因素;3 - {name_Chinese}的流行病学调查结果,患病率和死亡率;4 - {name_Chinese}的临床症状与早期识别;5 - {name_Chinese}的疾病进展与转移及其密切相关的生物标志物和异常基因改变;6 - {name_Chinese}的生存率与预后;7 - {name_Chinese}的诊断、治疗方法和未来研究。
""".strip()
else:
output1 = f"✍️ Abbreviation: {project_TCGA}\n❤️ Full name in Chinese: {name_Chinese}\n💛 Full Name in English: {name_English}\n🔗 Link: {tcga_link}"
system_instruction = f"You are an expert in the fields of public health, epidemiology, cancer research, and precision medicine, with a deep comprehension of {name_English}."
prompt_template = f"""
Your task is to analyze and write an in-depth summary about the complex disease of {name_English} that must be accurate, informative, logical, and readable, which is very important for the general public to understand this complex disease.
Specific content needs to include: 1 - Basic definition and overview of {name_English}, clinicopathologic features; 2 - Etiology and risk factors of {name_English}; 3 - Epidemiologic findings, prevalence, and mortality rates of {name_English}; 4 - Clinical signs and early recognition of {name_English}; 5 - Disease progression and metastasis of {name_English} and its closely related biomarkers and aberrant gene alterations; 6 - Survival and prognosis of {name_English}; and 7 - Diagnostics, therapeutic approaches, and future research of {name_English}.
""".strip()
try:
# 要实例化一个 OpenAI 对象,你需要设置 OpenAI API Key、Base URL、最大重试次数以及超时限制时间。
client = OpenAI(
api_key=os.environ["OPENAI_API_KEY"],
base_url=os.environ["API_BASE"],
max_retries=3,
timeout=60,
)
# 调用 client.chat.completions.create,设置关键参数。
chat_completion = client.chat.completions.create(
model="gpt-4o-mini-2024-07-18", # gpt-4o-mini-2024-07-18, gpt-4-turbo
messages=[
{"role": "system", "content": system_instruction},
{"role": "user", "content": prompt_template},
],
n=1,
seed=42,
temperature=0.50,
max_tokens=3600 if output_language == "Chinese" else 2048,
logprobs=False,
# top_logprobs=3,
presence_penalty=0.20,
frequency_penalty=0.20,
)
resp_text = chat_completion.choices[0].message.content.strip()
# 在普通文本框不能用 "**" 渲染加粗,Markdown 才可以。因此,将输入字符串中所有的 "**" 替换为 ""。
# if "**" in resp_text:
# resp_text = resp_text.replace("**", "")
# if "# " in resp_text:
# resp_text = resp_text.replace("# ", "")
# if "#" in resp_text:
# resp_text = resp_text.replace("#", "")
if output_language == "Chinese":
# "🤖 请注意:以下内容通过提示工程驱动的 GPT-4 Turbo 生成\n\n"
output2 = "" + resp_text
else:
# "🤖 Note: The following content is generated by the GPT-4 Turbo driven by Prompt Engineering\n\n"
output2 = "" + resp_text
except Exception as e:
print(str(e), "Response Error")
return output1, "Response Error"
return output1, output2
# TCGA 有 33 种癌症类型
project_name_TCGA = {
"TCGA-ACC": ["adrenocortical carcinoma", "肾上腺皮质癌"],
"TCGA-BLCA": ["bladder urothelial carcinoma", "膀胱尿路上皮癌"],
"TCGA-BRCA": ["breast invasive carcinoma", "浸润性乳腺癌"],
"TCGA-CESC": [
"cervical squamous cell carcinoma and endocervical adenocarcinoma",
"宫颈鳞状细胞癌与宫颈内膜腺癌",
],
"TCGA-CHOL": ["cholangiocarcinoma", "胆管癌"],
"TCGA-COAD": ["colon adenocarcinoma", "结肠腺癌"],
"TCGA-DLBC": [
"lymphoid neoplasm diffuse large B-cell lymphoma",
"弥漫性大 B 细胞淋巴瘤",
],
"TCGA-ESCA": ["esophageal carcinoma", "食道癌"],
"TCGA-GBM": ["glioblastoma multiforme", "多形性胶质母细胞瘤"],
"TCGA-HNSC": ["head and neck squamous cell carcinoma", "头颈部鳞状细胞癌"],
"TCGA-KICH": ["kidney chromophobe", "肾嫌色细胞癌"],
"TCGA-KIRC": ["kidney renal clear cell carcinoma", "肾透明细胞癌"],
"TCGA-KIRP": ["kidney renal papillary cell carcinoma", "乳头状肾细胞癌"],
"TCGA-LAML": ["acute myeloid leukemia", "急性髓系白血病"],
"TCGA-LGG": ["brain lower grade glioma", "低级别脑胶质瘤"],
"TCGA-LIHC": ["liver hepatocellular carcinoma", "肝细胞癌"],
"TCGA-LUAD": ["lung adenocarcinoma", "肺腺癌"],
"TCGA-LUSC": ["lung squamous cell carcinoma", "肺鳞状细胞癌"],
"TCGA-MESO": ["mesothelioma", "间皮瘤"],
"TCGA-OV": ["ovarian serous cystadenocarcinoma", "卵巢浆液性囊腺癌"],
"TCGA-PAAD": ["pancreatic adenocarcinoma", "胰腺腺癌"],
"TCGA-PCPG": ["pheochromocytoma and paraganglioma", "嗜铬细胞瘤和副神经节瘤"],
"TCGA-PRAD": ["prostate adenocarcinoma", "前列腺腺癌"],
"TCGA-READ": ["rectum adenocarcinoma", "直肠腺癌"],
"TCGA-SARC": ["sarcoma", "肉瘤"],
"TCGA-SKCM": ["skin cutaneous melanoma", "皮肤黑色素瘤"],
"TCGA-STAD": ["stomach adenocarcinoma", "胃腺癌"],
"TCGA-TGCT": ["testicular germ cell tumors", "睾丸生殖细胞肿瘤"],
"TCGA-THCA": ["thyroid carcinoma", "甲状腺癌"],
"TCGA-THYM": ["thymoma", "胸腺瘤"],
"TCGA-UCEC": ["uterine corpus endometrial carcinoma", "子宫体子宫内膜癌"],
"TCGA-UCS": ["uterine carcinosarcoma", "子宫癌肉瘤"],
"TCGA-UVM": ["uveal melanoma", "眼内(葡萄膜)黑色素瘤"],
}
# 预定义的字符串集合:
valid_strings = {input_query for input_query in project_name_TCGA.keys()}
# print(len(project_name_TCGA.keys()))
# input_query = input("请输入您要查询的 TCGA 项目名称:")
# print(project_name_TCGA[input_query])
# print([k for k in project_name_TCGA.keys()])
# 支持 Markdown 和 HTML 内容格式:
# Abbreviations, Full Names and Descriptions of All Cancer Types Covered by TCGA Project.
# desc = """<h1 align="center" style="font-family: Latin Modern Math, sans-serif; font-size: 22px; color: #00FF7F;">🎉 Abbreviations, Full Names and Descriptions of All Cancer Types Covered by TCGA Project 🧬</h1>"""
desc = """<h1 align="center" style="font-family: KaiTi, sans-serif; font-size: 22px; color: #00FF7F;">🎉 TCGA 项目涉及的所有癌症类型的缩写、中英文全称和描述 🧬</h1>"""
outputs = [
gr.Textbox(
label="🔎 1. 您查询的 TCGA 项目的癌症类型", show_copy_button=True
), # 1. The Full Name of The Cancer Type Queried.
gr.Textbox(
label="👩‍⚕️ 2. 迅速了解这种癌症类型的信息",
show_copy_button=True,
), # 2. Insight Into The Cancer Type Being Queried. A Quick Look At The Cancer Type Being Queried
]
my_demo = gr.Interface(
fn=demo,
inputs=[
gr.Dropdown(
choices=[k for k in project_name_TCGA.keys()],
value="TCGA-READ",
allow_custom_value=True,
filterable=True,
interactive=True,
label="⌨️ 请输入您要查询的 TCGA 项目名称,如 TCGA-READ",
), # Please enter the name of the TCGA project you want to query, such as TCGA-READ.
gr.Dropdown(
choices=["Chinese", "English"],
value="Chinese",
allow_custom_value=False,
label="👨‍💻 输出语言目前仅支持中文和英文",
),
],
outputs=outputs,
submit_btn=gr.Button("提交", variant="primary"),
clear_btn=gr.Button("清除", variant="secondary"),
cache_examples=True,
examples=[["TCGA-READ", "Chinese"], ["TCGA-COAD", "English"]],
description=desc,
theme="JohnSmith9982/small_and_pretty",
)
my_demo.launch(show_api=False, show_error=True)