Mood_Anxiety_Disorder_Classify / dataset /process_scripts_transfer.py
Tokymin's picture
编写了数据处理已经预训练的基础代码
1f4f3bd
raw
history blame contribute delete
No virus
2.33 kB
# 重新加载数据集,现在我们有了列名的准确英文对照和如何处理SAS和SDS列的具体信息
import pandas as pd
columns_english = ["Gender", "Education", "Hemoglobin", "Total_Protein", "Albumin", "HDL", "AST", "Direct_Bilirubin",
"Sleep", "SF-36", "SAS", "SDS"]
file_path = "new_data.csv"
# 由于文件未变,直接使用新列名重新加载数据
data = pd.read_csv(file_path, encoding='ISO-8859-1', names=columns_english, header=0)
# 定义学历对应的英文描述
education_levels = {
0: "elementary school",
1: "junior high school",
2: "high school/diploma",
3: "bachelor/master/phd"
}
# 定义性别对应的英文描述
gender_dict = {
0: "male",
1: "female"
}
# 定义SAS和SDS得分的分类
def classify_sas_sds(score, threshold, mild, moderate, severe):
if score < threshold:
return 0 # Normal
elif threshold <= score < mild:
return 1 # Mild
elif mild <= score < moderate:
return 2 # Moderate
elif score >= severe:
return 3 # Severe
# 转换每行数据为自然语言描述
descriptions = ""
# 根据要求,将创建一个新的DataFrame,其中包括更新后的描述和单独的sas_class和sds_class列
new_data = pd.DataFrame({
"Description": descriptions,
"SAS_Class": [classify_sas_sds(row["SAS"], 50, 59, 69, 70) for _, row in data.iterrows()],
"SDS_Class": [classify_sas_sds(row["SDS"], 53, 62, 72, 73) for _, row in data.iterrows()]
})
# 生成不包含sas_class和sds_class标签的描述
new_descriptions = []
for _, row in data.iterrows():
gender = gender_dict[row["Gender"]]
education = education_levels[row["Education"]]
description = f"A patient of gender {gender} and educational background {education}, has hemoglobin {row['Hemoglobin']}g/L, total protein {row['Total_Protein']}g/L, albumin {row['Albumin']}g/L, HDL {row['HDL']}mmol/L, AST {row['AST']}U/L, direct bilirubin {row['Direct_Bilirubin']}μmol/L, sleeps {row['Sleep']} hours, and has an SF-36 score of {row['SF-36']}."
new_descriptions.append(description)
# 更新DataFrame中的描述列
new_data["Description"] = new_descriptions
# 将新的DataFrame保存为CSV文件
new_csv_path = 'processed_new_data.csv'
new_data.to_csv(new_csv_path, index=False)
print(new_csv_path)