# 重新加载数据集,现在我们有了列名的准确英文对照和如何处理SAS和SDS列的具体信息 import pandas as pd columns_english = ["Gender", "Education", "Hemoglobin", "Total_Protein", "Albumin", "HDL", "AST", "Direct_Bilirubin", "Sleep", "SF-36", "SAS", "SDS"] file_path = "new_data.csv" # 由于文件未变,直接使用新列名重新加载数据 data = pd.read_csv(file_path, encoding='ISO-8859-1', names=columns_english, header=0) # 定义学历对应的英文描述 education_levels = { 0: "elementary school", 1: "junior high school", 2: "high school/diploma", 3: "bachelor/master/phd" } # 定义性别对应的英文描述 gender_dict = { 0: "male", 1: "female" } # 定义SAS和SDS得分的分类 def classify_sas_sds(score, threshold, mild, moderate, severe): if score < threshold: return 0 # Normal elif threshold <= score < mild: return 1 # Mild elif mild <= score < moderate: return 2 # Moderate elif score >= severe: return 3 # Severe # 转换每行数据为自然语言描述 descriptions = "" # 根据要求,将创建一个新的DataFrame,其中包括更新后的描述和单独的sas_class和sds_class列 new_data = pd.DataFrame({ "Description": descriptions, "SAS_Class": [classify_sas_sds(row["SAS"], 50, 59, 69, 70) for _, row in data.iterrows()], "SDS_Class": [classify_sas_sds(row["SDS"], 53, 62, 72, 73) for _, row in data.iterrows()] }) # 生成不包含sas_class和sds_class标签的描述 new_descriptions = [] for _, row in data.iterrows(): gender = gender_dict[row["Gender"]] education = education_levels[row["Education"]] description = f"A patient of gender {gender} and educational background {education}, has hemoglobin {row['Hemoglobin']}g/L, total protein {row['Total_Protein']}g/L, albumin {row['Albumin']}g/L, HDL {row['HDL']}mmol/L, AST {row['AST']}U/L, direct bilirubin {row['Direct_Bilirubin']}μmol/L, sleeps {row['Sleep']} hours, and has an SF-36 score of {row['SF-36']}." new_descriptions.append(description) # 更新DataFrame中的描述列 new_data["Description"] = new_descriptions # 将新的DataFrame保存为CSV文件 new_csv_path = 'processed_new_data.csv' new_data.to_csv(new_csv_path, index=False) print(new_csv_path)