# 重新加载数据集，现在我们有了列名的准确英文对照和如何处理SAS和SDS列的具体信息
import pandas as pd

columns_english = ["Gender", "Education", "Hemoglobin", "Total_Protein", "Albumin", "HDL", "AST", "Direct_Bilirubin",
                   "Sleep", "SF-36", "SAS", "SDS"]
file_path = "new_data.csv"
# 由于文件未变，直接使用新列名重新加载数据
data = pd.read_csv(file_path, encoding='ISO-8859-1', names=columns_english, header=0)
# 定义学历对应的英文描述
education_levels = {
    0: "elementary school",
    1: "junior high school",
    2: "high school/diploma",
    3: "bachelor/master/phd"
}
# 定义性别对应的英文描述
gender_dict = {
    0: "male",
    1: "female"
}


# 定义SAS和SDS得分的分类
def classify_sas_sds(score, threshold, mild, moderate, severe):
    if score < threshold:
        return 0  # Normal
    elif threshold <= score < mild:
        return 1  # Mild
    elif mild <= score < moderate:
        return 2  # Moderate
    elif score >= severe:
        return 3  # Severe


# 转换每行数据为自然语言描述
descriptions = ""
# 根据要求，将创建一个新的DataFrame，其中包括更新后的描述和单独的sas_class和sds_class列
new_data = pd.DataFrame({
    "Description": descriptions,
    "SAS_Class": [classify_sas_sds(row["SAS"], 50, 59, 69, 70) for _, row in data.iterrows()],
    "SDS_Class": [classify_sas_sds(row["SDS"], 53, 62, 72, 73) for _, row in data.iterrows()]
})

# 生成不包含sas_class和sds_class标签的描述
new_descriptions = []
for _, row in data.iterrows():
    gender = gender_dict[row["Gender"]]
    education = education_levels[row["Education"]]
    description = f"A patient of gender {gender} and educational background {education}, has hemoglobin {row['Hemoglobin']}g/L, total protein {row['Total_Protein']}g/L, albumin {row['Albumin']}g/L, HDL {row['HDL']}mmol/L, AST {row['AST']}U/L, direct bilirubin {row['Direct_Bilirubin']}μmol/L, sleeps {row['Sleep']} hours, and has an SF-36 score of {row['SF-36']}."
    new_descriptions.append(description)

# 更新DataFrame中的描述列
new_data["Description"] = new_descriptions

# 将新的DataFrame保存为CSV文件
new_csv_path = 'processed_new_data.csv'
new_data.to_csv(new_csv_path, index=False)

print(new_csv_path)