File size: 2,926 Bytes
efa0882
4b771a1
b62adba
fde29d9
efa0882
 
fde29d9
6575b06
 
 
 
 
 
 
efa0882
b62adba
6575b06
 
 
 
 
 
 
 
 
 
efa0882
fde29d9
 
efa0882
6575b06
 
 
 
 
 
fde29d9
 
efa0882
b62adba
efa0882
6575b06
 
 
 
 
fde29d9
efa0882
6575b06
fde29d9
 
efa0882
b62adba
efa0882
 
 
6575b06
efa0882
 
 
 
6575b06
 
 
 
0901b66
 
b62adba
efa0882
 
 
fde29d9
 
6575b06
4b771a1
 
6575b06
efa0882
da88bf0
6575b06
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
# Import necessary libraries
import pandas as pd
from datasets import Dataset
from transformers import AutoTokenizer, AutoModelForSequenceClassification, Trainer, TrainingArguments
from sklearn.model_selection import train_test_split

# Load the dataset
file_path = 'diabetes_prediction_dataset.csv'  # Ensure the dataset file is present in the same directory
df = pd.read_csv(file_path)

# Define the target column and create binary labels
target_column = 'hypertension'  # Replace with your target column name
if target_column not in df.columns:
    raise ValueError(f"Target column '{target_column}' not found in the dataset.")

threshold_value = 0
df['label'] = (df[target_column] > threshold_value).astype(int)

# Ensure necessary feature columns exist
feature_columns = ['age', 'bmi', 'HbA1c_level']  # Replace with your dataset's feature names
for col in feature_columns:
    if col not in df.columns:
        raise ValueError(f"Feature column '{col}' not found in the dataset.")

# Handle missing values (optional: drop or fill)
df = df.dropna(subset=feature_columns + [target_column])

# Split the dataset into train and test sets
train_df, test_df = train_test_split(df, test_size=0.2, random_state=42)

# Convert to Hugging Face Dataset
train_dataset = Dataset.from_pandas(train_df.reset_index(drop=True))
test_dataset = Dataset.from_pandas(test_df.reset_index(drop=True))

# Load the tokenizer and model
model_name = "bert-base-uncased"  # Replace with a suitable model for your task
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModelForSequenceClassification.from_pretrained(model_name, num_labels=2)

# Define a tokenization function
def preprocess_function(examples):
    # Combine features into a single string representation
    inputs = [
        f"age: {age}, bmi: {bmi}, HbA1c: {hba1c}"
        for age, bmi, hba1c in zip(examples["age"], examples["bmi"], examples["HbA1c_level"])
    ]
    return tokenizer(inputs, padding="max_length", truncation=True, max_length=32)

# Apply the tokenization function
tokenized_train = train_dataset.map(preprocess_function, batched=True)
tokenized_test = test_dataset.map(preprocess_function, batched=True)

# Set up training arguments
training_args = TrainingArguments(
    output_dir="./results",
    evaluation_strategy="epoch",
    save_strategy="epoch",
    per_device_train_batch_size=16,
    per_device_eval_batch_size=16,
    num_train_epochs=3,
    weight_decay=0.01,
    logging_dir='./logs',
    logging_steps=10,
    load_best_model_at_end=True,
    metric_for_best_model="accuracy",
)

# Initialize the Trainer
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_train,
    eval_dataset=tokenized_test,
    tokenizer=tokenizer,  # Ensure the tokenizer is passed
)

# Train the model
trainer.train()

# Evaluate the model
eval_results = trainer.evaluate()
print("Evaluation Results:", eval_results)