N-I-M-I commited on
Commit
f4135ce
·
verified ·
1 Parent(s): dce5712

Upload data_loader.py with huggingface_hub

Browse files
Files changed (1) hide show
  1. data_loader.py +72 -0
data_loader.py ADDED
@@ -0,0 +1,72 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import pandas as pd
2
+ import numpy as np
3
+ import torch
4
+ from torch.utils.data import DataLoader, Dataset
5
+ from sklearn.model_selection import train_test_split
6
+ from sklearn.preprocessing import StandardScaler, LabelEncoder
7
+
8
+ class CardiovascularDataset(Dataset):
9
+ def __init__(self, X, y):
10
+ self.X = torch.FloatTensor(X)
11
+ self.y = torch.LongTensor(y)
12
+
13
+ def __len__(self):
14
+ return len(self.y)
15
+
16
+ def __getitem__(self, idx):
17
+ return self.X[idx], self.y[idx]
18
+
19
+ def get_data_loaders(csv_path, batch_size=32):
20
+ df = pd.read_csv(csv_path)
21
+
22
+ # Drop Patient_ID as it's not a feature
23
+ if 'Patient_ID' in df.columns:
24
+ df = df.drop('Patient_ID', axis=1)
25
+
26
+ # Encode categorical features
27
+ le_smoking = LabelEncoder()
28
+ df['smoking_status'] = le_smoking.fit_transform(df['smoking_status'])
29
+
30
+ le_family = LabelEncoder()
31
+ df['family_history_heart_disease'] = le_family.fit_transform(df['family_history_heart_disease'])
32
+
33
+ # Encode target
34
+ le_risk = LabelEncoder()
35
+ df['risk_category'] = le_risk.fit_transform(df['risk_category'])
36
+
37
+ # Class mapping: Low=1, Medium=2, High=0 (LabelEncoder sorts alphabetically)
38
+ # Actually, let's see what LabelEncoder did:
39
+ # High -> 0, Low -> 1, Medium -> 2 (alphabetical)
40
+
41
+ X = df.drop('risk_category', axis=1).values
42
+ y = df['risk_category'].values
43
+
44
+ # Split data
45
+ X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
46
+
47
+ # Scale numerical features
48
+ scaler = StandardScaler()
49
+ X_train = scaler.fit_transform(X_train)
50
+ X_test = scaler.transform(X_test)
51
+
52
+ # Reshape for RNN: (batch, seq_len, input_size)
53
+ # We treat features as a sequence of length 'num_features' with 1 feature per step
54
+ X_train = X_train.reshape(X_train.shape[0], X_train.shape[1], 1)
55
+ X_test = X_test.reshape(X_test.shape[0], X_test.shape[1], 1)
56
+
57
+ train_dataset = CardiovascularDataset(X_train, y_train)
58
+ test_dataset = CardiovascularDataset(X_test, y_test)
59
+
60
+ train_loader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True)
61
+ test_loader = DataLoader(test_dataset, batch_size=batch_size, shuffle=False)
62
+
63
+ return train_loader, test_loader, X_train.shape[1], len(le_risk.classes_)
64
+
65
+ if __name__ == "__main__":
66
+ train_loader, test_loader, num_features, num_classes = get_data_loaders('cardiovascular_risk_dataset.csv')
67
+ print(f"Number of features: {num_features}")
68
+ print(f"Number of classes: {num_classes}")
69
+ for X, y in train_loader:
70
+ print(f"Batch X shape: {X.shape}")
71
+ print(f"Batch y shape: {y.shape}")
72
+ break