Spaces:
Sleeping
Sleeping
Upload data_loader.py with huggingface_hub
Browse files- data_loader.py +72 -0
data_loader.py
ADDED
|
@@ -0,0 +1,72 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
import pandas as pd
|
| 2 |
+
import numpy as np
|
| 3 |
+
import torch
|
| 4 |
+
from torch.utils.data import DataLoader, Dataset
|
| 5 |
+
from sklearn.model_selection import train_test_split
|
| 6 |
+
from sklearn.preprocessing import StandardScaler, LabelEncoder
|
| 7 |
+
|
| 8 |
+
class CardiovascularDataset(Dataset):
|
| 9 |
+
def __init__(self, X, y):
|
| 10 |
+
self.X = torch.FloatTensor(X)
|
| 11 |
+
self.y = torch.LongTensor(y)
|
| 12 |
+
|
| 13 |
+
def __len__(self):
|
| 14 |
+
return len(self.y)
|
| 15 |
+
|
| 16 |
+
def __getitem__(self, idx):
|
| 17 |
+
return self.X[idx], self.y[idx]
|
| 18 |
+
|
| 19 |
+
def get_data_loaders(csv_path, batch_size=32):
|
| 20 |
+
df = pd.read_csv(csv_path)
|
| 21 |
+
|
| 22 |
+
# Drop Patient_ID as it's not a feature
|
| 23 |
+
if 'Patient_ID' in df.columns:
|
| 24 |
+
df = df.drop('Patient_ID', axis=1)
|
| 25 |
+
|
| 26 |
+
# Encode categorical features
|
| 27 |
+
le_smoking = LabelEncoder()
|
| 28 |
+
df['smoking_status'] = le_smoking.fit_transform(df['smoking_status'])
|
| 29 |
+
|
| 30 |
+
le_family = LabelEncoder()
|
| 31 |
+
df['family_history_heart_disease'] = le_family.fit_transform(df['family_history_heart_disease'])
|
| 32 |
+
|
| 33 |
+
# Encode target
|
| 34 |
+
le_risk = LabelEncoder()
|
| 35 |
+
df['risk_category'] = le_risk.fit_transform(df['risk_category'])
|
| 36 |
+
|
| 37 |
+
# Class mapping: Low=1, Medium=2, High=0 (LabelEncoder sorts alphabetically)
|
| 38 |
+
# Actually, let's see what LabelEncoder did:
|
| 39 |
+
# High -> 0, Low -> 1, Medium -> 2 (alphabetical)
|
| 40 |
+
|
| 41 |
+
X = df.drop('risk_category', axis=1).values
|
| 42 |
+
y = df['risk_category'].values
|
| 43 |
+
|
| 44 |
+
# Split data
|
| 45 |
+
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
|
| 46 |
+
|
| 47 |
+
# Scale numerical features
|
| 48 |
+
scaler = StandardScaler()
|
| 49 |
+
X_train = scaler.fit_transform(X_train)
|
| 50 |
+
X_test = scaler.transform(X_test)
|
| 51 |
+
|
| 52 |
+
# Reshape for RNN: (batch, seq_len, input_size)
|
| 53 |
+
# We treat features as a sequence of length 'num_features' with 1 feature per step
|
| 54 |
+
X_train = X_train.reshape(X_train.shape[0], X_train.shape[1], 1)
|
| 55 |
+
X_test = X_test.reshape(X_test.shape[0], X_test.shape[1], 1)
|
| 56 |
+
|
| 57 |
+
train_dataset = CardiovascularDataset(X_train, y_train)
|
| 58 |
+
test_dataset = CardiovascularDataset(X_test, y_test)
|
| 59 |
+
|
| 60 |
+
train_loader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True)
|
| 61 |
+
test_loader = DataLoader(test_dataset, batch_size=batch_size, shuffle=False)
|
| 62 |
+
|
| 63 |
+
return train_loader, test_loader, X_train.shape[1], len(le_risk.classes_)
|
| 64 |
+
|
| 65 |
+
if __name__ == "__main__":
|
| 66 |
+
train_loader, test_loader, num_features, num_classes = get_data_loaders('cardiovascular_risk_dataset.csv')
|
| 67 |
+
print(f"Number of features: {num_features}")
|
| 68 |
+
print(f"Number of classes: {num_classes}")
|
| 69 |
+
for X, y in train_loader:
|
| 70 |
+
print(f"Batch X shape: {X.shape}")
|
| 71 |
+
print(f"Batch y shape: {y.shape}")
|
| 72 |
+
break
|