tinyml-backend / preprocessing.py
IDKHowToCodeFr's picture
Upload folder using huggingface_hub
f7b331f verified
Raw
History Blame Contribute Delete
2.71 kB
import pandas as pd
import numpy as np
import os
import joblib
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, LabelEncoder
def resolve_model_dir():
# Resolve relative paths universally whether Docker or local execution
return '/app/model' if os.path.exists('/app/model') else '../model' if os.path.exists('../model') else 'model'
def preprocess_data(df, is_training=True):
model_dir = resolve_model_dir()
scaler_path = f'{model_dir}/scaler.pkl'
label_encoder_path = f'{model_dir}/label_encoder.pkl'
columns_to_drop = ['Patient Number', 'Data Accuracy (%)', 'Heart Rate Alert', 'SpO2 Level Alert', 'Blood Pressure Alert', 'Temperature Alert']
cols_drop = [c for c in columns_to_drop if c in df.columns]
X_raw = df.drop(columns=cols_drop)
y = None
if 'Predicted Disease' in X_raw.columns:
y_raw = X_raw['Predicted Disease']
X_raw = X_raw.drop(columns=['Predicted Disease'])
if is_training:
label_encoder = LabelEncoder()
y = label_encoder.fit_transform(y_raw)
os.makedirs(os.path.dirname(label_encoder_path), exist_ok=True)
joblib.dump(label_encoder, label_encoder_path)
joblib.dump(label_encoder.classes_, f'{model_dir}/classes.pkl')
else:
if os.path.exists(label_encoder_path):
label_encoder = joblib.load(label_encoder_path)
y = label_encoder.transform(y_raw)
else:
y = y_raw
if 'Fall Detection' in X_raw.columns:
X_raw['Fall Detection'] = X_raw['Fall Detection'].map({'Yes': 1, 'No': 0}).fillna(0)
X_raw.fillna(X_raw.mean(), inplace=True)
# Engineered Feature
X_raw['Risk_Severity'] = (X_raw['Heart Rate (bpm)'] > 105).astype(int) + (X_raw['SpO2 Level (%)'] < 94).astype(int)
continuous_features = ['Heart Rate (bpm)', 'SpO2 Level (%)', 'Systolic Blood Pressure (mmHg)', 'Diastolic Blood Pressure (mmHg)', 'Body Temperature (°C)', 'Risk_Severity']
if is_training:
scaler = StandardScaler()
X_raw[continuous_features] = scaler.fit_transform(X_raw[continuous_features])
joblib.dump(scaler, scaler_path)
else:
if os.path.exists(scaler_path):
scaler = joblib.load(scaler_path)
X_raw[continuous_features] = scaler.transform(X_raw[continuous_features])
return X_raw, y
def get_train_test_split(df):
X, y = preprocess_data(df, is_training=True)
return train_test_split(X, y, test_size=0.2, random_state=42, stratify=y)