Spaces:
Runtime error
Runtime error
| import pandas as pd | |
| from sklearn.model_selection import train_test_split | |
| from sklearn.ensemble import RandomForestClassifier | |
| from sklearn.preprocessing import LabelEncoder | |
| import joblib | |
| # ---------------------------- | |
| # 1. Define the symptom keywords you care about (fixed set) | |
| # ---------------------------- | |
| SYMPTOM_KEYWORDS = [ | |
| "cough", | |
| "shortness of breath", | |
| "wheezing", | |
| "chest pain", | |
| "fever", | |
| "sore throat", | |
| "fatigue", | |
| "nasal congestion" | |
| ] | |
| # ---------------------------- | |
| # 2. Load your CSV (labels come ONLY from here) | |
| # ---------------------------- | |
| CSV_FILE = "filtered_dataset.csv" # β your actual file | |
| TEXT_COL = "Symptoms" # β column with symptom descriptions | |
| LABEL_COL = "Disease" # β column with disease names | |
| df = pd.read_csv(CSV_FILE) | |
| # Optional: Drop rows with missing symptoms or labels | |
| df = df.dropna(subset=[TEXT_COL, LABEL_COL]).copy() | |
| df[TEXT_COL] = df[TEXT_COL].astype(str) | |
| print(f"Loaded {len(df)} rows from CSV.") | |
| print(f"Unique diseases found: {sorted(df[LABEL_COL].unique())}") | |
| # ---------------------------- | |
| # 3. Convert free-text β binary symptom vector | |
| # ---------------------------- | |
| def symptoms_to_binary_vector(text: str): | |
| text = text.lower() | |
| return [1 if keyword in text else 0 for keyword in SYMPTOM_KEYWORDS] | |
| # Apply to every row | |
| X = df[TEXT_COL].apply(symptoms_to_binary_vector).tolist() | |
| y = df[LABEL_COL].values # labels directly from CSV | |
| # ---------------------------- | |
| # 4. Encode labels (if not already numeric) | |
| # ---------------------------- | |
| label_encoder = LabelEncoder() | |
| y_encoded = label_encoder.fit_transform(y) | |
| # ---------------------------- | |
| # 5. Train model | |
| # ---------------------------- | |
| X_train, X_test, y_train, y_test = train_test_split( | |
| X, y_encoded, test_size=0.2, random_state=42, stratify=y_encoded | |
| ) | |
| model = RandomForestClassifier(n_estimators=100, random_state=42) | |
| model.fit(X_train, y_train) | |
| # ---------------------------- | |
| # 6. Save everything for your app | |
| # ---------------------------- | |
| joblib.dump(model, "disease_model.pkl") | |
| joblib.dump(label_encoder, "label_encoder.pkl") | |
| joblib.dump(SYMPTOM_KEYWORDS, "symptom_keywords.pkl") | |
| print("\nβ Training complete!") | |
| print("Saved: disease_model.pkl, label_encoder.pkl, symptom_keywords.pkl") |