from fastapi import FastAPI, HTTPException import pandas as pd from pydantic import BaseModel import joblib from sklearn.model_selection import train_test_split from sklearn.ensemble import RandomForestClassifier from sklearn.linear_model import LogisticRegression # Initialize FastAPI app app = FastAPI() # Load dataset DATASET_PATH = "credit_risk_dataset.csv" # Update with actual dataset path df = pd.read_csv(DATASET_PATH) # Prepare data for ML training FEATURES = ["loan_amnt", "loan_int_rate", "person_age", "person_income", "cb_person_cred_hist_length"] X = df[FEATURES] y = df["loan_status"] X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42) # Train models classifier = RandomForestClassifier(n_estimators=100, random_state=42) classifier.fit(X_train, y_train) joblib.dump(classifier, "models/risk_classifier.pkl") regressor = LogisticRegression() regressor.fit(X_train, y_train) joblib.dump(regressor, "models/past_due_regressor.pkl") # Load ML models classifier = joblib.load("models/risk_classifier.pkl") regressor = joblib.load("models/past_due_regressor.pkl") # Pydantic models for validation class LoanRequest(BaseModel): loan_amnt: float loan_int_rate: float person_age: int person_income: float person_home_ownership: str cb_person_cred_hist_length: int # API Endpoints @app.get("/loan_status_distribution") def loan_status_distribution(): if "loan_status" not in df.columns: raise HTTPException(status_code=400, detail="Missing 'loan_status' column") status_counts = df["loan_status"].value_counts(normalize=True) * 100 return {"default_percentage": status_counts.get(1, 0), "non_default_percentage": status_counts.get(0, 0)} @app.get("/payment_timeline_analysis") def payment_timeline_analysis(): grouped = df.groupby("loan_status")["loan_amnt"].mean().to_dict() return {"average_loan_amount_by_status": grouped} @app.get("/principal_amount_patterns") def principal_amount_patterns(): demographic_defaults = df.groupby(["person_age", "person_income", "person_home_ownership"])['loan_status'].mean().to_dict() return {"demographic_default_rates": demographic_defaults} @app.get("/credit_history_impact") def credit_history_impact(): history_impact = df.groupby("cb_person_cred_hist_length")["loan_status"].mean().to_dict() return {"credit_history_default_rates": history_impact} @app.get("/customer_profile_analysis") def customer_profile_analysis(): profile_analysis = df.groupby(["person_age", "person_income", "person_home_ownership"])["loan_status"].mean().to_dict() return {"customer_profile_default_rates": profile_analysis} @app.get("/loan_intent_analysis") def loan_intent_analysis(): intent_defaults = df.groupby("loan_intent")["loan_status"].mean().to_dict() return {"loan_intent_default_rates": intent_defaults} @app.get("/collection_effectiveness") def collection_effectiveness(): success_rate = df.groupby("cb_person_default_on_file")["loan_status"].mean().to_dict() return {"collection_success_rate": success_rate} @app.get("/risk_score_development") def risk_score_development(): risk_factors = df.groupby(["loan_amnt", "loan_int_rate", "person_age", "person_income", "cb_person_cred_hist_length"])["loan_status"].mean().to_dict() return {"risk_scores": risk_factors} @app.post("/predict_loan_risk") def predict_loan_risk(request: LoanRequest): input_data = [[request.loan_amnt, request.loan_int_rate, request.person_age, request.person_income, request.cb_person_cred_hist_length]] risk_class = classifier.predict(input_data)[0] risk_prob = regressor.predict_proba(input_data)[0][1] return {"predicted_risk_category": int(risk_class), "default_probability": float(risk_prob)}