transexpress_ml_api

Sleeping

File size: 9,284 Bytes

import asyncio
from fastapi import FastAPI
from fastapi.middleware.cors import CORSMiddleware
import requests
import pandas as pd
import json
import os,datetime
import pandas as pd
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.preprocessing import LabelEncoder
from xgboost import XGBClassifier
from sklearn.metrics import accuracy_score, classification_report
from joblib import dump, load
import numpy as np


app = FastAPI()

app.add_middleware(
    CORSMiddleware,
    allow_origins=["*"],
    allow_credentials=True,
    allow_methods=["*"],
    allow_headers=["*"],
)



def train_the_model(data):
    try:     

        new_data = data
        encoders = load('transexpress_encoders.joblib')
        xgb_model = load('transexpress_xgb_model.joblib')
        
        # Selecting and filling missing data
        selected_columns = ['customer_name', 'customer_address', 'customer_phone_no',
                            'weight', 'cod', 'pickup_address', 'client_number', 'destination_city',
                            'status_name']
        new_data_filled = new_data[selected_columns].fillna('Missing')
        
        # Encoding categorical data
        for col, encoder in encoders.items():
            if col in new_data_filled.columns:
                unseen_categories = set(new_data_filled[col]) - set(encoder.classes_)
                if unseen_categories:
                    encoder.classes_ = np.append(encoder.classes_, unseen_categories)
                new_data_filled[col] = encoder.transform(new_data_filled[col])
        
        # Splitting data into features and target
        X_new = new_data_filled.drop('status_name', axis=1)
        y_new = new_data_filled['status_name']
        
        # Splitting data into training and testing sets
        X_train, X_test, y_train, y_test = train_test_split(X_new, y_new, test_size=0.2, random_state=42)
        
        # Setting up parameter grid for hyperparameter tuning
        param_grid = {
            'max_depth': [3, 4, 5],
            'learning_rate': [0.01, 0.1, 0.4],
            'n_estimators': [100, 200, 300],
            'subsample': [0.8, 0.9, 1],
            'colsample_bytree': [0.3, 0.7]
        }
        
        # Initializing GridSearchCV
        grid_search = GridSearchCV(estimator=xgb_model, param_grid, cv=50, n_jobs=-1, scoring='accuracy')
        
        # Fitting GridSearchCV
        grid_search.fit(X_train, y_train)
        
        # Updating the model with the best estimator
        best_model = grid_search.best_estimator_
        dump(best_model, 'transexpress_xgb_model.joblib')
        
        # Making predictions and evaluating the model
        y_pred = best_model.predict(X_test)
        accuracy = accuracy_score(y_test, y_pred)
        classification_rep = classification_report(y_test, y_pred)
        
        # Returning the results
        return accuracy, classification_rep, "Model finetuned with new data."
        

        
    except:
        data = data
        
        # Select columns
        selected_columns = ['customer_name', 'customer_address', 'customer_phone_no',
                            'weight','cod','pickup_address','client_number','destination_city',
                            'status_name']
        
        # Handling missing values
        data_filled = data[selected_columns].fillna('Missing')
        
        # Encoding categorical variables
        encoders = {col: LabelEncoder() for col in selected_columns if data_filled[col].dtype == 'object'}
        for col, encoder in encoders.items():
            data_filled[col] = encoder.fit_transform(data_filled[col])
        
        # Splitting the dataset
        X = data_filled.drop('status_name', axis=1)
        y = data_filled['status_name']
        X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
        
        # Setup the hyperparameter grid to search
        param_grid = {
            'max_depth': [3, 4, 5],
            'learning_rate': [0.01, 0.1, 0.4],
            'n_estimators': [100, 200, 300],
            'subsample': [0.8, 0.9, 1],
            'colsample_bytree': [0.3, 0.7]
        }
        
        # Initialize the classifier
        xgb = XGBClassifier(use_label_encoder=False, eval_metric='logloss')
        
        # Setup GridSearchCV
        grid_search = GridSearchCV(xgb, param_grid, cv=50, n_jobs=-1, scoring='accuracy')
        
        # Fit the grid search to the data
        grid_search.fit(X_train, y_train)
        
        # Get the best parameters
        best_params = grid_search.best_params_
        print("Best parameters:", best_params)
        
        # Train the model with best parameters
        best_xgb = XGBClassifier(**best_params, use_label_encoder=False, eval_metric='logloss')
        best_xgb.fit(X_train, y_train)
        
        # Predict on the test set
        y_pred = best_xgb.predict(X_test)
        y_pred_proba = best_xgb.predict_proba(X_test)
        
        # Evaluate the model
        accuracy = accuracy_score(y_test, y_pred)
        classification_rep = classification_report(y_test, y_pred)
        
        # Save the model
        model_filename = 'transexpress_xgb_model.joblib'
        dump(best_xgb, model_filename)
        
        # Save the encoders
        encoders_filename = 'transexpress_encoders.joblib'
        dump(encoders, encoders_filename)
        
        return accuracy,classification_rep,"base Model trained"
    
@app.get("/trigger_the_data_fecher")
async def your_continuous_function(page: str,paginate: str):
    print("data fetcher running.....")
            
    # Initialize an empty DataFrame to store the combined data
    combined_df = pd.DataFrame()
            
    # Update the payload for each page
    url = "https://report.transexpress.lk/api/orders/delivery-success-rate/return-to-client-orders?page="+page+"&per_page="+paginate
    
    payload = {}
    headers = {
      'Cookie': 'development_trans_express_session=NaFDGzh5WQCFwiortxA6WEFuBjsAG9GHIQrbKZ8B'
    }
            
    response = requests.request("GET", url, headers=headers, data=payload)
            
    # Sample JSON response
    json_response = response.json()
    # Extracting 'data' for conversion
    data = json_response["return_to_client_orders"]['data']

    data_count = len(data)  
    
    df = pd.json_normalize(data)
    
            
    df['status_name'] = df['status_name'].replace('Partially Delivered', 'Delivered')
    df['status_name'] = df['status_name'].replace('Received by Client', 'Returned to Client')
    
    print("data collected from page : "+page)
    #return "done"
    #data.to_csv("new.csv")
    
    accuracy,classification_rep,message = train_the_model(df)

    return {"message":message,"page_number":page,"data_count":data_count,"accuracy":accuracy,"classification_rep":classification_rep}


    

@app.get("/get_latest_model_updated_time")
async def model_updated_time():
    try:
        m_time_encoder = os.path.getmtime('transexpress_encoders.joblib')
        m_time_model = os.path.getmtime('transexpress_xgb_model.joblib')
        return {"base model created time ":datetime.datetime.fromtimestamp(m_time_encoder),
                "last model updated time":datetime.datetime.fromtimestamp(m_time_model)}
    except:
        return {"no model found so first trained the model using data fecther"}





# Endpoint for making predictions
@app.post("/predict")
def predict(
    customer_name: str,
    customer_address: str,
    customer_phone: str,
    weight: int,
    cod: int,
    pickup_address: str,
    client_number:str,
    destination_city:str
    ):


    try:
        # Load your trained model and encoders
        xgb_model = load('transexpress_xgb_model.joblib')
        encoders = load('transexpress_encoders.joblib')
    except:
        return {"no model found so first trained the model using data fecther"}

    
    # Function to handle unseen labels during encoding
    def safe_transform(encoder, column):
        classes = encoder.classes_
        return [encoder.transform([x])[0] if x in classes else -1 for x in column] 
        
    # Convert input data to DataFrame
    input_data = {
        'customer_name': customer_name,
        'customer_address': customer_address,
        'customer_phone_no': customer_phone,
        'weight': weight,
        'cod': cod,
        'pickup_address':pickup_address,
        'client_number':client_number,
        'destination_city':destination_city
    }
    input_df = pd.DataFrame([input_data])

    # Encode categorical variables using the same encoders used during training
    for col in input_df.columns:
        if col in encoders:
            input_df[col] = safe_transform(encoders[col], input_df[col])

    # Predict and obtain probabilities
    pred = xgb_model.predict(input_df)
    pred_proba = xgb_model.predict_proba(input_df)

    # Output
    predicted_status = "Unknown" if pred[0] == -1 else encoders['status_name'].inverse_transform([pred])[0]
    probability = pred_proba[0][pred[0]] * 100 if pred[0] != -1 else "Unknown"
    print(predicted_status)

    if predicted_status == "Returned to Client":
       probability = 100 - probability

    return {"Probability": round(probability,2)}