Spaces:
Sleeping
Sleeping
| import asyncio | |
| from fastapi import FastAPI | |
| from fastapi.middleware.cors import CORSMiddleware | |
| import requests | |
| import pandas as pd | |
| import json | |
| import os,datetime | |
| import pandas as pd | |
| from sklearn.model_selection import train_test_split, GridSearchCV | |
| from sklearn.preprocessing import LabelEncoder | |
| from sklearn.utils import resample | |
| from xgboost import XGBClassifier | |
| from sklearn.metrics import accuracy_score, classification_report | |
| from joblib import dump, load | |
| import numpy as np | |
| import requests | |
| import mysql.connector | |
| from mysql.connector import Error | |
| app = FastAPI() | |
| app.add_middleware( | |
| CORSMiddleware, | |
| allow_origins=["*"], | |
| allow_credentials=True, | |
| allow_methods=["*"], | |
| allow_headers=["*"], | |
| ) | |
| async def train_the_model(): | |
| # Load the dataset | |
| #file_path = 'model/trainer_data.csv' # Update to the correct file path 'model/trainer_data_new.csv' | |
| #data = pd.read_csv(file_path) | |
| csv_files = ['model/trainer_data.csv','model/trainer_data2.csv','model/trainer_data3.csv','model/trainer_data4.csv'] | |
| data_frames = [pd.read_csv(file) for file in csv_files] | |
| # Step 4: Concatenate all DataFrames into a single DataFrame | |
| data = pd.concat(data_frames, ignore_index=True) | |
| #data = data.iloc[0:50000] | |
| # Analyze class distribution | |
| class_distribution = data['status_name'].value_counts() | |
| print("Class Distribution before balancing:\n", class_distribution) | |
| # Get the size of the largest class to match other classes' sizes | |
| max_class_size = class_distribution.max() | |
| # Oversampling | |
| oversampled_data = pd.DataFrame() | |
| for class_name, group in data.groupby('status_name'): | |
| oversampled_group = resample(group, | |
| replace=True, # Sample with replacement | |
| n_samples=max_class_size, # to match majority class | |
| random_state=123) # for reproducibility | |
| oversampled_data = pd.concat([oversampled_data, oversampled_group], axis=0) | |
| # Verify new class distribution | |
| print("Class Distribution after oversampling:\n", oversampled_data['status_name'].value_counts()) | |
| # Save the balanced dataset if needed | |
| #oversampled_data.to_csv('model/trainer_data_balanced.csv', index=False) | |
| data = pd.read_csv("model/trainer_data_new.csv") | |
| print(data["customer_name"].count()) | |
| data = pd.read_csv("model/trainer_data_balanced.csv") | |
| print(data["customer_name"].count()) | |
| data = oversampled_data | |
| print(data["customer_name"].count()) | |
| # Select columns | |
| selected_columns = ['customer_name', 'customer_address', 'customer_phone_no', | |
| 'weight','cod','pickup_address','client_number','destination_city', | |
| 'status_name'] | |
| # Handling missing values | |
| #data_filled = data[selected_columns].fillna('Missing') | |
| data_filled = data[selected_columns].dropna() | |
| # Encoding categorical variables | |
| encoders = {col: LabelEncoder() for col in selected_columns if data_filled[col].dtype == 'object'} | |
| for col, encoder in encoders.items(): | |
| data_filled[col] = encoder.fit_transform(data_filled[col]) | |
| # Splitting the dataset | |
| X = data_filled.drop('status_name', axis=1) | |
| y = data_filled['status_name'] | |
| X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42) | |
| # Parameters to use for the model | |
| # Parameters to use for the model | |
| """params = { | |
| 'colsample_bytree': 0.3, | |
| 'learning_rate': 0.6, | |
| 'max_depth': 6, | |
| 'n_estimators': 100, | |
| 'subsample': 0.9, | |
| 'use_label_encoder': False, | |
| 'eval_metric': 'logloss' | |
| }""" | |
| params = { | |
| 'colsample_bytree': 0.9, | |
| 'learning_rate': 0.1, | |
| 'max_depth': 30, | |
| 'n_estimators': 600, | |
| 'subsample': 0.9, | |
| 'use_label_encoder': False, | |
| 'eval_metric': 'logloss' | |
| } | |
| # Initialize the classifier with the specified parameters | |
| xgb = XGBClassifier(**params) | |
| # Train the model | |
| xgb.fit(X_train, y_train) | |
| # Predict on the test set | |
| y_pred = xgb.predict(X_test) | |
| y_pred_proba = xgb.predict_proba(X_test) | |
| # Evaluate the model | |
| accuracy = accuracy_score(y_test, y_pred) | |
| classification_rep = classification_report(y_test, y_pred) | |
| # Save the model | |
| model_filename = 'model/transexpress_xgb_model.joblib' | |
| dump(xgb, model_filename) | |
| # Save the encoders | |
| encoders_filename = 'model/transexpress_encoders.joblib' | |
| dump(encoders, encoders_filename) | |
| return accuracy,classification_rep,"Model trained with new data" | |
| async def get_data(page: str,paginate: str): | |
| print("data fetcher running.....") | |
| # Initialize an empty DataFrame to store the combined data | |
| combined_df = pd.DataFrame() | |
| # Update the payload for each page | |
| url = "https://report.transexpress.lk/api/orders/delivery-success-rate/return-to-client-orders?page="+page+"&per_page="+paginate | |
| payload = {} | |
| headers = { | |
| 'Cookie': 'development_trans_express_session=NaFDGzh5WQCFwiortxA6WEFuBjsAG9GHIQrbKZ8B' | |
| } | |
| response = requests.request("GET", url, headers=headers, data=payload) | |
| # Sample JSON response | |
| json_response = response.json() | |
| # Extracting 'data' for conversion | |
| data = json_response["return_to_client_orders"]['data'] | |
| data_count = len(data) | |
| df = pd.json_normalize(data) | |
| df['status_name'] = df['status_name'].replace('Partially Delivered', 'Delivered') | |
| df['status_name'] = df['status_name'].replace('Received by Client', 'Returned to Client') | |
| print("data collected from page : "+page) | |
| #return "done" | |
| try: | |
| file_path = 'model/trainer_data5.csv' # Replace with your file path | |
| source_csv = pd.read_csv(file_path) | |
| new_data = df | |
| combined_df_final = pd.concat([source_csv,new_data], ignore_index=True) | |
| combined_df_final.to_csv("model/trainer_data5.csv") | |
| print("data added") | |
| except: | |
| df.to_csv("model/trainer_data5.csv") | |
| print("data created") | |
| print({"page_number":page,"data_count":data_count}) | |
| return {"page_number":page,"data_count":data_count} | |
| async def get_versions(): | |
| try: | |
| from pip._internal.operations import freeze | |
| except ImportError: # pip < 10.0 | |
| from pip.operations import freeze | |
| pkgs = freeze.freeze() | |
| for pkg in pkgs: | |
| print(pkg) | |
| return pkgs | |
| async def model_updated_time(): | |
| try: | |
| file_size = os.path.getsize("model/transexpress_xgb_model.joblib") | |
| m_time_encoder = os.path.getmtime('model/transexpress_encoders.joblib') | |
| m_time_model = os.path.getmtime('model/transexpress_xgb_model.joblib') | |
| return {"base model created time ":datetime.datetime.fromtimestamp(m_time_encoder), | |
| "last model updated time":datetime.datetime.fromtimestamp(m_time_model), | |
| "The size of the file is bytes":file_size | |
| } | |
| except: | |
| return {"no model found so first trained the model using data fecther"} | |
| # Database connection parameters | |
| DB_HOST = 'trans-prod-clone-staging.mysql.database.azure.com' | |
| DB_PORT = 3306 | |
| DB_DATABASE = 'defaultdb' | |
| DB_USERNAME = 'wwwdata' | |
| DB_PASSWORD = 'fcLa8F3sxgNYQ$K@%' | |
| # Connect to the database | |
| #calling this function for each request | |
| def fetch_customer_data(phone_number): | |
| #local connection | |
| connection = mysql.connector.connect( | |
| host=DB_HOST, | |
| port=DB_PORT, | |
| database=DB_DATABASE, | |
| user=DB_USERNAME, | |
| password=DB_PASSWORD | |
| ) | |
| #try: | |
| if connection.is_connected(): | |
| print("Connected to the database") | |
| # SQL query | |
| query = """ | |
| SELECT | |
| orders.customer_name AS customer_name, | |
| orders.address AS customer_address, | |
| orders.phone_no AS customer_phone_no, | |
| primary_statuses.name AS status_name | |
| FROM | |
| orders | |
| INNER JOIN | |
| statuses ON orders.status_id = statuses.id | |
| INNER JOIN | |
| primary_statuses ON statuses.name = primary_statuses.key | |
| WHERE orders.phone_no LIKE %s | |
| """ | |
| # Execute the query | |
| cursor = connection.cursor(dictionary=True) | |
| cursor.execute(query, (f"%{phone_number}%",)) | |
| # Fetch results | |
| results = cursor.fetchall() | |
| #print("Results:", results) | |
| #close conection | |
| #if connection.is_connected(): | |
| cursor.close() | |
| connection.close() | |
| print("Database connection closed") | |
| return results | |
| # except Error as e: | |
| # print(f"Error: {e}") | |
| # #close conection | |
| # #if connection.is_connected(): | |
| # cursor.close() | |
| # connection.close() | |
| # print("Database connection closed") | |
| # Endpoint for making predictions | |
| async def predict( | |
| date : str, | |
| customer_name: str, | |
| customer_address: str, | |
| customer_phone: str, | |
| weight: float, | |
| cod: int, | |
| pickup_address: str, | |
| client_number:str, | |
| destination_city:str | |
| ): | |
| try: | |
| # Load your trained model and encoders | |
| xgb_model = load('model/transexpress_xgb_model.joblib') | |
| encoders = load('model/transexpress_encoders.joblib') | |
| except: | |
| return {"no model found so first trained the model using data fecther"} | |
| # Function to handle unseen labels during encoding | |
| def safe_transform(encoder, column): | |
| classes = encoder.classes_ | |
| return [encoder.transform([x])[0] if x in classes else -1 for x in column] | |
| # Convert input data to DataFrame | |
| input_data = { | |
| 'customer_name': customer_name, | |
| 'customer_address': customer_address, | |
| 'customer_phone_no': customer_phone, | |
| 'weight': float(weight), | |
| 'cod': int(cod), | |
| 'pickup_address':pickup_address, | |
| 'client_number':client_number, | |
| 'destination_city':destination_city | |
| } | |
| input_df = pd.DataFrame([input_data]) | |
| # Encode categorical variables using the same encoders used during training | |
| for col in input_df.columns: | |
| if col in encoders: | |
| input_df[col] = safe_transform(encoders[col], input_df[col]) | |
| # Predict and obtain probabilities | |
| pred = xgb_model.predict(input_df) | |
| pred_proba = xgb_model.predict_proba(input_df) | |
| import numpy as np | |
| from urllib.parse import unquote | |
| def extract_phone_numbers(customer_phone): | |
| # Decode URL-encoded phone numbers | |
| decoded_phone = unquote(customer_phone) | |
| # Split into a list of phone numbers | |
| phone_numbers = [phone.strip() for phone in decoded_phone.split('/')] | |
| # Handle case where there is a single phone number | |
| if len(phone_numbers) == 1 and phone_numbers[0]: | |
| return phone_numbers | |
| elif len(phone_numbers) == 0: | |
| return [] | |
| return phone_numbers | |
| def calculate_delivery_factor(phone_number): | |
| # Replace with the desired customer name and phone number | |
| #customer_phone_no = '0773224384' | |
| json = fetch_customer_data(phone_number) | |
| data = json | |
| #print(url,data) | |
| # Filter only relevant status names | |
| valid_statuses = ['Failed to Deliver', 'Delivered', 'Returned to Client'] | |
| relevant_orders = [order for order in data if order['status_name'] in valid_statuses] | |
| if not relevant_orders: | |
| base_probability = 0.50 | |
| else: | |
| delivered_count = sum(1 for order in relevant_orders if order['status_name'] == 'Delivered') | |
| total_orders_count = len(relevant_orders) | |
| base_probability = delivered_count / total_orders_count | |
| base_probability = max(0.05, min(base_probability, 0.95)) | |
| # Add a narrower random component | |
| random_component = np.random.uniform(-0.05, 0.05) | |
| adjusted_probability = base_probability + random_component | |
| return adjusted_probability | |
| try: | |
| print(customer_phone) | |
| phone_numbers = extract_phone_numbers(customer_phone) | |
| print(phone_numbers, "api calling ......") | |
| probability = calculate_delivery_factor(phone_numbers[0]) | |
| probability = round((probability * 100),2) | |
| #probability = f"{probability:.2f}" probability = f"{float(probability):.2f}" | |
| print(f"new model probability: {probability}") | |
| predicted_status = "delivered" | |
| # Output | |
| except Exception as e: | |
| print(f"Error: {e}") | |
| predicted_status = "Unknown" if pred[0] == -1 else encoders['status_name'].inverse_transform([pred])[0] | |
| probability = pred_proba[0][pred[0]] * 100 if pred[0] != -1 else "Unknown" | |
| print(str(predicted_status),probability) | |
| if probability>98: | |
| probability = probability-1 | |
| if predicted_status == "Returned to Client": | |
| probability = 100 - probability | |
| return {"Probability": round(probability,2),"predicted_status":predicted_status} | |