File size: 3,941 Bytes
9739a70 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 |
import streamlit as st
import numpy as np
import pandas as pd
import mlflow
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score
import streamlit as st
import numpy as np
import joblib
url = './archive/ds_salaries.csv' # replace with the actual path
# Load the dataset
data = pd.read_csv(url)
data = data.drop('Unnamed: 0', axis=1)
# Fill missing values if any
data = data.fillna(method='ffill')
# Encode categorical variables
label_encoders = {}
categorical_columns = ['experience_level', 'employment_type', 'job_title', 'salary_currency', 'employee_residence', 'company_location', 'company_size']
for col in categorical_columns:
le = LabelEncoder()
data[col] = le.fit_transform(data[col])
label_encoders[col] = le
from sklearn.model_selection import train_test_split
# Define features and target variable
X = data.drop('salary_in_usd', axis=1)
y = data['salary_in_usd']
# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
print(f"Training set size: {X_train.shape[0]}")
print(f"Testing set size: {X_test.shape[0]}")
# Load the best model
# best_model = mlflow.sklearn.load_model("runs:/2f24d11653334bfc8611ef5edbe52bfd/model")
# Load the best model
best_model = joblib.load('best_model.pkl')
# Streamlit app
st.title("Salary Prediction App")
# Input features
work_year = st.number_input('Work Year', min_value=2020, max_value=2024, step=1)
experience_level = st.selectbox("Experience Level", label_encoders['experience_level'].classes_)
employment_type = st.selectbox("Employment Type", label_encoders['employment_type'].classes_)
job_title = st.selectbox("Job Title", label_encoders['job_title'].classes_)
salary = st.number_input('Salary', min_value=0)
salary_currency = st.selectbox("Salary Currency", label_encoders['salary_currency'].classes_)
employee_residence = st.selectbox("Employee Residence", label_encoders['employee_residence'].classes_)
remote_ratio = st.slider("Remote Ratio", 0, 100)
company_location = st.selectbox("Company Location", label_encoders['company_location'].classes_)
company_size = st.selectbox("Company Size", label_encoders['company_size'].classes_)
def predict_salary():
# Encode input features
encoded_experience_level = label_encoders['experience_level'].transform([experience_level])[0]
encoded_employment_type = label_encoders['employment_type'].transform([employment_type])[0]
encoded_job_title = label_encoders['job_title'].transform([job_title])[0]
encoded_salary_currency = label_encoders['salary_currency'].transform([salary_currency])[0]
encoded_employee_residence = label_encoders['employee_residence'].transform([employee_residence])[0]
encoded_company_location = label_encoders['company_location'].transform([company_location])[0]
encoded_company_size = label_encoders['company_size'].transform([company_size])[0]
# Create input array matching training data format
input_features = np.array([
work_year,
encoded_experience_level,
encoded_employment_type,
encoded_job_title,
salary,
encoded_salary_currency,
encoded_employee_residence,
remote_ratio,
encoded_company_location,
encoded_company_size,
]).reshape(1, -1)
# Make prediction
predicted_salary = best_model.predict(input_features)[0]
return predicted_salary
# Button to trigger prediction
if st.button("Predict Salary"):
predicted_salary = predict_salary()
st.write(f"Predicted Salary (in USD): {predicted_salary}")
# Display model performance metrics
st.write(f"RMSE: {mean_squared_error(y_test, best_model.predict(X_test), squared=False)}")
st.write(f"MAE: {mean_absolute_error(y_test, best_model.predict(X_test))}")
st.write(f"R²: {r2_score(y_test, best_model.predict(X_test))}")
|