|
import streamlit as st |
|
import numpy as np |
|
import pandas as pd |
|
import mlflow |
|
from sklearn.preprocessing import LabelEncoder |
|
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score |
|
import streamlit as st |
|
import numpy as np |
|
import joblib |
|
|
|
url = './archive/ds_salaries.csv' |
|
|
|
|
|
data = pd.read_csv(url) |
|
|
|
data = data.drop('Unnamed: 0', axis=1) |
|
|
|
|
|
|
|
data = data.fillna(method='ffill') |
|
|
|
|
|
label_encoders = {} |
|
categorical_columns = ['experience_level', 'employment_type', 'job_title', 'salary_currency', 'employee_residence', 'company_location', 'company_size'] |
|
|
|
for col in categorical_columns: |
|
le = LabelEncoder() |
|
data[col] = le.fit_transform(data[col]) |
|
label_encoders[col] = le |
|
|
|
from sklearn.model_selection import train_test_split |
|
|
|
|
|
X = data.drop('salary_in_usd', axis=1) |
|
y = data['salary_in_usd'] |
|
|
|
|
|
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42) |
|
|
|
print(f"Training set size: {X_train.shape[0]}") |
|
print(f"Testing set size: {X_test.shape[0]}") |
|
|
|
|
|
|
|
|
|
|
|
|
|
best_model = joblib.load('best_model.pkl') |
|
|
|
|
|
st.title("Salary Prediction App") |
|
|
|
|
|
work_year = st.number_input('Work Year', min_value=2020, max_value=2024, step=1) |
|
experience_level = st.selectbox("Experience Level", label_encoders['experience_level'].classes_) |
|
employment_type = st.selectbox("Employment Type", label_encoders['employment_type'].classes_) |
|
job_title = st.selectbox("Job Title", label_encoders['job_title'].classes_) |
|
salary = st.number_input('Salary', min_value=0) |
|
salary_currency = st.selectbox("Salary Currency", label_encoders['salary_currency'].classes_) |
|
employee_residence = st.selectbox("Employee Residence", label_encoders['employee_residence'].classes_) |
|
remote_ratio = st.slider("Remote Ratio", 0, 100) |
|
company_location = st.selectbox("Company Location", label_encoders['company_location'].classes_) |
|
company_size = st.selectbox("Company Size", label_encoders['company_size'].classes_) |
|
|
|
|
|
def predict_salary(): |
|
|
|
encoded_experience_level = label_encoders['experience_level'].transform([experience_level])[0] |
|
encoded_employment_type = label_encoders['employment_type'].transform([employment_type])[0] |
|
encoded_job_title = label_encoders['job_title'].transform([job_title])[0] |
|
encoded_salary_currency = label_encoders['salary_currency'].transform([salary_currency])[0] |
|
encoded_employee_residence = label_encoders['employee_residence'].transform([employee_residence])[0] |
|
encoded_company_location = label_encoders['company_location'].transform([company_location])[0] |
|
encoded_company_size = label_encoders['company_size'].transform([company_size])[0] |
|
|
|
|
|
input_features = np.array([ |
|
work_year, |
|
encoded_experience_level, |
|
encoded_employment_type, |
|
encoded_job_title, |
|
salary, |
|
encoded_salary_currency, |
|
encoded_employee_residence, |
|
remote_ratio, |
|
encoded_company_location, |
|
encoded_company_size, |
|
]).reshape(1, -1) |
|
|
|
predicted_salary = best_model.predict(input_features)[0] |
|
return predicted_salary |
|
|
|
|
|
if st.button("Predict Salary"): |
|
predicted_salary = predict_salary() |
|
st.write(f"Predicted Salary (in USD): {predicted_salary}") |
|
|
|
|
|
st.write(f"RMSE: {mean_squared_error(y_test, best_model.predict(X_test), squared=False)}") |
|
st.write(f"MAE: {mean_absolute_error(y_test, best_model.predict(X_test))}") |
|
st.write(f"R²: {r2_score(y_test, best_model.predict(X_test))}") |
|
|