File size: 3,941 Bytes
9739a70
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
import streamlit as st
import numpy as np
import pandas as pd
import mlflow
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score
import streamlit as st
import numpy as np
import joblib

url = './archive/ds_salaries.csv'  # replace with the actual path

# Load the dataset
data = pd.read_csv(url)

data = data.drop('Unnamed: 0', axis=1)


# Fill missing values if any
data = data.fillna(method='ffill')

# Encode categorical variables
label_encoders = {}
categorical_columns = ['experience_level', 'employment_type', 'job_title', 'salary_currency', 'employee_residence', 'company_location', 'company_size']

for col in categorical_columns:
    le = LabelEncoder()
    data[col] = le.fit_transform(data[col])
    label_encoders[col] = le

from sklearn.model_selection import train_test_split

# Define features and target variable
X = data.drop('salary_in_usd', axis=1)
y = data['salary_in_usd']

# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

print(f"Training set size: {X_train.shape[0]}")
print(f"Testing set size: {X_test.shape[0]}")


# Load the best model
# best_model = mlflow.sklearn.load_model("runs:/2f24d11653334bfc8611ef5edbe52bfd/model")

# Load the best model
best_model = joblib.load('best_model.pkl')

# Streamlit app
st.title("Salary Prediction App")

# Input features
work_year = st.number_input('Work Year', min_value=2020, max_value=2024, step=1)
experience_level = st.selectbox("Experience Level", label_encoders['experience_level'].classes_)
employment_type = st.selectbox("Employment Type", label_encoders['employment_type'].classes_)
job_title = st.selectbox("Job Title", label_encoders['job_title'].classes_)
salary = st.number_input('Salary', min_value=0)
salary_currency = st.selectbox("Salary Currency", label_encoders['salary_currency'].classes_)
employee_residence = st.selectbox("Employee Residence", label_encoders['employee_residence'].classes_)
remote_ratio = st.slider("Remote Ratio", 0, 100)
company_location = st.selectbox("Company Location", label_encoders['company_location'].classes_)
company_size = st.selectbox("Company Size", label_encoders['company_size'].classes_)


def predict_salary():
    # Encode input features
    encoded_experience_level = label_encoders['experience_level'].transform([experience_level])[0]
    encoded_employment_type = label_encoders['employment_type'].transform([employment_type])[0]
    encoded_job_title = label_encoders['job_title'].transform([job_title])[0]
    encoded_salary_currency = label_encoders['salary_currency'].transform([salary_currency])[0]
    encoded_employee_residence = label_encoders['employee_residence'].transform([employee_residence])[0]
    encoded_company_location = label_encoders['company_location'].transform([company_location])[0]
    encoded_company_size = label_encoders['company_size'].transform([company_size])[0]

    # Create input array matching training data format
    input_features = np.array([
        work_year,
        encoded_experience_level,
        encoded_employment_type,
        encoded_job_title,
        salary,
        encoded_salary_currency,
        encoded_employee_residence,
        remote_ratio,
        encoded_company_location,
        encoded_company_size,
    ]).reshape(1, -1)
    # Make prediction
    predicted_salary = best_model.predict(input_features)[0]
    return predicted_salary

# Button to trigger prediction
if st.button("Predict Salary"):
    predicted_salary = predict_salary()
    st.write(f"Predicted Salary (in USD): {predicted_salary}")

    # Display model performance metrics
    st.write(f"RMSE: {mean_squared_error(y_test, best_model.predict(X_test), squared=False)}")
    st.write(f"MAE: {mean_absolute_error(y_test, best_model.predict(X_test))}")
    st.write(f"R²: {r2_score(y_test, best_model.predict(X_test))}")