import streamlit as st import numpy as np import pandas as pd import mlflow from sklearn.preprocessing import LabelEncoder from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score import streamlit as st import numpy as np import joblib url = './archive/ds_salaries.csv' # replace with the actual path # Load the dataset data = pd.read_csv(url) data = data.drop('Unnamed: 0', axis=1) # Fill missing values if any data = data.fillna(method='ffill') # Encode categorical variables label_encoders = {} categorical_columns = ['experience_level', 'employment_type', 'job_title', 'salary_currency', 'employee_residence', 'company_location', 'company_size'] for col in categorical_columns: le = LabelEncoder() data[col] = le.fit_transform(data[col]) label_encoders[col] = le from sklearn.model_selection import train_test_split # Define features and target variable X = data.drop('salary_in_usd', axis=1) y = data['salary_in_usd'] # Split the data into training and testing sets X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42) print(f"Training set size: {X_train.shape[0]}") print(f"Testing set size: {X_test.shape[0]}") # Load the best model # best_model = mlflow.sklearn.load_model("runs:/2f24d11653334bfc8611ef5edbe52bfd/model") # Load the best model best_model = joblib.load('best_model.pkl') # Streamlit app st.title("Salary Prediction App") # Input features work_year = st.number_input('Work Year', min_value=2020, max_value=2024, step=1) experience_level = st.selectbox("Experience Level", label_encoders['experience_level'].classes_) employment_type = st.selectbox("Employment Type", label_encoders['employment_type'].classes_) job_title = st.selectbox("Job Title", label_encoders['job_title'].classes_) salary = st.number_input('Salary', min_value=0) salary_currency = st.selectbox("Salary Currency", label_encoders['salary_currency'].classes_) employee_residence = st.selectbox("Employee Residence", label_encoders['employee_residence'].classes_) remote_ratio = st.slider("Remote Ratio", 0, 100) company_location = st.selectbox("Company Location", label_encoders['company_location'].classes_) company_size = st.selectbox("Company Size", label_encoders['company_size'].classes_) def predict_salary(): # Encode input features encoded_experience_level = label_encoders['experience_level'].transform([experience_level])[0] encoded_employment_type = label_encoders['employment_type'].transform([employment_type])[0] encoded_job_title = label_encoders['job_title'].transform([job_title])[0] encoded_salary_currency = label_encoders['salary_currency'].transform([salary_currency])[0] encoded_employee_residence = label_encoders['employee_residence'].transform([employee_residence])[0] encoded_company_location = label_encoders['company_location'].transform([company_location])[0] encoded_company_size = label_encoders['company_size'].transform([company_size])[0] # Create input array matching training data format input_features = np.array([ work_year, encoded_experience_level, encoded_employment_type, encoded_job_title, salary, encoded_salary_currency, encoded_employee_residence, remote_ratio, encoded_company_location, encoded_company_size, ]).reshape(1, -1) # Make prediction predicted_salary = best_model.predict(input_features)[0] return predicted_salary # Button to trigger prediction if st.button("Predict Salary"): predicted_salary = predict_salary() st.write(f"Predicted Salary (in USD): {predicted_salary}") # Display model performance metrics st.write(f"RMSE: {mean_squared_error(y_test, best_model.predict(X_test), squared=False)}") st.write(f"MAE: {mean_absolute_error(y_test, best_model.predict(X_test))}") st.write(f"R²: {r2_score(y_test, best_model.predict(X_test))}")