File size: 5,094 Bytes
0836746
 
 
 
 
466d49d
0836746
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
5fac565
0836746
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
466d49d
 
 
 
 
0836746
 
 
 
 
 
 
466d49d
 
 
 
 
 
0836746
 
 
 
 
 
 
 
466d49d
 
0836746
 
 
 
 
 
 
5fac565
0836746
 
 
 
 
 
 
 
 
 
5fac565
 
0836746
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
import streamlit as st
import pandas as pd
import pickle
from sklearn.impute import SimpleImputer
from sklearn.utils.validation import check_is_fitted
import numpy as np

# Load the trained model and preprocessing objects using pickle
with open('random_forest_model.pkl', 'rb') as f:
    random_forest_model = pickle.load(f)

with open('scaler.pkl', 'rb') as f:
    scaler = pickle.load(f)

with open('label_encoders.pkl', 'rb') as f:
    label_encoders = pickle.load(f)

# State corrections and valid states/UTs
state_corrections = {
    'uttaranchal': 'uttarakhand',
    'orissa (odisha)': 'odisha',
    'kashmir': 'jammu and kashmir',
    'multi state': 'other',
    'not classified': 'other'
}

valid_states_uts = [
    'andhra pradesh', 'arunachal pradesh', 'assam', 'bihar', 'chhattisgarh', 'goa',
    'gujarat', 'haryana', 'himachal pradesh', 'jharkhand', 'karnataka', 'kerala',
    'madhya pradesh', 'maharashtra', 'manipur', 'meghalaya', 'mizoram', 'nagaland',
    'odisha', 'punjab', 'rajasthan', 'sikkim', 'tamil nadu', 'telangana', 'tripura',
    'uttar pradesh', 'uttarakhand', 'west bengal', 'andaman and nicobar islands',
    'chandigarh', 'dadra and nagar haveli and daman and diu', 'lakshadweep', 'delhi',
    'puducherry', 'jammu and kashmir', 'ladakh'
]

# Extract city, state, and country
def extract_city(x):
    if isinstance(x, str):
        splitted_string = x.split("-")
        if len(splitted_string) == 4:
            return f"{splitted_string[0].strip().lower()} {splitted_string[1].strip().lower()}"
        else:
            return splitted_string[0].strip().lower()
    else:
        return "other"

def extract_state(x):
    if isinstance(x, str):
        state = x.split("-")[-2].strip().lower()
        return state_corrections.get(state, state if state in valid_states_uts else 'other')
    else:
        return "other"

def extract_country(x):
    if isinstance(x, str):
        return x.split("-")[-1].strip().lower()
    else:
        return "other"


def preprocess_new_data(df):
    df['Ownership'] = df['Ownership'].str.lower().str.strip()
    df[' Type of Tender '] = df[' Type of Tender '].str.lower().str.strip()

    def parse_closing_date(date_str):
        try:
            return pd.to_datetime(date_str)
        except Exception:
            if " to " in date_str:
                date_str = date_str.split(" to ")[-1]
                return pd.to_datetime(date_str, errors='coerce')
            return pd.NaT

    df['Closing Date'] = df['Closing Date'].apply(parse_closing_date)
    df['Date'] = pd.to_datetime(df['Date'], errors='coerce')
    df['days_left'] = (df['Closing Date'] - df['Date']).dt.days

    df['city'] = df['Location'].apply(lambda x: extract_city(x))
    df['state'] = df['Location'].apply(lambda x: extract_state(x))
    df['country'] = df['Location'].apply(lambda x: extract_country(x))

    df['city'].fillna("other", inplace=True)
    df['state'].fillna("other", inplace=True)
    df['country'].fillna("other", inplace=True)

    # Remove commas and convert numerical columns to floats
    numerical_columns = ['Earnest Money', 'Estimated Cost', 'DocFees']
    for col in numerical_columns:
        df[col] = df[col].replace({',': ''}, regex=True).astype(float)

    df = df[['Ref No', 'Earnest Money', 'Estimated Cost', 'DocFees', 'Ownership', ' Type of Tender ', 'days_left', 'city', 'state', 'country']]

    imputer = SimpleImputer(strategy='median')
    df['days_left'] = imputer.fit_transform(df[['days_left']])

    for column in ['Ownership', ' Type of Tender ', 'city', 'state', 'country']:
        le = label_encoders[column]

        # Add 'other' to the classes if it's not already there
        if 'other' not in le.classes_:
            le.classes_ = np.append(le.classes_, 'other')

        # Replace unseen labels with 'other'
        df[column] = df[column].apply(lambda x: x if x in le.classes_ else 'other')
        df[column] = le.transform(df[column])

    numerical_features = ['Earnest Money', 'Estimated Cost', 'DocFees', 'days_left']
    df[numerical_features] = scaler.transform(df[numerical_features])

    return df



def predict_new_data(new_data):
    preprocessed_data = preprocess_new_data(new_data)
    X_new = preprocessed_data.drop(columns=['Ref No'])
    tender_ref_numbers_new = preprocessed_data['Ref No']
    predictions = random_forest_model.predict(X_new)
    results = pd.DataFrame({
        'Ref No': tender_ref_numbers_new,
        'predictions': predictions
    })
    
    return results

st.title("Tender Selection Prediction")
uploaded_file = st.file_uploader("Upload your CSV file", type=["csv"])

if uploaded_file is not None:
    new_data = pd.read_csv(uploaded_file)
    prediction_results = predict_new_data(new_data)
    
    selected_tenders = prediction_results[prediction_results['predictions'] == "yes"]['Ref No'].astype(str).to_list()
    new_data['Ref No'] = new_data['Ref No'].astype(str)

    st.write("Selected Tenders:")
    st.write(new_data[new_data['Ref No'].isin(selected_tenders)].drop(columns=['Unnamed: 0']).reset_index().drop(columns=['index']))