nethke0009 commited on
Commit
9e9465a
1 Parent(s): fdef2dc

Upload 4 files

Browse files
Files changed (4) hide show
  1. Bank_Telemarketing.csv +0 -0
  2. app.py +124 -0
  3. requirements.txt +1 -0
  4. train.py +80 -0
Bank_Telemarketing.csv ADDED
The diff for this file is too large to render. See raw diff
 
app.py ADDED
@@ -0,0 +1,124 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import os
2
+ import uuid
3
+ import joblib
4
+ import json
5
+
6
+ import gradio as gr
7
+ import pandas as pd
8
+
9
+ from huggingface_hub import CommitScheduler
10
+ from pathlib import Path
11
+
12
+
13
+ os.system("python train.py")
14
+
15
+ term_deposit_predictor = joblib.load('model.joblib')
16
+
17
+ log_file = Path("logs/") / f"data_{uuid.uuid4()}.json"
18
+ log_folder = log_file.parent
19
+
20
+ scheduler = CommitScheduler(
21
+ repo_id="term-deposit-logs",
22
+ repo_type="dataset",
23
+ folder_path=log_folder,
24
+ path_in_repo="data",
25
+ every=2
26
+ )
27
+
28
+ age_input = gr.Number(label="Age")
29
+ duration_input = gr.Number(label='Duration(Sec)')
30
+ cc_contact_freq_input = gr.Number(label='CC Contact Freq')
31
+ days_since_pc_input = gr.Number(label='Days Since PC')
32
+ pc_contact_freq_input = gr.Number(label='Pc Contact Freq')
33
+ job_input = gr.Dropdown(['admin.', 'blue-collar', 'technician', 'services', 'management',
34
+ 'retired', 'entrepreneur', 'self-employed', 'housemaid', 'unemployed',
35
+ 'student', 'unknown'],label="Job")
36
+ marital_input = gr.Dropdown(['married', 'single', 'divorced', 'unknown'],label='Marital Status')
37
+ education_input = gr.Dropdown(['experience', 'university degree', 'high school', 'professional.course',
38
+ 'Others', 'illiterate'],label='Education')
39
+ defaulter_input = gr.Dropdown(['no', 'unknown', 'yes'],label='Defaulter')
40
+ home_loan_input = gr.Dropdown(['yes', 'no', 'unknown'],label='Home Loan')
41
+ personal_loan_input = gr.Dropdown(['yes', 'no', 'unknown'],label='Personal Loan')
42
+ communication_type_input = gr.Dropdown(['cellular', 'telephone'],label='Communication Type')
43
+ last_contacted_input = gr.Dropdown(['may', 'jul', 'aug', 'jun', 'nov', 'apr', 'oct', 'mar', 'sep', 'dec'],label='Last Contacted')
44
+ day_of_week_input = gr.Dropdown(['thu', 'mon', 'wed', 'tue', 'fri'],label='Day of Week')
45
+ pc_outcome_input = gr.Dropdown(['nonexistent', 'failure', 'success'], label='PC Outcome')
46
+
47
+
48
+ model_output = gr.Label(label="Subscribed")
49
+
50
+ def predict_term_deposit(age, duration, cc_contact_freq, days_since_pc, pc_contact_freq, job, marital_status, education,
51
+ defaulter, home_loan, personal_loan, communication_type, last_contacted,
52
+ day_of_week, pc_outcome):
53
+ sample = {
54
+ 'Age': age,
55
+ 'Duration(Sec)': duration,
56
+ 'CC Contact Freq': cc_contact_freq,
57
+ 'Days Since PC': days_since_pc,
58
+ 'PC Contact Freq': pc_contact_freq,
59
+ 'Job': job,
60
+ 'Marital Status': marital_status,
61
+ 'Education': education,
62
+ 'Defaulter': defaulter,
63
+ 'Home Loan': home_loan,
64
+ 'Personal Loan': personal_loan,
65
+ 'Communication Type': communication_type,
66
+ 'Last Contacted': last_contacted,
67
+ 'Day of Week': day_of_week,
68
+ 'PC Outcome': pc_outcome,
69
+ }
70
+ data_point = pd.DataFrame([sample])
71
+ prediction = term_deposit_predictor.predict(data_point).tolist()
72
+
73
+ with scheduler.lock:
74
+ with log_file.open("a") as f:
75
+ f.write(json.dumps(
76
+ {
77
+ 'Age': age,
78
+ 'Duration(Sec)': duration,
79
+ 'CC Contact Freq': cc_contact_freq,
80
+ 'Days Since PC': days_since_pc,
81
+ 'PC Contact Freq': pc_contact_freq,
82
+ 'Job': job,
83
+ 'Marital Status': marital_status,
84
+ 'Education': education,
85
+ 'Defaulter': defaulter,
86
+ 'Home Loan': home_loan,
87
+ 'Personal Loan': personal_loan,
88
+ 'Communication Type': communication_type,
89
+ 'Last Month Contacted': last_contacted,
90
+ 'Day of Week': day_of_week,
91
+ 'PC Outcome': pc_outcome,
92
+ 'prediction': prediction[0]
93
+ }
94
+ ))
95
+ f.write("\n")
96
+
97
+ return prediction[0]
98
+
99
+ demo = gr.Interface(
100
+ fn=predict_term_deposit,
101
+ inputs=[age_input,
102
+ duration_input,
103
+ cc_contact_freq_input,
104
+ days_since_pc_input,
105
+ pc_contact_freq_input,
106
+ job_input,
107
+ marital_input,
108
+ education_input,
109
+ defaulter_input,
110
+ home_loan_input,
111
+ personal_loan_input,
112
+ communication_type_input,
113
+ last_contacted_input,
114
+ day_of_week_input,
115
+ pc_outcome_input],
116
+ outputs=model_output,
117
+ title="Term Deposit Prediction",
118
+ description="This API allows you to predict the person who are going to likely subscribe the term deposit",
119
+ allow_flagging="auto",
120
+ concurrency_limit=10
121
+ )
122
+
123
+ demo.queue()
124
+ demo.launch(share=False)
requirements.txt ADDED
@@ -0,0 +1 @@
 
 
1
+ scikit-learn==1.4.2
train.py ADDED
@@ -0,0 +1,80 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+
2
+ import joblib
3
+ import pandas as pd
4
+
5
+ from sklearn.preprocessing import StandardScaler, OneHotEncoder
6
+ from sklearn.compose import make_column_transformer
7
+ from sklearn.impute import SimpleImputer
8
+ from sklearn.pipeline import Pipeline
9
+ from sklearn.pipeline import make_pipeline
10
+
11
+ from sklearn.model_selection import train_test_split, RandomizedSearchCV
12
+
13
+ from sklearn.linear_model import LogisticRegression
14
+ from sklearn.metrics import accuracy_score, classification_report
15
+
16
+ data_df = pd.read_csv("Bank_Telemarketing.csv")
17
+
18
+ target = 'subscribed'
19
+ numerical_features = ['Age', 'Duration(Sec)', 'CC Contact Freq', 'Days Since PC','PC Contact Freq']
20
+ categorical_features = ['Job', 'Marital Status', 'Education', 'Defaulter', 'Home Loan',
21
+ 'Personal Loan', 'Communication Type', 'Last Contacted', 'Day of Week',
22
+ 'PC Outcome']
23
+
24
+ print("Creating data subsets")
25
+
26
+ X = data_df[numerical_features + categorical_features]
27
+ y = data_df[target]
28
+
29
+ Xtrain, Xtest, ytrain, ytest = train_test_split(
30
+ X, y,
31
+ test_size=0.2,
32
+ random_state=42
33
+ )
34
+
35
+ numerical_pipeline = Pipeline([
36
+ ('imputer', SimpleImputer(strategy='median')),
37
+ ('scaler', StandardScaler())
38
+ ])
39
+
40
+ categorical_pipeline = Pipeline([
41
+ ('imputer', SimpleImputer(strategy='most_frequent')),
42
+ ('onehot', OneHotEncoder(handle_unknown='ignore'))
43
+ ])
44
+
45
+ preprocessor = make_column_transformer(
46
+ (numerical_pipeline, numerical_features),
47
+ (categorical_pipeline, categorical_features)
48
+ )
49
+
50
+ model_logistic_regression = LogisticRegression(n_jobs=-1)
51
+
52
+ print("Estimating Best Model Pipeline")
53
+
54
+ model_pipeline = make_pipeline(
55
+ preprocessor,
56
+ model_logistic_regression
57
+ )
58
+
59
+ param_distribution = {
60
+ "logisticregression__C": [0.001, 0.01, 0.1, 0.5, 1, 5, 10]
61
+ }
62
+
63
+ rand_search_cv = RandomizedSearchCV(
64
+ model_pipeline,
65
+ param_distribution,
66
+ n_iter=3,
67
+ cv=3,
68
+ random_state=42
69
+ )
70
+
71
+ rand_search_cv.fit(Xtrain, ytrain)
72
+
73
+ print("Logging Metrics")
74
+ print(f"Accuracy: {rand_search_cv.best_score_}")
75
+
76
+ print("Serializing Model")
77
+
78
+ saved_model_path = "model.joblib"
79
+
80
+ joblib.dump(rand_search_cv.best_estimator_, saved_model_path)