Andy Lau commited on
Commit
b6af401
·
1 Parent(s): 5f9dcc2

working model CC

Browse files
Files changed (4) hide show
  1. app.py +262 -15
  2. main_app.py +15 -0
  3. models/models.sav +0 -0
  4. requirements.txt +1 -0
app.py CHANGED
@@ -1,9 +1,55 @@
 
1
  import streamlit as st
2
  import pandas as pd
3
  import PIL
 
 
 
 
 
 
 
4
 
5
  from bokeh.models.widgets import Div
6
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
7
 
8
  def url_button(button_name,url):
9
  if st.button(button_name):
@@ -12,20 +58,89 @@ def url_button(button_name,url):
12
  div = Div(text=html)
13
  st.bokeh_chart(div)
14
 
 
 
 
 
 
 
 
15
 
 
 
 
 
 
 
16
 
17
- st.markdown("# Character Counter: Email Industry")
 
 
 
18
 
19
- col1, col2, col3 = st.columns([1,1,1])
 
 
 
 
 
 
 
 
 
 
 
 
20
 
21
- with col2:
22
- img = PIL.Image.open("figures/ModelCC.png")
23
- st.image(img)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
24
 
25
  stats_col1, stats_col2, stats_col3, stats_col4 = st.columns([1,1,1,1])
26
 
27
  with stats_col1:
28
- st.metric(label="Production", value="Ready")
29
  with stats_col2:
30
  st.metric(label="Accuracy", value="85%")
31
 
@@ -36,15 +151,26 @@ with stats_col4:
36
  st.metric(label="Industry", value="Email")
37
 
38
 
39
- with st.sidebar:
40
 
 
41
 
42
  with st.expander('Model Description', expanded=False):
 
 
43
  st.markdown('Finding the correct length for an email campaign to maximize user engagement can be an ambiguous task. The Loxz Character Count Model allows you to predict the correct length of your emails for a particular industry and a particular type of email. Using these inputs and trained on an extensive proprietary data set from the Loxz family digital archive, the models incorporate real-world and synthetic data to find the optimized character counts. We applied the random forest algorithm in this model. Bootstrapping was also ensembled in the algorithm which effectively prevents overfitting by reducing variance. The model achieves an 86% accuracy on the test set. This inference-based ML model will help the campaign engineers start with an acceptable length and zero in on the best character count, maximizing engagement in their campaign.')
44
 
 
 
 
 
 
 
 
 
 
45
 
46
- url_button('Model Homepage','https://www.loxz.com/#/models/CC')
47
- url_button('Full Report','https://resources.loxz.com/reports/realtime-ml-character-count-model')
48
  url_button('Amazon Market Place','https://aws.amazon.com/marketplace')
49
 
50
 
@@ -52,12 +178,12 @@ industry_lists = [
52
  'Retail',
53
  'Software and Technology',
54
  'Hospitality',
55
- 'Adacemic and Education',
56
  'Healthcare',
57
  'Energy',
58
  'Real Estate',
59
  'Entertainment',
60
- 'Fianance and Banking'
61
  ]
62
 
63
  campaign_types = [
@@ -74,12 +200,19 @@ campaign_types = [
74
  ]
75
 
76
  target_variables = [
77
- 'open_rate',
78
- 'click_through_rate',
79
- 'abandoned_cart',
80
- 'unsubscribe_rate'
81
  ]
82
 
 
 
 
 
 
 
 
 
 
83
  industry = st.selectbox(
84
  'Please select your industry',
85
  industry_lists
@@ -94,3 +227,117 @@ target = st.selectbox(
94
  'Please select your target variable',
95
  target_variables
96
  )
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from ast import arg
2
  import streamlit as st
3
  import pandas as pd
4
  import PIL
5
+ import re
6
+ from io import StringIO
7
+ import boto3
8
+
9
+ # from joblib import dump, load
10
+
11
+ import joblib
12
 
13
  from bokeh.models.widgets import Div
14
 
15
+ import email
16
+ from urlextract import URLExtract
17
+
18
+ import main_app
19
+
20
+
21
+ def table_data():
22
+ # creating table data
23
+ field = [
24
+ 'Data Scientist',
25
+ 'Dataset',
26
+ 'Algorithm',
27
+ 'Framework',
28
+ 'Ensemble',
29
+ 'Domain',
30
+ 'Model Size'
31
+ ]
32
+
33
+ data = [
34
+ 'Chen Song',
35
+ 'Internal + Campaign monitor',
36
+ 'Random Forest',
37
+ 'Sci-kit learn',
38
+ 'Bootstrapping',
39
+ 'Bootstrapping Aggregation',
40
+ '4 KB'
41
+ ]
42
+
43
+ data = {
44
+ 'Field':field,
45
+ 'Data':data
46
+ }
47
+
48
+ df = pd.DataFrame.from_dict(data)
49
+
50
+ return df
51
+
52
+
53
 
54
  def url_button(button_name,url):
55
  if st.button(button_name):
 
58
  div = Div(text=html)
59
  st.bokeh_chart(div)
60
 
61
+ def get_industry_code_dict(training_dataset):
62
+ training_dataset['industry_code'] = training_dataset['industry'].astype('category')
63
+ cat_columns = training_dataset.select_dtypes(['category']).columns
64
+ training_dataset[cat_columns] = training_dataset[cat_columns].apply(lambda x: x.cat.codes)
65
+ industry_code_dict = dict(zip(training_dataset.industry, training_dataset.industry_code))
66
+ return industry_code_dict
67
+
68
 
69
+ ## extract email body from parse email
70
+ def email_body_extractor(email_data):
71
+ # email_data = parsed_email.data[0]
72
+ emailstr = email_data.decode("utf-8")
73
+ b = email.message_from_string(emailstr)
74
+ body = ""
75
 
76
+ if b.is_multipart():
77
+ for part in b.walk():
78
+ ctype = part.get_content_type()
79
+ cdispo = str(part.get('Content-Disposition'))
80
 
81
+ # skip any text/plain (txt) attachments
82
+ if ctype == 'text/plain' and 'attachment' not in cdispo:
83
+ body = part.get_payload() # decode
84
+ break
85
+ # not multipart - i.e. plain text, no attachments, keeping fingers crossed
86
+ else:
87
+ body = b.get_payload()
88
+ ## Remove escape sequences
89
+ body = body.replace('\n', '')
90
+ body = body.replace('\t', '')
91
+ body = body.replace('\r', '')
92
+ body = body.replace('</b>', '')
93
+ body = body.replace('<b>', '')
94
 
95
+
96
+ ## Extract urls in the email body and get url counts
97
+ extractor = URLExtract()
98
+ urls = extractor.find_urls(body)
99
+ url_cnt = len(urls)
100
+ ## Remove urls
101
+ body = re.sub(r'\w+:\/{2}[\d\w-]+(\.[\d\w-]+)*(?:(?:\/[^\s/]*))*', '', body)
102
+ sep = '©'
103
+ body = body.split(sep, 1)[0]
104
+ character_cnt = sum(not chr.isspace() for chr in body)
105
+
106
+ return body, character_cnt, url_cnt
107
+
108
+ # def select_char_preference_variables():
109
+ # opt_list = ["Increase", "Decrease"]
110
+ # button_option = widgets.RadioButtons(options = opt_list)
111
+ # print("Do you want to increase or decrease your character count in the email?")
112
+ # display(button_option)
113
+ # return button_option
114
+
115
+
116
+ def add_bg_from_url():
117
+ st.markdown(
118
+ f"""
119
+ <style>
120
+ .stApp {{
121
+ background-image: linear-gradient(#0A3144,#126072,#1C8D99);
122
+ background-attachment: fixed;
123
+ background-size: cover
124
+ }}
125
+ </style>
126
+ """,
127
+ unsafe_allow_html=True
128
+ )
129
+
130
+ # add_bg_from_url()
131
+
132
+ st.markdown("# Character Count: Email Industry")
133
+
134
+ # col1, col2, col3 = st.columns([1,1,1])
135
+
136
+ # with col2:
137
+ # img = PIL.Image.open("figures/ModelCC_solid.png")
138
+ # st.image(img)
139
 
140
  stats_col1, stats_col2, stats_col3, stats_col4 = st.columns([1,1,1,1])
141
 
142
  with stats_col1:
143
+ st.metric(label="Production", value="Production")
144
  with stats_col2:
145
  st.metric(label="Accuracy", value="85%")
146
 
 
151
  st.metric(label="Industry", value="Email")
152
 
153
 
 
154
 
155
+ with st.sidebar:
156
 
157
  with st.expander('Model Description', expanded=False):
158
+ img = PIL.Image.open("figures/ModelCC.png")
159
+ st.image(img)
160
  st.markdown('Finding the correct length for an email campaign to maximize user engagement can be an ambiguous task. The Loxz Character Count Model allows you to predict the correct length of your emails for a particular industry and a particular type of email. Using these inputs and trained on an extensive proprietary data set from the Loxz family digital archive, the models incorporate real-world and synthetic data to find the optimized character counts. We applied the random forest algorithm in this model. Bootstrapping was also ensembled in the algorithm which effectively prevents overfitting by reducing variance. The model achieves an 86% accuracy on the test set. This inference-based ML model will help the campaign engineers start with an acceptable length and zero in on the best character count, maximizing engagement in their campaign.')
161
 
162
+ with st.expander('Model Information', expanded=False):
163
+ hide_table_row_index = """
164
+ <style>
165
+ thead tr th:first-child {display:none}
166
+ tbody th {display:none}
167
+ </style>
168
+ """
169
+ st.markdown(hide_table_row_index, unsafe_allow_html=True)
170
+ st.table(table_data())
171
 
172
+ url_button('Model Homepage','https://www.loxz.com/#/models/CTA')
173
+ # url_button('Full Report','https://resources.loxz.com/reports/realtime-ml-character-count-model')
174
  url_button('Amazon Market Place','https://aws.amazon.com/marketplace')
175
 
176
 
 
178
  'Retail',
179
  'Software and Technology',
180
  'Hospitality',
181
+ 'Academic and Education',
182
  'Healthcare',
183
  'Energy',
184
  'Real Estate',
185
  'Entertainment',
186
+ 'Finance and Banking'
187
  ]
188
 
189
  campaign_types = [
 
200
  ]
201
 
202
  target_variables = [
203
+ 'conversion_rate',
204
+ 'click_to_open_rate'
 
 
205
  ]
206
 
207
+ uploaded_file = st.file_uploader("Please upload your email (In HTML Format)", type=["html"])
208
+
209
+ if uploaded_file is None:
210
+ # upload_img = PIL.Image.open(uploaded_file)
211
+ upload_img = None
212
+ # else:
213
+ # upload_img = None
214
+
215
+
216
  industry = st.selectbox(
217
  'Please select your industry',
218
  industry_lists
 
227
  'Please select your target variable',
228
  target_variables
229
  )
230
+
231
+ st.markdown("""---""")
232
+
233
+ char_reco_preference = st.selectbox(
234
+ 'Do you want to increase or decrease your character count in the email?',
235
+ ["Increase", "Decrease"])
236
+
237
+
238
+ def get_files_from_aws(bucket,prefix):
239
+ """
240
+ get files from aws s3 bucket
241
+
242
+ bucket (STRING): bucket name
243
+ prefix (STRING): file location in s3 bucket
244
+ """
245
+ s3_client = boto3.client('s3',
246
+ aws_access_key_id = st.secrets["aws_id"],
247
+ aws_secret_access_key = st.secrets["aws_key"])
248
+
249
+ file_obj = s3_client.get_object(Bucket=bucket,Key=prefix)
250
+ body = file_obj['Body']
251
+ string = body.read().decode('utf-8')
252
+
253
+ df = pd.read_csv(StringIO(string))
254
+
255
+ return df
256
+
257
+
258
+
259
+
260
+ if st.button('Generate Predictions'):
261
+ if uploaded_file is None:
262
+ st.error('Please upload a email (HTML format)')
263
+ else:
264
+ placeholder = st.empty()
265
+ placeholder.text('Loading Data')
266
+
267
+ # Starting predictions
268
+ model = joblib.load('models/models.sav')
269
+ print(type(model))
270
+ # Generate Email Data
271
+ email_data = get_files_from_aws('emailcampaigntrainingdata','trainingdata/email_dataset_training.csv')
272
+ acc_data = get_files_from_aws('emailcampaigntrainingdata','trainingdata/email_dataset_training_raw.csv')
273
+
274
+ email_data_ = email_data[["email_body", "industry", "campaign_type","character_cnt", "url_cnt","Open_Rate", "Click_Through_Rate"]]
275
+ email_data_ = email_data_.rename({'Open_Rate': 'Click-to-open_Rate', 'Click_Through_Rate': 'Conversion_Rate'})
276
+ df_email_data = email_data_.rename(columns={'Open_Rate': 'Click-to-open_Rate', 'Click_Through_Rate': 'Conversion_Rate'})
277
+
278
+ # Dataset:
279
+ training_dataset = get_files_from_aws('emailcampaigntrainingdata','modelCC/training.csv')
280
+ X_test = get_files_from_aws('emailcampaigntrainingdata','modelCC/Xtest.csv')
281
+ Y_test = get_files_from_aws('emailcampaigntrainingdata','modelCC/ytest.csv')
282
+
283
+
284
+ industry_code_dict = get_industry_code_dict(email_data)
285
+
286
+ bytes_data = uploaded_file.getvalue()
287
+
288
+ email_body, character_cnt, url_cnt = email_body_extractor(bytes_data)
289
+
290
+ # Start the prediction
291
+ # Need to solve X test issue
292
+
293
+ # y_pred = model.predict(X_test)
294
+ df_uploaded = pd.DataFrame(columns=['character_cnt', "url_cnt", "industry"])
295
+ df_uploaded.loc[0] = [character_cnt, url_cnt, industry]
296
+ df_uploaded["industry_code"] = industry_code_dict.get(industry)
297
+ df_uploaded_test = df_uploaded[["industry_code", "character_cnt", "url_cnt"]]
298
+ predicted_rate = model.predict(df_uploaded_test)[0]
299
+ output_rate = round(predicted_rate*100,2)
300
+ print(output_rate)
301
+ # output_rate = 0.5
302
+ if output_rate < 0:
303
+ print("Sorry, Current model couldn't provide predictions on the target variable you selected.")
304
+ else:
305
+ st.info('Current Character Count in Your Email is: {}'.format(character_cnt))
306
+ st.info('The model predicts that it achieves a {} of {}%'.format(target, str(output_rate)))
307
+
308
+ # print(target)
309
+ if target == "click_to_open_rate":
310
+ selected_variable = "Open_Rate"
311
+ if target == "conversion_rate":
312
+ selected_variable = "Click_Through_Rate"
313
+
314
+ df_reco = training_dataset[["industry_code", "character_cnt", "url_cnt", selected_variable]]
315
+ df_reco = df_reco[df_reco["industry_code"] == industry]
316
+ df_reco[selected_variable]=df_reco[selected_variable].apply(lambda x:round(x, 3))
317
+ df_reco_sort = df_reco.sort_values(by=[selected_variable])
318
+ df_reco = df_reco.drop_duplicates(subset=selected_variable)
319
+
320
+ if char_reco_preference == "Increase":
321
+ df_reco_opt = df_reco[(df_reco[selected_variable] > output_rate) & (df_reco["character_cnt"] > character_cnt) & (df_reco["character_cnt"] <= (1.5*character_cnt))]
322
+ df_reco_opt_rank = df_reco_opt.nlargest(3,[selected_variable])
323
+ else:
324
+ df_reco_opt = df_reco[(df_reco[selected_variable] > output_rate) & (df_reco["character_cnt"] < character_cnt)]
325
+ df_reco_opt_rank = df_reco_opt.nlargest(3,[selected_variable])
326
+
327
+ if selected_variable == "Open_Rate":
328
+ selected_variable = "Click-to-Open_Rate"
329
+ if selected_variable == "Click_Through_Rate":
330
+ selected_variable = "Conversion_Rate"
331
+
332
+ st.info('To get higher, {},the model recommends the following options:'.format(selected_variable))
333
+ if len(df_reco_opt_rank) == 0:
334
+ st.info('You ve already achieved the highest, {}, with the current character count!'.format(selected_variable))
335
+ else:
336
+ for _, row in df_reco_opt_rank.iterrows():
337
+ Character_Count = row[1]
338
+ selected_variable = row[3]
339
+ # print(f"·Number of Characters: {int(Character_Count)}, Target Rate: {round(selected_variable, 3)*100}", "%")
340
+ st.info('Number of Characters: {}, Target Rate: {}'.format(int(Character_Count), round(selected_variable, 3)*100))
341
+
342
+
343
+ placeholder.empty()
main_app.py ADDED
@@ -0,0 +1,15 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import streamlit as st
2
+
3
+ st.set_page_config(layout="wide")
4
+
5
+ st.markdown(
6
+ """
7
+ <style>
8
+ body {
9
+ background-image: linear-gradient(#2e7bcf,#2e7bcf);
10
+ color: white;
11
+ }
12
+ </style>
13
+ """,
14
+ unsafe_allow_html=True,
15
+ )
models/models.sav ADDED
Binary file (3.22 kB). View file
 
requirements.txt CHANGED
@@ -1 +1,2 @@
1
  bokeh==2.4.1
 
 
1
  bokeh==2.4.1
2
+ joblib