Duy-Anh Dang commited on
Commit
8fe1e33
·
1 Parent(s): 41e5416

updated app.py april 2023

Browse files
Files changed (1) hide show
  1. app.py +151 -23
app.py CHANGED
@@ -14,6 +14,13 @@ import joblib
14
  from bokeh.models.widgets import Div
15
 
16
  import email
 
 
 
 
 
 
 
17
 
18
 
19
  def table_data():
@@ -66,6 +73,65 @@ def get_industry_code_dict(training_dataset):
66
  zip(training_dataset.industry, training_dataset.industry_code))
67
  return industry_code_dict
68
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
69
 
70
  # extract email body from parse email
71
  def email_body_extractor(email_data):
@@ -187,7 +253,7 @@ campaign_types = [
187
  'Survey',
188
  'Newsletter',
189
  'Engagement',
190
- 'Curated_Content',
191
  'Review_Request',
192
  'Product_Announcement',
193
  'Abandoned_Cart'
@@ -228,10 +294,10 @@ target = st.selectbox(
228
 
229
  st.markdown("""---""")
230
 
231
- char_reco_preference = st.selectbox(
232
- 'Do you want to increase or decrease your character count in the email?',
233
- ["Increase", "Decrease"],
234
- index=1)
235
 
236
 
237
  def get_files_from_aws(bucket, prefix):
@@ -289,9 +355,10 @@ if st.button('Generate Predictions'):
289
  # print("Getting Data Time: %s seconds" % (time.time() - start_time))
290
 
291
  industry_code_dict = get_industry_code_dict(email_data)
292
- bytes_data = uploaded_file.getvalue()
 
293
 
294
- email_body, character_cnt, url_cnt = email_body_extractor(bytes_data)
295
 
296
  # Start the prediction
297
  # Need to solve X test issue
@@ -314,11 +381,12 @@ if st.button('Generate Predictions'):
314
  character_cnt), unsafe_allow_html=True)
315
  # st.info('The model predicts that it achieves a {} of {}%'.format(target, str(round(output_rate*100,2))))
316
  if target == 'conversion_rate':
317
- target_vis = 'Conversion Rate'
318
  else:
319
- target_vis = 'Click-to-Open Rate'
 
320
  st.markdown('#### The model predicts that it achieves a <span style="color:blue">{}</span> of <span style="color:blue">{}</span>%'.format(
321
- target_vis, str(round(output_rate*100, 2))), unsafe_allow_html=True)
322
  selected_industry_code = industry_code_dict.get(industry)
323
 
324
  if target == "click_to_open_rate":
@@ -335,16 +403,34 @@ if st.button('Generate Predictions'):
335
  df_reco_sort = df_reco.sort_values(by=[selected_variable])
336
  df_reco = df_reco.drop_duplicates(subset=selected_variable)
337
 
338
- preference = char_reco_preference
339
- if preference == "Increase":
340
- df_reco_opt = df_reco[(df_reco[selected_variable] > output_rate) & (
341
- df_reco["character_cnt"] > character_cnt) & (df_reco["character_cnt"] <= (1.5*character_cnt))]
342
- df_reco_opt_rank = df_reco_opt.nlargest(3, [selected_variable])
343
  # decrease character reco
344
- if preference == "Decrease":
345
- df_reco_opt = df_reco[(df_reco[selected_variable] > output_rate) & (
346
- df_reco["character_cnt"] < character_cnt)]
347
- df_reco_opt_rank = df_reco_opt.nlargest(3, [selected_variable])
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
348
 
349
  if selected_variable == "Open_Rate":
350
  selected_variable = "Click-to-Open_Rate"
@@ -357,12 +443,54 @@ if st.button('Generate Predictions'):
357
  st.markdown('#### You ve already achieved the highest, <span style="color:blue">{}</span>, with the current character count!'.format(
358
  selected_variable), unsafe_allow_html=True)
359
  else:
 
 
 
 
 
 
 
 
 
 
360
  for _, row in df_reco_opt_rank.iterrows():
361
  Character_Count = row[1]
362
- selected_variable = row[3]
363
- # print(f"·Number of Characters: {int(Character_Count)}, Target Rate: {round(selected_variable, 3)*100}", "%")
364
- st.markdown('Number of Characters: {}, Target Rate: {}'.format(
365
- int(Character_Count), round(selected_variable*100, 3)))
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
366
 
367
  placeholder.empty()
368
  # print(time.time() - start_time)
 
14
  from bokeh.models.widgets import Div
15
 
16
  import email
17
+ #from ipyfilechooser import FileChooser
18
+
19
+ #from IPython.display import display
20
+ from io import BytesIO
21
+ from bs4 import BeautifulSoup
22
+ import matplotlib.pyplot as plt
23
+ import numpy as np
24
 
25
 
26
  def table_data():
 
73
  zip(training_dataset.industry, training_dataset.industry_code))
74
  return industry_code_dict
75
 
76
+ def parse_email(uploaded_file):
77
+ parsed_email = []
78
+ efile = open(uploaded_file.name,'r')
79
+ emailstr = ""
80
+ for i, line in enumerate(efile):
81
+ emailstr += line
82
+
83
+ b = email.message_from_string(emailstr)
84
+ for part in b.walk():
85
+ if part.get_content_type():
86
+ body = str(part.get_payload())
87
+ soup = BeautifulSoup(body)
88
+ paragraphs = soup.find_all('body')
89
+ for paragraph in paragraphs:
90
+ parsed_email.append(paragraph.text)
91
+ return parsed_email
92
+
93
+ def email_upload():
94
+ print("Please upload your email (In HTML Format)")
95
+ # upload = FileUpload(accept='.html', multiple=True)
96
+ # display(upload)
97
+ # return upload
98
+ fc = FileChooser()
99
+ display(fc)
100
+ return fc
101
+
102
+
103
+ # New - In-Use
104
+ def email_extractor(email_uploaded):
105
+ parse = parse_email(email_uploaded)
106
+
107
+ email_text = ''.join(parse).strip()
108
+
109
+ # extract the email body using string manipulation functions
110
+ email_body_start_index = email_text.find('Bright Apps LLC')
111
+ email_body_end_index = email_text.find('To read more')
112
+ email_body = email_text[email_body_start_index:email_body_end_index].strip()
113
+
114
+ # get rid of non-text elements
115
+ email_body = email_body.replace('\n', '')
116
+ email_body = email_body.replace('\t', '')
117
+ email_body = email_body.replace('\r', '')
118
+ email_body = email_body.replace('</b>', '')
119
+ email_body = email_body.replace('<b>', '')
120
+ email_body = email_body.replace('\xa0', '')
121
+
122
+ # find length of URLs if any
123
+ extractor = URLExtract()
124
+ urls = extractor.find_urls(email_body)
125
+ url_cnt = len(urls)
126
+
127
+ # remove URLs and get character count
128
+ body = re.sub(r'\w+:\/{2}[\d\w-]+(\.[\d\w-]+)*(?:(?:\/[^\s/]*))*', '', email_body)
129
+ sep = '©'
130
+ body = body.split(sep, 1)[0]
131
+ character_cnt = sum(not chr.isspace() for chr in body)
132
+
133
+ return email_body, character_cnt, url_cnt
134
+
135
 
136
  # extract email body from parse email
137
  def email_body_extractor(email_data):
 
253
  'Survey',
254
  'Newsletter',
255
  'Engagement',
256
+ 'Usage_and_Consumption',
257
  'Review_Request',
258
  'Product_Announcement',
259
  'Abandoned_Cart'
 
294
 
295
  st.markdown("""---""")
296
 
297
+ #char_reco_preference = st.selectbox(
298
+ # 'Do you want to increase or decrease your character count in the email?',
299
+ # ["Increase", "Decrease"],
300
+ # index=1)
301
 
302
 
303
  def get_files_from_aws(bucket, prefix):
 
355
  # print("Getting Data Time: %s seconds" % (time.time() - start_time))
356
 
357
  industry_code_dict = get_industry_code_dict(email_data)
358
+ #uploaded_file = FileChooser(uploaded_file)
359
+ #bytes_data = uploaded_file.getvalue()
360
 
361
+ email_body, character_cnt, url_cnt = email_extractor(uploaded_file)
362
 
363
  # Start the prediction
364
  # Need to solve X test issue
 
381
  character_cnt), unsafe_allow_html=True)
382
  # st.info('The model predicts that it achieves a {} of {}%'.format(target, str(round(output_rate*100,2))))
383
  if target == 'conversion_rate':
384
+ target_vis = 'Click_Through_Rate'
385
  else:
386
+ target_vis = 'Open_Rate'
387
+
388
  st.markdown('#### The model predicts that it achieves a <span style="color:blue">{}</span> of <span style="color:blue">{}</span>%'.format(
389
+ target_vis, str(round(output_rate*100, 3))), unsafe_allow_html=True)
390
  selected_industry_code = industry_code_dict.get(industry)
391
 
392
  if target == "click_to_open_rate":
 
403
  df_reco_sort = df_reco.sort_values(by=[selected_variable])
404
  df_reco = df_reco.drop_duplicates(subset=selected_variable)
405
 
406
+ #preference = char_reco_preference
407
+ #if preference == "Increase":
408
+ # df_reco_opt = df_reco[(df_reco[selected_variable] > output_rate) & (
409
+ # df_reco["character_cnt"] > character_cnt) & (df_reco["character_cnt"] <= (1.5*character_cnt))]
410
+ # df_reco_opt_rank = df_reco_opt.nlargest(3, [selected_variable])
411
  # decrease character reco
412
+ #if preference == "Decrease":
413
+ # df_reco_opt = df_reco[(df_reco[selected_variable] > output_rate) & (
414
+ # df_reco["character_cnt"] < character_cnt)]
415
+ # df_reco_opt_rank = df_reco_opt.nlargest(3, [selected_variable])
416
+
417
+
418
+ # split into two dataframes of higher and lower character_cnt (added apr 2023)
419
+ char_cnt_uploaded = character_cnt
420
+
421
+ df_reco_opt1 = df_reco[(df_reco[selected_variable] > output_rate) & (df_reco["character_cnt"] > char_cnt_uploaded) & (df_reco["character_cnt"] <= (1.5*char_cnt_uploaded))]
422
+ df_reco_opt2 = df_reco[(df_reco[selected_variable] > output_rate) & (df_reco["character_cnt"] < char_cnt_uploaded) & (df_reco["character_cnt"] >= (char_cnt_uploaded/2))]
423
+
424
+ # drop duplicates of character_cnt keeping the row with the highest output_rate
425
+ df_reco_opt1 = df_reco_opt1.sort_values(by=[selected_variable], ascending=False).drop_duplicates(subset=["character_cnt"])
426
+ df_reco_opt2 = df_reco_opt2.sort_values(by=[selected_variable], ascending=False).drop_duplicates(subset=["character_cnt"])
427
+
428
+ # get top 2 largest in higher and lower dataframe
429
+ df_reco_opt_rank1 = df_reco_opt1.nlargest(2, [selected_variable])
430
+ df_reco_opt_rank2 = df_reco_opt2.nlargest(2, [selected_variable])
431
+
432
+ df_reco_opt_rank = pd.concat([df_reco_opt_rank1, df_reco_opt_rank2])
433
+ df_reco_opt_rank = df_reco_opt_rank.nlargest(3,[selected_variable])
434
 
435
  if selected_variable == "Open_Rate":
436
  selected_variable = "Click-to-Open_Rate"
 
443
  st.markdown('#### You ve already achieved the highest, <span style="color:blue">{}</span>, with the current character count!'.format(
444
  selected_variable), unsafe_allow_html=True)
445
  else:
446
+ #for _, row in df_reco_opt_rank.iterrows():
447
+ # Character_Count = row[1]
448
+ # selected_variable = row[3]
449
+ # print(f"·Number of Characters: {int(Character_Count)}, Target Rate: {round(selected_variable, 3)*100}", "%")
450
+ # st.markdown('Number of Characters: {}, Target Rate: {}'.format(
451
+ # int(Character_Count), round(selected_variable*100, 3)))
452
+
453
+ chars = []
454
+ sel_var_values = []
455
+
456
  for _, row in df_reco_opt_rank.iterrows():
457
  Character_Count = row[1]
458
+ selected_variable_number = row[3]
459
+ chars.append(int(Character_Count))
460
+ sel_var_values.append(round(selected_variable_number, 3)*100)
461
+ st.write(f"·Number of Characters: {int(Character_Count)}, Target Rate: {round(round(selected_variable_number, 3)*100, 3)}", "%")
462
+ st.write("\n")
463
+
464
+ if len(chars) > 1:
465
+ #fig = plt.figure()
466
+ #ax = fig.add_axes([0,0,1,1])
467
+ fig, ax = plt.subplots(figsize=(10,4))
468
+ bars = ax.barh(np.arange(len(chars)), sel_var_values, height=0.175, color='#0F4D60')
469
+
470
+ #ax.bar_label(bars)
471
+
472
+ ax.set_yticks(np.arange(len(chars)))
473
+ ax.set_yticklabels(np.array(chars), fontsize=14)
474
+ ax.set_title('Character Counts vs. Target Variable Rates', fontsize=18)
475
+ ax.set_ylabel('Character Counts', fontsize=16)
476
+ ax.set_xlabel('Target Rates %', fontsize=16)
477
+
478
+ for i, bar in enumerate(bars):
479
+ rounded_value = round(sel_var_values[i], 2)
480
+ ax.text(bar.get_width() + 0.3, bar.get_y() + bar.get_height()/2, str(rounded_value) + '%', ha='left', va='center', fontsize=12, fontweight='bold')
481
+
482
+ ax.margins(0.1,0.05)
483
+
484
+ biggest_bar_index = np.argmax(sel_var_values)
485
+ bars[biggest_bar_index].set_color('#00BF93')
486
+
487
+ st.plotly_chart(fig, use_container_width=True)
488
+
489
+ st.write("\n")
490
+ #st.write(np.array(chars))
491
+ chars_out = dict(zip(chars, sel_var_values))
492
+ sorted_chars_out = sorted(chars_out.items(), key=lambda x: x[1], reverse=True)
493
+
494
 
495
  placeholder.empty()
496
  # print(time.time() - start_time)