Runtime error
Runtime error
Duy-Anh Dang
commited on
updated april 2023
Browse files
@@ -14,6 +14,13 @@ import joblib
14 |
from bokeh.models.widgets import Div
15 |
16 |
import email
17 |
18 |
19 |
def table_data():
@@ -66,6 +73,65 @@ def get_industry_code_dict(training_dataset):
66 |
zip(training_dataset.industry, training_dataset.industry_code))
67 |
return industry_code_dict
68 |
69 |
70 |
# extract email body from parse email
71 |
def email_body_extractor(email_data):
@@ -187,7 +253,7 @@ campaign_types = [
187 |
188 |
189 |
190 |
191 |
192 |
193 |
@@ -228,10 +294,10 @@ target = st.selectbox(
228 |
229 |
230 |
231 |
char_reco_preference = st.selectbox(
232 |
'Do you want to increase or decrease your character count in the email?',
233 |
["Increase", "Decrease"],
234 |
235 |
236 |
237 |
def get_files_from_aws(bucket, prefix):
@@ -289,9 +355,10 @@ if st.button('Generate Predictions'):
289 |
# print("Getting Data Time: %s seconds" % (time.time() - start_time))
290 |
291 |
industry_code_dict = get_industry_code_dict(email_data)
292 |
293 |
294 |
email_body, character_cnt, url_cnt =
295 |
296 |
# Start the prediction
297 |
# Need to solve X test issue
@@ -314,11 +381,12 @@ if st.button('Generate Predictions'):
314 |
character_cnt), unsafe_allow_html=True)
315 |
#'The model predicts that it achieves a {} of {}%'.format(target, str(round(output_rate*100,2))))
316 |
if target == 'conversion_rate':
317 |
target_vis = '
318 |
319 |
target_vis = '
320 |
st.markdown('#### The model predicts that it achieves a <span style="color:blue">{}</span> of <span style="color:blue">{}</span>%'.format(
321 |
target_vis, str(round(output_rate*100,
322 |
selected_industry_code = industry_code_dict.get(industry)
323 |
324 |
if target == "click_to_open_rate":
@@ -335,16 +403,34 @@ if st.button('Generate Predictions'):
335 |
df_reco_sort = df_reco.sort_values(by=[selected_variable])
336 |
df_reco = df_reco.drop_duplicates(subset=selected_variable)
337 |
338 |
preference = char_reco_preference
339 |
if preference == "Increase":
340 |
341 |
342 |
343 |
# decrease character reco
344 |
if preference == "Decrease":
345 |
346 |
347 |
348 |
349 |
if selected_variable == "Open_Rate":
350 |
selected_variable = "Click-to-Open_Rate"
@@ -357,12 +443,54 @@ if st.button('Generate Predictions'):
357 |
st.markdown('#### You ve already achieved the highest, <span style="color:blue">{}</span>, with the current character count!'.format(
358 |
selected_variable), unsafe_allow_html=True)
359 |
360 |
for _, row in df_reco_opt_rank.iterrows():
361 |
Character_Count = row[1]
362 |
363 |
364 |
365 |
366 |
367 |
368 |
# print(time.time() - start_time)
14 |
from bokeh.models.widgets import Div
15 |
16 |
import email
17 |
#from ipyfilechooser import FileChooser
18 |
19 |
#from IPython.display import display
20 |
from io import BytesIO
21 |
from bs4 import BeautifulSoup
22 |
import matplotlib.pyplot as plt
23 |
import numpy as np
24 |
25 |
26 |
def table_data():
73 |
zip(training_dataset.industry, training_dataset.industry_code))
74 |
return industry_code_dict
75 |
76 |
def parse_email(uploaded_file):
77 |
parsed_email = []
78 |
efile = open(,'r')
79 |
emailstr = ""
80 |
for i, line in enumerate(efile):
81 |
emailstr += line
82 |
83 |
b = email.message_from_string(emailstr)
84 |
for part in b.walk():
85 |
if part.get_content_type():
86 |
body = str(part.get_payload())
87 |
soup = BeautifulSoup(body)
88 |
paragraphs = soup.find_all('body')
89 |
for paragraph in paragraphs:
90 |
91 |
return parsed_email
92 |
93 |
def email_upload():
94 |
print("Please upload your email (In HTML Format)")
95 |
# upload = FileUpload(accept='.html', multiple=True)
96 |
# display(upload)
97 |
# return upload
98 |
fc = FileChooser()
99 |
100 |
return fc
101 |
102 |
103 |
# New - In-Use
104 |
def email_extractor(email_uploaded):
105 |
parse = parse_email(email_uploaded)
106 |
107 |
email_text = ''.join(parse).strip()
108 |
109 |
# extract the email body using string manipulation functions
110 |
email_body_start_index = email_text.find('Bright Apps LLC')
111 |
email_body_end_index = email_text.find('To read more')
112 |
email_body = email_text[email_body_start_index:email_body_end_index].strip()
113 |
114 |
# get rid of non-text elements
115 |
email_body = email_body.replace('\n', '')
116 |
email_body = email_body.replace('\t', '')
117 |
email_body = email_body.replace('\r', '')
118 |
email_body = email_body.replace('</b>', '')
119 |
email_body = email_body.replace('<b>', '')
120 |
email_body = email_body.replace('\xa0', '')
121 |
122 |
# find length of URLs if any
123 |
extractor = URLExtract()
124 |
urls = extractor.find_urls(email_body)
125 |
url_cnt = len(urls)
126 |
127 |
# remove URLs and get character count
128 |
body = re.sub(r'\w+:\/{2}[\d\w-]+(\.[\d\w-]+)*(?:(?:\/[^\s/]*))*', '', email_body)
129 |
sep = '©'
130 |
body = body.split(sep, 1)[0]
131 |
character_cnt = sum(not chr.isspace() for chr in body)
132 |
133 |
return email_body, character_cnt, url_cnt
134 |
135 |
136 |
# extract email body from parse email
137 |
def email_body_extractor(email_data):
253 |
254 |
255 |
256 |
257 |
258 |
259 |
294 |
295 |
296 |
297 |
#char_reco_preference = st.selectbox(
298 |
# 'Do you want to increase or decrease your character count in the email?',
299 |
# ["Increase", "Decrease"],
300 |
# index=1)
301 |
302 |
303 |
def get_files_from_aws(bucket, prefix):
355 |
# print("Getting Data Time: %s seconds" % (time.time() - start_time))
356 |
357 |
industry_code_dict = get_industry_code_dict(email_data)
358 |
#uploaded_file = FileChooser(uploaded_file)
359 |
#bytes_data = uploaded_file.getvalue()
360 |
361 |
email_body, character_cnt, url_cnt = email_extractor(uploaded_file)
362 |
363 |
# Start the prediction
364 |
# Need to solve X test issue
381 |
character_cnt), unsafe_allow_html=True)
382 |
#'The model predicts that it achieves a {} of {}%'.format(target, str(round(output_rate*100,2))))
383 |
if target == 'conversion_rate':
384 |
target_vis = 'Click_Through_Rate'
385 |
386 |
target_vis = 'Open_Rate'
387 |
388 |
st.markdown('#### The model predicts that it achieves a <span style="color:blue">{}</span> of <span style="color:blue">{}</span>%'.format(
389 |
target_vis, str(round(output_rate*100, 3))), unsafe_allow_html=True)
390 |
selected_industry_code = industry_code_dict.get(industry)
391 |
392 |
if target == "click_to_open_rate":
403 |
df_reco_sort = df_reco.sort_values(by=[selected_variable])
404 |
df_reco = df_reco.drop_duplicates(subset=selected_variable)
405 |
406 |
#preference = char_reco_preference
407 |
#if preference == "Increase":
408 |
# df_reco_opt = df_reco[(df_reco[selected_variable] > output_rate) & (
409 |
# df_reco["character_cnt"] > character_cnt) & (df_reco["character_cnt"] <= (1.5*character_cnt))]
410 |
# df_reco_opt_rank = df_reco_opt.nlargest(3, [selected_variable])
411 |
# decrease character reco
412 |
#if preference == "Decrease":
413 |
# df_reco_opt = df_reco[(df_reco[selected_variable] > output_rate) & (
414 |
# df_reco["character_cnt"] < character_cnt)]
415 |
# df_reco_opt_rank = df_reco_opt.nlargest(3, [selected_variable])
416 |
417 |
418 |
# split into two dataframes of higher and lower character_cnt (added apr 2023)
419 |
char_cnt_uploaded = character_cnt
420 |
421 |
df_reco_opt1 = df_reco[(df_reco[selected_variable] > output_rate) & (df_reco["character_cnt"] > char_cnt_uploaded) & (df_reco["character_cnt"] <= (1.5*char_cnt_uploaded))]
422 |
df_reco_opt2 = df_reco[(df_reco[selected_variable] > output_rate) & (df_reco["character_cnt"] < char_cnt_uploaded) & (df_reco["character_cnt"] >= (char_cnt_uploaded/2))]
423 |
424 |
# drop duplicates of character_cnt keeping the row with the highest output_rate
425 |
df_reco_opt1 = df_reco_opt1.sort_values(by=[selected_variable], ascending=False).drop_duplicates(subset=["character_cnt"])
426 |
df_reco_opt2 = df_reco_opt2.sort_values(by=[selected_variable], ascending=False).drop_duplicates(subset=["character_cnt"])
427 |
428 |
# get top 2 largest in higher and lower dataframe
429 |
df_reco_opt_rank1 = df_reco_opt1.nlargest(2, [selected_variable])
430 |
df_reco_opt_rank2 = df_reco_opt2.nlargest(2, [selected_variable])
431 |
432 |
df_reco_opt_rank = pd.concat([df_reco_opt_rank1, df_reco_opt_rank2])
433 |
df_reco_opt_rank = df_reco_opt_rank.nlargest(3,[selected_variable])
434 |
435 |
if selected_variable == "Open_Rate":
436 |
selected_variable = "Click-to-Open_Rate"
443 |
st.markdown('#### You ve already achieved the highest, <span style="color:blue">{}</span>, with the current character count!'.format(
444 |
selected_variable), unsafe_allow_html=True)
445 |
446 |
#for _, row in df_reco_opt_rank.iterrows():
447 |
# Character_Count = row[1]
448 |
# selected_variable = row[3]
449 |
# print(f"·Number of Characters: {int(Character_Count)}, Target Rate: {round(selected_variable, 3)*100}", "%")
450 |
# st.markdown('Number of Characters: {}, Target Rate: {}'.format(
451 |
# int(Character_Count), round(selected_variable*100, 3)))
452 |
453 |
chars = []
454 |
sel_var_values = []
455 |
456 |
for _, row in df_reco_opt_rank.iterrows():
457 |
Character_Count = row[1]
458 |
selected_variable_number = row[3]
459 |
460 |
sel_var_values.append(round(selected_variable_number, 3)*100)
461 |
st.write(f"·Number of Characters: {int(Character_Count)}, Target Rate: {round(round(selected_variable_number, 3)*100, 3)}", "%")
462 |
463 |
464 |
if len(chars) > 1:
465 |
#fig = plt.figure()
466 |
#ax = fig.add_axes([0,0,1,1])
467 |
fig, ax = plt.subplots(figsize=(10,4))
468 |
bars = ax.barh(np.arange(len(chars)), sel_var_values, height=0.175, color='#0F4D60')
469 |
470 |
471 |
472 |
473 |
ax.set_yticklabels(np.array(chars), fontsize=14)
474 |
ax.set_title('Character Counts vs. Target Variable Rates', fontsize=18)
475 |
ax.set_ylabel('Character Counts', fontsize=16)
476 |
ax.set_xlabel('Target Rates %', fontsize=16)
477 |
478 |
for i, bar in enumerate(bars):
479 |
rounded_value = round(sel_var_values[i], 2)
480 |
ax.text(bar.get_width() + 0.3, bar.get_y() + bar.get_height()/2, str(rounded_value) + '%', ha='left', va='center', fontsize=12, fontweight='bold')
481 |
482 |
483 |
484 |
biggest_bar_index = np.argmax(sel_var_values)
485 |
486 |
487 |
st.plotly_chart(fig, use_container_width=True)
488 |
489 |
490 |
491 |
chars_out = dict(zip(chars, sel_var_values))
492 |
sorted_chars_out = sorted(chars_out.items(), key=lambda x: x[1], reverse=True)
493 |
494 |
495 |
496 |
# print(time.time() - start_time)