File size: 20,589 Bytes
fcf4786
03260fa
fcf4786
 
 
 
 
 
 
4f60771
 
fcf4786
03260fa
fcf4786
03260fa
fcf4786
03260fa
fcf4786
a035bf2
fcf4786
03260fa
fcf4786
 
 
 
 
032d12c
 
5b1422d
 
03260fa
fcf4786
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
c7fc413
fcf4786
 
 
 
 
 
 
 
 
 
 
 
73739df
fcf4786
 
 
 
 
8e10cad
fcf4786
8e10cad
fcf4786
8e10cad
fcf4786
8e10cad
fcf4786
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
9637db2
 
 
a035bf2
fcf4786
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
9637db2
fcf4786
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
359c387
fcf4786
 
 
 
 
 
 
359c387
fcf4786
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
359c387
fcf4786
 
359c387
fcf4786
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
e40942d
fcf4786
59eade1
b336735
 
62a0375
 
fcf4786
 
 
 
 
 
 
 
 
 
4f60771
fcf4786
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
4f60771
 
359c387
 
 
 
 
 
 
 
 
 
 
 
 
fb12786
29264ff
0e67d14
29264ff
0e67d14
a297cac
4f60771
 
a297cac
 
fcf4786
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
from ast import arg
import streamlit as st
import pandas as pd
import PIL
import re
from io import StringIO
import boto3
from urlextract import URLExtract
import time
from utils import * 

# from joblib import dump, load

import joblib

from bokeh.models.widgets import Div

import email
import os
#from ipyfilechooser import FileChooser

#from IPython.display import display
from bs4 import BeautifulSoup
import matplotlib.pyplot as plt
import numpy as np
import timeit
import shutil

CURRENT_THEME = "blue"
IS_DARK_THEME = True

def table_data():
    # creating table data
    field = [
        'Data Scientist',
        'Dataset',
        'Algorithm',
        'Framework',
        'Ensemble',
        'Domain',
        'Model Size'
    ]

    data = [
        'Chen Song',
        'Internal + Campaign monitor',
        'Random Forest',
        'Sci-kit learn',
        'Bootstrapping',
        'Bootstrapping Aggregation',
        '4 KB'
    ]

    data = {
        'Field': field,
        'Data': data
    }

    df = pd.DataFrame.from_dict(data)

    return df


def url_button(button_name, url):
    if st.button(button_name):
        js = """window.open('{url}')""".format(url=url)  # New tab or window
        html = '<img src onerror="{}">'.format(js)
        div = Div(text=html)
        st.bokeh_chart(div)


def get_industry_code_dict(training_dataset):
    training_dataset['industry_code'] = training_dataset['industry'].astype(
        'category')
    cat_columns = training_dataset.select_dtypes(['category']).columns
    training_dataset[cat_columns] = training_dataset[cat_columns].apply(
        lambda x: x.cat.codes)
    industry_code_dict = dict(
        zip(training_dataset.industry, training_dataset.industry_code))
    return industry_code_dict

def parse_email(uploaded_file):
    parsed_email = []
    efile = open(uploaded_file.name,'r')
    emailstr = ""
    for i, line in enumerate(efile):
        emailstr += line
    
    b = email.message_from_string(emailstr)
    for part in b.walk():
        if part.get_content_type():
            body = str(part.get_payload())
            soup = BeautifulSoup(body)
            paragraphs = soup.find_all('body')
            for paragraph in paragraphs:
                parsed_email.append(paragraph.text)
    return parsed_email

#def email_upload():
#    print("Please upload your email (In HTML Format)")
#     upload = FileUpload(accept='.html', multiple=True)
#     display(upload)
#     return upload
#    fc = FileChooser()
#    display(fc)
#    return fc


# New - In-Use
def email_extractor(email_uploaded):
    parse = parse_email(email_uploaded)

    email_text = ''.join(parse).strip()

    # extract the email body using string manipulation functions
    email_body_start_index = email_text.find('Bright Apps LLC')
    email_body_end_index = email_text.find('To read more')
    email_body = email_text[email_body_start_index:email_body_end_index].strip()
    
    # get rid of non-text elements
    email_body = email_body.replace('\n', '')
    email_body = email_body.replace('\t', '')
    email_body = email_body.replace('\r', '')
    email_body = email_body.replace('</b>', '')
    email_body = email_body.replace('<b>', '')
    email_body = email_body.replace('\xa0', '')
    
    # find length of URLs if any
    extractor = URLExtract()
    urls = extractor.find_urls(email_body)
    url_cnt = len(urls)
    
    # remove URLs and get character count
    body = re.sub(r'\w+:\/{2}[\d\w-]+(\.[\d\w-]+)*(?:(?:\/[^\s/]*))*', '', email_body)    
    sep = '©'
    body = body.split(sep, 1)[0]
    character_cnt = sum(not chr.isspace() for chr in body)
    
    return email_body, character_cnt, url_cnt


# extract email body from parse email
def email_body_extractor(email_data):
    # email_data = parsed_email.data[0]
    emailstr = email_data.decode("utf-8")
    b = email.message_from_string(emailstr)
    body = ""

    if b.is_multipart():
        for part in b.walk():
            ctype = part.get_content_type()
            cdispo = str(part.get('Content-Disposition'))

            # skip any text/plain (txt) attachments
            if ctype == 'text/plain' and 'attachment' not in cdispo:
                body = part.get_payload()  # decode
                break
    # not multipart - i.e. plain text, no attachments, keeping fingers crossed
    else:
        body = b.get_payload()
    # Remove escape sequences
    body = body.replace('\n', '')
    body = body.replace('\t', '')
    body = body.replace('\r', '')
    body = body.replace('</b>', '')
    body = body.replace('<b>', '')

    # Extract urls in the email body and get url counts
    extractor = URLExtract()
    urls = extractor.find_urls(body)
    url_cnt = len(urls)
    # Remove urls
    body = re.sub(
        r'\w+:\/{2}[\d\w-]+(\.[\d\w-]+)*(?:(?:\/[^\s/]*))*', '', body)
    sep = '©'
    body = body.split(sep, 1)[0]
    character_cnt = sum(not chr.isspace() for chr in body)

    return body, character_cnt, url_cnt


def add_bg_from_url():
    st.markdown(
        f"""
         <style>
         .stApp {{
             background-image: linear-gradient(135deg,#061c2c,#084e69 35%,#3e7e89);
             background-attachment: fixed;
             background-size: cover
            
         }}
         </style>
         """,
        unsafe_allow_html=True
    )

add_bg_from_url()
#linear-gradient(0deg,#010405 0,#061c2c 55%,#0a3144 75%,#0f4d60)

st.markdown("# Character Count: Email Industry")


stats_col1, stats_col2, stats_col3, stats_col4 = st.columns([1, 1, 1, 1])

with stats_col1:
    st.caption("Production: Ready")
with stats_col2:
    st.caption("Accuracy: 85%")
with stats_col3:
    st.caption("Speed: 16.89 ms")
with stats_col4:
    st.caption("Industry: Email")


with st.sidebar:
    
    with st.expander('Model Description', expanded=False):
        img = PIL.Image.open("figures/ModelCC.png")
        st.image(img)
        st.markdown('Finding the correct length for an email campaign to maximize user engagement can be an ambiguous task. The Loxz Character Count Model allows you to predict the correct length of your emails for a particular industry and a particular type of email. Using these inputs and trained on an extensive proprietary data set from the Loxz family digital archive, the models incorporate real-world and synthetic data to find the optimized character counts. We applied the random forest algorithm in this model. Bootstrapping was also ensembled in the algorithm which effectively prevents overfitting by reducing variance. The model achieves an 86% accuracy on the test set. This inference-based ML model will help the campaign engineers start with an acceptable length and zero in on the best character count, maximizing engagement in their campaign.')

    with st.expander('Model Information', expanded=False):
        hide_table_row_index = """
            <style>
            thead tr th:first-child {display:none}
            tbody th {display:none}
            </style>
            """
        st.markdown(hide_table_row_index, unsafe_allow_html=True)
        st.table(table_data())

    url_button('Model Homepage', 'https://www.loxz.com/#/models/CTA')
    # url_button('Full Report','https://resources.loxz.com/reports/realtime-ml-character-count-model')
    url_button('Amazon Market Place', 'https://aws.amazon.com/marketplace')


industry_lists = [
    'Retail',
    'Software and Technology',
    'Hospitality',
    'Academic and Education',
    'Healthcare',
    'Energy',
    'Real Estate',
    'Entertainment',
    'Finance and Banking'
]

campaign_types = [
    'Promotional',
    'Transactional',
    'Webinar',
    'Survey',
    'Newsletter',
    'Engagement',
    'Usage_and_Consumption',
    'Review_Request',
    'Product_Announcement',
    'Abandoned_Cart'
]

target_variables = [
    'conversion_rate',
    'click_to_open_rate'
]

uploaded_file = st.file_uploader(
    "Please upload your email (In HTML Format)", type=["html"])

def save_file(uploaded_file):
    with open(os.path.join("./",uploaded_file.name),"wb") as f:
        f.write(uploaded_file.getbuffer())

if uploaded_file is None:
    # upload_img = PIL.Image.open(uploaded_file)
    upload_img = None
# else:
    # upload_img = None


industry = st.selectbox(
    'Please select your industry',
    industry_lists,
    index=6
)

campaign = st.selectbox(
    'Please select your campaign type',
    campaign_types,
    index=5
)

target = st.selectbox(
    'Please select your target variable',
    target_variables,
    index=1
)

st.markdown("""---""")

#char_reco_preference = st.selectbox(
#    'Do you want to increase or decrease your character count in the email?',
#    ["Increase", "Decrease"],
#    index=1)


def get_files_from_aws(bucket, prefix):
    """
        get files from aws s3 bucket
    bucket (STRING): bucket name
    prefix (STRING): file location in s3 bucket
    """
    s3_client = boto3.client('s3',
                             aws_access_key_id=st.secrets["aws_id"],
                             aws_secret_access_key=st.secrets["aws_key"])

    file_obj = s3_client.get_object(Bucket=bucket, Key=prefix)
    body = file_obj['Body']
    string = body.read().decode('utf-8')

    df = pd.read_csv(StringIO(string))

    return df


# st.info([industry,campaign,target,char_reco_preference])


if st.button('Generate Predictions'):
    start_time = time.time()
    if uploaded_file is None:
        st.error('Please upload a email (HTML format)')
    else:
        save_file(uploaded_file)
        placeholder = st.empty()
        placeholder.text('Loading Data')

        # Starting predictions
        model = joblib.load('models/models.sav')
        # Generate Email Data
        email_data = get_files_from_aws(
            'emailcampaigntrainingdata', 'trainingdata/email_dataset_training.csv')
        acc_data = get_files_from_aws(
            'emailcampaigntrainingdata', 'trainingdata/email_dataset_training_raw.csv')

        email_data_ = email_data[["email_body", "industry", "campaign_type",
                                  "character_cnt", "url_cnt", "Open_Rate", "Click_Through_Rate"]]
        email_data_ = email_data_.rename(
            {'Open_Rate': 'Click-to-open_Rate', 'Click_Through_Rate': 'Conversion_Rate'})
        df_email_data = email_data_.rename(
            columns={'Open_Rate': 'Click-to-open_Rate', 'Click_Through_Rate': 'Conversion_Rate'})

        # Dataset:
        training_dataset = get_files_from_aws(
            'emailcampaigntrainingdata', 'modelCC/training.csv')
        # X_test = get_files_from_aws('emailcampaigntrainingdata','modelCC/Xtest.csv')
        # Y_test = get_files_from_aws('emailcampaigntrainingdata','modelCC/ytest.csv')

        # print("Getting Data Time: %s seconds" % (time.time() - start_time))

        industry_code_dict = get_industry_code_dict(email_data)
        #uploaded_file = FileChooser(uploaded_file)
        #bytes_data = uploaded_file.getvalue()

        email_body, character_cnt, url_cnt = email_extractor(uploaded_file)

        # Start the prediction
        # Need to solve X test issue

        # y_pred = model.predict(X_test)
        df_uploaded = pd.DataFrame(
            columns=['character_cnt', "url_cnt", "industry"])
        df_uploaded.loc[0] = [character_cnt, url_cnt, industry]
        df_uploaded["industry_code"] = industry_code_dict.get(industry)
        df_uploaded_test = df_uploaded[[
            "industry_code", "character_cnt", "url_cnt"]]
        predicted_rate = model.predict(df_uploaded_test)[0]
        output_rate = round(predicted_rate, 4)

        if output_rate < 0:
            print(
                "Sorry, Current model couldn't provide predictions on the target variable you selected.")
        else:
            st.markdown('##### Current Character Count in Your Email is: <span style="color:yellow">{}</span>'.format(
                character_cnt), unsafe_allow_html=True)
            # st.info('The model predicts that it achieves a {} of {}%'.format(target, str(round(output_rate*100,2))))
            if target == 'conversion_rate':
                target_vis = 'Click_Through_Rate'
            else:
                target_vis = 'Open_Rate'
                
            st.markdown('##### The model predicts that it achieves a <span style="color:yellow">{}</span> of <span style="color:yellow">{}</span>%'.format(
                target_vis, str(round(output_rate*100, 3))), unsafe_allow_html=True)
            selected_industry_code = industry_code_dict.get(industry)

            if target == "click_to_open_rate":
                selected_variable = "Open_Rate"
            if target == "conversion_rate":
                selected_variable = "Click_Through_Rate"

            df_reco = training_dataset[[
                "industry_code", "character_cnt", "url_cnt", selected_variable]]
            df_reco = df_reco[df_reco["industry_code"]
                              == selected_industry_code]
            df_reco[selected_variable] = df_reco[selected_variable].apply(
                lambda x: round(x, 3))
            df_reco_sort = df_reco.sort_values(by=[selected_variable])
            df_reco = df_reco.drop_duplicates(subset=selected_variable)

            #preference = char_reco_preference
            #if preference == "Increase":
            #    df_reco_opt = df_reco[(df_reco[selected_variable] > output_rate) & (
            #        df_reco["character_cnt"] > character_cnt) & (df_reco["character_cnt"] <= (1.5*character_cnt))]
            #    df_reco_opt_rank = df_reco_opt.nlargest(3, [selected_variable])
            # decrease character reco
            #if preference == "Decrease":
            #    df_reco_opt = df_reco[(df_reco[selected_variable] > output_rate) & (
            #        df_reco["character_cnt"] < character_cnt)]
            #    df_reco_opt_rank = df_reco_opt.nlargest(3, [selected_variable])


            # split into two dataframes of higher and lower character_cnt (added apr 2023)
            char_cnt_uploaded = character_cnt

            df_reco_opt1 = df_reco[(df_reco[selected_variable] > output_rate) & (df_reco["character_cnt"] > char_cnt_uploaded) & (df_reco["character_cnt"] <= (1.5*char_cnt_uploaded))]
            df_reco_opt2 = df_reco[(df_reco[selected_variable] > output_rate) & (df_reco["character_cnt"] < char_cnt_uploaded) & (df_reco["character_cnt"] >= (char_cnt_uploaded/2))]

            # drop duplicates of character_cnt keeping the row with the highest output_rate
            df_reco_opt1 = df_reco_opt1.sort_values(by=[selected_variable], ascending=False).drop_duplicates(subset=["character_cnt"])
            df_reco_opt2 = df_reco_opt2.sort_values(by=[selected_variable], ascending=False).drop_duplicates(subset=["character_cnt"])
            
            # get top 2 largest in higher and lower dataframe
            df_reco_opt_rank1 = df_reco_opt1.nlargest(2, [selected_variable])
            df_reco_opt_rank2 = df_reco_opt2.nlargest(2, [selected_variable])

            df_reco_opt_rank = pd.concat([df_reco_opt_rank1, df_reco_opt_rank2])
            df_reco_opt_rank = df_reco_opt_rank.nlargest(3,[selected_variable])

            if selected_variable == "Open_Rate":
                selected_variable = "Click-to-Open_Rate"
            if selected_variable == "Click_Through_Rate":
                selected_variable = "Conversion_Rate"

            st.markdown('##### To get higher, <span style="color:yellow">{}</span>, the model recommends the following options:'.format(
                selected_variable), unsafe_allow_html=True)
            if len(df_reco_opt_rank) == 0:
                st.markdown('##### You ve already achieved the highest, <span style="color:yellow">{}</span>, with the current character count!'.format(
                    selected_variable), unsafe_allow_html=True)
            else:
                #for _, row in df_reco_opt_rank.iterrows():
                #    Character_Count = row[1]
                #    selected_variable = row[3]
                    # print(f"·Number of Characters: {int(Character_Count)}, Target Rate: {round(selected_variable, 3)*100}", "%")
                #    st.markdown('Number of Characters: {}, Target Rate: {}'.format(
                #        int(Character_Count), round(selected_variable*100, 3)))

                chars = []
                sel_var_values = []

                for _, row in df_reco_opt_rank.iterrows():
                    Character_Count = row[1]
                    selected_variable_number = row[3]
                    chars.append(int(Character_Count))
                    sel_var_values.append(round(selected_variable_number, 3)*100)
                    # st.write(f"·Number of Characters: {int(Character_Count)}, Target Rate: {round(round(selected_variable_number, 3)*100, 3)}", "%")
                st.write("\n")
                df_modelpred=pd.DataFrame(list(zip(chars, sel_var_values)), columns=["Number of Characters", "Target_Rate"])
                # st.checkbox("Use container width", value=False, key="use_container_width")
                # st.dataframe(df_modelpred.style.highlight_max(axis=0), use_container_width=st.session_state.use_container_width)
                df_modelpred.sort_values(by='Target_Rate', ascending=False, inplace = True)
                st.dataframe(df_modelpred)

                if len(chars) > 1:
                    #fig = plt.figure()
                    #ax = fig.add_axes([0,0,1,1])
                    fig, ax = plt.subplots(figsize=(10,4))
                    bars = ax.barh(np.arange(len(chars)), sel_var_values, height=0.175, color='#0F4D60')
                    
                    #ax.bar_label(bars)

                    ax.set_yticks(np.arange(len(chars)))
                    ax.set_yticklabels(tuple(chars), fontsize=14)
                    ax.set_title('Character Counts vs. Target Variable Rates', fontsize=18)
                    ax.set_ylabel('Character Counts', fontsize=16)
                    ax.set_xlabel('Target Rates %', fontsize=16)
                    
                    for i, bar in enumerate(bars):
                        rounded_value = round(sel_var_values[i], 2)
                        ax.text(bar.get_width() + 0.3, bar.get_y() + bar.get_height()/2, str(rounded_value) + '%', ha='left', va='center', fontsize=12, fontweight='bold')
                    
                    ax.margins(0.1,0.05)
                    
                    biggest_bar_index = np.argmax(sel_var_values)
                    bars[biggest_bar_index].set_color('#00BF93')
                    
                    st.plotly_chart(fig, use_container_width=True)

                    st.write("\n")
                    chars_out = dict(zip(chars, sel_var_values))
                    sorted_chars_out = sorted(chars_out.items(), key=lambda x: x[1], reverse=True)
                    prefrence_variables=res=["charcter counts: "+str(x)+", Target Rate: "+str(y) for x,y in zip(chars,sel_var_values)]
                    preference = st.selectbox(
                                'Please select your preferences',
                                prefrence_variables,
                                index=1
                            )
                    if st.button('Generate AI Recommended Email'):
                        if(preference is None):
                            st.error('Please upload a email (HTML format)')
                        else:
                            ai_generated_email=generate_example_email_with_context(email_body, campaign, industry, target, sorted_chars_out, preference)
                            st.markdown('##### Here is the recommended Generated Email for you:')
                            st.markdown('####### {}:'.format(ai_generated_email),unsafe_allow_html=True)
                    preference= "character counts: "+str(573)+", Target Rate: "+str(37.2)
                    ai_generated_email=generate_example_email_with_context(email_body, campaign, industry, target, sorted_chars_out, preference)
                    print("ai_generated_email: ",ai_generated_email)
                    st.markdown('##### Here is the recommended Generated Email for you:')
                    st.markdown('####### {}'.format(ai_generated_email),unsafe_allow_html=True)
                    #st.write(np.array(chars))


                chars_out = dict(zip(chars, sel_var_values))
                sorted_chars_out = sorted(chars_out.items(), key=lambda x: x[1], reverse=True)
                

        placeholder.empty()
        #st.write(time.time() - start_time)