Spaces:

loxzdigital
/

Model-CC-Space

Runtime error

App Files Files Community

Duy-Anh Dang commited on Apr 16, 2023

Commit

8fe1e33

1 Parent(s): 41e5416

updated app.py april 2023

Browse files

Files changed (1) hide show

app.py +151 -23

app.py CHANGED Viewed

@@ -14,6 +14,13 @@ import joblib
 from bokeh.models.widgets import Div
 import email
 def table_data():
@@ -66,6 +73,65 @@ def get_industry_code_dict(training_dataset):
         zip(training_dataset.industry, training_dataset.industry_code))
     return industry_code_dict
 # extract email body from parse email
 def email_body_extractor(email_data):
@@ -187,7 +253,7 @@ campaign_types = [
     'Survey',
     'Newsletter',
     'Engagement',
-    'Curated_Content',
     'Review_Request',
     'Product_Announcement',
     'Abandoned_Cart'
@@ -228,10 +294,10 @@ target = st.selectbox(
 st.markdown("""---""")
-char_reco_preference = st.selectbox(
-    'Do you want to increase or decrease your character count in the email?',
-    ["Increase", "Decrease"],
-    index=1)
 def get_files_from_aws(bucket, prefix):
@@ -289,9 +355,10 @@ if st.button('Generate Predictions'):
         # print("Getting Data Time: %s seconds" % (time.time() - start_time))
         industry_code_dict = get_industry_code_dict(email_data)
-        bytes_data = uploaded_file.getvalue()
-        email_body, character_cnt, url_cnt = email_body_extractor(bytes_data)
         # Start the prediction
         # Need to solve X test issue
@@ -314,11 +381,12 @@ if st.button('Generate Predictions'):
                 character_cnt), unsafe_allow_html=True)
             # st.info('The model predicts that it achieves a {} of {}%'.format(target, str(round(output_rate*100,2))))
             if target == 'conversion_rate':
-                target_vis = 'Conversion Rate'
             else:
-                target_vis = 'Click-to-Open Rate'
             st.markdown('#### The model predicts that it achieves a <span style="color:blue">{}</span> of <span style="color:blue">{}</span>%'.format(
-                target_vis, str(round(output_rate*100, 2))), unsafe_allow_html=True)
             selected_industry_code = industry_code_dict.get(industry)
             if target == "click_to_open_rate":
@@ -335,16 +403,34 @@ if st.button('Generate Predictions'):
             df_reco_sort = df_reco.sort_values(by=[selected_variable])
             df_reco = df_reco.drop_duplicates(subset=selected_variable)
-            preference = char_reco_preference
-            if preference == "Increase":
-                df_reco_opt = df_reco[(df_reco[selected_variable] > output_rate) & (
-                    df_reco["character_cnt"] > character_cnt) & (df_reco["character_cnt"] <= (1.5*character_cnt))]
-                df_reco_opt_rank = df_reco_opt.nlargest(3, [selected_variable])
             # decrease character reco
-            if preference == "Decrease":
-                df_reco_opt = df_reco[(df_reco[selected_variable] > output_rate) & (
-                    df_reco["character_cnt"] < character_cnt)]
-                df_reco_opt_rank = df_reco_opt.nlargest(3, [selected_variable])
             if selected_variable == "Open_Rate":
                 selected_variable = "Click-to-Open_Rate"
@@ -357,12 +443,54 @@ if st.button('Generate Predictions'):
                 st.markdown('#### You ve already achieved the highest, <span style="color:blue">{}</span>, with the current character count!'.format(
                     selected_variable), unsafe_allow_html=True)
             else:
                 for _, row in df_reco_opt_rank.iterrows():
                     Character_Count = row[1]
-                    selected_variable = row[3]
-                    # print(f"·Number of Characters: {int(Character_Count)}, Target Rate: {round(selected_variable, 3)*100}", "%")
-                    st.markdown('Number of Characters: {}, Target Rate: {}'.format(
-                        int(Character_Count), round(selected_variable*100, 3)))
         placeholder.empty()
         # print(time.time() - start_time)

 from bokeh.models.widgets import Div
 import email
+#from ipyfilechooser import FileChooser
+#from IPython.display import display
+from io import BytesIO
+from bs4 import BeautifulSoup
+import matplotlib.pyplot as plt
+import numpy as np
 def table_data():
         zip(training_dataset.industry, training_dataset.industry_code))
     return industry_code_dict
+def parse_email(uploaded_file):
+    parsed_email = []
+    efile = open(uploaded_file.name,'r')
+    emailstr = ""
+    for i, line in enumerate(efile):
+        emailstr += line
+    b = email.message_from_string(emailstr)
+    for part in b.walk():
+        if part.get_content_type():
+            body = str(part.get_payload())
+            soup = BeautifulSoup(body)
+            paragraphs = soup.find_all('body')
+            for paragraph in paragraphs:
+                parsed_email.append(paragraph.text)
+    return parsed_email
+def email_upload():
+    print("Please upload your email (In HTML Format)")
+#     upload = FileUpload(accept='.html', multiple=True)
+#     display(upload)
+#     return upload
+    fc = FileChooser()
+    display(fc)
+    return fc
+# New - In-Use
+def email_extractor(email_uploaded):
+    parse = parse_email(email_uploaded)
+    email_text = ''.join(parse).strip()
+    # extract the email body using string manipulation functions
+    email_body_start_index = email_text.find('Bright Apps LLC')
+    email_body_end_index = email_text.find('To read more')
+    email_body = email_text[email_body_start_index:email_body_end_index].strip()
+    # get rid of non-text elements
+    email_body = email_body.replace('\n', '')
+    email_body = email_body.replace('\t', '')
+    email_body = email_body.replace('\r', '')
+    email_body = email_body.replace('</b>', '')
+    email_body = email_body.replace('<b>', '')
+    email_body = email_body.replace('\xa0', '')
+    # find length of URLs if any
+    extractor = URLExtract()
+    urls = extractor.find_urls(email_body)
+    url_cnt = len(urls)
+    # remove URLs and get character count
+    body = re.sub(r'\w+:\/{2}[\d\w-]+(\.[\d\w-]+)*(?:(?:\/[^\s/]*))*', '', email_body)
+    sep = '©'
+    body = body.split(sep, 1)[0]
+    character_cnt = sum(not chr.isspace() for chr in body)
+    return email_body, character_cnt, url_cnt
 # extract email body from parse email
 def email_body_extractor(email_data):
     'Survey',
     'Newsletter',
     'Engagement',
+    'Usage_and_Consumption',
     'Review_Request',
     'Product_Announcement',
     'Abandoned_Cart'
 st.markdown("""---""")
+#char_reco_preference = st.selectbox(
+#    'Do you want to increase or decrease your character count in the email?',
+#    ["Increase", "Decrease"],
+#    index=1)
 def get_files_from_aws(bucket, prefix):
         # print("Getting Data Time: %s seconds" % (time.time() - start_time))
         industry_code_dict = get_industry_code_dict(email_data)
+        #uploaded_file = FileChooser(uploaded_file)
+        #bytes_data = uploaded_file.getvalue()
+        email_body, character_cnt, url_cnt = email_extractor(uploaded_file)
         # Start the prediction
         # Need to solve X test issue
                 character_cnt), unsafe_allow_html=True)
             # st.info('The model predicts that it achieves a {} of {}%'.format(target, str(round(output_rate*100,2))))
             if target == 'conversion_rate':
+                target_vis = 'Click_Through_Rate'
             else:
+                target_vis = 'Open_Rate'
             st.markdown('#### The model predicts that it achieves a <span style="color:blue">{}</span> of <span style="color:blue">{}</span>%'.format(
+                target_vis, str(round(output_rate*100, 3))), unsafe_allow_html=True)
             selected_industry_code = industry_code_dict.get(industry)
             if target == "click_to_open_rate":
             df_reco_sort = df_reco.sort_values(by=[selected_variable])
             df_reco = df_reco.drop_duplicates(subset=selected_variable)
+            #preference = char_reco_preference
+            #if preference == "Increase":
+            #    df_reco_opt = df_reco[(df_reco[selected_variable] > output_rate) & (
+            #        df_reco["character_cnt"] > character_cnt) & (df_reco["character_cnt"] <= (1.5*character_cnt))]
+            #    df_reco_opt_rank = df_reco_opt.nlargest(3, [selected_variable])
             # decrease character reco
+            #if preference == "Decrease":
+            #    df_reco_opt = df_reco[(df_reco[selected_variable] > output_rate) & (
+            #        df_reco["character_cnt"] < character_cnt)]
+            #    df_reco_opt_rank = df_reco_opt.nlargest(3, [selected_variable])
+            # split into two dataframes of higher and lower character_cnt (added apr 2023)
+            char_cnt_uploaded = character_cnt
+            df_reco_opt1 = df_reco[(df_reco[selected_variable] > output_rate) & (df_reco["character_cnt"] > char_cnt_uploaded) & (df_reco["character_cnt"] <= (1.5*char_cnt_uploaded))]
+            df_reco_opt2 = df_reco[(df_reco[selected_variable] > output_rate) & (df_reco["character_cnt"] < char_cnt_uploaded) & (df_reco["character_cnt"] >= (char_cnt_uploaded/2))]
+            # drop duplicates of character_cnt keeping the row with the highest output_rate
+            df_reco_opt1 = df_reco_opt1.sort_values(by=[selected_variable], ascending=False).drop_duplicates(subset=["character_cnt"])
+            df_reco_opt2 = df_reco_opt2.sort_values(by=[selected_variable], ascending=False).drop_duplicates(subset=["character_cnt"])
+            # get top 2 largest in higher and lower dataframe
+            df_reco_opt_rank1 = df_reco_opt1.nlargest(2, [selected_variable])
+            df_reco_opt_rank2 = df_reco_opt2.nlargest(2, [selected_variable])
+            df_reco_opt_rank = pd.concat([df_reco_opt_rank1, df_reco_opt_rank2])
+            df_reco_opt_rank = df_reco_opt_rank.nlargest(3,[selected_variable])
             if selected_variable == "Open_Rate":
                 selected_variable = "Click-to-Open_Rate"
                 st.markdown('#### You ve already achieved the highest, <span style="color:blue">{}</span>, with the current character count!'.format(
                     selected_variable), unsafe_allow_html=True)
             else:
+                #for _, row in df_reco_opt_rank.iterrows():
+                #    Character_Count = row[1]
+                #    selected_variable = row[3]
+                    # print(f"·Number of Characters: {int(Character_Count)}, Target Rate: {round(selected_variable, 3)*100}", "%")
+                #    st.markdown('Number of Characters: {}, Target Rate: {}'.format(
+                #        int(Character_Count), round(selected_variable*100, 3)))
+                chars = []
+                sel_var_values = []
                 for _, row in df_reco_opt_rank.iterrows():
                     Character_Count = row[1]
+                    selected_variable_number = row[3]
+                    chars.append(int(Character_Count))
+                    sel_var_values.append(round(selected_variable_number, 3)*100)
+                    st.write(f"·Number of Characters: {int(Character_Count)}, Target Rate: {round(round(selected_variable_number, 3)*100, 3)}", "%")
+                st.write("\n")
+                if len(chars) > 1:
+                    #fig = plt.figure()
+                    #ax = fig.add_axes([0,0,1,1])
+                    fig, ax = plt.subplots(figsize=(10,4))
+                    bars = ax.barh(np.arange(len(chars)), sel_var_values, height=0.175, color='#0F4D60')
+                    #ax.bar_label(bars)
+                    ax.set_yticks(np.arange(len(chars)))
+                    ax.set_yticklabels(np.array(chars), fontsize=14)
+                    ax.set_title('Character Counts vs. Target Variable Rates', fontsize=18)
+                    ax.set_ylabel('Character Counts', fontsize=16)
+                    ax.set_xlabel('Target Rates %', fontsize=16)
+                    for i, bar in enumerate(bars):
+                        rounded_value = round(sel_var_values[i], 2)
+                        ax.text(bar.get_width() + 0.3, bar.get_y() + bar.get_height()/2, str(rounded_value) + '%', ha='left', va='center', fontsize=12, fontweight='bold')
+                    ax.margins(0.1,0.05)
+                    biggest_bar_index = np.argmax(sel_var_values)
+                    bars[biggest_bar_index].set_color('#00BF93')
+                    st.plotly_chart(fig, use_container_width=True)
+                    st.write("\n")
+                    #st.write(np.array(chars))
+                chars_out = dict(zip(chars, sel_var_values))
+                sorted_chars_out = sorted(chars_out.items(), key=lambda x: x[1], reverse=True)
         placeholder.empty()
         # print(time.time() - start_time)