Spaces:
Runtime error
Runtime error
Andy Lau
commited on
Commit
·
b6af401
1
Parent(s):
5f9dcc2
working model CC
Browse files- app.py +262 -15
- main_app.py +15 -0
- models/models.sav +0 -0
- requirements.txt +1 -0
app.py
CHANGED
@@ -1,9 +1,55 @@
|
|
|
|
1 |
import streamlit as st
|
2 |
import pandas as pd
|
3 |
import PIL
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
4 |
|
5 |
from bokeh.models.widgets import Div
|
6 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
7 |
|
8 |
def url_button(button_name,url):
|
9 |
if st.button(button_name):
|
@@ -12,20 +58,89 @@ def url_button(button_name,url):
|
|
12 |
div = Div(text=html)
|
13 |
st.bokeh_chart(div)
|
14 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
15 |
|
|
|
|
|
|
|
|
|
|
|
|
|
16 |
|
17 |
-
|
|
|
|
|
|
|
18 |
|
19 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
20 |
|
21 |
-
|
22 |
-
|
23 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
24 |
|
25 |
stats_col1, stats_col2, stats_col3, stats_col4 = st.columns([1,1,1,1])
|
26 |
|
27 |
with stats_col1:
|
28 |
-
st.metric(label="Production", value="
|
29 |
with stats_col2:
|
30 |
st.metric(label="Accuracy", value="85%")
|
31 |
|
@@ -36,15 +151,26 @@ with stats_col4:
|
|
36 |
st.metric(label="Industry", value="Email")
|
37 |
|
38 |
|
39 |
-
with st.sidebar:
|
40 |
|
|
|
41 |
|
42 |
with st.expander('Model Description', expanded=False):
|
|
|
|
|
43 |
st.markdown('Finding the correct length for an email campaign to maximize user engagement can be an ambiguous task. The Loxz Character Count Model allows you to predict the correct length of your emails for a particular industry and a particular type of email. Using these inputs and trained on an extensive proprietary data set from the Loxz family digital archive, the models incorporate real-world and synthetic data to find the optimized character counts. We applied the random forest algorithm in this model. Bootstrapping was also ensembled in the algorithm which effectively prevents overfitting by reducing variance. The model achieves an 86% accuracy on the test set. This inference-based ML model will help the campaign engineers start with an acceptable length and zero in on the best character count, maximizing engagement in their campaign.')
|
44 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
45 |
|
46 |
-
url_button('Model Homepage','https://www.loxz.com/#/models/
|
47 |
-
url_button('Full Report','https://resources.loxz.com/reports/realtime-ml-character-count-model')
|
48 |
url_button('Amazon Market Place','https://aws.amazon.com/marketplace')
|
49 |
|
50 |
|
@@ -52,12 +178,12 @@ industry_lists = [
|
|
52 |
'Retail',
|
53 |
'Software and Technology',
|
54 |
'Hospitality',
|
55 |
-
'
|
56 |
'Healthcare',
|
57 |
'Energy',
|
58 |
'Real Estate',
|
59 |
'Entertainment',
|
60 |
-
'
|
61 |
]
|
62 |
|
63 |
campaign_types = [
|
@@ -74,12 +200,19 @@ campaign_types = [
|
|
74 |
]
|
75 |
|
76 |
target_variables = [
|
77 |
-
'
|
78 |
-
'
|
79 |
-
'abandoned_cart',
|
80 |
-
'unsubscribe_rate'
|
81 |
]
|
82 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
83 |
industry = st.selectbox(
|
84 |
'Please select your industry',
|
85 |
industry_lists
|
@@ -94,3 +227,117 @@ target = st.selectbox(
|
|
94 |
'Please select your target variable',
|
95 |
target_variables
|
96 |
)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
from ast import arg
|
2 |
import streamlit as st
|
3 |
import pandas as pd
|
4 |
import PIL
|
5 |
+
import re
|
6 |
+
from io import StringIO
|
7 |
+
import boto3
|
8 |
+
|
9 |
+
# from joblib import dump, load
|
10 |
+
|
11 |
+
import joblib
|
12 |
|
13 |
from bokeh.models.widgets import Div
|
14 |
|
15 |
+
import email
|
16 |
+
from urlextract import URLExtract
|
17 |
+
|
18 |
+
import main_app
|
19 |
+
|
20 |
+
|
21 |
+
def table_data():
|
22 |
+
# creating table data
|
23 |
+
field = [
|
24 |
+
'Data Scientist',
|
25 |
+
'Dataset',
|
26 |
+
'Algorithm',
|
27 |
+
'Framework',
|
28 |
+
'Ensemble',
|
29 |
+
'Domain',
|
30 |
+
'Model Size'
|
31 |
+
]
|
32 |
+
|
33 |
+
data = [
|
34 |
+
'Chen Song',
|
35 |
+
'Internal + Campaign monitor',
|
36 |
+
'Random Forest',
|
37 |
+
'Sci-kit learn',
|
38 |
+
'Bootstrapping',
|
39 |
+
'Bootstrapping Aggregation',
|
40 |
+
'4 KB'
|
41 |
+
]
|
42 |
+
|
43 |
+
data = {
|
44 |
+
'Field':field,
|
45 |
+
'Data':data
|
46 |
+
}
|
47 |
+
|
48 |
+
df = pd.DataFrame.from_dict(data)
|
49 |
+
|
50 |
+
return df
|
51 |
+
|
52 |
+
|
53 |
|
54 |
def url_button(button_name,url):
|
55 |
if st.button(button_name):
|
|
|
58 |
div = Div(text=html)
|
59 |
st.bokeh_chart(div)
|
60 |
|
61 |
+
def get_industry_code_dict(training_dataset):
|
62 |
+
training_dataset['industry_code'] = training_dataset['industry'].astype('category')
|
63 |
+
cat_columns = training_dataset.select_dtypes(['category']).columns
|
64 |
+
training_dataset[cat_columns] = training_dataset[cat_columns].apply(lambda x: x.cat.codes)
|
65 |
+
industry_code_dict = dict(zip(training_dataset.industry, training_dataset.industry_code))
|
66 |
+
return industry_code_dict
|
67 |
+
|
68 |
|
69 |
+
## extract email body from parse email
|
70 |
+
def email_body_extractor(email_data):
|
71 |
+
# email_data = parsed_email.data[0]
|
72 |
+
emailstr = email_data.decode("utf-8")
|
73 |
+
b = email.message_from_string(emailstr)
|
74 |
+
body = ""
|
75 |
|
76 |
+
if b.is_multipart():
|
77 |
+
for part in b.walk():
|
78 |
+
ctype = part.get_content_type()
|
79 |
+
cdispo = str(part.get('Content-Disposition'))
|
80 |
|
81 |
+
# skip any text/plain (txt) attachments
|
82 |
+
if ctype == 'text/plain' and 'attachment' not in cdispo:
|
83 |
+
body = part.get_payload() # decode
|
84 |
+
break
|
85 |
+
# not multipart - i.e. plain text, no attachments, keeping fingers crossed
|
86 |
+
else:
|
87 |
+
body = b.get_payload()
|
88 |
+
## Remove escape sequences
|
89 |
+
body = body.replace('\n', '')
|
90 |
+
body = body.replace('\t', '')
|
91 |
+
body = body.replace('\r', '')
|
92 |
+
body = body.replace('</b>', '')
|
93 |
+
body = body.replace('<b>', '')
|
94 |
|
95 |
+
|
96 |
+
## Extract urls in the email body and get url counts
|
97 |
+
extractor = URLExtract()
|
98 |
+
urls = extractor.find_urls(body)
|
99 |
+
url_cnt = len(urls)
|
100 |
+
## Remove urls
|
101 |
+
body = re.sub(r'\w+:\/{2}[\d\w-]+(\.[\d\w-]+)*(?:(?:\/[^\s/]*))*', '', body)
|
102 |
+
sep = '©'
|
103 |
+
body = body.split(sep, 1)[0]
|
104 |
+
character_cnt = sum(not chr.isspace() for chr in body)
|
105 |
+
|
106 |
+
return body, character_cnt, url_cnt
|
107 |
+
|
108 |
+
# def select_char_preference_variables():
|
109 |
+
# opt_list = ["Increase", "Decrease"]
|
110 |
+
# button_option = widgets.RadioButtons(options = opt_list)
|
111 |
+
# print("Do you want to increase or decrease your character count in the email?")
|
112 |
+
# display(button_option)
|
113 |
+
# return button_option
|
114 |
+
|
115 |
+
|
116 |
+
def add_bg_from_url():
|
117 |
+
st.markdown(
|
118 |
+
f"""
|
119 |
+
<style>
|
120 |
+
.stApp {{
|
121 |
+
background-image: linear-gradient(#0A3144,#126072,#1C8D99);
|
122 |
+
background-attachment: fixed;
|
123 |
+
background-size: cover
|
124 |
+
}}
|
125 |
+
</style>
|
126 |
+
""",
|
127 |
+
unsafe_allow_html=True
|
128 |
+
)
|
129 |
+
|
130 |
+
# add_bg_from_url()
|
131 |
+
|
132 |
+
st.markdown("# Character Count: Email Industry")
|
133 |
+
|
134 |
+
# col1, col2, col3 = st.columns([1,1,1])
|
135 |
+
|
136 |
+
# with col2:
|
137 |
+
# img = PIL.Image.open("figures/ModelCC_solid.png")
|
138 |
+
# st.image(img)
|
139 |
|
140 |
stats_col1, stats_col2, stats_col3, stats_col4 = st.columns([1,1,1,1])
|
141 |
|
142 |
with stats_col1:
|
143 |
+
st.metric(label="Production", value="Production")
|
144 |
with stats_col2:
|
145 |
st.metric(label="Accuracy", value="85%")
|
146 |
|
|
|
151 |
st.metric(label="Industry", value="Email")
|
152 |
|
153 |
|
|
|
154 |
|
155 |
+
with st.sidebar:
|
156 |
|
157 |
with st.expander('Model Description', expanded=False):
|
158 |
+
img = PIL.Image.open("figures/ModelCC.png")
|
159 |
+
st.image(img)
|
160 |
st.markdown('Finding the correct length for an email campaign to maximize user engagement can be an ambiguous task. The Loxz Character Count Model allows you to predict the correct length of your emails for a particular industry and a particular type of email. Using these inputs and trained on an extensive proprietary data set from the Loxz family digital archive, the models incorporate real-world and synthetic data to find the optimized character counts. We applied the random forest algorithm in this model. Bootstrapping was also ensembled in the algorithm which effectively prevents overfitting by reducing variance. The model achieves an 86% accuracy on the test set. This inference-based ML model will help the campaign engineers start with an acceptable length and zero in on the best character count, maximizing engagement in their campaign.')
|
161 |
|
162 |
+
with st.expander('Model Information', expanded=False):
|
163 |
+
hide_table_row_index = """
|
164 |
+
<style>
|
165 |
+
thead tr th:first-child {display:none}
|
166 |
+
tbody th {display:none}
|
167 |
+
</style>
|
168 |
+
"""
|
169 |
+
st.markdown(hide_table_row_index, unsafe_allow_html=True)
|
170 |
+
st.table(table_data())
|
171 |
|
172 |
+
url_button('Model Homepage','https://www.loxz.com/#/models/CTA')
|
173 |
+
# url_button('Full Report','https://resources.loxz.com/reports/realtime-ml-character-count-model')
|
174 |
url_button('Amazon Market Place','https://aws.amazon.com/marketplace')
|
175 |
|
176 |
|
|
|
178 |
'Retail',
|
179 |
'Software and Technology',
|
180 |
'Hospitality',
|
181 |
+
'Academic and Education',
|
182 |
'Healthcare',
|
183 |
'Energy',
|
184 |
'Real Estate',
|
185 |
'Entertainment',
|
186 |
+
'Finance and Banking'
|
187 |
]
|
188 |
|
189 |
campaign_types = [
|
|
|
200 |
]
|
201 |
|
202 |
target_variables = [
|
203 |
+
'conversion_rate',
|
204 |
+
'click_to_open_rate'
|
|
|
|
|
205 |
]
|
206 |
|
207 |
+
uploaded_file = st.file_uploader("Please upload your email (In HTML Format)", type=["html"])
|
208 |
+
|
209 |
+
if uploaded_file is None:
|
210 |
+
# upload_img = PIL.Image.open(uploaded_file)
|
211 |
+
upload_img = None
|
212 |
+
# else:
|
213 |
+
# upload_img = None
|
214 |
+
|
215 |
+
|
216 |
industry = st.selectbox(
|
217 |
'Please select your industry',
|
218 |
industry_lists
|
|
|
227 |
'Please select your target variable',
|
228 |
target_variables
|
229 |
)
|
230 |
+
|
231 |
+
st.markdown("""---""")
|
232 |
+
|
233 |
+
char_reco_preference = st.selectbox(
|
234 |
+
'Do you want to increase or decrease your character count in the email?',
|
235 |
+
["Increase", "Decrease"])
|
236 |
+
|
237 |
+
|
238 |
+
def get_files_from_aws(bucket,prefix):
|
239 |
+
"""
|
240 |
+
get files from aws s3 bucket
|
241 |
+
|
242 |
+
bucket (STRING): bucket name
|
243 |
+
prefix (STRING): file location in s3 bucket
|
244 |
+
"""
|
245 |
+
s3_client = boto3.client('s3',
|
246 |
+
aws_access_key_id = st.secrets["aws_id"],
|
247 |
+
aws_secret_access_key = st.secrets["aws_key"])
|
248 |
+
|
249 |
+
file_obj = s3_client.get_object(Bucket=bucket,Key=prefix)
|
250 |
+
body = file_obj['Body']
|
251 |
+
string = body.read().decode('utf-8')
|
252 |
+
|
253 |
+
df = pd.read_csv(StringIO(string))
|
254 |
+
|
255 |
+
return df
|
256 |
+
|
257 |
+
|
258 |
+
|
259 |
+
|
260 |
+
if st.button('Generate Predictions'):
|
261 |
+
if uploaded_file is None:
|
262 |
+
st.error('Please upload a email (HTML format)')
|
263 |
+
else:
|
264 |
+
placeholder = st.empty()
|
265 |
+
placeholder.text('Loading Data')
|
266 |
+
|
267 |
+
# Starting predictions
|
268 |
+
model = joblib.load('models/models.sav')
|
269 |
+
print(type(model))
|
270 |
+
# Generate Email Data
|
271 |
+
email_data = get_files_from_aws('emailcampaigntrainingdata','trainingdata/email_dataset_training.csv')
|
272 |
+
acc_data = get_files_from_aws('emailcampaigntrainingdata','trainingdata/email_dataset_training_raw.csv')
|
273 |
+
|
274 |
+
email_data_ = email_data[["email_body", "industry", "campaign_type","character_cnt", "url_cnt","Open_Rate", "Click_Through_Rate"]]
|
275 |
+
email_data_ = email_data_.rename({'Open_Rate': 'Click-to-open_Rate', 'Click_Through_Rate': 'Conversion_Rate'})
|
276 |
+
df_email_data = email_data_.rename(columns={'Open_Rate': 'Click-to-open_Rate', 'Click_Through_Rate': 'Conversion_Rate'})
|
277 |
+
|
278 |
+
# Dataset:
|
279 |
+
training_dataset = get_files_from_aws('emailcampaigntrainingdata','modelCC/training.csv')
|
280 |
+
X_test = get_files_from_aws('emailcampaigntrainingdata','modelCC/Xtest.csv')
|
281 |
+
Y_test = get_files_from_aws('emailcampaigntrainingdata','modelCC/ytest.csv')
|
282 |
+
|
283 |
+
|
284 |
+
industry_code_dict = get_industry_code_dict(email_data)
|
285 |
+
|
286 |
+
bytes_data = uploaded_file.getvalue()
|
287 |
+
|
288 |
+
email_body, character_cnt, url_cnt = email_body_extractor(bytes_data)
|
289 |
+
|
290 |
+
# Start the prediction
|
291 |
+
# Need to solve X test issue
|
292 |
+
|
293 |
+
# y_pred = model.predict(X_test)
|
294 |
+
df_uploaded = pd.DataFrame(columns=['character_cnt', "url_cnt", "industry"])
|
295 |
+
df_uploaded.loc[0] = [character_cnt, url_cnt, industry]
|
296 |
+
df_uploaded["industry_code"] = industry_code_dict.get(industry)
|
297 |
+
df_uploaded_test = df_uploaded[["industry_code", "character_cnt", "url_cnt"]]
|
298 |
+
predicted_rate = model.predict(df_uploaded_test)[0]
|
299 |
+
output_rate = round(predicted_rate*100,2)
|
300 |
+
print(output_rate)
|
301 |
+
# output_rate = 0.5
|
302 |
+
if output_rate < 0:
|
303 |
+
print("Sorry, Current model couldn't provide predictions on the target variable you selected.")
|
304 |
+
else:
|
305 |
+
st.info('Current Character Count in Your Email is: {}'.format(character_cnt))
|
306 |
+
st.info('The model predicts that it achieves a {} of {}%'.format(target, str(output_rate)))
|
307 |
+
|
308 |
+
# print(target)
|
309 |
+
if target == "click_to_open_rate":
|
310 |
+
selected_variable = "Open_Rate"
|
311 |
+
if target == "conversion_rate":
|
312 |
+
selected_variable = "Click_Through_Rate"
|
313 |
+
|
314 |
+
df_reco = training_dataset[["industry_code", "character_cnt", "url_cnt", selected_variable]]
|
315 |
+
df_reco = df_reco[df_reco["industry_code"] == industry]
|
316 |
+
df_reco[selected_variable]=df_reco[selected_variable].apply(lambda x:round(x, 3))
|
317 |
+
df_reco_sort = df_reco.sort_values(by=[selected_variable])
|
318 |
+
df_reco = df_reco.drop_duplicates(subset=selected_variable)
|
319 |
+
|
320 |
+
if char_reco_preference == "Increase":
|
321 |
+
df_reco_opt = df_reco[(df_reco[selected_variable] > output_rate) & (df_reco["character_cnt"] > character_cnt) & (df_reco["character_cnt"] <= (1.5*character_cnt))]
|
322 |
+
df_reco_opt_rank = df_reco_opt.nlargest(3,[selected_variable])
|
323 |
+
else:
|
324 |
+
df_reco_opt = df_reco[(df_reco[selected_variable] > output_rate) & (df_reco["character_cnt"] < character_cnt)]
|
325 |
+
df_reco_opt_rank = df_reco_opt.nlargest(3,[selected_variable])
|
326 |
+
|
327 |
+
if selected_variable == "Open_Rate":
|
328 |
+
selected_variable = "Click-to-Open_Rate"
|
329 |
+
if selected_variable == "Click_Through_Rate":
|
330 |
+
selected_variable = "Conversion_Rate"
|
331 |
+
|
332 |
+
st.info('To get higher, {},the model recommends the following options:'.format(selected_variable))
|
333 |
+
if len(df_reco_opt_rank) == 0:
|
334 |
+
st.info('You ve already achieved the highest, {}, with the current character count!'.format(selected_variable))
|
335 |
+
else:
|
336 |
+
for _, row in df_reco_opt_rank.iterrows():
|
337 |
+
Character_Count = row[1]
|
338 |
+
selected_variable = row[3]
|
339 |
+
# print(f"·Number of Characters: {int(Character_Count)}, Target Rate: {round(selected_variable, 3)*100}", "%")
|
340 |
+
st.info('Number of Characters: {}, Target Rate: {}'.format(int(Character_Count), round(selected_variable, 3)*100))
|
341 |
+
|
342 |
+
|
343 |
+
placeholder.empty()
|
main_app.py
ADDED
@@ -0,0 +1,15 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import streamlit as st
|
2 |
+
|
3 |
+
st.set_page_config(layout="wide")
|
4 |
+
|
5 |
+
st.markdown(
|
6 |
+
"""
|
7 |
+
<style>
|
8 |
+
body {
|
9 |
+
background-image: linear-gradient(#2e7bcf,#2e7bcf);
|
10 |
+
color: white;
|
11 |
+
}
|
12 |
+
</style>
|
13 |
+
""",
|
14 |
+
unsafe_allow_html=True,
|
15 |
+
)
|
models/models.sav
ADDED
Binary file (3.22 kB). View file
|
|
requirements.txt
CHANGED
@@ -1 +1,2 @@
|
|
1 |
bokeh==2.4.1
|
|
|
|
1 |
bokeh==2.4.1
|
2 |
+
joblib
|