Spaces:
Runtime error
Runtime error
Andy Lau
commited on
Commit
•
af1cbd8
1
Parent(s):
282fc84
added standard file
Browse files- FunctionsModelSA_V1.py +377 -0
- app.py +174 -0
- figures/ModelSA.png +0 -0
- main_app.py +15 -0
- requirements.txt +4 -0
- utils.py +84 -0
FunctionsModelSA_V1.py
ADDED
@@ -0,0 +1,377 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import s3fs
|
2 |
+
import pandas as pd
|
3 |
+
import numpy as np
|
4 |
+
from numpy import arange
|
5 |
+
from colour import Color
|
6 |
+
import plotly.graph_objects as go
|
7 |
+
from nltk import tokenize
|
8 |
+
from IPython.display import Markdown
|
9 |
+
from PIL import ImageColor
|
10 |
+
from vaderSentiment.vaderSentiment import SentimentIntensityAnalyzer
|
11 |
+
import nltk
|
12 |
+
nltk.download('punkt')
|
13 |
+
import email
|
14 |
+
import codecs
|
15 |
+
import pickle
|
16 |
+
import string
|
17 |
+
from scipy import spatial
|
18 |
+
import re
|
19 |
+
import pytorch_lightning as pl
|
20 |
+
from bs4 import BeautifulSoup
|
21 |
+
import ipywidgets as widgets
|
22 |
+
from ipywidgets import FileUpload
|
23 |
+
from urlextract import URLExtract
|
24 |
+
from transformers import BertTokenizerFast as BertTokenizer, BertModel, BertConfig
|
25 |
+
import torch.nn as nn
|
26 |
+
import torch
|
27 |
+
from ipywidgets import interact, Dropdown
|
28 |
+
import boto3
|
29 |
+
# from sagemaker import get_execution_role
|
30 |
+
from scipy import spatial
|
31 |
+
from ipyfilechooser import FileChooser
|
32 |
+
import streamlit as st
|
33 |
+
import utils
|
34 |
+
|
35 |
+
|
36 |
+
PARAMS={
|
37 |
+
'BATCH_SIZE': 8,
|
38 |
+
'MAX_TOKEN_COUNT':100,
|
39 |
+
'BERT_MODEL_NAME':'google/bert_uncased_L-2_H-128_A-2' ,
|
40 |
+
'N_EPOCHS': 10,
|
41 |
+
'n_classes':8,
|
42 |
+
'LABEL_COLUMNS': ['label_analytical', 'label_casual', 'label_confident', 'label_friendly',
|
43 |
+
'label_joyful', 'label_opstimistic', 'label_respectful',
|
44 |
+
'label_urgent'],
|
45 |
+
'TEXTCOL': 'text',
|
46 |
+
'rf_labels':['label_analytical', 'label_casual', 'label_confident',
|
47 |
+
'label_friendly', 'label_joyful', 'label_opstimistic',
|
48 |
+
'label_respectful', 'label_urgent',
|
49 |
+
'industry_Academic and Education', 'industry_Energy',
|
50 |
+
'industry_Entertainment', 'industry_Finance and Banking',
|
51 |
+
'industry_Healthcare', 'industry_Hospitality', 'industry_Real Estate',
|
52 |
+
'industry_Retail', 'industry_Software and Technology',
|
53 |
+
'campaign_type_Abandoned_Cart', 'campaign_type_Engagement',
|
54 |
+
'campaign_type_Newsletter', 'campaign_type_Product_Announcement',
|
55 |
+
'campaign_type_Promotional', 'campaign_type_Review_Request',
|
56 |
+
'campaign_type_Survey', 'campaign_type_Transactional',
|
57 |
+
'campaign_type_Usage_and_Consumption', 'campaign_type_Webinar']
|
58 |
+
}
|
59 |
+
|
60 |
+
# # CI_rates=pd.read_csv('CI_RATES.csv')
|
61 |
+
# s3://emailcampaignmodeldata/ModelSADataSets/CI_RATES.csv
|
62 |
+
CI_rates = utils.get_files_from_aws('emailcampaignmodeldata','ModelSADataSets/CI_RATES.csv')
|
63 |
+
|
64 |
+
### create file uploading widget
|
65 |
+
def email_upload():
|
66 |
+
print("Please upload your email (In EML Format)")
|
67 |
+
upload = FileUpload(accept='.eml', multiple=True)
|
68 |
+
display(upload)
|
69 |
+
return upload
|
70 |
+
|
71 |
+
def parse_email(uploaded_file):
|
72 |
+
check=[]
|
73 |
+
filename = list(uploaded_file.value.keys())[0]
|
74 |
+
email_body_str = codecs.decode(uploaded_file.value[filename]['content'], encoding="utf-8")
|
75 |
+
b=email.message_from_string(email_body_str)
|
76 |
+
for part in b.walk():
|
77 |
+
if part.get_content_type():
|
78 |
+
body = str(part.get_payload())
|
79 |
+
soup = BeautifulSoup(body)
|
80 |
+
paragraphs = soup.find_all('body')
|
81 |
+
for paragraph in paragraphs:
|
82 |
+
check.append(paragraph.text)
|
83 |
+
file="".join(check)
|
84 |
+
return file
|
85 |
+
|
86 |
+
|
87 |
+
def text_clean(x,punct=True):
|
88 |
+
|
89 |
+
### Light
|
90 |
+
x = x.lower() # lowercase everything
|
91 |
+
x = x.encode('ascii', 'ignore').decode() # remove unicode characters
|
92 |
+
x = re.sub(r'https*\S+', ' ', x) # remove links
|
93 |
+
x = re.sub(r'http*\S+', ' ', x)
|
94 |
+
# cleaning up text
|
95 |
+
x = re.sub(r'\'\w+', ' ', x)
|
96 |
+
x = re.sub(r'\w*\d+\w*', ' ', x)
|
97 |
+
x = re.sub(r'\s{2,}', ' ', x)
|
98 |
+
x = re.sub(r'\s[^\w\s]\s', ' ', x)
|
99 |
+
|
100 |
+
### Heavy
|
101 |
+
x = re.sub(r'@\S', ' ', x)
|
102 |
+
x = re.sub(r'#\S+', ' ', x)
|
103 |
+
x=x.replace('=',' ')
|
104 |
+
if(punct==True):
|
105 |
+
x = re.sub('[%s]' % re.escape(string.punctuation), ' ', x)
|
106 |
+
# remove single letters and numbers surrounded by space
|
107 |
+
x = re.sub(r'\s[a-z]\s|\s[0-9]\s', ' ', x)
|
108 |
+
clean=[' Â\x8a','\t','\n','Ã\x83','Â\x92','Â\x93','Â\x8a','Â\x95']
|
109 |
+
for y in clean:
|
110 |
+
x=x.replace(y,'')
|
111 |
+
|
112 |
+
return x
|
113 |
+
|
114 |
+
####BERT MODEL LOAD REQUIRMENTS#########
|
115 |
+
|
116 |
+
|
117 |
+
class ToneTagger(pl.LightningModule):
|
118 |
+
|
119 |
+
def __init__(self, n_classes: int, n_training_steps=None, n_warmup_steps=None):
|
120 |
+
super().__init__()
|
121 |
+
self.bert = BertModel.from_pretrained(PARAMS['BERT_MODEL_NAME'], return_dict=True)
|
122 |
+
self.classifier = nn.Linear(self.bert.config.hidden_size, n_classes)
|
123 |
+
self.n_training_steps = n_training_steps
|
124 |
+
self.n_warmup_steps = n_warmup_steps
|
125 |
+
self.criterion = nn.BCELoss()
|
126 |
+
|
127 |
+
def forward(self, input_ids, attention_mask):
|
128 |
+
output = self.bert(input_ids,attention_mask)
|
129 |
+
output = self.classifier(output.pooler_output)
|
130 |
+
output = torch.sigmoid(output)
|
131 |
+
return output
|
132 |
+
|
133 |
+
|
134 |
+
# LOAD IN PRE TRAINED MODEL WITH WEIGHTS
|
135 |
+
model=ToneTagger(8) # load up the model archetecture with 8 different tones
|
136 |
+
model.load_state_dict(torch.load("models/SAMODEL")) # populate the weights of the model
|
137 |
+
model.eval()
|
138 |
+
|
139 |
+
|
140 |
+
|
141 |
+
def bert_tones(text_sentences,model):
|
142 |
+
""" This function takes in setences and the model cleaned them then predicts the bert tones"""
|
143 |
+
predictions=[]
|
144 |
+
text=[]
|
145 |
+
|
146 |
+
tokenizer = BertTokenizer.from_pretrained('google/bert_uncased_L-2_H-128_A-2')
|
147 |
+
for sent in text_sentences:
|
148 |
+
text.append(text_clean(sent,False))
|
149 |
+
cleaned_text=text_clean(sent)
|
150 |
+
encoding = tokenizer.encode_plus(
|
151 |
+
cleaned_text,
|
152 |
+
add_special_tokens=True,
|
153 |
+
max_length=100,
|
154 |
+
return_token_type_ids=False,
|
155 |
+
padding="max_length",
|
156 |
+
truncation=True,
|
157 |
+
return_attention_mask=True,
|
158 |
+
return_tensors='pt',
|
159 |
+
)
|
160 |
+
with torch.no_grad():
|
161 |
+
inputs=encoding['input_ids']
|
162 |
+
attention=encoding['attention_mask']
|
163 |
+
pred=model(inputs,attention)
|
164 |
+
pred=pred.cpu().numpy()
|
165 |
+
predictions.append(np.array(pred[0]))
|
166 |
+
|
167 |
+
return text,predictions
|
168 |
+
|
169 |
+
|
170 |
+
def convert_text_to_tone(text,model=model,params=PARAMS):
|
171 |
+
""" This Function will convert the text to tone, it takes in the text with punctuations seperates it into senteces"""
|
172 |
+
data=[]
|
173 |
+
# Find the sentiment from vader sentiment analyzer (Not currently in use)
|
174 |
+
sid_obj = SentimentIntensityAnalyzer()
|
175 |
+
total_cleaned=text_clean(text)
|
176 |
+
sentiment_dict = sid_obj.polarity_scores(total_cleaned)# Find the sentiment from
|
177 |
+
|
178 |
+
|
179 |
+
text_sentences=tokenize.sent_tokenize(text) #Find all the different sentences through the NLTK library
|
180 |
+
|
181 |
+
|
182 |
+
plain_text,predictions=bert_tones(text_sentences,model)
|
183 |
+
|
184 |
+
data.append([plain_text,sentiment_dict,predictions])
|
185 |
+
final=pd.DataFrame(data,columns=['text','sentiment','sentencetone'])
|
186 |
+
agg_tones=final['sentencetone'].apply(np.mean,axis=0)
|
187 |
+
tones=pd.DataFrame(agg_tones.tolist(),columns=params['LABEL_COLUMNS'])
|
188 |
+
return final,tones
|
189 |
+
|
190 |
+
|
191 |
+
### This will be abstracted away to a more dynamic model
|
192 |
+
brf='Rate_Models/bounce_rate_model.sav'
|
193 |
+
BRM = pickle.load(open(brf, 'rb'))
|
194 |
+
orf='Rate_Models/open_rate_model.sav'
|
195 |
+
ORM = pickle.load(open(orf, 'rb'))
|
196 |
+
urf='Rate_Models/unsubscribe_rate_model.sav'
|
197 |
+
URM = pickle.load(open(urf, 'rb'))
|
198 |
+
crf='Rate_Models/click_trough_rate_model.sav'
|
199 |
+
CRM = pickle.load(open(crf, 'rb'))
|
200 |
+
CV='Rate_Models/Conversion_rate.sav'
|
201 |
+
ConM = pickle.load(open(CV, 'rb'))
|
202 |
+
CTOR='Rate_Models/Click-To-Open_Rates.sav'
|
203 |
+
CTORM = pickle.load(open(CTOR, 'rb'))
|
204 |
+
RV='Rate_Models/Revenue_per_email.sav'
|
205 |
+
RVM = pickle.load(open(RV, 'rb'))
|
206 |
+
|
207 |
+
model_dict={'Open_Rate':ORM,
|
208 |
+
'Click_Through_Rate': CRM,
|
209 |
+
'Unsubscribe_Rate': URM,
|
210 |
+
'Bounce_Rate':BRM,
|
211 |
+
'Click_To_Open_Rate': CTORM,
|
212 |
+
'Conversion_Rate': ConM,
|
213 |
+
'Revenue_Per_Email':RVM}
|
214 |
+
|
215 |
+
|
216 |
+
|
217 |
+
def plot_CI(pred,lower,upper,scale_factor=0.5,streamlit=False):
|
218 |
+
"""This function plots the confidence intervals of your prediction
|
219 |
+
pred- The prediction varaible given from the Random Forest for the target variable
|
220 |
+
lower- The lower half of the prediction confidence interval
|
221 |
+
upper- The upper half of the confidence interval
|
222 |
+
scale_factor- This will modify the size of the graph """
|
223 |
+
|
224 |
+
|
225 |
+
title=f'The Predicted Value is {pred}'
|
226 |
+
fig = go.Figure()
|
227 |
+
fig.update_xaxes(showgrid=False)
|
228 |
+
fig.update_yaxes(showgrid=False,
|
229 |
+
zeroline=True, zerolinecolor='black', zerolinewidth=3,
|
230 |
+
showticklabels=False)
|
231 |
+
fig.update_layout(height=200, plot_bgcolor='white')
|
232 |
+
fig.add_trace(go.Scatter(
|
233 |
+
x=[pred], y=[0,0], mode='markers', marker_size=10,line=dict(color="red")
|
234 |
+
))
|
235 |
+
fig.update_layout(xaxis_range=[0,upper+upper*scale_factor])
|
236 |
+
fig.update_layout(showlegend=False)
|
237 |
+
fig.add_vline(x=lower,annotation_text=f"{lower}",annotation_position="top")
|
238 |
+
fig.add_vline(x=upper,annotation_text=f"{upper}",annotation_position="top")
|
239 |
+
fig.add_vrect(lower,upper,fillcolor='red',opacity=0.25,annotation_text='95% CI',annotation_position="outside top")
|
240 |
+
fig.update_layout(title_text=title, title_x=0.5)
|
241 |
+
|
242 |
+
if streamlit:
|
243 |
+
st.plotly_chart(fig)
|
244 |
+
else:
|
245 |
+
fig.show()
|
246 |
+
|
247 |
+
def find_max_cat(df,target,industry,campaign):
|
248 |
+
d=df[(df[campaign]==1) & (df[industry]==1)]
|
249 |
+
if(len(d)>0):
|
250 |
+
rec=df.loc[d[target].idxmax()][3:11]
|
251 |
+
return round(d[target].max(),3),rec
|
252 |
+
else:
|
253 |
+
return 0,0
|
254 |
+
|
255 |
+
def recommend(tones,recommend_changes,change,target,streamlit=False):
|
256 |
+
''' This function creates the recomended changes plots it takes it the tones, the changes and '''
|
257 |
+
fig = go.Figure()
|
258 |
+
fig.add_trace(go.Bar(
|
259 |
+
y=tones.columns,
|
260 |
+
x=tones.values[0],
|
261 |
+
name='Current Tones',
|
262 |
+
orientation='h',
|
263 |
+
# text=np.round(tones.values[0],3),
|
264 |
+
width=.5,
|
265 |
+
marker=dict(
|
266 |
+
color='#00e6b1',
|
267 |
+
line=dict(color='rgba(58, 71, 80, 1.0)', width=3)
|
268 |
+
)
|
269 |
+
|
270 |
+
))
|
271 |
+
fig.add_trace(go.Bar(
|
272 |
+
y=tones.columns,
|
273 |
+
x=recommend_changes,
|
274 |
+
name='Recommend changes',
|
275 |
+
orientation='h',
|
276 |
+
text=np.round(recommend_changes,3),
|
277 |
+
width=0.3,
|
278 |
+
marker=dict(
|
279 |
+
color='#e60f00',
|
280 |
+
line=dict(color='rgba(58, 71, 80, 1.0)', width=3)
|
281 |
+
)
|
282 |
+
))
|
283 |
+
fig.update_traces(textfont_size=18, textposition="outside", cliponaxis=False)
|
284 |
+
fig.update_layout(height=700, plot_bgcolor='white')
|
285 |
+
fig.update_layout(barmode='stack', yaxis={'categoryorder':'array','categoryarray': recommend_changes.sort_values(key=abs,ascending=True).index})
|
286 |
+
fig.update_layout(title_text=f'The following Changes will yield a {round(change,3)} increase in {target}')
|
287 |
+
|
288 |
+
if streamlit:
|
289 |
+
st.plotly_chart(fig)
|
290 |
+
else:
|
291 |
+
fig.show()
|
292 |
+
|
293 |
+
|
294 |
+
|
295 |
+
def prediction(tones,campaign_val,industry_val,target):
|
296 |
+
model_val=pd.DataFrame(tones,columns=PARAMS['rf_labels']).fillna(0)
|
297 |
+
model_val.loc[0,campaign_val]=1
|
298 |
+
model_val.loc[0,industry_val]=1
|
299 |
+
model=model_dict[target]
|
300 |
+
pred=model.predict(model_val)[0]
|
301 |
+
CI=CI_rates[CI_rates['model']==target]
|
302 |
+
lower=pred+CI['2_5'].values[0]
|
303 |
+
higher=pred+CI['97_5'].values[0]
|
304 |
+
return round(pred,3),round(lower,3),round(higher,3),model
|
305 |
+
|
306 |
+
def load_data(buckets,key):
|
307 |
+
# data_location='Tone_and_target.csv'
|
308 |
+
# data=pd.read_csv(data_location)
|
309 |
+
df=utils.get_files_from_aws(buckets,key)
|
310 |
+
df_unique = df.drop_duplicates()
|
311 |
+
df_unique = pd.get_dummies(df_unique, columns=['industry','campaign_type'])
|
312 |
+
df_data=df_unique.drop(columns=['Unnamed: 0','body'])
|
313 |
+
df_data=df_data.rename(columns={'Click-To-Open Rates':'Click_To_Open_Rate','Conversion Rate':'Conversion_Rate','Revenue Per email':'Revenue_Per_Email'})
|
314 |
+
return df_data
|
315 |
+
|
316 |
+
|
317 |
+
def plot_table(sorted_setence_tuple,streamlit=True):
|
318 |
+
""" Plots the bottom most table, takes in a list of tuples where the tuple is the sentence the sentiment distance
|
319 |
+
from the best values """
|
320 |
+
sentences=list(zip(*sorted_setence_tuple))[0]
|
321 |
+
scores= list(zip(*sorted_setence_tuple))[1]
|
322 |
+
colors= list(zip(*sorted_setence_tuple))[2]
|
323 |
+
rbg_list=[]
|
324 |
+
for i in colors:
|
325 |
+
rbg_list.append('rgb'+str(i))
|
326 |
+
fig = go.Figure(data=[go.Table(
|
327 |
+
header=dict(values=['<b>Sentences</b>', '<b>Difference from Recommended Tone</b>'],
|
328 |
+
line_color = 'darkslategray',
|
329 |
+
fill_color = '#010405',
|
330 |
+
align = 'center',
|
331 |
+
font=dict(family="Metropolis",color='white', size=16)),
|
332 |
+
cells=dict(values=[sentences, # 1st column
|
333 |
+
scores] , # 2nd column
|
334 |
+
line_color='darkslategray',
|
335 |
+
fill_color=[rbg_list],
|
336 |
+
align=['left','center'],
|
337 |
+
font=dict(family="Arial",size=12)))
|
338 |
+
])
|
339 |
+
|
340 |
+
if streamlit:
|
341 |
+
st.plotly_chart(fig)
|
342 |
+
else:
|
343 |
+
fig.show()
|
344 |
+
|
345 |
+
def corrections(best,df,streamlit=False):
|
346 |
+
"""This function finds the the difference between the tone of each sentence and the best tone for the desired metric
|
347 |
+
best- tone values of the best email for the current categories
|
348 |
+
df- dataframe of the sentences of the uploaded email and the """
|
349 |
+
sentence_order=[]
|
350 |
+
colors=['#48f0c9','#6ef5d6','#94f7e1','#bbfaec','#e6fff9','#ffe7e6','#ffc3bf','#ffa099','#ff7c73','#ff584d'] #loxz green primary to Loxz light red
|
351 |
+
for i in range(len(df['sentencetone'][0])):
|
352 |
+
text=df['text'][0][i]
|
353 |
+
cur=df['sentencetone'][0][i]
|
354 |
+
cosine_distance= spatial.distance.cosine(best,cur)
|
355 |
+
distance=cosine_distance # Cosine distance
|
356 |
+
new_value = round(( (distance - 0) / (1 - 0) ) * (100 - 0) + 0) # for distance metric this is just normalizing the varaible
|
357 |
+
color_value=round(( (distance - 0) / (1 - 0) ) * (10 - 0) + 0) # Color whell value
|
358 |
+
col=colors[(color_value)]
|
359 |
+
rbg=ImageColor.getcolor(f'{col}', "RGB")
|
360 |
+
sentence_order.append((text,new_value,rbg))
|
361 |
+
sorted_sentences=sorted(sentence_order,key=lambda x: x[1],reverse=True)
|
362 |
+
plot_table(sorted_sentences,streamlit)
|
363 |
+
|
364 |
+
def read_file(fc):
|
365 |
+
with open(fc.selected) as file: # Use file to refer to the file object
|
366 |
+
data = file.read()
|
367 |
+
check=[]
|
368 |
+
b=email.message_from_string(data)
|
369 |
+
for part in b.walk():
|
370 |
+
if part.get_content_type():
|
371 |
+
body = str(part.get_payload())
|
372 |
+
soup = BeautifulSoup(body)
|
373 |
+
paragraphs = soup.find_all('body')
|
374 |
+
for paragraph in paragraphs:
|
375 |
+
check.append(paragraph.text)
|
376 |
+
file="".join(check)
|
377 |
+
return file
|
app.py
ADDED
@@ -0,0 +1,174 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
from ast import arg
|
2 |
+
import FunctionsModelSA_V1
|
3 |
+
import streamlit as st
|
4 |
+
import pandas as pd
|
5 |
+
import PIL
|
6 |
+
# import re
|
7 |
+
# from io import StringIO
|
8 |
+
# import boto3
|
9 |
+
import time
|
10 |
+
|
11 |
+
import main_app
|
12 |
+
import utils
|
13 |
+
|
14 |
+
def table_data():
|
15 |
+
# creating table data
|
16 |
+
field = [
|
17 |
+
'Data Scientist',
|
18 |
+
'Dataset',
|
19 |
+
'Algorithm',
|
20 |
+
'Framework',
|
21 |
+
'Ensemble',
|
22 |
+
'Domain',
|
23 |
+
'Model Size'
|
24 |
+
]
|
25 |
+
|
26 |
+
data = [
|
27 |
+
'Jeffrey Ott',
|
28 |
+
'Internal + Campaign monitor',
|
29 |
+
'BERT_Uncased_L_2_H_128_A-2, Single Linear Layer Neural Network, Random Forest',
|
30 |
+
'Pytorch',
|
31 |
+
'Bootstrapping',
|
32 |
+
'NLP Text Classification',
|
33 |
+
'16.8 MB'
|
34 |
+
]
|
35 |
+
|
36 |
+
data = {
|
37 |
+
'Field':field,
|
38 |
+
'Data':data
|
39 |
+
}
|
40 |
+
|
41 |
+
df = pd.DataFrame.from_dict(data)
|
42 |
+
|
43 |
+
return df
|
44 |
+
|
45 |
+
|
46 |
+
|
47 |
+
def add_bg_from_url():
|
48 |
+
st.markdown(
|
49 |
+
f"""
|
50 |
+
<style>
|
51 |
+
.stApp {{
|
52 |
+
background-image: linear-gradient(#0A3144,#126072,#1C8D99);
|
53 |
+
background-attachment: fixed;
|
54 |
+
background-size: cover
|
55 |
+
}}
|
56 |
+
</style>
|
57 |
+
""",
|
58 |
+
unsafe_allow_html=True
|
59 |
+
)
|
60 |
+
|
61 |
+
# add_bg_from_url()
|
62 |
+
|
63 |
+
st.markdown("# Sentiment Analysis: Email Industry")
|
64 |
+
|
65 |
+
# col1, col2, col3 = st.columns([1,1,1])
|
66 |
+
|
67 |
+
# with col2:
|
68 |
+
# img = PIL.Image.open("figures/ModelCC_solid.png")
|
69 |
+
# st.image(img)
|
70 |
+
|
71 |
+
stats_col1, stats_col2, stats_col3, stats_col4 = st.columns([1,1,1,1])
|
72 |
+
|
73 |
+
with stats_col1:
|
74 |
+
st.metric(label="Production", value="Production")
|
75 |
+
with stats_col2:
|
76 |
+
st.metric(label="Accuracy", value="85%")
|
77 |
+
|
78 |
+
with stats_col3:
|
79 |
+
st.metric(label="Speed", value="3.86 ms")
|
80 |
+
|
81 |
+
with stats_col4:
|
82 |
+
st.metric(label="Industry", value="Email")
|
83 |
+
|
84 |
+
|
85 |
+
|
86 |
+
with st.sidebar:
|
87 |
+
|
88 |
+
with st.expander('Model Description', expanded=False):
|
89 |
+
img = PIL.Image.open("figures/ModelSA.png")
|
90 |
+
st.image(img)
|
91 |
+
st.markdown('The model seeks to solve the problem of how to set the tone for an email campaign appropriately. This 5th generation model from the Loxz family uses state-of-the-art NLP to determine and predict the optimized sentiment of an email using tokenization techniques. The model will analyze any email text “shape” and help the user understand the tone and how that tone correlates with the metric of interest. We applied a pre-trained tiny BERT model to vectorize the email campaign text body, then a softmax dense layer was added to get the multi-label classifications. Email metrics are provided prior to campaign launch, and the model determines the optimal engagement rate based on several factors, including inputs by the campaign engineer.')
|
92 |
+
|
93 |
+
with st.expander('Model Information', expanded=False):
|
94 |
+
hide_table_row_index = """
|
95 |
+
<style>
|
96 |
+
thead tr th:first-child {display:none}
|
97 |
+
tbody th {display:none}
|
98 |
+
</style>
|
99 |
+
"""
|
100 |
+
st.markdown(hide_table_row_index, unsafe_allow_html=True)
|
101 |
+
st.table(table_data())
|
102 |
+
|
103 |
+
utils.url_button('Model Homepage','https://loxz.com/#/models/SA')
|
104 |
+
# url_button('Full Report','https://resources.loxz.com/reports/realtime-ml-character-count-model')
|
105 |
+
utils.url_button('Amazon Market Place','https://aws.amazon.com/marketplace')
|
106 |
+
|
107 |
+
|
108 |
+
industry_lists = ['Software and Technology', 'Academic and Education',
|
109 |
+
'Entertainment', 'Finance and Banking', 'Hospitality',
|
110 |
+
'Real Estate', 'Retail', 'Energy', 'Healthcare']
|
111 |
+
|
112 |
+
campaign_types = ['Webinar', 'Engagement', 'Product_Announcement', 'Promotional',
|
113 |
+
'Newsletter', 'Abandoned_Cart', 'Review_Request', 'Survey',
|
114 |
+
'Transactional', 'Usage_and_Consumption']
|
115 |
+
|
116 |
+
target_variables = ['Conversion_Rate','Click_To_Open_Rate','Revenue_Per_Email']
|
117 |
+
|
118 |
+
|
119 |
+
|
120 |
+
input_text = st.text_area("Please enter your email text here", height=300)
|
121 |
+
|
122 |
+
|
123 |
+
industry = st.selectbox(
|
124 |
+
'Please select your industry',
|
125 |
+
industry_lists,
|
126 |
+
index=6
|
127 |
+
)
|
128 |
+
|
129 |
+
campaign = st.selectbox(
|
130 |
+
'Please select your industry',
|
131 |
+
campaign_types,
|
132 |
+
index=5
|
133 |
+
)
|
134 |
+
|
135 |
+
target = st.selectbox(
|
136 |
+
'Please select your target variable',
|
137 |
+
target_variables,
|
138 |
+
index=1
|
139 |
+
)
|
140 |
+
|
141 |
+
|
142 |
+
if st.button('Generate Predictions'):
|
143 |
+
start_time = time.time()
|
144 |
+
if input_text is "":
|
145 |
+
st.error('Please enter a sentence!')
|
146 |
+
else:
|
147 |
+
placeholder = st.empty()
|
148 |
+
placeholder.text('Loading Data')
|
149 |
+
|
150 |
+
# Starting predictions
|
151 |
+
bucket='emailcampaignmodeldata'
|
152 |
+
# file_key = 'fullEmailBody/fullemailtextbody_labeled_3rates_8tones_20220524.csv'
|
153 |
+
|
154 |
+
# email_data = utils.get_files_from_aws(bucket,file_key)
|
155 |
+
tone_key = 'ModelSADataSets/Tone_and_target.csv'
|
156 |
+
tone_data = FunctionsModelSA_V1.load_data(bucket,tone_key)
|
157 |
+
test_predictions,tones = FunctionsModelSA_V1.convert_text_to_tone(input_text)
|
158 |
+
|
159 |
+
# st.dataframe(test_predictions)
|
160 |
+
# st.dataframe(tones)
|
161 |
+
campaign_val='campaign_type_'+campaign
|
162 |
+
industry_val='industry_'+ industry
|
163 |
+
pred,lower,upper,model = FunctionsModelSA_V1.prediction(tones,campaign_val,industry_val,target)
|
164 |
+
best_target,best_target_tones = FunctionsModelSA_V1.find_max_cat(tone_data,target,industry_val,campaign_val)
|
165 |
+
|
166 |
+
FunctionsModelSA_V1.plot_CI(pred,lower,upper,streamlit=True)
|
167 |
+
if((best_target!=0) and (pred<best_target)):
|
168 |
+
recommended_changes=(best_target_tones-tones.loc[0])
|
169 |
+
change=best_target-pred
|
170 |
+
FunctionsModelSA_V1.recommend(tones,recommended_changes,change,target,streamlit=True)
|
171 |
+
FunctionsModelSA_V1.corrections(best_target_tones,test_predictions,streamlit=True)
|
172 |
+
|
173 |
+
placeholder.empty()
|
174 |
+
# print(time.time() - start_time)
|
figures/ModelSA.png
ADDED
main_app.py
ADDED
@@ -0,0 +1,15 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import streamlit as st
|
2 |
+
|
3 |
+
st.set_page_config(layout="wide")
|
4 |
+
|
5 |
+
st.markdown(
|
6 |
+
"""
|
7 |
+
<style>
|
8 |
+
body {
|
9 |
+
background-image: linear-gradient(#2e7bcf,#2e7bcf);
|
10 |
+
color: white;
|
11 |
+
}
|
12 |
+
</style>
|
13 |
+
""",
|
14 |
+
unsafe_allow_html=True,
|
15 |
+
)
|
requirements.txt
ADDED
@@ -0,0 +1,4 @@
|
|
|
|
|
|
|
|
|
|
|
1 |
+
pandas
|
2 |
+
numpy
|
3 |
+
colour
|
4 |
+
nltk
|
utils.py
ADDED
@@ -0,0 +1,84 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
# import s3fs
|
2 |
+
|
3 |
+
import pandas as pd
|
4 |
+
import numpy as np
|
5 |
+
from numpy import arange
|
6 |
+
from colour import Color
|
7 |
+
import plotly.graph_objects as go
|
8 |
+
from nltk import tokenize
|
9 |
+
from IPython.display import Markdown
|
10 |
+
from PIL import ImageColor
|
11 |
+
from vaderSentiment.vaderSentiment import SentimentIntensityAnalyzer
|
12 |
+
import nltk
|
13 |
+
nltk.download('punkt')
|
14 |
+
from io import StringIO
|
15 |
+
from scipy import spatial
|
16 |
+
import re
|
17 |
+
import pytorch_lightning as pl
|
18 |
+
from bs4 import BeautifulSoup
|
19 |
+
import ipywidgets as widgets
|
20 |
+
from ipywidgets import FileUpload
|
21 |
+
from urlextract import URLExtract
|
22 |
+
from transformers import BertTokenizerFast as BertTokenizer, BertModel, BertConfig
|
23 |
+
import torch.nn as nn
|
24 |
+
import torch
|
25 |
+
from ipywidgets import interact, Dropdown
|
26 |
+
import boto3
|
27 |
+
# from sagemaker import get_execution_role
|
28 |
+
from scipy import spatial
|
29 |
+
from bokeh.models.widgets import Div
|
30 |
+
import streamlit as st
|
31 |
+
|
32 |
+
|
33 |
+
def get_files_from_aws(bucket,prefix):
|
34 |
+
"""
|
35 |
+
get files from aws s3 bucket
|
36 |
+
|
37 |
+
bucket (STRING): bucket name
|
38 |
+
prefix (STRING): file location in s3 bucket
|
39 |
+
"""
|
40 |
+
s3_client = boto3.client('s3',
|
41 |
+
aws_access_key_id = st.secrets["aws_id"],
|
42 |
+
aws_secret_access_key = st.secrets["aws_key"])
|
43 |
+
|
44 |
+
file_obj = s3_client.get_object(Bucket=bucket,Key=prefix)
|
45 |
+
body = file_obj['Body']
|
46 |
+
string = body.read().decode('utf-8')
|
47 |
+
|
48 |
+
df = pd.read_csv(StringIO(string))
|
49 |
+
|
50 |
+
return df
|
51 |
+
|
52 |
+
def url_button(button_name,url):
|
53 |
+
if st.button(button_name):
|
54 |
+
js = """window.open('{url}')""".format(url=url) # New tab or window
|
55 |
+
html = '<img src onerror="{}">'.format(js)
|
56 |
+
div = Div(text=html)
|
57 |
+
st.bokeh_chart(div)
|
58 |
+
|
59 |
+
|
60 |
+
|
61 |
+
PARAMS={
|
62 |
+
'BATCH_SIZE': 8,
|
63 |
+
'MAX_TOKEN_COUNT':100,
|
64 |
+
'BERT_MODEL_NAME':'google/bert_uncased_L-2_H-128_A-2' ,
|
65 |
+
'N_EPOCHS': 10,
|
66 |
+
'n_classes':8,
|
67 |
+
'LABEL_COLUMNS': ['label_analytical', 'label_casual', 'label_confident', 'label_friendly',
|
68 |
+
'label_joyful', 'label_opstimistic', 'label_respectful',
|
69 |
+
'label_urgent'],
|
70 |
+
'TEXTCOL': 'text',
|
71 |
+
'rf_labels':['label_analytical', 'label_casual', 'label_confident',
|
72 |
+
'label_friendly', 'label_joyful', 'label_opstimistic',
|
73 |
+
'label_respectful', 'label_urgent',
|
74 |
+
'industry_Academic and Education', 'industry_Energy',
|
75 |
+
'industry_Entertainment', 'industry_Finance and Banking',
|
76 |
+
'industry_Healthcare', 'industry_Hospitality', 'industry_Real Estate',
|
77 |
+
'industry_Retail', 'industry_Software and Technology',
|
78 |
+
'campaign_type_Abandoned_Cart', 'campaign_type_Engagement',
|
79 |
+
'campaign_type_Newsletter', 'campaign_type_Product_Announcement',
|
80 |
+
'campaign_type_Promotional', 'campaign_type_Review_Request',
|
81 |
+
'campaign_type_Survey', 'campaign_type_Transactional',
|
82 |
+
'campaign_type_Usage_and_Consumption', 'campaign_type_Webinar']
|
83 |
+
}
|
84 |
+
|