edichief commited on
Commit
69abbc0
·
1 Parent(s): 1550afa
.gitattributes CHANGED
@@ -1,27 +1,2 @@
1
- *.7z filter=lfs diff=lfs merge=lfs -text
2
- *.arrow filter=lfs diff=lfs merge=lfs -text
3
- *.bin filter=lfs diff=lfs merge=lfs -text
4
- *.bin.* filter=lfs diff=lfs merge=lfs -text
5
- *.bz2 filter=lfs diff=lfs merge=lfs -text
6
- *.ftz filter=lfs diff=lfs merge=lfs -text
7
- *.gz filter=lfs diff=lfs merge=lfs -text
8
- *.h5 filter=lfs diff=lfs merge=lfs -text
9
- *.joblib filter=lfs diff=lfs merge=lfs -text
10
- *.lfs.* filter=lfs diff=lfs merge=lfs -text
11
- *.model filter=lfs diff=lfs merge=lfs -text
12
- *.msgpack filter=lfs diff=lfs merge=lfs -text
13
- *.onnx filter=lfs diff=lfs merge=lfs -text
14
- *.ot filter=lfs diff=lfs merge=lfs -text
15
- *.parquet filter=lfs diff=lfs merge=lfs -text
16
- *.pb filter=lfs diff=lfs merge=lfs -text
17
- *.pt filter=lfs diff=lfs merge=lfs -text
18
- *.pth filter=lfs diff=lfs merge=lfs -text
19
- *.rar filter=lfs diff=lfs merge=lfs -text
20
- saved_model/**/* filter=lfs diff=lfs merge=lfs -text
21
- *.tar.* filter=lfs diff=lfs merge=lfs -text
22
- *.tflite filter=lfs diff=lfs merge=lfs -text
23
- *.tgz filter=lfs diff=lfs merge=lfs -text
24
- *.xz filter=lfs diff=lfs merge=lfs -text
25
- *.zip filter=lfs diff=lfs merge=lfs -text
26
- *.zstandard filter=lfs diff=lfs merge=lfs -text
27
- *tfevents* filter=lfs diff=lfs merge=lfs -text
 
1
+ data/*.json filter=lfs diff=lfs merge=lfs -text
2
+ data/img/*.gif filter=lfs diff=lfs merge=lfs -text
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
.gitignore ADDED
@@ -0,0 +1 @@
 
 
1
+ __pycache__
README.md CHANGED
@@ -1,8 +1,8 @@
1
  ---
2
- title: Healthsea Demo
3
- emoji: 👀
4
- colorFrom: green
5
- colorTo: gray
6
  sdk: streamlit
7
  app_file: app.py
8
  pinned: false
 
1
  ---
2
+ title: Healthsea
3
+ emoji: 🪐
4
+ colorFrom: yellow
5
+ colorTo: pink
6
  sdk: streamlit
7
  app_file: app.py
8
  pinned: false
app.py ADDED
@@ -0,0 +1,39 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import streamlit as st
2
+ from visualize_dataset import visualize_dataset
3
+ from visualize_pipeline import visualize_pipeline
4
+
5
+ # Header
6
+ with open("style.css") as f:
7
+ st.markdown("<style>" + f.read() + "</style>", unsafe_allow_html=True)
8
+
9
+ st.title("Welcome to Healthsea 🪐")
10
+
11
+ intro, jellyfish = st.columns(2)
12
+ jellyfish.markdown("\n")
13
+
14
+ data_load_state = intro.subheader("Create easier access to health✨")
15
+
16
+ jellyfish.image("data/img/Jellymation.gif")
17
+ intro.markdown(
18
+ "Healthsea is a spaCy v3 pipeline that analyzes user reviews to supplement products by extracting their effects on health."
19
+ )
20
+ intro.markdown(
21
+ """With this app, you're able to explore the results of healthsea on up to 1 million reviews.
22
+ You can search for any health aspect, whether it is an disease (e.g. joint pain) or a desired health effect such as (e.g. energy),
23
+ the app returns a list of the best products and substances. You can also explore the capabilities of the pipeline itself, by writing custom reviews and
24
+ see every processing step of the pipeline.
25
+ """
26
+ )
27
+ intro.markdown(
28
+ """If you want to learn more about healthsea, you can read more in our [blog post]().
29
+ """
30
+ )
31
+
32
+ st.markdown("""---""")
33
+
34
+ app_type = st.selectbox("Choose app", ["Visualize dataset", "Visualize pipeline"])
35
+
36
+ if app_type == "Visualize dataset":
37
+ visualize_dataset()
38
+ else:
39
+ visualize_pipeline()
data/benefit_vectors.json ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:c77f19346af726d403cb571589e9d5802385c665dfb358a86591ebdd5c43e084
3
+ size 53173260
data/condition_vectors.json ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:1d8700f555d2fb6c643bead407f97ee14ebaa8e1d491a16af92026c719a3d91b
3
+ size 192093565
data/health_aspects.json ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:09840d8b5e503a8f62bd4bcc6455348453f111321cc108be1f115a550a34757a
3
+ size 23936080
data/img/Jellymation.gif ADDED

Git LFS Details

  • SHA256: c796dd42c6b93dbf75ca3045f44ad9471db737f1452fbcdd488c7b531aae79b1
  • Pointer size: 133 Bytes
  • Size of remote file: 25.4 MB
data/products.json ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:19606c9ad43abb4e9b7b679e9229b2c2101b5a748de4b5ba2c3baec4fde2f73f
3
+ size 56608006
requirements.txt ADDED
@@ -0,0 +1,8 @@
 
 
 
 
 
 
 
 
 
1
+ streamlit>=1.2.0
2
+ plotly>=5.4.0
3
+ scikit-learn>=1.0.1
4
+ spacy-streamlit>=1.0.2
5
+ spacy>=3.1.4
6
+ benepar>=0.2.0
7
+
8
+ https://huggingface.co/edichief/en_healthsea/resolve/main/en_healthsea-any-py3-none-any.whl
style.css ADDED
@@ -0,0 +1,58 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ .kpi{
2
+ text-align: center;
3
+ border-style: solid;
4
+ border-width: 1px;
5
+ border-radius: 5px;
6
+ border-color: #3b3b4d;
7
+ box-shadow: 0px 5px #3b3b4d;
8
+ }
9
+
10
+ .kpi:hover {
11
+ transform: scale(1.1);
12
+ }
13
+
14
+ .central_text{
15
+ text-align: center;
16
+ top: 50%;
17
+ }
18
+
19
+ .clause{
20
+ text-align: center;
21
+ border-style: solid;
22
+ border-width: 1px;
23
+ border-radius: 5px;
24
+ border-color: #1B7735;
25
+ box-shadow: 0px 5px #1B7735;
26
+ color: white;
27
+ margin-left: 10%;
28
+ margin-right: 10%;
29
+ padding-top: 2%;
30
+ padding-bottom: 2%;
31
+ background-color: #3C9E58;
32
+ z-index: 5;
33
+ display: block;
34
+ position: relative;
35
+ }
36
+
37
+ .clause:hover {
38
+ transform: scale(1.1);
39
+ }
40
+
41
+ .clause_text{
42
+ font-weight: bold;
43
+ }
44
+
45
+ .clause_meta{
46
+ text-align: center;
47
+ border-style: solid;
48
+ border-width: 1px;
49
+ border-radius: 5px;
50
+ border-color: #0c0c0e;
51
+ margin-left: 10%;
52
+ margin-right: 10%;
53
+ padding-top: 2%;
54
+ padding-bottom: 2%;
55
+ z-index: 3;
56
+ display: block;
57
+ position: relative;
58
+ }
support_functions.py ADDED
@@ -0,0 +1,296 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import pandas as pd
2
+ import difflib
3
+ from spacy.tokens import Doc
4
+
5
+ import plotly
6
+ import plotly.graph_objs as go
7
+ from sklearn.manifold import TSNE
8
+ import numpy as np
9
+
10
+
11
+ class HealthseaSearch:
12
+ def __init__(self, _health_aspects, _products, _conditions, _benefits):
13
+ self.health_aspects = _health_aspects
14
+ self.products = _products
15
+ self.conditions = _conditions
16
+ self.benefits = _benefits
17
+
18
+ def __call__(self, query):
19
+ return query
20
+
21
+ # Load product meta
22
+ def get_products(self, _aspect, n):
23
+ product_list = []
24
+ product_ids = {}
25
+ _n = n
26
+ _aspect = _aspect.replace(" ", "_")
27
+ if _aspect in self.health_aspects:
28
+ aspect = self.health_aspects[_aspect]
29
+ else:
30
+ _aspect = difflib.get_close_matches("_aspect", self.health_aspects.keys())[
31
+ 0
32
+ ]
33
+ aspect = self.health_aspects[_aspect]
34
+
35
+ product_scoring = aspect["products"]
36
+ if n != 0:
37
+ if n > len(product_scoring):
38
+ n = len(product_scoring)
39
+ product_scoring = aspect["products"][:n]
40
+
41
+ for product in product_scoring:
42
+ if product[1] not in product_ids:
43
+ product_list.append((product[0], self.products[product[1]], _aspect))
44
+ product_ids[product[1]] = 1
45
+
46
+ for alias in aspect["alias"]:
47
+ n = _n
48
+ _product_scoring = self.health_aspects[alias]["products"]
49
+ if n != 0:
50
+ if n > len(_product_scoring):
51
+ n = len(_product_scoring)
52
+ _product_scoring = self.health_aspects[alias]["products"][:n]
53
+
54
+ for product in _product_scoring:
55
+ if product[1] not in product_ids:
56
+ product_list.append((product[0], self.products[product[1]], alias))
57
+ product_ids[product[1]] = 1
58
+
59
+ n = _n
60
+ if len(product_list) > n and n != 0:
61
+ product_list = product_list[:n]
62
+ product_list = sorted(product_list, key=lambda tup: tup[0], reverse=True)
63
+
64
+ return product_list
65
+
66
+ # Load product meta and return as DataFrame
67
+ def get_products_df(self, _aspect, n):
68
+ product_list = self.get_products(_aspect, n)
69
+ product_data = {
70
+ "product": [],
71
+ "score": [],
72
+ "health_aspect": [],
73
+ "rating": [],
74
+ "reviews": [],
75
+ }
76
+ for product in product_list:
77
+ product_data["score"].append(product[0])
78
+ product_data["product"].append(product[1]["name"])
79
+ product_data["health_aspect"].append(product[2])
80
+ product_data["rating"].append(product[1]["rating"])
81
+ product_data["reviews"].append(product[1]["review_count"])
82
+
83
+ datatypes = {
84
+ "product": str,
85
+ "score": int,
86
+ "health_aspect": str,
87
+ "rating": str,
88
+ "reviews": int,
89
+ }
90
+
91
+ df = pd.DataFrame(data=product_data)
92
+ df = df.astype(datatypes)
93
+
94
+ return df
95
+
96
+ # Get health aspect
97
+ def get_aspect(self, _aspect):
98
+ _aspect = _aspect.replace(" ", "_")
99
+ if _aspect in self.health_aspects:
100
+ return self.health_aspects[_aspect]
101
+ else:
102
+ _aspect = difflib.get_close_matches("_aspect", self.health_aspects.keys())[
103
+ 0
104
+ ]
105
+ return self.health_aspects[_aspect]
106
+
107
+ # Get health aspect meta
108
+ def get_aspect_meta(self, _aspect):
109
+ _aspect = _aspect.replace(" ", "_")
110
+ if _aspect in self.conditions:
111
+ return self.conditions[_aspect]
112
+ elif _aspect in self.benefits:
113
+ return self.benefits[_aspect]
114
+ else:
115
+ _aspect = difflib.get_close_matches("_aspect", self.conditions.keys())[0]
116
+ return self.conditions[_aspect]
117
+
118
+ # Plotting vectors (2D/3D)
119
+ def tsne_plot(self, dataset):
120
+ "Creates and TSNE model and plots it"
121
+ labels = []
122
+ tokens = []
123
+
124
+ for i in dataset:
125
+ tokens.append(np.array(i[1]))
126
+ labels.append(i[0])
127
+
128
+ if len(dataset) > 2:
129
+ tsne_model = TSNE(
130
+ perplexity=40, n_components=3, init="pca", n_iter=2500, random_state=23
131
+ )
132
+
133
+ new_values = tsne_model.fit_transform(tokens)
134
+
135
+ x = []
136
+ y = []
137
+ z = []
138
+ for value in new_values:
139
+ x.append(value[0])
140
+ y.append(value[1])
141
+ z.append(value[2])
142
+
143
+ trace = go.Scatter3d(
144
+ x=x,
145
+ y=y,
146
+ z=z,
147
+ text=labels,
148
+ textposition="top right",
149
+ mode="lines+markers+text",
150
+ marker={
151
+ "size": 10,
152
+ "opacity": 0.8,
153
+ },
154
+ )
155
+
156
+ # Configure the layout.
157
+ layout = go.Layout(
158
+ margin={"l": 0, "r": 0, "b": 0, "t": 0}, font={"color": "#DF55E2"}
159
+ )
160
+
161
+ data = [trace]
162
+
163
+ return go.Figure(data=data, layout=layout)
164
+
165
+ else:
166
+ tsne_model = TSNE(
167
+ perplexity=40, n_components=2, init="pca", n_iter=2500, random_state=23
168
+ )
169
+
170
+ new_values = tsne_model.fit_transform(tokens)
171
+
172
+ x = []
173
+ y = []
174
+ for value in new_values:
175
+ x.append(value[0])
176
+ y.append(value[1])
177
+
178
+ trace = go.Scatter(
179
+ x=x,
180
+ y=y,
181
+ text=labels,
182
+ textposition="top right",
183
+ mode="lines+markers+text",
184
+ marker={
185
+ "size": 10,
186
+ "opacity": 0.8,
187
+ },
188
+ )
189
+
190
+ # Configure the layout.
191
+ layout = go.Layout(
192
+ margin={"l": 0, "r": 0, "b": 0, "t": 0}, font={"color": "#DF55E2"}
193
+ )
194
+
195
+ data = [trace]
196
+
197
+ return go.Figure(data=data, layout=layout)
198
+
199
+ # Load substance meta
200
+ def get_substances(self, _aspect, n):
201
+ substance_list = []
202
+ substance_ids = {}
203
+ exclude = ["sodium", "sugar", "sugar_alcohol"]
204
+ _n = n
205
+ _aspect = _aspect.replace(" ", "_")
206
+ if _aspect in self.health_aspects:
207
+ aspect = self.health_aspects[_aspect]
208
+ else:
209
+ _aspect = difflib.get_close_matches("_aspect", self.health_aspects.keys())[
210
+ 0
211
+ ]
212
+ aspect = self.health_aspects[_aspect]
213
+
214
+ substance_scoring = aspect["substance"]
215
+ if n != 0:
216
+ if n > len(substance_scoring):
217
+ n = len(substance_scoring)
218
+ substance_scoring = aspect["substance"][:n]
219
+
220
+ for substance in substance_scoring:
221
+ if substance[1] in exclude:
222
+ continue
223
+ if substance[1] not in substance_ids:
224
+ substance_list.append((substance[0], substance[1], _aspect))
225
+ substance_ids[substance[1]] = 1
226
+
227
+ for alias in aspect["alias"]:
228
+ n = _n
229
+ _substance_scoring = self.health_aspects[alias]["substance"]
230
+ if n != 0:
231
+ if n > len(_substance_scoring):
232
+ n = len(_substance_scoring)
233
+ _substance_scoring = self.health_aspects[alias]["substance"][:n]
234
+
235
+ for substance in _substance_scoring:
236
+ if substance[1] in exclude:
237
+ continue
238
+ if substance[1] not in substance_ids:
239
+ substance_list.append((substance[0], substance[1], alias))
240
+ substance_ids[substance[1]] = 1
241
+
242
+ n = _n
243
+ if len(substance_list) > n and n != 0:
244
+ substance_list = substance_list[:n]
245
+ substance_list = sorted(substance_list, key=lambda tup: tup[0], reverse=True)
246
+
247
+ return substance_list
248
+
249
+ # Load substance meta and return as DataFrame
250
+ def get_substances_df(self, _aspect, n):
251
+ substance_list = self.get_substances(_aspect, n)
252
+ substance_data = {"substance": [], "score": [], "health_aspect": []}
253
+ for substance in substance_list:
254
+ substance_data["score"].append(substance[0])
255
+ substance_data["substance"].append(substance[1])
256
+ substance_data["health_aspect"].append(substance[2])
257
+
258
+ datatypes = {"substance": str, "score": int, "health_aspect": str}
259
+
260
+ df = pd.DataFrame(data=substance_data)
261
+ df = df.astype(datatypes)
262
+
263
+ return df
264
+
265
+
266
+ class HealthseaPipe:
267
+
268
+ # Get Clauses and their predictions
269
+ def get_clauses(self, doc):
270
+ clauses = []
271
+ for clause in doc._.clauses:
272
+ words = []
273
+ spaces = []
274
+ clause_slice = doc[clause["split_indices"][0] : clause["split_indices"][1]]
275
+
276
+ if clause["has_ent"]:
277
+ for token in clause_slice:
278
+ if token.i == clause["ent_indices"][0]:
279
+ words.append(
280
+ clause["blinder"].replace(">", "").replace("<", "")
281
+ )
282
+ spaces.append(True)
283
+ elif token.i not in range(
284
+ clause["ent_indices"][0], clause["ent_indices"][1]
285
+ ):
286
+ words.append(token.text)
287
+ spaces.append(token.whitespace_)
288
+ clauses.append(Doc(doc.vocab, words=words, spaces=spaces))
289
+
290
+ else:
291
+ for token in clause_slice:
292
+ words.append(token.text)
293
+ spaces.append(token.whitespace_)
294
+ clauses.append(Doc(doc.vocab, words=words, spaces=spaces))
295
+
296
+ return clauses
visualize_dataset.py ADDED
@@ -0,0 +1,128 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import streamlit as st
2
+ from pathlib import Path
3
+ import json
4
+ from support_functions import HealthseaSearch
5
+
6
+ def visualize_dataset():
7
+ # Configuration
8
+ health_aspect_path = Path("data/health_aspects.json")
9
+ product_path = Path("data/products.json")
10
+ condition_path = Path("data/condition_vectors.json")
11
+ benefit_path = Path("data/benefit_vectors.json")
12
+
13
+
14
+ # Load data
15
+ @st.cache(allow_output_mutation=True)
16
+ def load_data(
17
+ _health_aspect_path: Path,
18
+ _product_path: Path,
19
+ _condition_path: Path,
20
+ _benefit_path: Path,
21
+ ):
22
+ with open(_health_aspect_path) as reader:
23
+ health_aspects = json.load(reader)
24
+ with open(_product_path) as reader:
25
+ products = json.load(reader)
26
+ with open(_condition_path) as reader:
27
+ conditions = json.load(reader)
28
+ with open(_benefit_path) as reader:
29
+ benefits = json.load(reader)
30
+ return health_aspects, products, conditions, benefits
31
+
32
+
33
+ # Functions
34
+ def kpi(n, text):
35
+ html = f"""
36
+ <div class='kpi'>
37
+ <h1 class='kpi_header'>{n}</h1>
38
+ <span>{text}</span>
39
+ </div>
40
+ """
41
+ return html
42
+
43
+
44
+ def central_text(text):
45
+ html = f"""<h2 class='central_text'>{text}</h2>"""
46
+ return html
47
+
48
+ # Loading data
49
+ health_aspects, products, conditions, benefits = load_data(
50
+ health_aspect_path, product_path, condition_path, benefit_path
51
+ )
52
+ search_engine = HealthseaSearch(health_aspects, products, conditions, benefits)
53
+
54
+ # KPI
55
+
56
+ st.markdown("""---""")
57
+
58
+ st.markdown(central_text("🎀 Dataset"), unsafe_allow_html=True)
59
+
60
+ kpi_products, kpi_reviews, kpi_condition, kpi_benefit = st.columns(4)
61
+
62
+ kpi_products.markdown(kpi(len(products), "Products"), unsafe_allow_html=True)
63
+ kpi_reviews.markdown(kpi(933.240, "Reviews"), unsafe_allow_html=True)
64
+ kpi_condition.markdown(kpi(len(conditions), "Conditions"), unsafe_allow_html=True)
65
+ kpi_benefit.markdown(kpi(len(benefits), "Benefits"), unsafe_allow_html=True)
66
+
67
+ st.markdown("""---""")
68
+
69
+ # Search
70
+ search = st.text_input(label="Search for an health aspect", value="joint pain")
71
+ n = st.slider("Show top n results", min_value=10, max_value=1000, value=25)
72
+
73
+ st.markdown("""---""")
74
+ st.markdown(central_text("🧃 Products"), unsafe_allow_html=True)
75
+
76
+ # DataFrame
77
+ st.write(search_engine.get_products_df(search, n))
78
+
79
+ # KPI & Alias
80
+ aspect_alias = search_engine.get_aspect(search)["alias"]
81
+
82
+ if len(aspect_alias) > 0:
83
+ kpi_mentions, kpi_product_mentions, kpi_alias = st.columns(3)
84
+ kpi_mentions.markdown(
85
+ kpi(search_engine.get_aspect_meta(search)["frequency"], "Mentions"),
86
+ unsafe_allow_html=True,
87
+ )
88
+ kpi_product_mentions.markdown(
89
+ kpi(len(search_engine.get_aspect(search)["products"]), "Products"),
90
+ unsafe_allow_html=True,
91
+ )
92
+ kpi_alias.markdown(
93
+ kpi(len(aspect_alias), "Similar health aspects"),
94
+ unsafe_allow_html=True,
95
+ )
96
+
97
+ vectors = []
98
+ main_aspect = search_engine.get_aspect_meta(search)
99
+ vectors.append((main_aspect["name"], main_aspect["vector"]))
100
+ for aspect in aspect_alias:
101
+ current_aspect = search_engine.get_aspect_meta(aspect)
102
+ vectors.append((current_aspect["name"], current_aspect["vector"]))
103
+ st.markdown("\n")
104
+ st.write(search_engine.tsne_plot(vectors))
105
+
106
+ else:
107
+ kpi_mentions, kpi_product_mentions = st.columns(2)
108
+ kpi_mentions.markdown(
109
+ kpi(search_engine.get_aspect_meta(search)["frequency"], "Mentions"),
110
+ unsafe_allow_html=True,
111
+ )
112
+ kpi_product_mentions.markdown(
113
+ kpi(len(search_engine.get_aspect(search)["products"]), "Products"),
114
+ unsafe_allow_html=True,
115
+ )
116
+
117
+ st.markdown("""---""")
118
+
119
+ # Substances
120
+ st.markdown(central_text("🍯 Substances"), unsafe_allow_html=True)
121
+
122
+ # DataFrame
123
+ st.write(search_engine.get_substances_df(search, n))
124
+ kpi_tmp, kpi_substances = st.columns(2)
125
+ kpi_substances.markdown(
126
+ kpi(len(search_engine.get_aspect(search)["substance"]), "Substances"),
127
+ unsafe_allow_html=True,
128
+ )
visualize_pipeline.py ADDED
@@ -0,0 +1,128 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import streamlit as st
2
+ import spacy
3
+ from spacy_streamlit import visualize_ner
4
+ from support_functions import HealthseaPipe
5
+ import operator
6
+
7
+ def visualize_pipeline():
8
+ healthsea_pipe = HealthseaPipe()
9
+
10
+ color_code = {
11
+ "POSITIVE": ("#3C9E58", "#1B7735"),
12
+ "NEGATIVE": ("#FF166A", "#C0094B"),
13
+ "NEUTRAL": ("#7E7E7E", "#4E4747"),
14
+ "ANAMNESIS": ("#E49A55", "#AD6B2D"),
15
+ }
16
+
17
+ example_reviews = [
18
+ "This is great for joint pain.",
19
+ "This help joint pain but causes rashes",
20
+ "I'm diagnosed with gastritis. This product helped!",
21
+ "Made my insomnia worse",
22
+ "Didn't help my energy levels",
23
+ ]
24
+
25
+ # Functions
26
+ def kpi(n, text):
27
+ html = f"""
28
+ <div class='kpi'>
29
+ <h1>{n}</h1>
30
+ <span>{text}</span>
31
+ </div>
32
+ """
33
+ return html
34
+
35
+
36
+ def central_text(text):
37
+ html = f"""<h2 class='central_text'>{text}</h2>"""
38
+ return html
39
+
40
+
41
+ def format_clause(text, meta, pred):
42
+ html = f"""
43
+ <div>
44
+ <div class="clause" style="background-color:{color_code[pred][0]} ; box-shadow: 0px 5px {color_code[pred][1]}; border-color:{color_code[pred][1]};">
45
+ <div class="clause_text">{text}</div>
46
+ </div>
47
+ <div class="clause_meta">
48
+ <div>{meta}</div>
49
+ </div>
50
+ </div>"""
51
+ return html
52
+
53
+
54
+ def format_effect(text, pred):
55
+ html = f"""
56
+ <div>
57
+ <div class="clause" style="background-color:{color_code[pred][0]} ; box-shadow: 0px 5px {color_code[pred][1]}; border-color:{color_code[pred][1]};">
58
+ <div class="clause_text">{text}</div>
59
+ </div>
60
+ </div>"""
61
+ return html
62
+
63
+ # Load model
64
+ nlp = spacy.load("en_healthsea")
65
+
66
+ # Pipeline
67
+ st.markdown("""---""")
68
+
69
+ st.markdown(central_text("⚙️ Pipeline"), unsafe_allow_html=True)
70
+
71
+ check = st.checkbox("Use predefined examples")
72
+
73
+ if not check:
74
+ text = st.text_input(label="Write a review", value="This is great for joint pain!")
75
+ else:
76
+ text = st.selectbox("Predefined example reviews", example_reviews)
77
+ doc = nlp(text)
78
+
79
+ # NER
80
+ visualize_ner(
81
+ doc,
82
+ labels=nlp.get_pipe("ner").labels,
83
+ show_table=False,
84
+ title="✨ Named Entity Recognition",
85
+ colors={"CONDITION": "#FF4B76", "BENEFIT": "#629B68"},
86
+ )
87
+
88
+ st.markdown("""---""")
89
+
90
+ # Segmentation, Blinding, Classification
91
+ st.markdown("## 🔮 Segmentation, Blinding, Classification")
92
+
93
+ clauses = healthsea_pipe.get_clauses(doc)
94
+ for doc_clause, clause in zip(clauses, doc._.clauses):
95
+ classification = max(clause["cats"].items(), key=operator.itemgetter(1))[0]
96
+ percentage = round(float(clause["cats"][classification]) * 100, 2)
97
+ meta = f"{clause['ent_name']} ({classification} {percentage}%)"
98
+
99
+ st.markdown(
100
+ format_clause(doc_clause.text, meta, classification), unsafe_allow_html=True
101
+ )
102
+ st.markdown("\n")
103
+
104
+ st.markdown("""---""")
105
+
106
+ # Aggregation
107
+ st.markdown("## 🔗 Aggregation")
108
+
109
+ for effect in doc._.health_effects:
110
+ st.markdown(
111
+ format_effect(
112
+ f"{doc._.health_effects[effect]['effect']} effect on {effect}",
113
+ doc._.health_effects[effect]["effect"],
114
+ ),
115
+ unsafe_allow_html=True,
116
+ )
117
+ st.markdown("\n")
118
+
119
+ st.markdown("""---""")
120
+ # Indepth
121
+ st.markdown("## 🔧 Pipeline attributes")
122
+ clauses_col, effect_col = st.columns(2)
123
+
124
+ clauses_col.markdown("### doc._.clauses")
125
+ for clause in doc._.clauses:
126
+ clauses_col.json(clause)
127
+ effect_col.markdown("### doc._.health_effects")
128
+ effect_col.json(doc._.health_effects)