edichief commited on
Commit
03bdce4
β€’
1 Parent(s): 99e6b78

Add more information and improve performance

Browse files
support_functions.py CHANGED
@@ -262,6 +262,64 @@ class HealthseaSearch:
262
 
263
  return df
264
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
265
 
266
  class HealthseaPipe:
267
 
 
262
 
263
  return df
264
 
265
+ # Get all health aspect indices
266
+ def get_all_conditions(self):
267
+ condition_list = []
268
+ for condition_key in self.conditions:
269
+ condition_list.append((self.conditions[condition_key]["frequency"],condition_key))
270
+
271
+ condition_list = sorted(condition_list, key=lambda tup: tup[0], reverse=True)
272
+ return condition_list
273
+
274
+ def get_all_conditions_df(self):
275
+ condition_list = self.get_all_conditions()[:1000]
276
+ condition_data = {
277
+ "Condition": [],
278
+ "Frequency": []
279
+ }
280
+ for condition in condition_list:
281
+ condition_data["Frequency"].append(condition[0])
282
+ condition_data["Condition"].append(condition[1])
283
+
284
+ datatypes = {
285
+ "Frequency": int,
286
+ "Condition": str
287
+ }
288
+
289
+ df = pd.DataFrame(data=condition_data)
290
+ df = df.astype(datatypes)
291
+
292
+ return df
293
+
294
+
295
+ def get_all_benefits(self):
296
+ benefit_list = []
297
+ for benefit_key in self.benefits:
298
+ benefit_list.append((self.benefits[benefit_key]["frequency"],benefit_key))
299
+
300
+ benefit_list = sorted(benefit_list, key=lambda tup: tup[0], reverse=True)
301
+ return benefit_list
302
+
303
+ def get_all_benefits_df(self):
304
+ benefit_list = self.get_all_benefits()[:1000]
305
+ benefit_data = {
306
+ "Benefit": [],
307
+ "Frequency": []
308
+ }
309
+ for benefit in benefit_list:
310
+ benefit_data["Frequency"].append(benefit[0])
311
+ benefit_data["Benefit"].append(benefit[1])
312
+
313
+ datatypes = {
314
+ "Frequency": int,
315
+ "Benefit": str
316
+ }
317
+
318
+ df = pd.DataFrame(data=benefit_data)
319
+ df = df.astype(datatypes)
320
+
321
+ return df
322
+
323
 
324
  class HealthseaPipe:
325
 
visualize_dataset.py CHANGED
@@ -10,7 +10,6 @@ def visualize_dataset():
10
  condition_path = Path("data/condition_vectors.json")
11
  benefit_path = Path("data/benefit_vectors.json")
12
 
13
-
14
  # Load data
15
  @st.cache(allow_output_mutation=True)
16
  def load_data(
@@ -53,6 +52,8 @@ def visualize_dataset():
53
 
54
  # KPI
55
 
 
 
56
  st.markdown("""---""")
57
 
58
  st.markdown(central_text("πŸŽ€ Dataset"), unsafe_allow_html=True)
@@ -66,6 +67,17 @@ def visualize_dataset():
66
 
67
  st.markdown("""---""")
68
 
 
 
 
 
 
 
 
 
 
 
 
69
  # Search
70
  search = st.text_input(label="Search for an health aspect", value="joint pain")
71
  n = st.slider("Show top n results", min_value=10, max_value=1000, value=25)
@@ -73,6 +85,8 @@ def visualize_dataset():
73
  st.markdown("""---""")
74
  st.markdown(central_text("πŸ§ƒ Products"), unsafe_allow_html=True)
75
 
 
 
76
  # DataFrame
77
  st.write(search_engine.get_products_df(search, n))
78
 
@@ -101,6 +115,7 @@ def visualize_dataset():
101
  current_aspect = search_engine.get_aspect_meta(aspect)
102
  vectors.append((current_aspect["name"], current_aspect["vector"]))
103
  st.markdown("\n")
 
104
  st.write(search_engine.tsne_plot(vectors))
105
 
106
  else:
@@ -118,6 +133,7 @@ def visualize_dataset():
118
 
119
  # Substances
120
  st.markdown(central_text("🍯 Substances"), unsafe_allow_html=True)
 
121
 
122
  # DataFrame
123
  st.write(search_engine.get_substances_df(search, n))
 
10
  condition_path = Path("data/condition_vectors.json")
11
  benefit_path = Path("data/benefit_vectors.json")
12
 
 
13
  # Load data
14
  @st.cache(allow_output_mutation=True)
15
  def load_data(
 
52
 
53
  # KPI
54
 
55
+ st.markdown("""This app presents the analyzed dataset of up to one million reviews. You can search for the best products and substances to any health aspect based on what reviewers wrote in their reviews.""")
56
+
57
  st.markdown("""---""")
58
 
59
  st.markdown(central_text("πŸŽ€ Dataset"), unsafe_allow_html=True)
 
67
 
68
  st.markdown("""---""")
69
 
70
+ # Expander
71
+ show_conditions, show_benefits = st.columns(2)
72
+
73
+ with show_conditions.expander("Top 1000 mentioned Conditions"):
74
+ st.write(search_engine.get_all_conditions_df())
75
+
76
+ with show_benefits.expander("Top 1000 mentioned Benefits"):
77
+ st.write(search_engine.get_all_benefits_df())
78
+
79
+ st.markdown("""---""")
80
+
81
  # Search
82
  search = st.text_input(label="Search for an health aspect", value="joint pain")
83
  n = st.slider("Show top n results", min_value=10, max_value=1000, value=25)
 
85
  st.markdown("""---""")
86
  st.markdown(central_text("πŸ§ƒ Products"), unsafe_allow_html=True)
87
 
88
+ st.markdown("""The products are scored based on what reviewers say. Additional variables in the scoring function are product rating, helpful count and whether the review is considered 'fake'. """)
89
+
90
  # DataFrame
91
  st.write(search_engine.get_products_df(search, n))
92
 
 
115
  current_aspect = search_engine.get_aspect_meta(aspect)
116
  vectors.append((current_aspect["name"], current_aspect["vector"]))
117
  st.markdown("\n")
118
+ st.markdown("""To improve the search, the table also shows results of other health aspects with a high similarity""")
119
  st.write(search_engine.tsne_plot(vectors))
120
 
121
  else:
 
133
 
134
  # Substances
135
  st.markdown(central_text("🍯 Substances"), unsafe_allow_html=True)
136
+ st.markdown("""The scores of the substances are based on the products""")
137
 
138
  # DataFrame
139
  st.write(search_engine.get_substances_df(search, n))
visualize_pipeline.py CHANGED
@@ -64,7 +64,9 @@ def visualize_pipeline():
64
  # Load model
65
  try:
66
  load_state.markdown ("#### Loading model...")
67
- nlp = spacy.load("en_healthsea")
 
 
68
 
69
  # Download model
70
  except LookupError:
@@ -75,6 +77,8 @@ def visualize_pipeline():
75
  load_state.markdown ("#### Loading done!")
76
 
77
  # Pipeline
 
 
78
  st.markdown("""---""")
79
 
80
  st.markdown(central_text("βš™οΈ Pipeline"), unsafe_allow_html=True)
@@ -85,6 +89,8 @@ def visualize_pipeline():
85
  text = st.text_input(label="Write a review", value="This is great for joint pain!")
86
  else:
87
  text = st.selectbox("Predefined example reviews", example_reviews)
 
 
88
  doc = nlp(text)
89
 
90
  # NER
@@ -96,6 +102,8 @@ def visualize_pipeline():
96
  colors={"CONDITION": "#FF4B76", "BENEFIT": "#629B68"},
97
  )
98
 
 
 
99
  st.markdown("""---""")
100
 
101
  # Segmentation, Blinding, Classification
@@ -112,6 +120,9 @@ def visualize_pipeline():
112
  )
113
  st.markdown("\n")
114
 
 
 
 
115
  st.markdown("""---""")
116
 
117
  # Aggregation
@@ -127,6 +138,8 @@ def visualize_pipeline():
127
  )
128
  st.markdown("\n")
129
 
 
 
130
  st.markdown("""---""")
131
  # Indepth
132
  st.markdown("## πŸ”§ Pipeline attributes")
 
64
  # Load model
65
  try:
66
  load_state.markdown ("#### Loading model...")
67
+ if "model" not in st.session_state:
68
+ nlp = spacy.load("en_healthsea")
69
+ st.session_state["model"] = nlp
70
 
71
  # Download model
72
  except LookupError:
 
77
  load_state.markdown ("#### Loading done!")
78
 
79
  # Pipeline
80
+ st.markdown("""This app visualizes the processing steps of the Healthsea pipeline. You can test it by writing an example review.""")
81
+
82
  st.markdown("""---""")
83
 
84
  st.markdown(central_text("βš™οΈ Pipeline"), unsafe_allow_html=True)
 
89
  text = st.text_input(label="Write a review", value="This is great for joint pain!")
90
  else:
91
  text = st.selectbox("Predefined example reviews", example_reviews)
92
+
93
+ nlp = st.session_state["model"]
94
  doc = nlp(text)
95
 
96
  # NER
 
102
  colors={"CONDITION": "#FF4B76", "BENEFIT": "#629B68"},
103
  )
104
 
105
+ st.markdown("""The first processing step is to identify Conditions or Benefits with Named Entity Recognition. Conditions are diseases, symptoms and general health problems (e.g. joint pain), while Benefits are positive desired health aspects (e.g. energy)""")
106
+
107
  st.markdown("""---""")
108
 
109
  # Segmentation, Blinding, Classification
 
120
  )
121
  st.markdown("\n")
122
 
123
+ st.markdown("""The review is segmented into sub-clauses and then classified by a Text Classification model. We additionally blind the found entities to improve generalization and also to inform the model about our current target entity of which we want to get the prediction of.
124
+ The Text Classification predicts four exclusive classes: 'Positive', 'Negative', 'Neutral', 'Anamnesis', they represent the health effect.""")
125
+
126
  st.markdown("""---""")
127
 
128
  # Aggregation
 
138
  )
139
  st.markdown("\n")
140
 
141
+ st.markdown("""Multiple classification are aggregated into one final classification.""")
142
+
143
  st.markdown("""---""")
144
  # Indepth
145
  st.markdown("## πŸ”§ Pipeline attributes")