Spaces:

spacy
/

healthsea-demo

Runtime error

App Files Files Community

edichief commited on Nov 29, 2021

Commit

03bdce4

•

1 Parent(s): 99e6b78

Add more information and improve performance

Browse files

Files changed (3) hide show

support_functions.py +58 -0
visualize_dataset.py +17 -1
visualize_pipeline.py +14 -1

support_functions.py CHANGED Viewed

@@ -262,6 +262,64 @@ class HealthseaSearch:
         return df
 class HealthseaPipe:

         return df
+    # Get all health aspect indices
+    def get_all_conditions(self):
+        condition_list = []
+        for condition_key in self.conditions:
+            condition_list.append((self.conditions[condition_key]["frequency"],condition_key))
+        condition_list = sorted(condition_list, key=lambda tup: tup[0], reverse=True)
+        return condition_list
+    def get_all_conditions_df(self):
+        condition_list = self.get_all_conditions()[:1000]
+        condition_data = {
+            "Condition": [],
+            "Frequency": []
+        }
+        for condition in condition_list:
+            condition_data["Frequency"].append(condition[0])
+            condition_data["Condition"].append(condition[1])
+        datatypes = {
+            "Frequency": int,
+            "Condition": str
+        }
+        df = pd.DataFrame(data=condition_data)
+        df = df.astype(datatypes)
+        return df
+    def get_all_benefits(self):
+        benefit_list = []
+        for benefit_key in self.benefits:
+            benefit_list.append((self.benefits[benefit_key]["frequency"],benefit_key))
+        benefit_list = sorted(benefit_list, key=lambda tup: tup[0], reverse=True)
+        return benefit_list
+    def get_all_benefits_df(self):
+        benefit_list = self.get_all_benefits()[:1000]
+        benefit_data = {
+            "Benefit": [],
+            "Frequency": []
+        }
+        for benefit in benefit_list:
+            benefit_data["Frequency"].append(benefit[0])
+            benefit_data["Benefit"].append(benefit[1])
+        datatypes = {
+            "Frequency": int,
+            "Benefit": str
+        }
+        df = pd.DataFrame(data=benefit_data)
+        df = df.astype(datatypes)
+        return df
 class HealthseaPipe:

visualize_dataset.py CHANGED Viewed

@@ -10,7 +10,6 @@ def visualize_dataset():
     condition_path = Path("data/condition_vectors.json")
     benefit_path = Path("data/benefit_vectors.json")
     # Load data
     @st.cache(allow_output_mutation=True)
     def load_data(
@@ -53,6 +52,8 @@ def visualize_dataset():
     # KPI
     st.markdown("""---""")
     st.markdown(central_text("🎀 Dataset"), unsafe_allow_html=True)
@@ -66,6 +67,17 @@ def visualize_dataset():
     st.markdown("""---""")
     # Search
     search = st.text_input(label="Search for an health aspect", value="joint pain")
     n = st.slider("Show top n results", min_value=10, max_value=1000, value=25)
@@ -73,6 +85,8 @@ def visualize_dataset():
     st.markdown("""---""")
     st.markdown(central_text("🧃 Products"), unsafe_allow_html=True)
     # DataFrame
     st.write(search_engine.get_products_df(search, n))
@@ -101,6 +115,7 @@ def visualize_dataset():
             current_aspect = search_engine.get_aspect_meta(aspect)
             vectors.append((current_aspect["name"], current_aspect["vector"]))
         st.markdown("\n")
         st.write(search_engine.tsne_plot(vectors))
     else:
@@ -118,6 +133,7 @@ def visualize_dataset():
     # Substances
     st.markdown(central_text("🍯 Substances"), unsafe_allow_html=True)
     # DataFrame
     st.write(search_engine.get_substances_df(search, n))

     condition_path = Path("data/condition_vectors.json")
     benefit_path = Path("data/benefit_vectors.json")
     # Load data
     @st.cache(allow_output_mutation=True)
     def load_data(
     # KPI
+    st.markdown("""This app presents the analyzed dataset of up to one million reviews. You can search for the best products and substances to any health aspect based on what reviewers wrote in their reviews.""")
     st.markdown("""---""")
     st.markdown(central_text("🎀 Dataset"), unsafe_allow_html=True)
     st.markdown("""---""")
+    # Expander
+    show_conditions, show_benefits = st.columns(2)
+    with show_conditions.expander("Top 1000 mentioned Conditions"):
+        st.write(search_engine.get_all_conditions_df())
+    with show_benefits.expander("Top 1000 mentioned Benefits"):
+        st.write(search_engine.get_all_benefits_df())
+    st.markdown("""---""")
     # Search
     search = st.text_input(label="Search for an health aspect", value="joint pain")
     n = st.slider("Show top n results", min_value=10, max_value=1000, value=25)
     st.markdown("""---""")
     st.markdown(central_text("🧃 Products"), unsafe_allow_html=True)
+    st.markdown("""The products are scored based on what reviewers say. Additional variables in the scoring function are product rating, helpful count and whether the review is considered 'fake'. """)
     # DataFrame
     st.write(search_engine.get_products_df(search, n))
             current_aspect = search_engine.get_aspect_meta(aspect)
             vectors.append((current_aspect["name"], current_aspect["vector"]))
         st.markdown("\n")
+        st.markdown("""To improve the search, the table also shows results of other health aspects with a high similarity""")
         st.write(search_engine.tsne_plot(vectors))
     else:
     # Substances
     st.markdown(central_text("🍯 Substances"), unsafe_allow_html=True)
+    st.markdown("""The scores of the substances are based on the products""")
     # DataFrame
     st.write(search_engine.get_substances_df(search, n))

visualize_pipeline.py CHANGED Viewed

@@ -64,7 +64,9 @@ def visualize_pipeline():
     # Load model
     try:
         load_state.markdown ("#### Loading model...")
-        nlp = spacy.load("en_healthsea")
     # Download model
     except LookupError:
@@ -75,6 +77,8 @@ def visualize_pipeline():
     load_state.markdown ("#### Loading done!")
     # Pipeline
     st.markdown("""---""")
     st.markdown(central_text("⚙️ Pipeline"), unsafe_allow_html=True)
@@ -85,6 +89,8 @@ def visualize_pipeline():
         text = st.text_input(label="Write a review", value="This is great for joint pain!")
     else:
         text = st.selectbox("Predefined example reviews", example_reviews)
     doc = nlp(text)
     # NER
@@ -96,6 +102,8 @@ def visualize_pipeline():
         colors={"CONDITION": "#FF4B76", "BENEFIT": "#629B68"},
     )
     st.markdown("""---""")
     # Segmentation, Blinding, Classification
@@ -112,6 +120,9 @@ def visualize_pipeline():
         )
         st.markdown("\n")
     st.markdown("""---""")
     # Aggregation
@@ -127,6 +138,8 @@ def visualize_pipeline():
         )
         st.markdown("\n")
     st.markdown("""---""")
     # Indepth
     st.markdown("## 🔧 Pipeline attributes")

     # Load model
     try:
         load_state.markdown ("#### Loading model...")
+        if "model" not in st.session_state:
+            nlp = spacy.load("en_healthsea")
+            st.session_state["model"] = nlp
     # Download model
     except LookupError:
     load_state.markdown ("#### Loading done!")
     # Pipeline
+    st.markdown("""This app visualizes the processing steps of the Healthsea pipeline. You can test it by writing an example review.""")
     st.markdown("""---""")
     st.markdown(central_text("⚙️ Pipeline"), unsafe_allow_html=True)
         text = st.text_input(label="Write a review", value="This is great for joint pain!")
     else:
         text = st.selectbox("Predefined example reviews", example_reviews)
+    nlp = st.session_state["model"]
     doc = nlp(text)
     # NER
         colors={"CONDITION": "#FF4B76", "BENEFIT": "#629B68"},
     )
+    st.markdown("""The first processing step is to identify Conditions or Benefits with Named Entity Recognition. Conditions are diseases, symptoms and general health problems (e.g. joint pain), while Benefits are positive desired health aspects (e.g. energy)""")
     st.markdown("""---""")
     # Segmentation, Blinding, Classification
         )
         st.markdown("\n")
+    st.markdown("""The review is segmented into sub-clauses and then classified by a Text Classification model. We additionally blind the found entities to improve generalization and also to inform the model about our current target entity of which we want to get the prediction of.
+    The Text Classification predicts four exclusive classes: 'Positive', 'Negative', 'Neutral', 'Anamnesis', they represent the health effect.""")
     st.markdown("""---""")
     # Aggregation
         )
         st.markdown("\n")
+    st.markdown("""Multiple classification are aggregated into one final classification.""")
     st.markdown("""---""")
     # Indepth
     st.markdown("## 🔧 Pipeline attributes")