healthsea-demo / support_functions.py
edichief's picture
Init
69abbc0
raw
history blame
9.74 kB
import pandas as pd
import difflib
from spacy.tokens import Doc
import plotly
import plotly.graph_objs as go
from sklearn.manifold import TSNE
import numpy as np
class HealthseaSearch:
def __init__(self, _health_aspects, _products, _conditions, _benefits):
self.health_aspects = _health_aspects
self.products = _products
self.conditions = _conditions
self.benefits = _benefits
def __call__(self, query):
return query
# Load product meta
def get_products(self, _aspect, n):
product_list = []
product_ids = {}
_n = n
_aspect = _aspect.replace(" ", "_")
if _aspect in self.health_aspects:
aspect = self.health_aspects[_aspect]
else:
_aspect = difflib.get_close_matches("_aspect", self.health_aspects.keys())[
0
]
aspect = self.health_aspects[_aspect]
product_scoring = aspect["products"]
if n != 0:
if n > len(product_scoring):
n = len(product_scoring)
product_scoring = aspect["products"][:n]
for product in product_scoring:
if product[1] not in product_ids:
product_list.append((product[0], self.products[product[1]], _aspect))
product_ids[product[1]] = 1
for alias in aspect["alias"]:
n = _n
_product_scoring = self.health_aspects[alias]["products"]
if n != 0:
if n > len(_product_scoring):
n = len(_product_scoring)
_product_scoring = self.health_aspects[alias]["products"][:n]
for product in _product_scoring:
if product[1] not in product_ids:
product_list.append((product[0], self.products[product[1]], alias))
product_ids[product[1]] = 1
n = _n
if len(product_list) > n and n != 0:
product_list = product_list[:n]
product_list = sorted(product_list, key=lambda tup: tup[0], reverse=True)
return product_list
# Load product meta and return as DataFrame
def get_products_df(self, _aspect, n):
product_list = self.get_products(_aspect, n)
product_data = {
"product": [],
"score": [],
"health_aspect": [],
"rating": [],
"reviews": [],
}
for product in product_list:
product_data["score"].append(product[0])
product_data["product"].append(product[1]["name"])
product_data["health_aspect"].append(product[2])
product_data["rating"].append(product[1]["rating"])
product_data["reviews"].append(product[1]["review_count"])
datatypes = {
"product": str,
"score": int,
"health_aspect": str,
"rating": str,
"reviews": int,
}
df = pd.DataFrame(data=product_data)
df = df.astype(datatypes)
return df
# Get health aspect
def get_aspect(self, _aspect):
_aspect = _aspect.replace(" ", "_")
if _aspect in self.health_aspects:
return self.health_aspects[_aspect]
else:
_aspect = difflib.get_close_matches("_aspect", self.health_aspects.keys())[
0
]
return self.health_aspects[_aspect]
# Get health aspect meta
def get_aspect_meta(self, _aspect):
_aspect = _aspect.replace(" ", "_")
if _aspect in self.conditions:
return self.conditions[_aspect]
elif _aspect in self.benefits:
return self.benefits[_aspect]
else:
_aspect = difflib.get_close_matches("_aspect", self.conditions.keys())[0]
return self.conditions[_aspect]
# Plotting vectors (2D/3D)
def tsne_plot(self, dataset):
"Creates and TSNE model and plots it"
labels = []
tokens = []
for i in dataset:
tokens.append(np.array(i[1]))
labels.append(i[0])
if len(dataset) > 2:
tsne_model = TSNE(
perplexity=40, n_components=3, init="pca", n_iter=2500, random_state=23
)
new_values = tsne_model.fit_transform(tokens)
x = []
y = []
z = []
for value in new_values:
x.append(value[0])
y.append(value[1])
z.append(value[2])
trace = go.Scatter3d(
x=x,
y=y,
z=z,
text=labels,
textposition="top right",
mode="lines+markers+text",
marker={
"size": 10,
"opacity": 0.8,
},
)
# Configure the layout.
layout = go.Layout(
margin={"l": 0, "r": 0, "b": 0, "t": 0}, font={"color": "#DF55E2"}
)
data = [trace]
return go.Figure(data=data, layout=layout)
else:
tsne_model = TSNE(
perplexity=40, n_components=2, init="pca", n_iter=2500, random_state=23
)
new_values = tsne_model.fit_transform(tokens)
x = []
y = []
for value in new_values:
x.append(value[0])
y.append(value[1])
trace = go.Scatter(
x=x,
y=y,
text=labels,
textposition="top right",
mode="lines+markers+text",
marker={
"size": 10,
"opacity": 0.8,
},
)
# Configure the layout.
layout = go.Layout(
margin={"l": 0, "r": 0, "b": 0, "t": 0}, font={"color": "#DF55E2"}
)
data = [trace]
return go.Figure(data=data, layout=layout)
# Load substance meta
def get_substances(self, _aspect, n):
substance_list = []
substance_ids = {}
exclude = ["sodium", "sugar", "sugar_alcohol"]
_n = n
_aspect = _aspect.replace(" ", "_")
if _aspect in self.health_aspects:
aspect = self.health_aspects[_aspect]
else:
_aspect = difflib.get_close_matches("_aspect", self.health_aspects.keys())[
0
]
aspect = self.health_aspects[_aspect]
substance_scoring = aspect["substance"]
if n != 0:
if n > len(substance_scoring):
n = len(substance_scoring)
substance_scoring = aspect["substance"][:n]
for substance in substance_scoring:
if substance[1] in exclude:
continue
if substance[1] not in substance_ids:
substance_list.append((substance[0], substance[1], _aspect))
substance_ids[substance[1]] = 1
for alias in aspect["alias"]:
n = _n
_substance_scoring = self.health_aspects[alias]["substance"]
if n != 0:
if n > len(_substance_scoring):
n = len(_substance_scoring)
_substance_scoring = self.health_aspects[alias]["substance"][:n]
for substance in _substance_scoring:
if substance[1] in exclude:
continue
if substance[1] not in substance_ids:
substance_list.append((substance[0], substance[1], alias))
substance_ids[substance[1]] = 1
n = _n
if len(substance_list) > n and n != 0:
substance_list = substance_list[:n]
substance_list = sorted(substance_list, key=lambda tup: tup[0], reverse=True)
return substance_list
# Load substance meta and return as DataFrame
def get_substances_df(self, _aspect, n):
substance_list = self.get_substances(_aspect, n)
substance_data = {"substance": [], "score": [], "health_aspect": []}
for substance in substance_list:
substance_data["score"].append(substance[0])
substance_data["substance"].append(substance[1])
substance_data["health_aspect"].append(substance[2])
datatypes = {"substance": str, "score": int, "health_aspect": str}
df = pd.DataFrame(data=substance_data)
df = df.astype(datatypes)
return df
class HealthseaPipe:
# Get Clauses and their predictions
def get_clauses(self, doc):
clauses = []
for clause in doc._.clauses:
words = []
spaces = []
clause_slice = doc[clause["split_indices"][0] : clause["split_indices"][1]]
if clause["has_ent"]:
for token in clause_slice:
if token.i == clause["ent_indices"][0]:
words.append(
clause["blinder"].replace(">", "").replace("<", "")
)
spaces.append(True)
elif token.i not in range(
clause["ent_indices"][0], clause["ent_indices"][1]
):
words.append(token.text)
spaces.append(token.whitespace_)
clauses.append(Doc(doc.vocab, words=words, spaces=spaces))
else:
for token in clause_slice:
words.append(token.text)
spaces.append(token.whitespace_)
clauses.append(Doc(doc.vocab, words=words, spaces=spaces))
return clauses