Spaces:
Runtime error
Runtime error
import pandas as pd | |
import difflib | |
from spacy.tokens import Doc | |
import plotly | |
import plotly.graph_objs as go | |
from sklearn.manifold import TSNE | |
import numpy as np | |
class HealthseaSearch: | |
def __init__(self, _health_aspects, _products, _conditions, _benefits): | |
self.health_aspects = _health_aspects | |
self.products = _products | |
self.conditions = _conditions | |
self.benefits = _benefits | |
def __call__(self, query): | |
return query | |
# Load product meta | |
def get_products(self, _aspect, n): | |
product_list = [] | |
product_ids = {} | |
_n = n | |
_aspect = _aspect.replace(" ", "_") | |
if _aspect in self.health_aspects: | |
aspect = self.health_aspects[_aspect] | |
else: | |
_aspect = difflib.get_close_matches("_aspect", self.health_aspects.keys())[ | |
0 | |
] | |
aspect = self.health_aspects[_aspect] | |
product_scoring = aspect["products"] | |
if n != 0: | |
if n > len(product_scoring): | |
n = len(product_scoring) | |
product_scoring = aspect["products"][:n] | |
for product in product_scoring: | |
if product[1] not in product_ids: | |
product_list.append((product[0], self.products[product[1]], _aspect)) | |
product_ids[product[1]] = 1 | |
for alias in aspect["alias"]: | |
n = _n | |
_product_scoring = self.health_aspects[alias]["products"] | |
if n != 0: | |
if n > len(_product_scoring): | |
n = len(_product_scoring) | |
_product_scoring = self.health_aspects[alias]["products"][:n] | |
for product in _product_scoring: | |
if product[1] not in product_ids: | |
product_list.append((product[0], self.products[product[1]], alias)) | |
product_ids[product[1]] = 1 | |
n = _n | |
if len(product_list) > n and n != 0: | |
product_list = product_list[:n] | |
product_list = sorted(product_list, key=lambda tup: tup[0], reverse=True) | |
return product_list | |
# Load product meta and return as DataFrame | |
def get_products_df(self, _aspect, n): | |
product_list = self.get_products(_aspect, n) | |
product_data = { | |
"product": [], | |
"score": [], | |
"health_aspect": [], | |
"rating": [], | |
"reviews": [], | |
} | |
for product in product_list: | |
product_data["score"].append(product[0]) | |
product_data["product"].append(product[1]["name"]) | |
product_data["health_aspect"].append(product[2]) | |
product_data["rating"].append(product[1]["rating"]) | |
product_data["reviews"].append(product[1]["review_count"]) | |
datatypes = { | |
"product": str, | |
"score": int, | |
"health_aspect": str, | |
"rating": str, | |
"reviews": int, | |
} | |
df = pd.DataFrame(data=product_data) | |
df = df.astype(datatypes) | |
return df | |
# Get health aspect | |
def get_aspect(self, _aspect): | |
_aspect = _aspect.replace(" ", "_") | |
if _aspect in self.health_aspects: | |
return self.health_aspects[_aspect] | |
else: | |
_aspect = difflib.get_close_matches("_aspect", self.health_aspects.keys())[ | |
0 | |
] | |
return self.health_aspects[_aspect] | |
# Get health aspect meta | |
def get_aspect_meta(self, _aspect): | |
_aspect = _aspect.replace(" ", "_") | |
if _aspect in self.conditions: | |
return self.conditions[_aspect] | |
elif _aspect in self.benefits: | |
return self.benefits[_aspect] | |
else: | |
_aspect = difflib.get_close_matches("_aspect", self.conditions.keys())[0] | |
return self.conditions[_aspect] | |
# Plotting vectors (2D/3D) | |
def tsne_plot(self, dataset): | |
"Creates and TSNE model and plots it" | |
labels = [] | |
tokens = [] | |
for i in dataset: | |
tokens.append(np.array(i[1])) | |
labels.append(i[0]) | |
if len(dataset) > 2: | |
tsne_model = TSNE( | |
perplexity=40, n_components=3, init="pca", n_iter=2500, random_state=23 | |
) | |
new_values = tsne_model.fit_transform(tokens) | |
x = [] | |
y = [] | |
z = [] | |
for value in new_values: | |
x.append(value[0]) | |
y.append(value[1]) | |
z.append(value[2]) | |
trace = go.Scatter3d( | |
x=x, | |
y=y, | |
z=z, | |
text=labels, | |
textposition="top right", | |
mode="lines+markers+text", | |
marker={ | |
"size": 10, | |
"opacity": 0.8, | |
}, | |
) | |
# Configure the layout. | |
layout = go.Layout( | |
margin={"l": 0, "r": 0, "b": 0, "t": 0}, font={"color": "#DF55E2"} | |
) | |
data = [trace] | |
return go.Figure(data=data, layout=layout) | |
else: | |
tsne_model = TSNE( | |
perplexity=40, n_components=2, init="pca", n_iter=2500, random_state=23 | |
) | |
new_values = tsne_model.fit_transform(tokens) | |
x = [] | |
y = [] | |
for value in new_values: | |
x.append(value[0]) | |
y.append(value[1]) | |
trace = go.Scatter( | |
x=x, | |
y=y, | |
text=labels, | |
textposition="top right", | |
mode="lines+markers+text", | |
marker={ | |
"size": 10, | |
"opacity": 0.8, | |
}, | |
) | |
# Configure the layout. | |
layout = go.Layout( | |
margin={"l": 0, "r": 0, "b": 0, "t": 0}, font={"color": "#DF55E2"} | |
) | |
data = [trace] | |
return go.Figure(data=data, layout=layout) | |
# Load substance meta | |
def get_substances(self, _aspect, n): | |
substance_list = [] | |
substance_ids = {} | |
exclude = ["sodium", "sugar", "sugar_alcohol"] | |
_n = n | |
_aspect = _aspect.replace(" ", "_") | |
if _aspect in self.health_aspects: | |
aspect = self.health_aspects[_aspect] | |
else: | |
_aspect = difflib.get_close_matches("_aspect", self.health_aspects.keys())[ | |
0 | |
] | |
aspect = self.health_aspects[_aspect] | |
substance_scoring = aspect["substance"] | |
if n != 0: | |
if n > len(substance_scoring): | |
n = len(substance_scoring) | |
substance_scoring = aspect["substance"][:n] | |
for substance in substance_scoring: | |
if substance[1] in exclude: | |
continue | |
if substance[1] not in substance_ids: | |
substance_list.append((substance[0], substance[1], _aspect)) | |
substance_ids[substance[1]] = 1 | |
for alias in aspect["alias"]: | |
n = _n | |
_substance_scoring = self.health_aspects[alias]["substance"] | |
if n != 0: | |
if n > len(_substance_scoring): | |
n = len(_substance_scoring) | |
_substance_scoring = self.health_aspects[alias]["substance"][:n] | |
for substance in _substance_scoring: | |
if substance[1] in exclude: | |
continue | |
if substance[1] not in substance_ids: | |
substance_list.append((substance[0], substance[1], alias)) | |
substance_ids[substance[1]] = 1 | |
n = _n | |
if len(substance_list) > n and n != 0: | |
substance_list = substance_list[:n] | |
substance_list = sorted(substance_list, key=lambda tup: tup[0], reverse=True) | |
return substance_list | |
# Load substance meta and return as DataFrame | |
def get_substances_df(self, _aspect, n): | |
substance_list = self.get_substances(_aspect, n) | |
substance_data = {"substance": [], "score": [], "health_aspect": []} | |
for substance in substance_list: | |
substance_data["score"].append(substance[0]) | |
substance_data["substance"].append(substance[1]) | |
substance_data["health_aspect"].append(substance[2]) | |
datatypes = {"substance": str, "score": int, "health_aspect": str} | |
df = pd.DataFrame(data=substance_data) | |
df = df.astype(datatypes) | |
return df | |
class HealthseaPipe: | |
# Get Clauses and their predictions | |
def get_clauses(self, doc): | |
clauses = [] | |
for clause in doc._.clauses: | |
words = [] | |
spaces = [] | |
clause_slice = doc[clause["split_indices"][0] : clause["split_indices"][1]] | |
if clause["has_ent"]: | |
for token in clause_slice: | |
if token.i == clause["ent_indices"][0]: | |
words.append( | |
clause["blinder"].replace(">", "").replace("<", "") | |
) | |
spaces.append(True) | |
elif token.i not in range( | |
clause["ent_indices"][0], clause["ent_indices"][1] | |
): | |
words.append(token.text) | |
spaces.append(token.whitespace_) | |
clauses.append(Doc(doc.vocab, words=words, spaces=spaces)) | |
else: | |
for token in clause_slice: | |
words.append(token.text) | |
spaces.append(token.whitespace_) | |
clauses.append(Doc(doc.vocab, words=words, spaces=spaces)) | |
return clauses | |