import pandas as pd import difflib from spacy.tokens import Doc import plotly import plotly.graph_objs as go from sklearn.manifold import TSNE import numpy as np class HealthseaSearch: def __init__(self, _health_aspects, _products, _conditions, _benefits): self.health_aspects = _health_aspects self.products = _products self.conditions = _conditions self.benefits = _benefits def __call__(self, query): return query # Load product meta def get_products(self, _aspect, n): product_list = [] product_ids = {} _n = n _aspect = _aspect.replace(" ", "_") if _aspect in self.health_aspects: aspect = self.health_aspects[_aspect] else: _aspect = difflib.get_close_matches("_aspect", self.health_aspects.keys())[ 0 ] aspect = self.health_aspects[_aspect] product_scoring = aspect["products"] if n != 0: if n > len(product_scoring): n = len(product_scoring) product_scoring = aspect["products"][:n] for product in product_scoring: if product[1] not in product_ids: product_list.append((product[0], self.products[product[1]], _aspect)) product_ids[product[1]] = 1 for alias in aspect["alias"]: n = _n _product_scoring = self.health_aspects[alias]["products"] if n != 0: if n > len(_product_scoring): n = len(_product_scoring) _product_scoring = self.health_aspects[alias]["products"][:n] for product in _product_scoring: if product[1] not in product_ids: product_list.append((product[0], self.products[product[1]], alias)) product_ids[product[1]] = 1 n = _n if len(product_list) > n and n != 0: product_list = product_list[:n] product_list = sorted(product_list, key=lambda tup: tup[0], reverse=True) return product_list # Load product meta and return as DataFrame def get_products_df(self, _aspect, n): product_list = self.get_products(_aspect, n) product_data = { "product": [], "score": [], "health_aspect": [], "rating": [], "reviews": [], } for product in product_list: product_data["score"].append(product[0]) product_data["product"].append(product[1]["name"]) product_data["health_aspect"].append(product[2]) product_data["rating"].append(product[1]["rating"]) product_data["reviews"].append(product[1]["review_count"]) datatypes = { "product": str, "score": int, "health_aspect": str, "rating": str, "reviews": int, } df = pd.DataFrame(data=product_data) df = df.astype(datatypes) return df # Get health aspect def get_aspect(self, _aspect): _aspect = _aspect.replace(" ", "_") if _aspect in self.health_aspects: return self.health_aspects[_aspect] else: _aspect = difflib.get_close_matches("_aspect", self.health_aspects.keys())[ 0 ] return self.health_aspects[_aspect] # Get health aspect meta def get_aspect_meta(self, _aspect): _aspect = _aspect.replace(" ", "_") if _aspect in self.conditions: return self.conditions[_aspect] elif _aspect in self.benefits: return self.benefits[_aspect] else: _aspect = difflib.get_close_matches("_aspect", self.conditions.keys())[0] return self.conditions[_aspect] # Plotting vectors (2D/3D) def tsne_plot(self, dataset): "Creates and TSNE model and plots it" labels = [] tokens = [] for i in dataset: tokens.append(np.array(i[1])) labels.append(i[0]) if len(dataset) > 2: tsne_model = TSNE( perplexity=40, n_components=3, init="pca", n_iter=2500, random_state=23 ) new_values = tsne_model.fit_transform(tokens) x = [] y = [] z = [] for value in new_values: x.append(value[0]) y.append(value[1]) z.append(value[2]) trace = go.Scatter3d( x=x, y=y, z=z, text=labels, textposition="top right", mode="lines+markers+text", marker={ "size": 10, "opacity": 0.8, }, ) # Configure the layout. layout = go.Layout( margin={"l": 0, "r": 0, "b": 0, "t": 0}, font={"color": "#DF55E2"} ) data = [trace] return go.Figure(data=data, layout=layout) else: tsne_model = TSNE( perplexity=40, n_components=2, init="pca", n_iter=2500, random_state=23 ) new_values = tsne_model.fit_transform(tokens) x = [] y = [] for value in new_values: x.append(value[0]) y.append(value[1]) trace = go.Scatter( x=x, y=y, text=labels, textposition="top right", mode="lines+markers+text", marker={ "size": 10, "opacity": 0.8, }, ) # Configure the layout. layout = go.Layout( margin={"l": 0, "r": 0, "b": 0, "t": 0}, font={"color": "#DF55E2"} ) data = [trace] return go.Figure(data=data, layout=layout) # Load substance meta def get_substances(self, _aspect, n): substance_list = [] substance_ids = {} exclude = ["sodium", "sugar", "sugar_alcohol"] _n = n _aspect = _aspect.replace(" ", "_") if _aspect in self.health_aspects: aspect = self.health_aspects[_aspect] else: _aspect = difflib.get_close_matches("_aspect", self.health_aspects.keys())[ 0 ] aspect = self.health_aspects[_aspect] substance_scoring = aspect["substance"] if n != 0: if n > len(substance_scoring): n = len(substance_scoring) substance_scoring = aspect["substance"][:n] for substance in substance_scoring: if substance[1] in exclude: continue if substance[1] not in substance_ids: substance_list.append((substance[0], substance[1], _aspect)) substance_ids[substance[1]] = 1 for alias in aspect["alias"]: n = _n _substance_scoring = self.health_aspects[alias]["substance"] if n != 0: if n > len(_substance_scoring): n = len(_substance_scoring) _substance_scoring = self.health_aspects[alias]["substance"][:n] for substance in _substance_scoring: if substance[1] in exclude: continue if substance[1] not in substance_ids: substance_list.append((substance[0], substance[1], alias)) substance_ids[substance[1]] = 1 n = _n if len(substance_list) > n and n != 0: substance_list = substance_list[:n] substance_list = sorted(substance_list, key=lambda tup: tup[0], reverse=True) return substance_list # Load substance meta and return as DataFrame def get_substances_df(self, _aspect, n): substance_list = self.get_substances(_aspect, n) substance_data = {"substance": [], "score": [], "health_aspect": []} for substance in substance_list: substance_data["score"].append(substance[0]) substance_data["substance"].append(substance[1]) substance_data["health_aspect"].append(substance[2]) datatypes = {"substance": str, "score": int, "health_aspect": str} df = pd.DataFrame(data=substance_data) df = df.astype(datatypes) return df class HealthseaPipe: # Get Clauses and their predictions def get_clauses(self, doc): clauses = [] for clause in doc._.clauses: words = [] spaces = [] clause_slice = doc[clause["split_indices"][0] : clause["split_indices"][1]] if clause["has_ent"]: for token in clause_slice: if token.i == clause["ent_indices"][0]: words.append( clause["blinder"].replace(">", "").replace("<", "") ) spaces.append(True) elif token.i not in range( clause["ent_indices"][0], clause["ent_indices"][1] ): words.append(token.text) spaces.append(token.whitespace_) clauses.append(Doc(doc.vocab, words=words, spaces=spaces)) else: for token in clause_slice: words.append(token.text) spaces.append(token.whitespace_) clauses.append(Doc(doc.vocab, words=words, spaces=spaces)) return clauses