Spaces:
Sleeping
Sleeping
File size: 3,047 Bytes
f01bb12 619fe5f |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 |
import typing as tp
import pandas as pd
import numpy as np
import faiss
from numpy import typing as ntp
import tensorflow_hub as tfhub
from sentence_transformers import SentenceTransformer
from sklearn.metrics.pairwise import cosine_distances
Embedder = tp.Callable[[list[str]], ntp.ArrayLike]
def load_model(model_name: str):
if "universal-sentence-encoder" in model_name:
model = tfhub.load("https://tfhub.dev/google/universal-sentence-encoder/4")
def inner_forward_fn(input_texts: list[str]):
return model(input_texts)
else:
model = SentenceTransformer(model_name)
def inner_forward_fn(input_texts: list[str]):
return model.encode(input_texts, convert_to_tensor=True)
return inner_forward_fn
def get_matching_reviews_ids(relevant_products: pd.DataFrame):
matching_reviews_ids = np.concatenate(
[np.array(i).reshape(-1) for i in relevant_products.reviewID.tolist()]
)
return matching_reviews_ids
def query_relevant_documents(
product_model: Embedder,
indexer: faiss.Index,
products: pd.DataFrame,
query_text: str,
) -> pd.DataFrame:
embedded_query = product_model([query_text])
dist, idx = indexer.search(embedded_query, 64)
relevant_products = products.iloc[idx[dist < 1]]
return relevant_products
def get_relevant_reviews(
relevant_products: pd.DataFrame, reviews: pd.DataFrame
) -> pd.DataFrame:
review_ids = ":".join(relevant_products.reviewID).split(":")
relevant_reviews = reviews.loc[review_ids].drop_duplicates("reviewText")
summaries = relevant_reviews.summary
relevant_reviews = relevant_reviews[~summaries.isna()]
relevant_reviews = relevant_reviews[~summaries.str.match(r"\w+ Star(s)?")]
return relevant_reviews
def clusterize_reviews(
relevant_reviews: pd.DataFrame,
reviews_embedder: Embedder,
clusterer,
) -> pd.Series:
embedded_reviews = reviews_embedder(relevant_reviews.summary.tolist())
dist_matrix = cosine_distances(embedded_reviews).astype(np.float64)
clusters = clusterer.fit(dist_matrix)
return clusters.labels_
def get_key_reviews(
reviews_with_topics, extracted_topics, top_k_topics: int = 5
) -> list[str]:
hist_of_topics = reviews_with_topics.topic.value_counts()
top_k = min(top_k_topics, len(hist_of_topics))
indices = hist_of_topics.iloc[:top_k].index
top_rated_reviews = set(
reviews_with_topics
.sort_values(['topic', 'overall'], ascending=False)
.groupby('topic')
.head(1)
.set_index('topic')
.loc[indices]
.reviewText
.tolist()
)
representative_reviews = {
extracted_topics[idx].representative_examples[0]
for idx in indices
}
return list(top_rated_reviews | representative_reviews)
def _format_review(x):
single_line = x.split("\n")[0]
return f' - {single_line.strip()}'
def key_reviews_to_prompt(reviews):
return '\n'.join([
_format_review(i) for i in reviews
])
|