demo / src /deploy_utils.py
Santiago Hincapie Potes
augmented generation
619fe5f
raw
history blame
No virus
3.05 kB
import typing as tp
import pandas as pd
import numpy as np
import faiss
from numpy import typing as ntp
import tensorflow_hub as tfhub
from sentence_transformers import SentenceTransformer
from sklearn.metrics.pairwise import cosine_distances
Embedder = tp.Callable[[list[str]], ntp.ArrayLike]
def load_model(model_name: str):
if "universal-sentence-encoder" in model_name:
model = tfhub.load("https://tfhub.dev/google/universal-sentence-encoder/4")
def inner_forward_fn(input_texts: list[str]):
return model(input_texts)
else:
model = SentenceTransformer(model_name)
def inner_forward_fn(input_texts: list[str]):
return model.encode(input_texts, convert_to_tensor=True)
return inner_forward_fn
def get_matching_reviews_ids(relevant_products: pd.DataFrame):
matching_reviews_ids = np.concatenate(
[np.array(i).reshape(-1) for i in relevant_products.reviewID.tolist()]
)
return matching_reviews_ids
def query_relevant_documents(
product_model: Embedder,
indexer: faiss.Index,
products: pd.DataFrame,
query_text: str,
) -> pd.DataFrame:
embedded_query = product_model([query_text])
dist, idx = indexer.search(embedded_query, 64)
relevant_products = products.iloc[idx[dist < 1]]
return relevant_products
def get_relevant_reviews(
relevant_products: pd.DataFrame, reviews: pd.DataFrame
) -> pd.DataFrame:
review_ids = ":".join(relevant_products.reviewID).split(":")
relevant_reviews = reviews.loc[review_ids].drop_duplicates("reviewText")
summaries = relevant_reviews.summary
relevant_reviews = relevant_reviews[~summaries.isna()]
relevant_reviews = relevant_reviews[~summaries.str.match(r"\w+ Star(s)?")]
return relevant_reviews
def clusterize_reviews(
relevant_reviews: pd.DataFrame,
reviews_embedder: Embedder,
clusterer,
) -> pd.Series:
embedded_reviews = reviews_embedder(relevant_reviews.summary.tolist())
dist_matrix = cosine_distances(embedded_reviews).astype(np.float64)
clusters = clusterer.fit(dist_matrix)
return clusters.labels_
def get_key_reviews(
reviews_with_topics, extracted_topics, top_k_topics: int = 5
) -> list[str]:
hist_of_topics = reviews_with_topics.topic.value_counts()
top_k = min(top_k_topics, len(hist_of_topics))
indices = hist_of_topics.iloc[:top_k].index
top_rated_reviews = set(
reviews_with_topics
.sort_values(['topic', 'overall'], ascending=False)
.groupby('topic')
.head(1)
.set_index('topic')
.loc[indices]
.reviewText
.tolist()
)
representative_reviews = {
extracted_topics[idx].representative_examples[0]
for idx in indices
}
return list(top_rated_reviews | representative_reviews)
def _format_review(x):
single_line = x.split("\n")[0]
return f' - {single_line.strip()}'
def key_reviews_to_prompt(reviews):
return '\n'.join([
_format_review(i) for i in reviews
])