File size: 3,047 Bytes
f01bb12
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
619fe5f
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
import typing as tp
import pandas as pd
import numpy as np
import faiss
from numpy import typing as ntp
import tensorflow_hub as tfhub
from sentence_transformers import SentenceTransformer
from sklearn.metrics.pairwise import cosine_distances


Embedder = tp.Callable[[list[str]], ntp.ArrayLike]


def load_model(model_name: str):
    if "universal-sentence-encoder" in model_name:
        model = tfhub.load("https://tfhub.dev/google/universal-sentence-encoder/4")

        def inner_forward_fn(input_texts: list[str]):
            return model(input_texts)

    else:
        model = SentenceTransformer(model_name)

        def inner_forward_fn(input_texts: list[str]):
            return model.encode(input_texts, convert_to_tensor=True)

    return inner_forward_fn


def get_matching_reviews_ids(relevant_products: pd.DataFrame):
    matching_reviews_ids = np.concatenate(
        [np.array(i).reshape(-1) for i in relevant_products.reviewID.tolist()]
    )

    return matching_reviews_ids


def query_relevant_documents(
    product_model: Embedder,
    indexer: faiss.Index,
    products: pd.DataFrame,
    query_text: str,
) -> pd.DataFrame:
    embedded_query = product_model([query_text])
    dist, idx = indexer.search(embedded_query, 64)

    relevant_products = products.iloc[idx[dist < 1]]
    return relevant_products


def get_relevant_reviews(
    relevant_products: pd.DataFrame, reviews: pd.DataFrame
) -> pd.DataFrame:
    review_ids = ":".join(relevant_products.reviewID).split(":")
    relevant_reviews = reviews.loc[review_ids].drop_duplicates("reviewText")

    summaries = relevant_reviews.summary

    relevant_reviews = relevant_reviews[~summaries.isna()]
    relevant_reviews = relevant_reviews[~summaries.str.match(r"\w+ Star(s)?")]

    return relevant_reviews


def clusterize_reviews(
    relevant_reviews: pd.DataFrame,
    reviews_embedder: Embedder,
    clusterer,
) -> pd.Series:
    embedded_reviews = reviews_embedder(relevant_reviews.summary.tolist())
    dist_matrix = cosine_distances(embedded_reviews).astype(np.float64)
    clusters = clusterer.fit(dist_matrix)
    return clusters.labels_


def get_key_reviews(
    reviews_with_topics, extracted_topics, top_k_topics: int = 5
) -> list[str]:
    hist_of_topics = reviews_with_topics.topic.value_counts()
    top_k = min(top_k_topics, len(hist_of_topics))
    indices = hist_of_topics.iloc[:top_k].index

    top_rated_reviews = set(
        reviews_with_topics
        .sort_values(['topic', 'overall'], ascending=False)
        .groupby('topic')
        .head(1)
        .set_index('topic')
        .loc[indices]
        .reviewText
        .tolist()
    )
    representative_reviews = {
        extracted_topics[idx].representative_examples[0]
        for idx in indices
    }

    return list(top_rated_reviews | representative_reviews)


def _format_review(x):
    single_line = x.split("\n")[0]
    return f' - {single_line.strip()}'


def key_reviews_to_prompt(reviews):
    return '\n'.join([
        _format_review(i) for i in reviews
    ])