Spaces:

factored-almond
/

demo

Sleeping

App Files Files Community

Santiago Hincapie Potes commited on Aug 4, 2023

Commit

f01bb12

•

1 Parent(s): bd231ba

add code

Browse files

Files changed (7) hide show

src/deploy_utils.py +98 -0
src/modelling/semantic_search/vectordb_utils.py +73 -0
src/modelling/topics/__init__.py +0 -0
src/modelling/topics/class_tf_idf.py +68 -0
src/modelling/topics/extraction_utils.py +48 -0
src/modelling/topics/topic_extractor.py +191 -0
src/streaming_loading.py +45 -0

src/deploy_utils.py ADDED Viewed

	@@ -0,0 +1,98 @@

+import typing as tp
+import pandas as pd
+import numpy as np
+import faiss
+from numpy import typing as ntp
+import tensorflow_hub as tfhub
+from sentence_transformers import SentenceTransformer
+from sklearn.metrics.pairwise import cosine_distances
+Embedder = tp.Callable[[list[str]], ntp.ArrayLike]
+def load_model(model_name: str):
+    if "universal-sentence-encoder" in model_name:
+        model = tfhub.load("https://tfhub.dev/google/universal-sentence-encoder/4")
+        def inner_forward_fn(input_texts: list[str]):
+            return model(input_texts)
+    else:
+        model = SentenceTransformer(model_name)
+        def inner_forward_fn(input_texts: list[str]):
+            return model.encode(input_texts, convert_to_tensor=True)
+    return inner_forward_fn
+def get_matching_reviews_ids(relevant_products: pd.DataFrame):
+    matching_reviews_ids = np.concatenate(
+        [np.array(i).reshape(-1) for i in relevant_products.reviewID.tolist()]
+    )
+    return matching_reviews_ids
+def query_relevant_documents(
+    product_model: Embedder,
+    indexer: faiss.Index,
+    products: pd.DataFrame,
+    query_text: str,
+) -> pd.DataFrame:
+    embedded_query = product_model([query_text])
+    dist, idx = indexer.search(embedded_query, 64)
+    relevant_products = products.iloc[idx[dist < 1]]
+    return relevant_products
+def get_relevant_reviews(
+    relevant_products: pd.DataFrame, reviews: pd.DataFrame
+) -> pd.DataFrame:
+    review_ids = ":".join(relevant_products.reviewID).split(":")
+    relevant_reviews = reviews.loc[review_ids].drop_duplicates("reviewText")
+    summaries = relevant_reviews.summary
+    relevant_reviews = relevant_reviews[~summaries.isna()]
+    relevant_reviews = relevant_reviews[~summaries.str.match(r"\w+ Star(s)?")]
+    return relevant_reviews
+def clusterize_reviews(
+    relevant_reviews: pd.DataFrame,
+    reviews_embedder: Embedder,
+    clusterer,
+) -> pd.Series:
+    embedded_reviews = reviews_embedder(relevant_reviews.summary.tolist())
+    dist_matrix = cosine_distances(embedded_reviews).astype(np.float64)
+    clusters = clusterer.fit(dist_matrix)
+    return clusters.labels_
+def get_key_reviews(
+    reviews_with_topics, extracted_topics, top_k_topics: int = 5
+) -> list[str]:
+    hist_of_topics = reviews_with_topics.topic.value_counts()
+    top_k = min(top_k_topics, len(hist_of_topics))
+    indices = hist_of_topics.iloc[:top_k].index
+    top_rated_reviews = set(
+        reviews_with_topics
+        .sort_values(['topic', 'overall'], ascending=False)
+        .groupby('topic')
+        .head(1)
+        .set_index('topic')
+        .loc[indices]
+        .reviewText
+        .tolist()
+    )
+    representative_reviews = {
+        extracted_topics[idx].representative_examples[0]
+        for idx in indices
+    }
+    return list(top_rated_reviews | representative_reviews)

src/modelling/semantic_search/vectordb_utils.py ADDED Viewed

	@@ -0,0 +1,73 @@

+import typing as tp
+import tensorflow as tf
+import faiss
+import numpy as np
+from faiss.contrib.ondisk import merge_ondisk
+import pathlib
+Embedding = tp.Callable[[list[str]], tf.Tensor]
+def batch_list(input_list: list[tp.Any], batch_size: int) -> list[tp.Any]:
+    pointers = [*range(0, len(input_list), batch_size), None]
+    for i, j in zip(pointers, pointers[1:]):
+        yield input_list[i:j]
+def train_index(
+    model: Embedding,
+    faiss_index_str: str,
+    text_batch: list[str],
+    trained_index_path: pathlib.Path,
+):
+    model_output = model(text_batch)
+    index = faiss.index_factory(512, faiss_index_str)
+    index.train(model_output)
+    faiss.write_index(index, trained_index_path)
+def add_sharded_embeddings(
+    model: Embedding,
+    batched_inputs: list[list[str]],
+    trained_index_path: pathlib.Path,
+    shard_root_dir: pathlib.Path,
+):
+    for idx, batch in enumerate(batched_inputs):
+        index = faiss.read_index(trained_index_path)
+        encoded_tensor = model(batch)
+        index.add(encoded_tensor)
+        faiss.write_index(index, shard_root_dir / f"shard_{idx}.index")
+def merge_shards(
+    trained_index_path: pathlib.Path,
+    shard_root_dir: pathlib.Path,
+    populated_index_path: pathlib.Path,
+    merged_index_path: pathlib.Path,
+):
+    # NOTE: run this on the deploy env!
+    index = faiss.read_index(trained_index_path)
+    block_fnames = list(shard_root_dir.iterdir())
+    merge_ondisk(index, block_fnames, merged_index_path)
+    faiss.write_index(index, populated_index_path)
+def load_populated_index(
+        populated_index_path: pathlib.Path, nprobe: int = 16) -> faiss.Index:
+    populated_index = faiss.read_index(str(populated_index_path))
+    populated_index.nprobe = nprobe
+    return populated_index
+def run_query(
+        populated_index: faiss.Index,
+        model: Embedding,
+        query: str,
+        top_k: int = 8) -> np.ndarray:
+    encoded_query = model([query])
+    _, idx = populated_index.search(encoded_query, top_k)
+    return idx[0]

src/modelling/topics/__init__.py ADDED Viewed

File without changes

src/modelling/topics/class_tf_idf.py ADDED Viewed

	@@ -0,0 +1,68 @@

+from sklearn.feature_extraction.text import TfidfTransformer
+from sklearn.preprocessing import normalize
+from sklearn.utils import check_array
+import numpy as np
+import scipy.sparse as sp
+class ClassTfidfTransformer(TfidfTransformer):
+    """Class-based TF-IDF."""
+    def __init__(
+        self,
+        bm25_weighting: bool = False,
+        reduce_frequent_words: bool = False,
+        **kwargs
+    ):
+        self.bm25_weighting = bm25_weighting
+        self.reduce_frequent_words = reduce_frequent_words
+        super(ClassTfidfTransformer, self).__init__(**kwargs)
+    def fit(self, X: sp.csr_matrix, multiplier: np.ndarray = None):
+        X = check_array(X, accept_sparse=("csr", "csc"))
+        if not sp.issparse(X):
+            X = sp.csr_matrix(X)
+        dtype = np.float64
+        if self.use_idf:
+            _, n_features = X.shape
+            # Calculate the frequency of words across all classes
+            df = np.squeeze(np.asarray(X.sum(axis=0)))
+            # Calculate the average number of samples as regularization
+            avg_nr_samples = int(X.sum(axis=1).mean())
+            # BM25-inspired weighting procedure
+            if self.bm25_weighting:
+                idf = np.log(1 + ((avg_nr_samples - df + 0.5) / (df + 0.5)))
+            # Divide the average number of samples by the word frequency
+            # +1 is added to force values to be positive
+            else:
+                idf = np.log((avg_nr_samples / df) + 1)
+            # Multiplier to increase/decrease certain idf scores
+            if multiplier is not None:
+                idf = idf * multiplier
+            self._idf_diag = sp.diags(
+                idf,
+                offsets=0,
+                shape=(n_features, n_features),
+                format="csr",
+                dtype=dtype,
+            )
+        return self
+    def transform(self, X: sp.csr_matrix):
+        if self.use_idf:
+            X = normalize(X, axis=1, norm="l1", copy=False)
+            if self.reduce_frequent_words:
+                X.data = np.sqrt(X.data)
+            X = X * self._idf_diag
+        return X

src/modelling/topics/extraction_utils.py ADDED Viewed

	@@ -0,0 +1,48 @@

+import pandas as pd
+import numpy as np
+import scipy.sparse as sp
+def group_reviews_per_topic(
+    reviews: pd.DataFrame, review_text_key: str) -> pd.DataFrame:
+    return (
+        reviews
+        .groupby('topic', as_index=False)
+        .agg({review_text_key: ' '.join})
+    )
+def mark_empty_docs(text: str) -> str:
+    if not text:
+        return "emptydoc"
+    return text
+def check_reviews_schema(reviews: pd.DataFrame):
+    pass
+def top_n_idx_sparse(matrix: sp.csr_matrix, n: int) -> np.ndarray:
+    indices = []
+    for le, ri in zip(matrix.indptr[:-1], matrix.indptr[1:]):
+        n_row_pick = min(n, ri - le)
+        values = matrix.indices[
+            le + np.argpartition(matrix.data[le:ri], -n_row_pick)[-n_row_pick:]
+        ]
+        values = [
+            values[index] if len(values) >= index + 1 else None
+            for index in range(n)
+        ]
+        indices.append(values)
+    return np.array(indices)
+def top_n_values_sparse(
+        matrix: sp.csr_matrix, indices: np.ndarray) -> np.ndarray:
+    top_values = []
+    for row, values in enumerate(indices):
+        scores = np.array([
+            matrix[row, value] if value is not None else 0 for value in values
+        ])
+        top_values.append(scores)
+    return np.array(top_values)

src/modelling/topics/topic_extractor.py ADDED Viewed

	@@ -0,0 +1,191 @@

+import dataclasses
+import collections
+import typing as tp
+import numpy as np
+import pandas as pd
+import scipy.sparse as sp
+from sklearn.metrics.pairwise import cosine_similarity
+from sklearn.feature_extraction.text import CountVectorizer, TfidfTransformer
+from src.modelling.topics import extraction_utils
+TopicID = tp.Union[str, int]
+@dataclasses.dataclass
+class Topic:
+    topic_id: TopicID
+    n_grams: list[tuple[str, float]]
+    representative_examples: list[str]
+    stats: dict[str, float]
+    sentiment: tp.Optional[str] = None  # postive | negative
+    text_label: tp.Optional[str] = None
+@dataclasses.dataclass
+class TopicExtractionConfig:
+    vectorizer_model: CountVectorizer
+    ctfidf_model: TfidfTransformer
+    number_of_grams_per_topic: int = 10
+    number_of_representative_documents: int = 3
+    reduce_topics: tp.Union[int, None] = None
+    review_text_key: str = "reviewText"
+    def get_vectorizer_model(self):
+        return self.vectorizer_model
+    def get_extraction_model(self):
+        return self.ctfidf_model
+class TopicExtractor:
+    def __init__(self, config: TopicExtractionConfig):
+        self.config = config
+        self.vectorizer_model = self.config.get_vectorizer_model()
+        self.ctfidf_model = self.config.get_extraction_model()
+        self.review_text_key = self.config.review_text_key
+        self.c_tf_idf = None
+    def __call__(
+            self, reviews: pd.DataFrame
+    ) -> dict[TopicID, Topic]:
+        extraction_utils.check_reviews_schema(reviews)
+        topic_stats = self.compute_topic_stats(reviews)
+        self.extract_topics(reviews)
+        representative_examples = self.extract_representative_documents(
+            reviews)
+        return {
+            topic_id: Topic(
+                topic_id=topic_id,
+                n_grams=self.words_per_topic[topic_id],
+                representative_examples=example,
+                stats=topic_stats[topic_id]
+            ) for topic_id, example in representative_examples.items()
+        }
+    def extract_topics(self, reviews: pd.DataFrame):
+        reviews_per_topic = extraction_utils.group_reviews_per_topic(
+            reviews, self.review_text_key)
+        self.c_tf_idf, vocab = self.compute_c_tf_idf(reviews_per_topic)
+        self.words_per_topic = self.extract_words_per_topic(reviews, vocab)
+    def _prepare_c_tf_idf_text(self, raw_text: pd.Series) -> pd.Series:
+        clean_text = raw_text.str.replace("\n", " ")
+        clean_text = clean_text.str.replace("\t", " ")
+        clean_text = clean_text.str.replace(r"[^A-Za-z0-9 ]+", "", regex=True)
+        clean_text = clean_text.apply(extraction_utils.mark_empty_docs)
+        return clean_text
+    def compute_topic_stats(self, reviews):
+        return collections.defaultdict(dict)
+    def compute_c_tf_idf(
+            self,
+            reviews_per_topic: pd.DataFrame
+            ) -> tuple[sp.csr_matrix, np.ndarray]:
+        """Compute C-TF-IDF per topic
+        Args:
+            reviews_per_topic: A per topic dataframe, it must be the output of
+                `extraction_utils.group_reviews_per_topic`
+        """
+        clean_reviews = self._prepare_c_tf_idf_text(
+          reviews_per_topic[self.review_text_key])
+        # update in place
+        self.vectorizer_model.fit(clean_reviews)
+        vectorized_reviews = self.vectorizer_model.transform(clean_reviews)
+        vocab = self.vectorizer_model.get_feature_names_out()
+        c_tf_idf = self.ctfidf_model.fit_transform(vectorized_reviews)
+        return c_tf_idf, vocab
+    def extract_words_per_topic(
+            self, reviews: pd.DataFrame, vocab: np.ndarray):
+        labels = reviews.topic.unique().astype(int)
+        indices = extraction_utils.top_n_idx_sparse(
+            self.c_tf_idf, self.config.number_of_grams_per_topic
+        )
+        scores = extraction_utils.top_n_values_sparse(self.c_tf_idf, indices)
+        sorted_indices = np.argsort(scores, 1)
+        indices = np.take_along_axis(indices, sorted_indices, axis=1)
+        scores = np.take_along_axis(scores, sorted_indices, axis=1)
+        # Get top 30 words per topic based on c-TF-IDF score
+        topics = {
+            label: [
+                (vocab[word_index], score)
+                if word_index is not None and score > 0
+                else ("", 0.00001)
+                for word_index, score in
+                zip(indices[index][::-1], scores[index][::-1])
+            ]
+            for index, label in enumerate(labels)
+        }
+        topics = {
+            label: values[:self.config.number_of_grams_per_topic]
+            for label, values in topics.items()
+        }
+        return topics
+    def extract_representative_documents(self, reviews):
+        sample_reviews_per_topic = (
+            reviews.groupby('topic')
+                   .sample(n=500, replace=True)
+                   .drop_duplicates(subset=[self.review_text_key])
+        )
+        repr_docs = []
+        repr_docs_indices = []
+        repr_docs_mappings = {}
+        repr_docs_ids = []
+        labels = sorted(list(self.words_per_topic.keys()))
+        for index, topic in enumerate(labels):
+            # Slice data
+            selection = sample_reviews_per_topic.loc[
+                sample_reviews_per_topic.topic == topic, :]
+            selected_docs = selection[self.review_text_key].values
+            selected_full_docs = selection['reviewText'].values
+            selected_docs_ids = selection.index.tolist()
+            # Calculate similarity
+            nr_repr_docs = self.config.number_of_representative_documents
+            nr_docs = min(nr_repr_docs, len(selected_docs))
+            bow = self.vectorizer_model.transform(selected_docs)
+            ctfidf = self.ctfidf_model.transform(bow)
+            sim_matrix = cosine_similarity(ctfidf, self.c_tf_idf[index])
+            # TODO(shpotes): add diversity
+            # extract top n most representative documents
+            indices = np.argpartition(
+                sim_matrix.reshape(1, -1)[0], -nr_docs)[-nr_docs:]
+            docs = [selected_full_docs[index] for index in indices]
+            doc_ids = [
+                selected_docs_ids[index]
+                for index, doc in enumerate(selected_docs) if doc in docs
+            ]
+            repr_docs_ids.append(doc_ids)
+            repr_docs.extend(docs)
+            repr_docs_indices.append([
+                repr_docs_indices[-1][-1] + i + 1 if index != 0 else i
+                for i in range(nr_docs)
+            ])
+        repr_docs_mappings = {
+            topic: repr_docs[i[0]:i[-1]+1]
+            for topic, i in zip(self.words_per_topic.keys(), repr_docs_indices)
+        }
+        return repr_docs_mappings

src/streaming_loading.py ADDED Viewed

	@@ -0,0 +1,45 @@

+import logging
+import asyncio
+from azure.eventhub.aio import EventHubConsumerClient
+import os
+from google.cloud import storage
+os.environ["GOOGLE_APPLICATION_CREDENTIALS"] = "/Users/camivasz/almond-datathon-ffcfe3899e67.json"
+CONNECTION_STRING = "Endpoint=sb://factored-datathon.servicebus.windows.net/;SharedAccessKeyName=datathon_group_3;SharedAccessKey=JLEggz9GNlDdLvbypDAudzTABp+WnVeIY+AEhBAupi4=;EntityPath=factored_datathon_amazon_reviews_3"
+EVENT_HUB_LISTEN_POLICY_KEY = "sJJnyi8GGTBAa55jY89kacoT6hXAzWx2B+AEhCPEKYE="
+CONSUMER_GROUP = 'almond'
+EVENT_HUB_NAME = "factored_datathon_amazon_reviews_3"
+logger = logging.getLogger("azure.eventhub")
+logging.basicConfig(level=logging.INFO)
+async def on_event(partition_context, event):
+    filename = f"{partition_context.partition_id}_{event.sequence_number}.json"
+    source_file_name = f"reads/{filename}"
+    destination_blob_name = f"patition_0/{filename}"
+    with open(f"reads/{filename}", 'wb') as fp:
+        fp.write(next(event.body))
+    if event.sequence_number > 15391:
+        client_storage = storage.Client()
+        bucket_name = "amazon-reviews-almond-3"
+        bucket = client_storage.bucket(bucket_name)
+        blob = bucket.blob(destination_blob_name)
+        blob.upload_from_filename(source_file_name)
+    logger.info("Received event {} from partition {}".format(event.sequence_number, partition_context.partition_id))
+    await partition_context.update_checkpoint(event)
+async def receive():
+    client = EventHubConsumerClient.from_connection_string(CONNECTION_STRING,
+                                                            CONSUMER_GROUP,
+                                                            eventhub_name=EVENT_HUB_NAME)
+    async with client:
+        await client.receive(
+            on_event=on_event,
+            starting_position="-1",  # "-1" is from the beginning of the partition.
+        )
+if __name__ == '__main__':
+    loop = asyncio.get_event_loop()
+    loop.run_until_complete(receive())