| import pandas as pd |
| from scipy.sparse import csr_matrix |
| from sklearn.base import BaseEstimator |
| from typing import Mapping, List, Tuple |
|
|
|
|
| class BaseRepresentation(BaseEstimator): |
| """ The base representation model for fine-tuning topic representations """ |
| def extract_topics(self, |
| topic_model, |
| documents: pd.DataFrame, |
| c_tf_idf: csr_matrix, |
| topics: Mapping[str, List[Tuple[str, float]]] |
| ) -> Mapping[str, List[Tuple[str, float]]]: |
| """ Extract topics |
| |
| Each representation model that inherits this class will have |
| its arguments (topic_model, documents, c_tf_idf, topics) |
| automatically passed. Therefore, the representation model |
| will only have access to the information about topics related |
| to those arguments. |
| |
| Arguments: |
| topic_model: The BERTopic model that is fitted until topic |
| representations are calculated. |
| documents: A dataframe with columns "Document" and "Topic" |
| that contains all documents with each corresponding |
| topic. |
| c_tf_idf: A c-TF-IDF representation that is typically |
| identical to `topic_model.c_tf_idf_` except for |
| dynamic, class-based, and hierarchical topic modeling |
| where it is calculated on a subset of the documents. |
| topics: A dictionary with topic (key) and tuple of word and |
| weight (value) as calculated by c-TF-IDF. This is the |
| default topics that are returned if no representation |
| model is used. |
| """ |
| return topic_model.topic_representations_ |
|
|