DTECT / backend /evaluation /CoherenceModel_ttc.py
AdhyaSuman's picture
Initial commit with Git LFS for large files
11c72a2
import logging
import multiprocessing as mp
from collections import namedtuple
import numpy as np
from gensim import interfaces, matutils
from gensim import utils
from gensim.topic_coherence import (
segmentation, probability_estimation,
direct_confirmation_measure, indirect_confirmation_measure,
aggregation,
)
from gensim.topic_coherence.probability_estimation import unique_ids_from_segments
# Set up logging for this module
logger = logging.getLogger(__name__)
# Define sets for categorizing coherence measures based on their probability estimation method
BOOLEAN_DOCUMENT_BASED = {'u_mass'}
SLIDING_WINDOW_BASED = {'c_v', 'c_uci', 'c_npmi', 'c_w2v'}
# Create a namedtuple to define the structure of a coherence measure pipeline
# Each pipeline consists of a segmentation (seg), probability estimation (prob),
# confirmation measure (conf), and aggregation (aggr) function.
_make_pipeline = namedtuple('Coherence_Measure', 'seg, prob, conf, aggr')
# Define the supported coherence measures and their respective pipeline components
COHERENCE_MEASURES = {
'u_mass': _make_pipeline(
segmentation.s_one_pre,
probability_estimation.p_boolean_document,
direct_confirmation_measure.log_conditional_probability,
aggregation.arithmetic_mean
),
'c_v': _make_pipeline(
segmentation.s_one_set,
probability_estimation.p_boolean_sliding_window,
indirect_confirmation_measure.cosine_similarity,
aggregation.arithmetic_mean
),
'c_w2v': _make_pipeline(
segmentation.s_one_set,
probability_estimation.p_word2vec,
indirect_confirmation_measure.word2vec_similarity,
aggregation.arithmetic_mean
),
'c_uci': _make_pipeline(
segmentation.s_one_one,
probability_estimation.p_boolean_sliding_window,
direct_confirmation_measure.log_ratio_measure,
aggregation.arithmetic_mean
),
'c_npmi': _make_pipeline(
segmentation.s_one_one,
probability_estimation.p_boolean_sliding_window,
direct_confirmation_measure.log_ratio_measure,
aggregation.arithmetic_mean
),
}
# Define default sliding window sizes for different coherence measures
SLIDING_WINDOW_SIZES = {
'c_v': 110,
'c_w2v': 5,
'c_uci': 10,
'c_npmi': 10,
'u_mass': None # u_mass does not use a sliding window
}
class CoherenceModel_ttc(interfaces.TransformationABC):
"""Objects of this class allow for building and maintaining a model for topic coherence.
Examples
---------
One way of using this feature is through providing a trained topic model. A dictionary has to be explicitly provided
if the model does not contain a dictionary already
.. sourcecode:: pycon
>>> from gensim.test.utils import common_corpus, common_dictionary
>>> from gensim.models.ldamodel import LdaModel
>>> # Assuming CoherenceModel_ttc is imported or defined in the current scope
>>> # from your_module import CoherenceModel_ttc # if saved in a file
>>>
>>> model = LdaModel(common_corpus, 5, common_dictionary)
>>>
>>> cm = CoherenceModel_ttc(model=model, corpus=common_corpus, coherence='u_mass')
>>> coherence = cm.get_coherence() # get coherence value
Another way of using this feature is through providing tokenized topics such as:
.. sourcecode:: pycon
>>> from gensim.test.utils import common_corpus, common_dictionary
>>> # Assuming CoherenceModel_ttc is imported or defined in the current scope
>>> # from your_module import CoherenceModel_ttc # if saved in a file
>>> topics = [
... ['human', 'computer', 'system', 'interface'],
... ['graph', 'minors', 'trees', 'eps']
... ]
>>>
>>> cm = CoherenceModel_ttc(topics=topics, corpus=common_corpus, dictionary=common_dictionary, coherence='u_mass')
>>> coherence = cm.get_coherence() # get coherence value
"""
def __init__(self, model=None, topics=None, texts=None, corpus=None, dictionary=None,
window_size=None, keyed_vectors=None, coherence='c_v', topn=20, processes=-1):
"""
Initializes the CoherenceModel_ttc.
Parameters
----------
model : :class:`~gensim.models.basemodel.BaseTopicModel`, optional
Pre-trained topic model. Should be provided if `topics` is not provided.
Supports models that implement the `get_topics` method.
topics : list of list of str, optional
List of tokenized topics. If provided, `dictionary` must also be provided.
texts : list of list of str, optional
Tokenized texts, needed for coherence models that use sliding window based (e.g., `c_v`, `c_uci`, `c_npmi`).
corpus : iterable of list of (int, number), optional
Corpus in Bag-of-Words format.
dictionary : :class:`~gensim.corpora.dictionary.Dictionary`, optional
Gensim dictionary mapping of id word to create corpus.
If `model.id2word` is present and `dictionary` is None, `model.id2word` will be used.
window_size : int, optional
The size of the window to be used for coherence measures using boolean sliding window as their
probability estimator. For 'u_mass' this doesn't matter.
If None, default window sizes from `SLIDING_WINDOW_SIZES` are used.
keyed_vectors : :class:`~gensim.models.keyedvectors.KeyedVectors`, optional
Pre-trained word embeddings (e.g., Word2Vec model) for 'c_w2v' coherence.
coherence : {'u_mass', 'c_v', 'c_uci', 'c_npmi', 'c_w2v'}, optional
Coherence measure to be used.
'u_mass' requires `corpus` (or `texts` which will be converted to corpus).
'c_v', 'c_uci', 'c_npmi', 'c_w2v' require `texts`.
topn : int, optional
Integer corresponding to the number of top words to be extracted from each topic. Defaults to 20.
processes : int, optional
Number of processes to use for probability estimation phase. Any value less than 1 will be interpreted as
`num_cpus - 1`. Defaults to -1.
"""
# Ensure either a model or explicit topics are provided
if model is None and topics is None:
raise ValueError("One of 'model' or 'topics' has to be provided.")
# If topics are provided, a dictionary is mandatory to convert tokens to IDs
elif topics is not None and dictionary is None:
raise ValueError("Dictionary has to be provided if 'topics' are to be used.")
self.keyed_vectors = keyed_vectors
# Ensure a data source (keyed_vectors, texts, or corpus) is provided for coherence calculation
if keyed_vectors is None and texts is None and corpus is None:
raise ValueError("One of 'texts', 'corpus', or 'keyed_vectors' has to be provided.")
# Determine the dictionary to use
if dictionary is None:
# If no explicit dictionary, try to use the model's dictionary
if isinstance(model.id2word, utils.FakeDict):
# If model's id2word is a FakeDict, it means no proper dictionary is associated
raise ValueError(
"The associated dictionary should be provided with the corpus or 'id2word'"
" for topic model should be set as the associated dictionary.")
else:
self.dictionary = model.id2word
else:
self.dictionary = dictionary
# Store coherence type and window size
self.coherence = coherence
self.window_size = window_size
if self.window_size is None:
# Use default window size if not specified
self.window_size = SLIDING_WINDOW_SIZES[self.coherence]
# Store texts and corpus
self.texts = texts
self.corpus = corpus
# Validate inputs based on coherence type
if coherence in BOOLEAN_DOCUMENT_BASED:
# For document-based measures (e.g., u_mass), corpus is preferred
if utils.is_corpus(corpus)[0]:
self.corpus = corpus
elif self.texts is not None:
# If texts are provided, convert them to corpus format
self.corpus = [self.dictionary.doc2bow(text) for text in self.texts]
else:
raise ValueError(
"Either 'corpus' with 'dictionary' or 'texts' should "
"be provided for %s coherence." % coherence)
elif coherence == 'c_w2v' and keyed_vectors is not None:
# For c_w2v, keyed_vectors are needed
pass
elif coherence in SLIDING_WINDOW_BASED:
# For sliding window-based measures, texts are required
if self.texts is None:
raise ValueError("'texts' should be provided for %s coherence." % coherence)
else:
# Raise error if coherence type is not supported
raise ValueError("%s coherence is not currently supported." % coherence)
self._topn = topn
self._model = model
self._accumulator = None # Cached accumulator for probability estimation
self._topics = None # Store topics internally
self.topics = topics # Call the setter to initialize topics and accumulator state
# Determine the number of processes to use for parallelization
self.processes = processes if processes >= 1 else max(1, mp.cpu_count() - 1)
@classmethod
def for_models(cls, models, dictionary, topn=20, **kwargs):
"""
Initialize a CoherenceModel_ttc with estimated probabilities for all of the given models.
This method extracts topics from each model and then uses `for_topics`.
Parameters
----------
models : list of :class:`~gensim.models.basemodel.BaseTopicModel`
List of models to evaluate coherence of. Each model should implement
the `get_topics` method.
dictionary : :class:`~gensim.corpora.dictionary.Dictionary`
Gensim dictionary mapping of id word.
topn : int, optional
Integer corresponding to the number of top words to be extracted from each topic. Defaults to 20.
kwargs : object
Additional arguments passed to the `CoherenceModel_ttc` constructor (e.g., `corpus`, `texts`, `coherence`).
Returns
-------
:class:`~gensim.models.coherencemodel.CoherenceModel`
CoherenceModel_ttc instance with estimated probabilities for all given models.
Example
-------
.. sourcecode:: pycon
>>> from gensim.test.utils import common_corpus, common_dictionary
>>> from gensim.models.ldamodel import LdaModel
>>> # from your_module import CoherenceModel_ttc
>>>
>>> m1 = LdaModel(common_corpus, 3, common_dictionary)
>>> m2 = LdaModel(common_corpus, 5, common_dictionary)
>>>
>>> cm = CoherenceModel_ttc.for_models([m1, m2], common_dictionary, corpus=common_corpus, coherence='u_mass')
>>> # To get coherences for each model:
>>> # model_coherences = cm.compare_model_topics([
>>> # CoherenceModel_ttc._get_topics_from_model(m1, topn=cm.topn),
>>> # CoherenceModel_ttc._get_topics_from_model(m2, topn=cm.topn)
>>> # ])
"""
# Extract top words as lists for each model's topics
topics = [cls.top_topics_as_word_lists(model, dictionary, topn) for model in models]
kwargs['dictionary'] = dictionary
kwargs['topn'] = topn
# Use for_topics to initialize the coherence model with these topics
return cls.for_topics(topics, **kwargs)
@staticmethod
def top_topics_as_word_lists(model, dictionary, topn=20):
"""
Get `topn` topics from a model as lists of words.
Parameters
----------
model : :class:`~gensim.models.basemodel.BaseTopicModel`
Pre-trained topic model.
dictionary : :class:`~gensim.corpora.dictionary.Dictionary`
Gensim dictionary mapping of id word.
topn : int, optional
Integer corresponding to the number of top words to be extracted from each topic. Defaults to 20.
Returns
-------
list of list of str
Top topics in list-of-list-of-words format.
"""
# Ensure id2token mapping exists in the dictionary
if not dictionary.id2token:
dictionary.id2token = {v: k for k, v in dictionary.token2id.items()}
str_topics = []
for topic_distribution in model.get_topics():
# Get the indices of the topN words based on their probabilities
bestn_indices = matutils.argsort(topic_distribution, topn=topn, reverse=True)
# Convert word IDs back to words using the dictionary
best_words = [dictionary.id2token[_id] for _id in bestn_indices]
str_topics.append(best_words)
return str_topics
@classmethod
def for_topics(cls, topics_as_topn_terms, **kwargs):
"""
Initialize a CoherenceModel_ttc with estimated probabilities for all of the given topics.
This is useful when you have raw topics (list of lists of words) and not a Gensim model object.
Parameters
----------
topics_as_topn_terms : list of list of str
Each element in the top-level list should be a list of top-N words, one per topic.
For example: `[['word1', 'word2'], ['word3', 'word4']]`.
Returns
-------
:class:`~gensim.models.coherencemodel.CoherenceModel`
CoherenceModel_ttc with estimated probabilities for the given topics.
"""
if not topics_as_topn_terms:
raise ValueError("len(topics_as_topn_terms) must be > 0.")
if any(len(topic_list) == 0 for topic_list in topics_as_topn_terms):
raise ValueError("Found an empty topic listing in `topics_as_topn_terms`.")
# Determine the maximum 'topn' value among the provided topics
# This will be used to initialize the CoherenceModel_ttc correctly for probability estimation
actual_topn_in_data = 0
for topic_list in topics_as_topn_terms:
for topic in topic_list:
actual_topn_in_data = max(actual_topn_in_data, len(topic))
# Use the provided 'topn' from kwargs, or the determined 'actual_topn_in_data',
# ensuring it's not greater than the actual data available.
# This allows for precomputing probabilities for a wider set of words if needed.
topn_for_prob_estimation = min(kwargs.pop('topn', actual_topn_in_data), actual_topn_in_data)
# Flatten all topics into a single "super topic" for initial probability estimation.
# This ensures that all words relevant to *any* topic in the comparison set
# are included in the accumulator.
super_topic = utils.flatten(topics_as_topn_terms)
logger.info(
"Number of relevant terms for all %d models (or topic sets): %d",
len(topics_as_topn_terms), len(super_topic))
# Initialize CoherenceModel_ttc with the super topic to pre-estimate probabilities
# for all relevant words across all models.
# We pass `topics=[super_topic]` and `topn=len(super_topic)` to ensure all words
# are considered during the probability estimation phase.
cm = CoherenceModel_ttc(topics=[super_topic], topn=len(super_topic), **kwargs)
cm.estimate_probabilities() # Perform the actual probability estimation
# After estimation, set the 'topn' back to the desired value for coherence calculation.
cm.topn = topn_for_prob_estimation
return cm
def __str__(self):
"""Returns a string representation of the coherence measure pipeline."""
return str(self.measure)
@property
def model(self):
"""
Get the current topic model used by the instance.
Returns
-------
:class:`~gensim.models.basemodel.BaseTopicModel`
The currently set topic model.
"""
return self._model
@model.setter
def model(self, model):
"""
Set the topic model for the instance. When a new model is set,
it triggers an update of the internal topics and checks if the accumulator needs recomputing.
Parameters
----------
model : :class:`~gensim.models.basemodel.BaseTopicModel`
The new topic model to set.
"""
self._model = model
if model is not None:
new_topics = self._get_topics() # Get topics from the new model
self._update_accumulator(new_topics) # Check and update accumulator if needed
self._topics = new_topics # Store the new topics
@property
def topn(self):
"""
Get the number of top words (`_topn`) used for coherence calculation.
Returns
-------
int
The number of top words.
"""
return self._topn
@topn.setter
def topn(self, topn):
"""
Set the number of top words (`_topn`) to consider for coherence calculation.
If the new `topn` requires more words than currently loaded topics, and a model is available,
it will attempt to re-extract topics from the model.
Parameters
----------
topn : int
The new number of top words.
"""
# Get the length of the first topic to check current topic length
current_topic_length = len(self._topics[0])
# Determine if the new 'topn' requires more words than currently available in topics
requires_expansion = current_topic_length < topn
if self.model is not None:
self._topn = topn
if requires_expansion:
# If expansion is needed and a model is available, re-extract topics from the model.
# This call to the setter property `self.model = self._model` effectively re-runs
# the logic that extracts topics and updates the accumulator based on the new `_topn`.
self.model = self._model
else:
# If no model is available and expansion is required, raise an error
if requires_expansion:
raise ValueError("Model unavailable and topic sizes are less than topn=%d" % topn)
self._topn = topn # Topics will be truncated by the `topics` getter if needed
@property
def measure(self):
"""
Returns the namedtuple representing the coherence pipeline functions
(segmentation, probability estimation, confirmation, aggregation)
based on the `self.coherence` type.
Returns
-------
namedtuple
Pipeline that contains needed functions/method for calculating coherence.
"""
return COHERENCE_MEASURES[self.coherence]
@property
def topics(self):
"""
Get the current topics. If the internally stored topics have more words
than `self._topn`, they are truncated to `self._topn` words.
Returns
-------
list of list of str
Topics as lists of word tokens.
"""
# If the stored topics contain more words than `_topn`, truncate them
if len(self._topics[0]) > self._topn:
return [topic[:self._topn] for topic in self._topics]
else:
return self._topics
@topics.setter
def topics(self, topics):
"""
Set the topics for the instance. This method converts topic words to their
corresponding dictionary IDs and updates the accumulator state.
Parameters
----------
topics : list of list of str or list of list of int
Topics, either as lists of word tokens or lists of word IDs.
"""
if topics is not None:
new_topics = []
for topic in topics:
# Ensure topic elements are converted to dictionary IDs (numpy array for efficiency)
topic_token_ids = self._ensure_elements_are_ids(topic)
new_topics.append(topic_token_ids)
if self.model is not None:
# Warn if both model and explicit topics are set, as they might be inconsistent
logger.warning(
"The currently set model '%s' may be inconsistent with the newly set topics",
self.model)
elif self.model is not None:
# If topics are None but a model exists, extract topics from the model
new_topics = self._get_topics()
logger.debug("Setting topics to those of the model: %s", self.model)
else:
new_topics = None
# Check if the accumulator needs to be recomputed based on the new topics
self._update_accumulator(new_topics)
self._topics = new_topics # Store the (ID-converted) topics
def _ensure_elements_are_ids(self, topic):
"""
Internal helper to ensure that topic elements are converted to dictionary IDs.
Handles cases where input topic might be tokens or already IDs.
Parameters
----------
topic : list of str or list of int
A single topic, either as a list of word tokens or word IDs.
Returns
-------
:class:`numpy.ndarray`
A numpy array of word IDs for the topic.
Raises
------
KeyError
If a token is not found in the dictionary or an ID is not a valid key in id2token.
"""
try:
# Try to convert tokens to IDs. This is the common case if `topic` contains strings.
return np.array([self.dictionary.token2id[token] for token in topic if token in self.dictionary.token2id])
except KeyError:
# If `KeyError` occurs, assume `topic` might already be a list of IDs.
# Attempt to convert IDs to tokens and then back to IDs, ensuring they are valid dictionary entries.
# This handles cases where `topic` might contain integer IDs that are not present in the dictionary.
try:
# Convert IDs to tokens (via id2token) and then tokens to IDs (via token2id)
# This filters out invalid IDs.
return np.array([self.dictionary.token2id[self.dictionary.id2token[_id]]
for _id in topic if _id in self.dictionary])
except KeyError:
raise ValueError("Unable to interpret topic as either a list of tokens or a list of valid IDs within the dictionary.")
def _update_accumulator(self, new_topics):
"""
Internal helper to determine if the cached `_accumulator` (probability statistics)
needs to be wiped and recomputed due to changes in topics.
"""
if self._relevant_ids_will_differ(new_topics):
logger.debug("Wiping cached accumulator since it does not contain all relevant ids.")
self._accumulator = None
def _relevant_ids_will_differ(self, new_topics):
"""
Internal helper to check if the set of unique word IDs relevant to the new topics
is different from the IDs already covered by the current accumulator.
Parameters
----------
new_topics : list of list of int
The new set of topics (as word IDs).
Returns
-------
bool
True if the relevant IDs will differ, False otherwise.
"""
if self._accumulator is None or not self._topics_differ(new_topics):
return False
# Get unique IDs from the segmented new topics
new_set = unique_ids_from_segments(self.measure.seg(new_topics))
# Check if the current accumulator's relevant IDs are a superset of the new set.
# If not, it means the new topics introduce words not covered, so the accumulator needs updating.
return not self._accumulator.relevant_ids.issuperset(new_set)
def _topics_differ(self, new_topics):
"""
Internal helper to check if the new topics are different from the currently stored topics.
Parameters
----------
new_topics : list of list of int
The new set of topics (as word IDs).
Returns
-------
bool
True if topics are different, False otherwise.
"""
# Compare topic arrays using numpy.array_equal for efficient comparison
return (new_topics is not None
and self._topics is not None
and not np.array_equal(new_topics, self._topics))
def _get_topics(self):
"""
Internal helper function to extract top words (as IDs) from a trained topic model.
"""
return self._get_topics_from_model(self.model, self.topn)
@staticmethod
def _get_topics_from_model(model, topn):
"""
Internal static method to extract top `topn` words (as IDs) from a trained topic model.
Parameters
----------
model : :class:`~gensim.models.basemodel.BaseTopicModel`
Pre-trained topic model (must implement `get_topics` method).
topn : int
Integer corresponding to the number of top words to extract.
Returns
-------
list of :class:`numpy.ndarray`
A list where each element is a numpy array of word IDs representing a topic's top words.
Raises
------
AttributeError
If the provided model does not implement a `get_topics` method.
"""
try:
# Iterate over the topic distributions from the model
# Use matutils.argsort to get the indices (word IDs) of the top `topn` words
return [
matutils.argsort(topic, topn=topn, reverse=True) for topic in
model.get_topics()
]
except AttributeError:
raise ValueError(
"This topic model is not currently supported. Supported topic models"
" should implement the `get_topics` method.")
def segment_topics(self):
"""
Segments the current topics using the segmentation function defined by the
chosen coherence measure (`self.measure.seg`).
Returns
-------
list of list of tuple
Segmented topics. The structure depends on the segmentation method (e.g., pairs of word IDs).
"""
# Apply the segmentation function from the pipeline to the current topics
return self.measure.seg(self.topics)
def estimate_probabilities(self, segmented_topics=None):
"""
Accumulates word occurrences and co-occurrences from texts or corpus
using the optimal probability estimation method for the chosen coherence metric.
This operation can be computationally intensive, especially for sliding window methods.
Parameters
----------
segmented_topics : list of list of tuple, optional
Segmented topics. If None, `self.segment_topics()` is called internally.
Returns
-------
:class:`~gensim.topic_coherence.text_analysis.CorpusAccumulator`
An object that holds the accumulated statistics (word frequencies, co-occurrence frequencies).
"""
if segmented_topics is None:
segmented_topics = self.segment_topics()
# Choose the appropriate probability estimation method based on the coherence type
if self.coherence in BOOLEAN_DOCUMENT_BASED:
self._accumulator = self.measure.prob(self.corpus, segmented_topics)
else:
kwargs = dict(
texts=self.texts, segmented_topics=segmented_topics,
dictionary=self.dictionary, window_size=self.window_size,
processes=self.processes)
if self.coherence == 'c_w2v':
kwargs['model'] = self.keyed_vectors # Pass keyed_vectors for word2vec based coherence
self._accumulator = self.measure.prob(**kwargs)
return self._accumulator
def get_coherence_per_topic(self, segmented_topics=None, with_std=False, with_support=False):
"""
Calculates and returns a list of coherence values, one for each topic,
based on the pipeline's confirmation measure.
Parameters
----------
segmented_topics : list of list of tuple, optional
Segmented topics. If None, `self.segment_topics()` is called internally.
with_std : bool, optional
If True, also includes the standard deviation across topic segment sets in addition
to the mean coherence for each topic. Defaults to False.
with_support : bool, optional
If True, also includes the "support" (number of pairwise similarity comparisons)
used to compute each topic's coherence. Defaults to False.
Returns
-------
list of float or list of tuple
A sequence of similarity measures for each topic.
If `with_std` or `with_support` is True, each element in the list will be a tuple
containing the coherence value and the requested additional statistics.
"""
measure = self.measure
if segmented_topics is None:
segmented_topics = measure.seg(self.topics)
# Ensure probabilities are estimated before calculating coherence
if self._accumulator is None:
self.estimate_probabilities(segmented_topics)
kwargs = dict(with_std=with_std, with_support=with_support)
if self.coherence in BOOLEAN_DOCUMENT_BASED or self.coherence == 'c_w2v':
# These coherence types don't require specific additional kwargs for confirmation measure
pass
elif self.coherence == 'c_v':
# Specific kwargs for c_v's confirmation measure (cosine_similarity)
kwargs['topics'] = self.topics
kwargs['measure'] = 'nlr' # Normalized Log Ratio
kwargs['gamma'] = 1
else:
# For c_uci and c_npmi, 'normalize' parameter is relevant
kwargs['normalize'] = (self.coherence == 'c_npmi')
return measure.conf(segmented_topics, self._accumulator, **kwargs)
def aggregate_measures(self, topic_coherences):
"""
Aggregates the individual topic coherence measures into a single overall score
using the pipeline's aggregation function (`self.measure.aggr`).
Parameters
----------
topic_coherences : list of float
List of coherence values for each topic.
Returns
-------
float
The aggregated coherence value (e.g., arithmetic mean).
"""
# Apply the aggregation function from the pipeline to the list of topic coherences
return self.measure.aggr(topic_coherences)
def get_coherence(self):
"""
Calculates and returns the overall coherence value for the entire set of topics.
This is the main entry point for getting a single coherence score.
Returns
-------
float
The aggregated coherence value.
"""
# First, get coherence values for each individual topic
confirmed_measures = self.get_coherence_per_topic()
# Then, aggregate these topic-level coherences into a single score
return self.aggregate_measures(confirmed_measures)
def compare_models(self, models):
"""
Compares multiple topic models by their coherence values.
It extracts topics from each model and then calls `compare_model_topics`.
Parameters
----------
models : list of :class:`~gensim.models.basemodel.BaseTopicModel`
A sequence of topic models to compare.
Returns
-------
list of (list of float, float)
A sequence where each element is a pair:
(list of average topic coherences for the model, overall model coherence).
"""
# Extract topics (as word IDs) for each model using the internal helper
model_topics = [self._get_topics_from_model(model, self.topn) for model in models]
# Delegate to compare_model_topics for the actual coherence comparison
return self.compare_model_topics(model_topics)
def compare_model_topics(self, model_topics):
"""
Performs coherence evaluation for each set of topics provided in `model_topics`.
This method is designed to be efficient by precomputing probabilities once if needed,
and then evaluating coherence for each set of topics.
Parameters
----------
model_topics : list of list of list of int
A list where each element is itself a list of topics (each topic being a list of word IDs)
representing a set of topics (e.g., from a single model).
Returns
-------
list of (list of float, float)
A sequence where each element is a pair:
(list of average topic coherences for the topic set, overall topic set coherence).
Notes
-----
This method uses a heuristic of evaluating coherence at various `topn` values (e.g., 20, 15, 10, 5)
and averaging the results for robustness, as suggested in some research.
"""
# Store original topics and topn to restore them after comparison
orig_topics = self._topics
orig_topn = self.topn
try:
# Perform the actual comparison
coherences = self._compare_model_topics(model_topics)
finally:
# Ensure original topics and topn are restored even if an error occurs
self.topics = orig_topics
self.topn = orig_topn
return coherences
def _compare_model_topics(self, model_topics):
"""
Internal helper to get average topic and model coherences across multiple sets of topics.
Parameters
----------
model_topics : list of list of list of int
A list where each element is a set of topics (list of lists of word IDs).
Returns
-------
list of (list of float, float)
A sequence of pairs:
(average topic coherences across different `topn` values for each topic,
overall model coherence averaged across different `topn` values).
"""
coherences = []
# Define a grid of `topn` values to evaluate coherence.
# This provides a more robust average coherence value.
# It goes from `self.topn` down to `min(self.topn - 1, 4)` in steps of -5.
# e.g., if self.topn is 20, grid might be [20, 15, 10, 5].
# The `min(self.topn - 1, 4)` ensures at least some lower values are included,
# but also prevents trying `topn` values that are too small or negative.
last_topn_value = min(self.topn - 1, 4)
topn_grid = list(range(self.topn, last_topn_value, -5))
if not topn_grid or max(topn_grid) < 1: # Ensure at least one valid topn if range is empty or too small
topn_grid = [max(1, min(self.topn, 5))] # Use min of self.topn and 5, ensure at least 1
for model_num, topics in enumerate(model_topics):
# Set the current topics for the instance to the topics of the model being evaluated
self.topics = topics
coherence_at_n = {} # Dictionary to store coherence results for different `topn` values
for n in topn_grid:
self.topn = n # Set the `topn` for the current evaluation round
topic_coherences = self.get_coherence_per_topic()
# Handle NaN values in topic coherences by imputing with the mean
filled_coherences = np.array(topic_coherences, dtype=float)
# Check for NaN values and replace them with the mean of non-NaN values.
# np.nanmean handles arrays with all NaNs gracefully by returning NaN.
if np.any(np.isnan(filled_coherences)):
mean_val = np.nanmean(filled_coherences)
if np.isnan(mean_val): # If all are NaN, mean_val will also be NaN. In this case, replace with 0 or a very small number.
filled_coherences[np.isnan(filled_coherences)] = 0.0 # Or another sensible default
else:
filled_coherences[np.isnan(filled_coherences)] = mean_val
# Store the topic-level coherences and the aggregated (overall) coherence for this `topn`
coherence_at_n[n] = (topic_coherences, self.aggregate_measures(filled_coherences))
# Unpack the stored coherences for different `topn` values
all_topic_coherences_at_n, all_avg_coherences_at_n = zip(*coherence_at_n.values())
# Calculate the average topic coherence across all `topn` values
# np.vstack stacks lists of topic coherences into a 2D array, then mean(0) computes mean for each topic.
avg_topic_coherences = np.vstack(all_topic_coherences_at_n).mean(axis=0)
# Calculate the overall model coherence by averaging the aggregated coherences from all `topn` values
model_coherence = np.mean(all_avg_coherences_at_n)
logging.info("Avg coherence for model %d: %.5f" % (model_num, model_coherence))
coherences.append((avg_topic_coherences.tolist(), model_coherence)) # Convert numpy array back to list for output
return coherences