Spaces:

AdhyaSuman
/

DTECT

Running

App Files Files Community

DTECT / backend /evaluation /CoherenceModel_ttc.py

AdhyaSuman

Initial commit with Git LFS for large files

11c72a2 6 months ago

raw

history blame contribute delete

38.1 kB

	import logging
	import multiprocessing as mp
	from collections import namedtuple

	import numpy as np

	from gensim import interfaces, matutils
	from gensim import utils
	from gensim.topic_coherence import (
	segmentation, probability_estimation,
	direct_confirmation_measure, indirect_confirmation_measure,
	aggregation,
	)
	from gensim.topic_coherence.probability_estimation import unique_ids_from_segments

	# Set up logging for this module
	logger = logging.getLogger(__name__)

	# Define sets for categorizing coherence measures based on their probability estimation method
	BOOLEAN_DOCUMENT_BASED = {'u_mass'}
	SLIDING_WINDOW_BASED = {'c_v', 'c_uci', 'c_npmi', 'c_w2v'}

	# Create a namedtuple to define the structure of a coherence measure pipeline
	# Each pipeline consists of a segmentation (seg), probability estimation (prob),
	# confirmation measure (conf), and aggregation (aggr) function.
	_make_pipeline = namedtuple('Coherence_Measure', 'seg, prob, conf, aggr')

	# Define the supported coherence measures and their respective pipeline components
	COHERENCE_MEASURES = {
	'u_mass': _make_pipeline(
	segmentation.s_one_pre,
	probability_estimation.p_boolean_document,
	direct_confirmation_measure.log_conditional_probability,
	aggregation.arithmetic_mean
	),
	'c_v': _make_pipeline(
	segmentation.s_one_set,
	probability_estimation.p_boolean_sliding_window,
	indirect_confirmation_measure.cosine_similarity,
	aggregation.arithmetic_mean
	),
	'c_w2v': _make_pipeline(
	segmentation.s_one_set,
	probability_estimation.p_word2vec,
	indirect_confirmation_measure.word2vec_similarity,
	aggregation.arithmetic_mean
	),
	'c_uci': _make_pipeline(
	segmentation.s_one_one,
	probability_estimation.p_boolean_sliding_window,
	direct_confirmation_measure.log_ratio_measure,
	aggregation.arithmetic_mean
	),
	'c_npmi': _make_pipeline(
	segmentation.s_one_one,
	probability_estimation.p_boolean_sliding_window,
	direct_confirmation_measure.log_ratio_measure,
	aggregation.arithmetic_mean
	),
	}

	# Define default sliding window sizes for different coherence measures
	SLIDING_WINDOW_SIZES = {
	'c_v': 110,
	'c_w2v': 5,
	'c_uci': 10,
	'c_npmi': 10,
	'u_mass': None # u_mass does not use a sliding window
	}


	class CoherenceModel_ttc(interfaces.TransformationABC):
	"""Objects of this class allow for building and maintaining a model for topic coherence.

	Examples
	---------
	One way of using this feature is through providing a trained topic model. A dictionary has to be explicitly provided
	if the model does not contain a dictionary already

	.. sourcecode:: pycon

	>>> from gensim.test.utils import common_corpus, common_dictionary
	>>> from gensim.models.ldamodel import LdaModel
	>>> # Assuming CoherenceModel_ttc is imported or defined in the current scope
	>>> # from your_module import CoherenceModel_ttc # if saved in a file
	>>>
	>>> model = LdaModel(common_corpus, 5, common_dictionary)
	>>>
	>>> cm = CoherenceModel_ttc(model=model, corpus=common_corpus, coherence='u_mass')
	>>> coherence = cm.get_coherence() # get coherence value

	Another way of using this feature is through providing tokenized topics such as:

	.. sourcecode:: pycon

	>>> from gensim.test.utils import common_corpus, common_dictionary
	>>> # Assuming CoherenceModel_ttc is imported or defined in the current scope
	>>> # from your_module import CoherenceModel_ttc # if saved in a file
	>>> topics = [
	... ['human', 'computer', 'system', 'interface'],
	... ['graph', 'minors', 'trees', 'eps']
	... ]
	>>>
	>>> cm = CoherenceModel_ttc(topics=topics, corpus=common_corpus, dictionary=common_dictionary, coherence='u_mass')
	>>> coherence = cm.get_coherence() # get coherence value

	"""
	def __init__(self, model=None, topics=None, texts=None, corpus=None, dictionary=None,
	window_size=None, keyed_vectors=None, coherence='c_v', topn=20, processes=-1):
	"""
	Initializes the CoherenceModel_ttc.

	Parameters
	----------
	model : :class:`~gensim.models.basemodel.BaseTopicModel`, optional
	Pre-trained topic model. Should be provided if `topics` is not provided.
	Supports models that implement the `get_topics` method.
	topics : list of list of str, optional
	List of tokenized topics. If provided, `dictionary` must also be provided.
	texts : list of list of str, optional
	Tokenized texts, needed for coherence models that use sliding window based (e.g., `c_v`, `c_uci`, `c_npmi`).
	corpus : iterable of list of (int, number), optional
	Corpus in Bag-of-Words format.
	dictionary : :class:`~gensim.corpora.dictionary.Dictionary`, optional
	Gensim dictionary mapping of id word to create corpus.
	If `model.id2word` is present and `dictionary` is None, `model.id2word` will be used.
	window_size : int, optional
	The size of the window to be used for coherence measures using boolean sliding window as their
	probability estimator. For 'u_mass' this doesn't matter.
	If None, default window sizes from `SLIDING_WINDOW_SIZES` are used.
	keyed_vectors : :class:`~gensim.models.keyedvectors.KeyedVectors`, optional
	Pre-trained word embeddings (e.g., Word2Vec model) for 'c_w2v' coherence.
	coherence : {'u_mass', 'c_v', 'c_uci', 'c_npmi', 'c_w2v'}, optional
	Coherence measure to be used.
	'u_mass' requires `corpus` (or `texts` which will be converted to corpus).
	'c_v', 'c_uci', 'c_npmi', 'c_w2v' require `texts`.
	topn : int, optional
	Integer corresponding to the number of top words to be extracted from each topic. Defaults to 20.
	processes : int, optional
	Number of processes to use for probability estimation phase. Any value less than 1 will be interpreted as
	`num_cpus - 1`. Defaults to -1.
	"""
	# Ensure either a model or explicit topics are provided
	if model is None and topics is None:
	raise ValueError("One of 'model' or 'topics' has to be provided.")
	# If topics are provided, a dictionary is mandatory to convert tokens to IDs
	elif topics is not None and dictionary is None:
	raise ValueError("Dictionary has to be provided if 'topics' are to be used.")

	self.keyed_vectors = keyed_vectors
	# Ensure a data source (keyed_vectors, texts, or corpus) is provided for coherence calculation
	if keyed_vectors is None and texts is None and corpus is None:
	raise ValueError("One of 'texts', 'corpus', or 'keyed_vectors' has to be provided.")

	# Determine the dictionary to use
	if dictionary is None:
	# If no explicit dictionary, try to use the model's dictionary
	if isinstance(model.id2word, utils.FakeDict):
	# If model's id2word is a FakeDict, it means no proper dictionary is associated
	raise ValueError(
	"The associated dictionary should be provided with the corpus or 'id2word'"
	" for topic model should be set as the associated dictionary.")
	else:
	self.dictionary = model.id2word
	else:
	self.dictionary = dictionary

	# Store coherence type and window size
	self.coherence = coherence
	self.window_size = window_size
	if self.window_size is None:
	# Use default window size if not specified
	self.window_size = SLIDING_WINDOW_SIZES[self.coherence]

	# Store texts and corpus
	self.texts = texts
	self.corpus = corpus

	# Validate inputs based on coherence type
	if coherence in BOOLEAN_DOCUMENT_BASED:
	# For document-based measures (e.g., u_mass), corpus is preferred
	if utils.is_corpus(corpus)[0]:
	self.corpus = corpus
	elif self.texts is not None:
	# If texts are provided, convert them to corpus format
	self.corpus = [self.dictionary.doc2bow(text) for text in self.texts]
	else:
	raise ValueError(
	"Either 'corpus' with 'dictionary' or 'texts' should "
	"be provided for %s coherence." % coherence)

	elif coherence == 'c_w2v' and keyed_vectors is not None:
	# For c_w2v, keyed_vectors are needed
	pass
	elif coherence in SLIDING_WINDOW_BASED:
	# For sliding window-based measures, texts are required
	if self.texts is None:
	raise ValueError("'texts' should be provided for %s coherence." % coherence)
	else:
	# Raise error if coherence type is not supported
	raise ValueError("%s coherence is not currently supported." % coherence)

	self._topn = topn
	self._model = model
	self._accumulator = None # Cached accumulator for probability estimation
	self._topics = None # Store topics internally
	self.topics = topics # Call the setter to initialize topics and accumulator state

	# Determine the number of processes to use for parallelization
	self.processes = processes if processes >= 1 else max(1, mp.cpu_count() - 1)

	@classmethod
	def for_models(cls, models, dictionary, topn=20, **kwargs):
	"""
	Initialize a CoherenceModel_ttc with estimated probabilities for all of the given models.
	This method extracts topics from each model and then uses `for_topics`.

	Parameters
	----------
	models : list of :class:`~gensim.models.basemodel.BaseTopicModel`
	List of models to evaluate coherence of. Each model should implement
	the `get_topics` method.
	dictionary : :class:`~gensim.corpora.dictionary.Dictionary`
	Gensim dictionary mapping of id word.
	topn : int, optional
	Integer corresponding to the number of top words to be extracted from each topic. Defaults to 20.
	kwargs : object
	Additional arguments passed to the `CoherenceModel_ttc` constructor (e.g., `corpus`, `texts`, `coherence`).

	Returns
	-------
	:class:`~gensim.models.coherencemodel.CoherenceModel`
	CoherenceModel_ttc instance with estimated probabilities for all given models.

	Example
	-------
	.. sourcecode:: pycon

	>>> from gensim.test.utils import common_corpus, common_dictionary
	>>> from gensim.models.ldamodel import LdaModel
	>>> # from your_module import CoherenceModel_ttc
	>>>
	>>> m1 = LdaModel(common_corpus, 3, common_dictionary)
	>>> m2 = LdaModel(common_corpus, 5, common_dictionary)
	>>>
	>>> cm = CoherenceModel_ttc.for_models([m1, m2], common_dictionary, corpus=common_corpus, coherence='u_mass')
	>>> # To get coherences for each model:
	>>> # model_coherences = cm.compare_model_topics([
	>>> # CoherenceModel_ttc._get_topics_from_model(m1, topn=cm.topn),
	>>> # CoherenceModel_ttc._get_topics_from_model(m2, topn=cm.topn)
	>>> # ])
	"""
	# Extract top words as lists for each model's topics
	topics = [cls.top_topics_as_word_lists(model, dictionary, topn) for model in models]
	kwargs['dictionary'] = dictionary
	kwargs['topn'] = topn
	# Use for_topics to initialize the coherence model with these topics
	return cls.for_topics(topics, **kwargs)

	@staticmethod
	def top_topics_as_word_lists(model, dictionary, topn=20):
	"""
	Get `topn` topics from a model as lists of words.

	Parameters
	----------
	model : :class:`~gensim.models.basemodel.BaseTopicModel`
	Pre-trained topic model.
	dictionary : :class:`~gensim.corpora.dictionary.Dictionary`
	Gensim dictionary mapping of id word.
	topn : int, optional
	Integer corresponding to the number of top words to be extracted from each topic. Defaults to 20.

	Returns
	-------
	list of list of str
	Top topics in list-of-list-of-words format.
	"""
	# Ensure id2token mapping exists in the dictionary
	if not dictionary.id2token:
	dictionary.id2token = {v: k for k, v in dictionary.token2id.items()}

	str_topics = []
	for topic_distribution in model.get_topics():
	# Get the indices of the topN words based on their probabilities
	bestn_indices = matutils.argsort(topic_distribution, topn=topn, reverse=True)
	# Convert word IDs back to words using the dictionary
	best_words = [dictionary.id2token[_id] for _id in bestn_indices]
	str_topics.append(best_words)
	return str_topics

	@classmethod
	def for_topics(cls, topics_as_topn_terms, **kwargs):
	"""
	Initialize a CoherenceModel_ttc with estimated probabilities for all of the given topics.
	This is useful when you have raw topics (list of lists of words) and not a Gensim model object.

	Parameters
	----------
	topics_as_topn_terms : list of list of str
	Each element in the top-level list should be a list of top-N words, one per topic.
	For example: `[['word1', 'word2'], ['word3', 'word4']]`.

	Returns
	-------
	:class:`~gensim.models.coherencemodel.CoherenceModel`
	CoherenceModel_ttc with estimated probabilities for the given topics.
	"""
	if not topics_as_topn_terms:
	raise ValueError("len(topics_as_topn_terms) must be > 0.")
	if any(len(topic_list) == 0 for topic_list in topics_as_topn_terms):
	raise ValueError("Found an empty topic listing in `topics_as_topn_terms`.")

	# Determine the maximum 'topn' value among the provided topics
	# This will be used to initialize the CoherenceModel_ttc correctly for probability estimation
	actual_topn_in_data = 0
	for topic_list in topics_as_topn_terms:
	for topic in topic_list:
	actual_topn_in_data = max(actual_topn_in_data, len(topic))

	# Use the provided 'topn' from kwargs, or the determined 'actual_topn_in_data',
	# ensuring it's not greater than the actual data available.
	# This allows for precomputing probabilities for a wider set of words if needed.
	topn_for_prob_estimation = min(kwargs.pop('topn', actual_topn_in_data), actual_topn_in_data)

	# Flatten all topics into a single "super topic" for initial probability estimation.
	# This ensures that all words relevant to any topic in the comparison set
	# are included in the accumulator.
	super_topic = utils.flatten(topics_as_topn_terms)

	logger.info(
	"Number of relevant terms for all %d models (or topic sets): %d",
	len(topics_as_topn_terms), len(super_topic))

	# Initialize CoherenceModel_ttc with the super topic to pre-estimate probabilities
	# for all relevant words across all models.
	# We pass `topics=[super_topic]` and `topn=len(super_topic)` to ensure all words
	# are considered during the probability estimation phase.
	cm = CoherenceModel_ttc(topics=[super_topic], topn=len(super_topic), **kwargs)
	cm.estimate_probabilities() # Perform the actual probability estimation

	# After estimation, set the 'topn' back to the desired value for coherence calculation.
	cm.topn = topn_for_prob_estimation
	return cm

	def __str__(self):
	"""Returns a string representation of the coherence measure pipeline."""
	return str(self.measure)

	@property
	def model(self):
	"""
	Get the current topic model used by the instance.

	Returns
	-------
	:class:`~gensim.models.basemodel.BaseTopicModel`
	The currently set topic model.
	"""
	return self._model

	@model.setter
	def model(self, model):
	"""
	Set the topic model for the instance. When a new model is set,
	it triggers an update of the internal topics and checks if the accumulator needs recomputing.

	Parameters
	----------
	model : :class:`~gensim.models.basemodel.BaseTopicModel`
	The new topic model to set.
	"""
	self._model = model
	if model is not None:
	new_topics = self._get_topics() # Get topics from the new model
	self._update_accumulator(new_topics) # Check and update accumulator if needed
	self._topics = new_topics # Store the new topics

	@property
	def topn(self):
	"""
	Get the number of top words (`_topn`) used for coherence calculation.

	Returns
	-------
	int
	The number of top words.
	"""
	return self._topn

	@topn.setter
	def topn(self, topn):
	"""
	Set the number of top words (`_topn`) to consider for coherence calculation.
	If the new `topn` requires more words than currently loaded topics, and a model is available,
	it will attempt to re-extract topics from the model.

	Parameters
	----------
	topn : int
	The new number of top words.
	"""
	# Get the length of the first topic to check current topic length
	current_topic_length = len(self._topics[0])
	# Determine if the new 'topn' requires more words than currently available in topics
	requires_expansion = current_topic_length < topn

	if self.model is not None:
	self._topn = topn
	if requires_expansion:
	# If expansion is needed and a model is available, re-extract topics from the model.
	# This call to the setter property `self.model = self._model` effectively re-runs
	# the logic that extracts topics and updates the accumulator based on the new `_topn`.
	self.model = self._model
	else:
	# If no model is available and expansion is required, raise an error
	if requires_expansion:
	raise ValueError("Model unavailable and topic sizes are less than topn=%d" % topn)
	self._topn = topn # Topics will be truncated by the `topics` getter if needed

	@property
	def measure(self):
	"""
	Returns the namedtuple representing the coherence pipeline functions
	(segmentation, probability estimation, confirmation, aggregation)
	based on the `self.coherence` type.

	Returns
	-------
	namedtuple
	Pipeline that contains needed functions/method for calculating coherence.
	"""
	return COHERENCE_MEASURES[self.coherence]

	@property
	def topics(self):
	"""
	Get the current topics. If the internally stored topics have more words
	than `self._topn`, they are truncated to `self._topn` words.

	Returns
	-------
	list of list of str
	Topics as lists of word tokens.
	"""
	# If the stored topics contain more words than `_topn`, truncate them
	if len(self._topics[0]) > self._topn:
	return [topic[:self._topn] for topic in self._topics]
	else:
	return self._topics

	@topics.setter
	def topics(self, topics):
	"""
	Set the topics for the instance. This method converts topic words to their
	corresponding dictionary IDs and updates the accumulator state.

	Parameters
	----------
	topics : list of list of str or list of list of int
	Topics, either as lists of word tokens or lists of word IDs.
	"""
	if topics is not None:
	new_topics = []
	for topic in topics:
	# Ensure topic elements are converted to dictionary IDs (numpy array for efficiency)
	topic_token_ids = self._ensure_elements_are_ids(topic)
	new_topics.append(topic_token_ids)

	if self.model is not None:
	# Warn if both model and explicit topics are set, as they might be inconsistent
	logger.warning(
	"The currently set model '%s' may be inconsistent with the newly set topics",
	self.model)
	elif self.model is not None:
	# If topics are None but a model exists, extract topics from the model
	new_topics = self._get_topics()
	logger.debug("Setting topics to those of the model: %s", self.model)
	else:
	new_topics = None

	# Check if the accumulator needs to be recomputed based on the new topics
	self._update_accumulator(new_topics)
	self._topics = new_topics # Store the (ID-converted) topics

	def _ensure_elements_are_ids(self, topic):
	"""
	Internal helper to ensure that topic elements are converted to dictionary IDs.
	Handles cases where input topic might be tokens or already IDs.

	Parameters
	----------
	topic : list of str or list of int
	A single topic, either as a list of word tokens or word IDs.

	Returns
	-------
	:class:`numpy.ndarray`
	A numpy array of word IDs for the topic.

	Raises
	------
	KeyError
	If a token is not found in the dictionary or an ID is not a valid key in id2token.
	"""
	try:
	# Try to convert tokens to IDs. This is the common case if `topic` contains strings.
	return np.array([self.dictionary.token2id[token] for token in topic if token in self.dictionary.token2id])
	except KeyError:
	# If `KeyError` occurs, assume `topic` might already be a list of IDs.
	# Attempt to convert IDs to tokens and then back to IDs, ensuring they are valid dictionary entries.
	# This handles cases where `topic` might contain integer IDs that are not present in the dictionary.
	try:
	# Convert IDs to tokens (via id2token) and then tokens to IDs (via token2id)
	# This filters out invalid IDs.
	return np.array([self.dictionary.token2id[self.dictionary.id2token[_id]]
	for _id in topic if _id in self.dictionary])
	except KeyError:
	raise ValueError("Unable to interpret topic as either a list of tokens or a list of valid IDs within the dictionary.")

	def _update_accumulator(self, new_topics):
	"""
	Internal helper to determine if the cached `_accumulator` (probability statistics)
	needs to be wiped and recomputed due to changes in topics.
	"""
	if self._relevant_ids_will_differ(new_topics):
	logger.debug("Wiping cached accumulator since it does not contain all relevant ids.")
	self._accumulator = None

	def _relevant_ids_will_differ(self, new_topics):
	"""
	Internal helper to check if the set of unique word IDs relevant to the new topics
	is different from the IDs already covered by the current accumulator.

	Parameters
	----------
	new_topics : list of list of int
	The new set of topics (as word IDs).

	Returns
	-------
	bool
	True if the relevant IDs will differ, False otherwise.
	"""
	if self._accumulator is None or not self._topics_differ(new_topics):
	return False

	# Get unique IDs from the segmented new topics
	new_set = unique_ids_from_segments(self.measure.seg(new_topics))
	# Check if the current accumulator's relevant IDs are a superset of the new set.
	# If not, it means the new topics introduce words not covered, so the accumulator needs updating.
	return not self._accumulator.relevant_ids.issuperset(new_set)

	def _topics_differ(self, new_topics):
	"""
	Internal helper to check if the new topics are different from the currently stored topics.

	Parameters
	----------
	new_topics : list of list of int
	The new set of topics (as word IDs).

	Returns
	-------
	bool
	True if topics are different, False otherwise.
	"""
	# Compare topic arrays using numpy.array_equal for efficient comparison
	return (new_topics is not None
	and self._topics is not None
	and not np.array_equal(new_topics, self._topics))

	def _get_topics(self):
	"""
	Internal helper function to extract top words (as IDs) from a trained topic model.
	"""
	return self._get_topics_from_model(self.model, self.topn)

	@staticmethod
	def _get_topics_from_model(model, topn):
	"""
	Internal static method to extract top `topn` words (as IDs) from a trained topic model.

	Parameters
	----------
	model : :class:`~gensim.models.basemodel.BaseTopicModel`
	Pre-trained topic model (must implement `get_topics` method).
	topn : int
	Integer corresponding to the number of top words to extract.

	Returns
	-------
	list of :class:`numpy.ndarray`
	A list where each element is a numpy array of word IDs representing a topic's top words.

	Raises
	------
	AttributeError
	If the provided model does not implement a `get_topics` method.
	"""
	try:
	# Iterate over the topic distributions from the model
	# Use matutils.argsort to get the indices (word IDs) of the top `topn` words
	return [
	matutils.argsort(topic, topn=topn, reverse=True) for topic in
	model.get_topics()
	]
	except AttributeError:
	raise ValueError(
	"This topic model is not currently supported. Supported topic models"
	" should implement the `get_topics` method.")

	def segment_topics(self):
	"""
	Segments the current topics using the segmentation function defined by the
	chosen coherence measure (`self.measure.seg`).

	Returns
	-------
	list of list of tuple
	Segmented topics. The structure depends on the segmentation method (e.g., pairs of word IDs).
	"""
	# Apply the segmentation function from the pipeline to the current topics
	return self.measure.seg(self.topics)

	def estimate_probabilities(self, segmented_topics=None):
	"""
	Accumulates word occurrences and co-occurrences from texts or corpus
	using the optimal probability estimation method for the chosen coherence metric.
	This operation can be computationally intensive, especially for sliding window methods.

	Parameters
	----------
	segmented_topics : list of list of tuple, optional
	Segmented topics. If None, `self.segment_topics()` is called internally.

	Returns
	-------
	:class:`~gensim.topic_coherence.text_analysis.CorpusAccumulator`
	An object that holds the accumulated statistics (word frequencies, co-occurrence frequencies).
	"""
	if segmented_topics is None:
	segmented_topics = self.segment_topics()

	# Choose the appropriate probability estimation method based on the coherence type
	if self.coherence in BOOLEAN_DOCUMENT_BASED:
	self._accumulator = self.measure.prob(self.corpus, segmented_topics)
	else:
	kwargs = dict(
	texts=self.texts, segmented_topics=segmented_topics,
	dictionary=self.dictionary, window_size=self.window_size,
	processes=self.processes)
	if self.coherence == 'c_w2v':
	kwargs['model'] = self.keyed_vectors # Pass keyed_vectors for word2vec based coherence

	self._accumulator = self.measure.prob(**kwargs)

	return self._accumulator

	def get_coherence_per_topic(self, segmented_topics=None, with_std=False, with_support=False):
	"""
	Calculates and returns a list of coherence values, one for each topic,
	based on the pipeline's confirmation measure.

	Parameters
	----------
	segmented_topics : list of list of tuple, optional
	Segmented topics. If None, `self.segment_topics()` is called internally.
	with_std : bool, optional
	If True, also includes the standard deviation across topic segment sets in addition
	to the mean coherence for each topic. Defaults to False.
	with_support : bool, optional
	If True, also includes the "support" (number of pairwise similarity comparisons)
	used to compute each topic's coherence. Defaults to False.

	Returns
	-------
	list of float or list of tuple
	A sequence of similarity measures for each topic.
	If `with_std` or `with_support` is True, each element in the list will be a tuple
	containing the coherence value and the requested additional statistics.
	"""
	measure = self.measure
	if segmented_topics is None:
	segmented_topics = measure.seg(self.topics)

	# Ensure probabilities are estimated before calculating coherence
	if self._accumulator is None:
	self.estimate_probabilities(segmented_topics)

	kwargs = dict(with_std=with_std, with_support=with_support)
	if self.coherence in BOOLEAN_DOCUMENT_BASED or self.coherence == 'c_w2v':
	# These coherence types don't require specific additional kwargs for confirmation measure
	pass
	elif self.coherence == 'c_v':
	# Specific kwargs for c_v's confirmation measure (cosine_similarity)
	kwargs['topics'] = self.topics
	kwargs['measure'] = 'nlr' # Normalized Log Ratio
	kwargs['gamma'] = 1
	else:
	# For c_uci and c_npmi, 'normalize' parameter is relevant
	kwargs['normalize'] = (self.coherence == 'c_npmi')

	return measure.conf(segmented_topics, self._accumulator, **kwargs)

	def aggregate_measures(self, topic_coherences):
	"""
	Aggregates the individual topic coherence measures into a single overall score
	using the pipeline's aggregation function (`self.measure.aggr`).

	Parameters
	----------
	topic_coherences : list of float
	List of coherence values for each topic.

	Returns
	-------
	float
	The aggregated coherence value (e.g., arithmetic mean).
	"""
	# Apply the aggregation function from the pipeline to the list of topic coherences
	return self.measure.aggr(topic_coherences)

	def get_coherence(self):
	"""
	Calculates and returns the overall coherence value for the entire set of topics.
	This is the main entry point for getting a single coherence score.

	Returns
	-------
	float
	The aggregated coherence value.
	"""
	# First, get coherence values for each individual topic
	confirmed_measures = self.get_coherence_per_topic()
	# Then, aggregate these topic-level coherences into a single score
	return self.aggregate_measures(confirmed_measures)

	def compare_models(self, models):
	"""
	Compares multiple topic models by their coherence values.
	It extracts topics from each model and then calls `compare_model_topics`.

	Parameters
	----------
	models : list of :class:`~gensim.models.basemodel.BaseTopicModel`
	A sequence of topic models to compare.

	Returns
	-------
	list of (list of float, float)
	A sequence where each element is a pair:
	(list of average topic coherences for the model, overall model coherence).
	"""
	# Extract topics (as word IDs) for each model using the internal helper
	model_topics = [self._get_topics_from_model(model, self.topn) for model in models]
	# Delegate to compare_model_topics for the actual coherence comparison
	return self.compare_model_topics(model_topics)

	def compare_model_topics(self, model_topics):
	"""
	Performs coherence evaluation for each set of topics provided in `model_topics`.
	This method is designed to be efficient by precomputing probabilities once if needed,
	and then evaluating coherence for each set of topics.

	Parameters
	----------
	model_topics : list of list of list of int
	A list where each element is itself a list of topics (each topic being a list of word IDs)
	representing a set of topics (e.g., from a single model).

	Returns
	-------
	list of (list of float, float)
	A sequence where each element is a pair:
	(list of average topic coherences for the topic set, overall topic set coherence).

	Notes
	-----
	This method uses a heuristic of evaluating coherence at various `topn` values (e.g., 20, 15, 10, 5)
	and averaging the results for robustness, as suggested in some research.
	"""
	# Store original topics and topn to restore them after comparison
	orig_topics = self._topics
	orig_topn = self.topn

	try:
	# Perform the actual comparison
	coherences = self._compare_model_topics(model_topics)
	finally:
	# Ensure original topics and topn are restored even if an error occurs
	self.topics = orig_topics
	self.topn = orig_topn

	return coherences

	def _compare_model_topics(self, model_topics):
	"""
	Internal helper to get average topic and model coherences across multiple sets of topics.

	Parameters
	----------
	model_topics : list of list of list of int
	A list where each element is a set of topics (list of lists of word IDs).

	Returns
	-------
	list of (list of float, float)
	A sequence of pairs:
	(average topic coherences across different `topn` values for each topic,
	overall model coherence averaged across different `topn` values).
	"""
	coherences = []
	# Define a grid of `topn` values to evaluate coherence.
	# This provides a more robust average coherence value.
	# It goes from `self.topn` down to `min(self.topn - 1, 4)` in steps of -5.
	# e.g., if self.topn is 20, grid might be [20, 15, 10, 5].
	# The `min(self.topn - 1, 4)` ensures at least some lower values are included,
	# but also prevents trying `topn` values that are too small or negative.
	last_topn_value = min(self.topn - 1, 4)
	topn_grid = list(range(self.topn, last_topn_value, -5))
	if not topn_grid or max(topn_grid) < 1: # Ensure at least one valid topn if range is empty or too small
	topn_grid = [max(1, min(self.topn, 5))] # Use min of self.topn and 5, ensure at least 1

	for model_num, topics in enumerate(model_topics):
	# Set the current topics for the instance to the topics of the model being evaluated
	self.topics = topics

	coherence_at_n = {} # Dictionary to store coherence results for different `topn` values
	for n in topn_grid:
	self.topn = n # Set the `topn` for the current evaluation round
	topic_coherences = self.get_coherence_per_topic()

	# Handle NaN values in topic coherences by imputing with the mean
	filled_coherences = np.array(topic_coherences, dtype=float)
	# Check for NaN values and replace them with the mean of non-NaN values.
	# np.nanmean handles arrays with all NaNs gracefully by returning NaN.
	if np.any(np.isnan(filled_coherences)):
	mean_val = np.nanmean(filled_coherences)
	if np.isnan(mean_val): # If all are NaN, mean_val will also be NaN. In this case, replace with 0 or a very small number.
	filled_coherences[np.isnan(filled_coherences)] = 0.0 # Or another sensible default
	else:
	filled_coherences[np.isnan(filled_coherences)] = mean_val


	# Store the topic-level coherences and the aggregated (overall) coherence for this `topn`
	coherence_at_n[n] = (topic_coherences, self.aggregate_measures(filled_coherences))

	# Unpack the stored coherences for different `topn` values
	all_topic_coherences_at_n, all_avg_coherences_at_n = zip(*coherence_at_n.values())

	# Calculate the average topic coherence across all `topn` values
	# np.vstack stacks lists of topic coherences into a 2D array, then mean(0) computes mean for each topic.
	avg_topic_coherences = np.vstack(all_topic_coherences_at_n).mean(axis=0)

	# Calculate the overall model coherence by averaging the aggregated coherences from all `topn` values
	model_coherence = np.mean(all_avg_coherences_at_n)

	logging.info("Avg coherence for model %d: %.5f" % (model_num, model_coherence))
	coherences.append((avg_topic_coherences.tolist(), model_coherence)) # Convert numpy array back to list for output

	return coherences