Spaces:

seanpedrickcase
/

topic_modelling

Running

App Files Files Community

topic_modelling / funcs /presidio_analyzer_custom.py

Sonnyjim

Added clean data options, improved re-representation options and visualisation. General format changes

4effac0 5 months ago

raw

history blame

No virus

4.18 kB

	import gradio as gr
	from typing import List, Iterable, Dict, Union, Any, Optional, Iterator, Tuple
	from tqdm import tqdm

	from presidio_analyzer import DictAnalyzerResult, RecognizerResult, AnalyzerEngine
	from presidio_analyzer.nlp_engine import NlpArtifacts

	def analyze_iterator_custom(
	self,
	texts: Iterable[Union[str, bool, float, int]],
	language: str,
	list_length:int,
	progress=gr.Progress(),
	**kwargs,
	) -> List[List[RecognizerResult]]:
	"""
	Analyze an iterable of strings.

	:param texts: An list containing strings to be analyzed.
	:param language: Input language
	:param list_length: Length of the input list.
	:param kwargs: Additional parameters for the `AnalyzerEngine.analyze` method.
	"""

	# validate types
	texts = self._validate_types(texts)

	# Process the texts as batch for improved performance
	nlp_artifacts_batch: Iterator[
	Tuple[str, NlpArtifacts]
	] = self.analyzer_engine.nlp_engine.process_batch(
	texts=texts, language=language
	)



	list_results = []
	for text, nlp_artifacts in progress.tqdm(nlp_artifacts_batch, total = list_length, desc = "Analysing text for personal information", unit = "rows"):
	results = self.analyzer_engine.analyze(
	text=str(text), nlp_artifacts=nlp_artifacts, language=language, **kwargs
	)

	list_results.append(results)

	return list_results

	def analyze_dict(
	self,
	input_dict: Dict[str, Union[Any, Iterable[Any]]],
	language: str,
	keys_to_skip: Optional[List[str]] = None,
	**kwargs,
	) -> Iterator[DictAnalyzerResult]:
	"""
	Analyze a dictionary of keys (strings) and values/iterable of values.

	Non-string values are returned as is.

	:param input_dict: The input dictionary for analysis
	:param language: Input language
	:param keys_to_skip: Keys to ignore during analysis
	:param kwargs: Additional keyword arguments
	for the `AnalyzerEngine.analyze` method.
	Use this to pass arguments to the analyze method,
	such as `ad_hoc_recognizers`, `context`, `return_decision_process`.
	See `AnalyzerEngine.analyze` for the full list.
	"""

	context = []
	if "context" in kwargs:
	context = kwargs["context"]
	del kwargs["context"]

	if not keys_to_skip:
	keys_to_skip = []


	for key, value in input_dict.items():
	if not value or key in keys_to_skip:
	yield DictAnalyzerResult(key=key, value=value, recognizer_results=[])
	continue # skip this key as requested

	# Add the key as an additional context
	specific_context = context[:]
	specific_context.append(key)

	if type(value) in (str, int, bool, float):
	results: List[RecognizerResult] = self.analyzer_engine.analyze(
	text=str(value), language=language, context=[key], **kwargs
	)
	elif isinstance(value, dict):
	new_keys_to_skip = self._get_nested_keys_to_skip(key, keys_to_skip)
	results = self.analyze_dict(
	input_dict=value,
	language=language,
	context=specific_context,
	keys_to_skip=new_keys_to_skip,
	**kwargs,
	)
	elif isinstance(value, Iterable):
	# Recursively iterate nested dicts
	list_length = len(value)

	results: List[List[RecognizerResult]] = analyze_iterator_custom(self,
	texts=value,
	language=language,
	context=specific_context,
	list_length=list_length,
	**kwargs,
	)
	else:
	raise ValueError(f"type {type(value)} is unsupported.")

	yield DictAnalyzerResult(key=key, value=value, recognizer_results=results)