"""Find secret keys in text. # Code forked from # https://github.com/bigcode-project/pii-lib/blob/main/utils/keys_detection.py # under the Apache 2.0 License. """ import os import tempfile from typing import Iterator from detect_secrets import SecretsCollection from detect_secrets.settings import transient_settings from ..data.dataset_utils import lilac_span from ..schema import Item # Secrets detection with detect-secrets tool filters = [ # some filters from # https://github.com/Yelp/detect-secrets/blob/master/docs/filters.md#built-in-filters # were removed based on their targets { 'path': 'detect_secrets.filters.heuristic.is_potential_uuid' }, { 'path': 'detect_secrets.filters.heuristic.is_likely_id_string' }, { 'path': 'detect_secrets.filters.heuristic.is_templated_secret' }, { 'path': 'detect_secrets.filters.heuristic.is_sequential_string' }, ] plugins = [ { 'name': 'ArtifactoryDetector' }, { 'name': 'AWSKeyDetector' }, { 'name': 'AzureStorageKeyDetector' }, { 'name': 'CloudantDetector' }, { 'name': 'DiscordBotTokenDetector' }, { 'name': 'GitHubTokenDetector' }, { 'name': 'IbmCloudIamDetector' }, { 'name': 'IbmCosHmacDetector' }, { 'name': 'JwtTokenDetector' }, { 'name': 'MailchimpDetector' }, { 'name': 'NpmDetector' }, { 'name': 'SendGridDetector' }, { 'name': 'SlackDetector' }, { 'name': 'SoftlayerDetector' }, { 'name': 'StripeDetector' }, { 'name': 'TwilioKeyDetector' }, # remove 3 plugins for keyword # {'name': 'BasicAuthDetector'}, # {'name': 'KeywordDetector'}, # {'name': 'PrivateKeyDetector'}, ] def _is_hash(content: str, value: str) -> bool: """Second check if the value is a hash (after gibberish detector).""" try: index = content.index(value) except ValueError: return False lines = content[:index].splitlines() target_line = lines[-1] if len(value) in [32, 40, 64]: # if 'sha' or 'md5' are in content: keywords = ['sha', 'md5', 'hash', 'byte'] if any(x in target_line.lower() for x in keywords): return True return False def _file_has_hashes(content: str, coeff: float = 0.02) -> bool: """Checks if the file contains literals 'hash' or 'sha' for more than 2% nb_of_lines.""" lines = content.splitlines() count_sha = 0 count_hash = 0 nlines = content.count('\n') threshold = int(coeff * nlines) for line in lines: count_sha += line.lower().count('sha') count_hash += line.lower().count('hash') if count_sha > threshold or count_hash > threshold: return True return False def _get_indexes(text: str, value: str) -> list[tuple[int, int]]: string = text indexes: list[int] = [] new_start = 0 while True: try: start = string.index(value) indexes.append(new_start + start) new_start = new_start + start + len(value) string = text[new_start:] except ValueError: break return [(x, x + len(value)) for x in indexes] def find_secrets(content: str, suffix: str = '.txt') -> Iterator[Item]: """Detect secret keys in content using detect-secrets tool.""" fp = tempfile.NamedTemporaryFile(suffix=suffix, delete=False, mode='w') fp.write(content) fp.close() secrets = SecretsCollection() with transient_settings({'plugins_used': plugins, 'filters_used': filters}): secrets.scan_file(fp.name) os.unlink(fp.name) secrets_set = list(secrets.data.values()) if not secrets_set: return for secret in secrets_set[0]: if not secret.secret_value: continue if _is_hash(content, secret.secret_value) or _file_has_hashes(content): continue indexes = _get_indexes(content, secret.secret_value) for start, end in indexes: yield lilac_span(start, end)