Spaces:

lilacai
/

nikhil_staging

Runtime error

File size: 3,835 Bytes

e9a1c18

"""Find secret keys in text.

# Code forked from
# https://github.com/bigcode-project/pii-lib/blob/main/utils/keys_detection.py
# under the Apache 2.0 License.
"""
import os
import tempfile
from typing import Iterator

from detect_secrets import SecretsCollection
from detect_secrets.settings import transient_settings

from ..data.dataset_utils import lilac_span
from ..schema import Item

# Secrets detection with detect-secrets tool

filters = [
  # some filters from
  # https://github.com/Yelp/detect-secrets/blob/master/docs/filters.md#built-in-filters
  # were removed based on their targets
  {
    'path': 'detect_secrets.filters.heuristic.is_potential_uuid'
  },
  {
    'path': 'detect_secrets.filters.heuristic.is_likely_id_string'
  },
  {
    'path': 'detect_secrets.filters.heuristic.is_templated_secret'
  },
  {
    'path': 'detect_secrets.filters.heuristic.is_sequential_string'
  },
]
plugins = [
  {
    'name': 'ArtifactoryDetector'
  },
  {
    'name': 'AWSKeyDetector'
  },
  {
    'name': 'AzureStorageKeyDetector'
  },
  {
    'name': 'CloudantDetector'
  },
  {
    'name': 'DiscordBotTokenDetector'
  },
  {
    'name': 'GitHubTokenDetector'
  },
  {
    'name': 'IbmCloudIamDetector'
  },
  {
    'name': 'IbmCosHmacDetector'
  },
  {
    'name': 'JwtTokenDetector'
  },
  {
    'name': 'MailchimpDetector'
  },
  {
    'name': 'NpmDetector'
  },
  {
    'name': 'SendGridDetector'
  },
  {
    'name': 'SlackDetector'
  },
  {
    'name': 'SoftlayerDetector'
  },
  {
    'name': 'StripeDetector'
  },
  {
    'name': 'TwilioKeyDetector'
  },
  # remove 3 plugins for keyword
  # {'name': 'BasicAuthDetector'},
  # {'name': 'KeywordDetector'},
  # {'name': 'PrivateKeyDetector'},
]


def _is_hash(content: str, value: str) -> bool:
  """Second check if the value is a hash (after gibberish detector)."""
  try:
    index = content.index(value)
  except ValueError:
    return False
  lines = content[:index].splitlines()
  target_line = lines[-1]
  if len(value) in [32, 40, 64]:
    # if 'sha' or 'md5' are in content:
    keywords = ['sha', 'md5', 'hash', 'byte']
    if any(x in target_line.lower() for x in keywords):
      return True
  return False


def _file_has_hashes(content: str, coeff: float = 0.02) -> bool:
  """Checks if the file contains literals 'hash' or 'sha' for more than 2% nb_of_lines."""
  lines = content.splitlines()
  count_sha = 0
  count_hash = 0
  nlines = content.count('\n')
  threshold = int(coeff * nlines)
  for line in lines:
    count_sha += line.lower().count('sha')
    count_hash += line.lower().count('hash')
    if count_sha > threshold or count_hash > threshold:
      return True
  return False


def _get_indexes(text: str, value: str) -> list[tuple[int, int]]:
  string = text
  indexes: list[int] = []
  new_start = 0
  while True:
    try:
      start = string.index(value)
      indexes.append(new_start + start)
      new_start = new_start + start + len(value)
      string = text[new_start:]
    except ValueError:
      break
  return [(x, x + len(value)) for x in indexes]


def find_secrets(content: str, suffix: str = '.txt') -> Iterator[Item]:
  """Detect secret keys in content using detect-secrets tool."""
  fp = tempfile.NamedTemporaryFile(suffix=suffix, delete=False, mode='w')
  fp.write(content)
  fp.close()
  secrets = SecretsCollection()
  with transient_settings({'plugins_used': plugins, 'filters_used': filters}):
    secrets.scan_file(fp.name)
  os.unlink(fp.name)
  secrets_set = list(secrets.data.values())
  if not secrets_set:
    return
  for secret in secrets_set[0]:
    if not secret.secret_value:
      continue
    if _is_hash(content, secret.secret_value) or _file_has_hashes(content):
      continue
    indexes = _get_indexes(content, secret.secret_value)
    for start, end in indexes:
      yield lilac_span(start, end)