File size: 3,835 Bytes
e9a1c18
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
"""Find secret keys in text.

# Code forked from
# https://github.com/bigcode-project/pii-lib/blob/main/utils/keys_detection.py
# under the Apache 2.0 License.
"""
import os
import tempfile
from typing import Iterator

from detect_secrets import SecretsCollection
from detect_secrets.settings import transient_settings

from ..data.dataset_utils import lilac_span
from ..schema import Item

# Secrets detection with detect-secrets tool

filters = [
  # some filters from
  # https://github.com/Yelp/detect-secrets/blob/master/docs/filters.md#built-in-filters
  # were removed based on their targets
  {
    'path': 'detect_secrets.filters.heuristic.is_potential_uuid'
  },
  {
    'path': 'detect_secrets.filters.heuristic.is_likely_id_string'
  },
  {
    'path': 'detect_secrets.filters.heuristic.is_templated_secret'
  },
  {
    'path': 'detect_secrets.filters.heuristic.is_sequential_string'
  },
]
plugins = [
  {
    'name': 'ArtifactoryDetector'
  },
  {
    'name': 'AWSKeyDetector'
  },
  {
    'name': 'AzureStorageKeyDetector'
  },
  {
    'name': 'CloudantDetector'
  },
  {
    'name': 'DiscordBotTokenDetector'
  },
  {
    'name': 'GitHubTokenDetector'
  },
  {
    'name': 'IbmCloudIamDetector'
  },
  {
    'name': 'IbmCosHmacDetector'
  },
  {
    'name': 'JwtTokenDetector'
  },
  {
    'name': 'MailchimpDetector'
  },
  {
    'name': 'NpmDetector'
  },
  {
    'name': 'SendGridDetector'
  },
  {
    'name': 'SlackDetector'
  },
  {
    'name': 'SoftlayerDetector'
  },
  {
    'name': 'StripeDetector'
  },
  {
    'name': 'TwilioKeyDetector'
  },
  # remove 3 plugins for keyword
  # {'name': 'BasicAuthDetector'},
  # {'name': 'KeywordDetector'},
  # {'name': 'PrivateKeyDetector'},
]


def _is_hash(content: str, value: str) -> bool:
  """Second check if the value is a hash (after gibberish detector)."""
  try:
    index = content.index(value)
  except ValueError:
    return False
  lines = content[:index].splitlines()
  target_line = lines[-1]
  if len(value) in [32, 40, 64]:
    # if 'sha' or 'md5' are in content:
    keywords = ['sha', 'md5', 'hash', 'byte']
    if any(x in target_line.lower() for x in keywords):
      return True
  return False


def _file_has_hashes(content: str, coeff: float = 0.02) -> bool:
  """Checks if the file contains literals 'hash' or 'sha' for more than 2% nb_of_lines."""
  lines = content.splitlines()
  count_sha = 0
  count_hash = 0
  nlines = content.count('\n')
  threshold = int(coeff * nlines)
  for line in lines:
    count_sha += line.lower().count('sha')
    count_hash += line.lower().count('hash')
    if count_sha > threshold or count_hash > threshold:
      return True
  return False


def _get_indexes(text: str, value: str) -> list[tuple[int, int]]:
  string = text
  indexes: list[int] = []
  new_start = 0
  while True:
    try:
      start = string.index(value)
      indexes.append(new_start + start)
      new_start = new_start + start + len(value)
      string = text[new_start:]
    except ValueError:
      break
  return [(x, x + len(value)) for x in indexes]


def find_secrets(content: str, suffix: str = '.txt') -> Iterator[Item]:
  """Detect secret keys in content using detect-secrets tool."""
  fp = tempfile.NamedTemporaryFile(suffix=suffix, delete=False, mode='w')
  fp.write(content)
  fp.close()
  secrets = SecretsCollection()
  with transient_settings({'plugins_used': plugins, 'filters_used': filters}):
    secrets.scan_file(fp.name)
  os.unlink(fp.name)
  secrets_set = list(secrets.data.values())
  if not secrets_set:
    return
  for secret in secrets_set[0]:
    if not secret.secret_value:
      continue
    if _is_hash(content, secret.secret_value) or _file_has_hashes(content):
      continue
    indexes = _get_indexes(content, secret.secret_value)
    for start, end in indexes:
      yield lilac_span(start, end)