File size: 1,958 Bytes
e4f9cbe
 
 
 
 
 
 
 
e9a1c18
 
e4f9cbe
 
 
e9a1c18
 
e4f9cbe
 
 
 
 
 
 
 
 
e9a1c18
e4f9cbe
 
 
 
 
 
 
 
e9a1c18
 
 
 
 
e4f9cbe
 
 
 
 
 
 
 
e9a1c18
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
"""Compute text statistics for a document."""
import re
from typing import Iterable, Optional

from typing_extensions import override

from ..data.dataset_utils import lilac_span
from ..schema import Field, Item, RichData, SignalInputType, field
from .pii_ip_address import find_ip_addresses
from .pii_secrets import find_secrets
from .signal import TextSignal

EMAILS_KEY = 'emails'
IPS_KEY = 'ip_addresses'
SECRETS_KEY = 'secrets'

# This regex is a fully RFC 5322 regex for email addresses.
# https://uibakery.io/regex-library/email-regex-python
EMAIL_REGEX = re.compile(
  "(?:[a-z0-9!#$%&'*+/=?^_`{|}~-]+(?:\\.[a-z0-9!#$%&'*+/=?^_`{|}~-]+)*|\"(?:[\\x01-\\x08\\x0b\\x0c\\x0e-\\x1f\\x21\\x23-\\x5b\\x5d-\\x7f]|\\\\[\\x01-\\x09\\x0b\\x0c\\x0e-\\x7f])*\")@(?:(?:[a-z0-9](?:[a-z0-9-]*[a-z0-9])?\\.)+[a-z0-9](?:[a-z0-9-]*[a-z0-9])?|\\[(?:(?:25[0-5]|2[0-4][0-9]|[01]?[0-9][0-9]?)\\.){3}(?:25[0-5]|2[0-4][0-9]|[01]?[0-9][0-9]?|[a-z0-9-]*[a-z0-9]:(?:[\\x01-\\x08\\x0b\\x0c\\x0e-\\x1f\\x21-\\x5a\\x53-\\x7f]|\\\\[\\x01-\\x09\\x0b\\x0c\\x0e-\\x7f])+)\\])",
  re.IGNORECASE)


class PIISignal(TextSignal):
  """Find personally identifiable information (emails, phone numbers, secret keys, etc)."""
  name = 'pii'
  display_name = 'Personal Information (PII)'

  input_type = SignalInputType.TEXT
  compute_type = SignalInputType.TEXT

  @override
  def fields(self) -> Field:
    return field(fields={
      EMAILS_KEY: ['string_span'],
      IPS_KEY: ['string_span'],
      SECRETS_KEY: ['string_span'],
    })

  @override
  def compute(self, data: Iterable[RichData]) -> Iterable[Optional[Item]]:
    for text in data:
      if not isinstance(text, str):
        yield None
        continue
      emails = [lilac_span(m.start(0), m.end(0)) for m in EMAIL_REGEX.finditer(text)]
      ips = list(find_ip_addresses(text))
      secrets = list(find_secrets(text))
      yield {
        EMAILS_KEY: emails,
        IPS_KEY: ips,
        SECRETS_KEY: secrets,
      }