File size: 1,353 Bytes
492deb9
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
__all__ = [
    "mask_email_udf", "mask_name_udf", "mask_date_udf",
    "mask_ssn_udf", "mask_itin_udf", "mask_phone_udf"
]

from pyspark.sql.functions import udf
from pyspark.sql.types import StringType
import re
from datetime import datetime

# Masking functions
def mask_email(value):
    if value and "@" in value:
        user, domain = value.split("@")
        return "***@" + domain
    return None

def mask_name(value):
    if value:
        return value[0] + "***"
    return None

def mask_date(value):
    try:
        dt = datetime.strptime(value, "%Y-%m-%d")
        return dt.strftime("***-**-%d")
    except:
        return None

def mask_ssn(value):
    if value and re.match(r"\d{3}-\d{2}-\d{4}", value):
        return "***-**-" + value[-4:]
    return None

def mask_itin(value):
    if value and re.match(r"9\d{2}-7\d-\d{4}", value):
        return "***-**-" + value[-4:]
    return None

def mask_phone(value):
    if value and re.match(r"\(?\d{3}\)?[-.\s]?\d{3}[-.\s]?\d{4}", value):
        return "***-***-" + value[-4:]
    return None

# UDFs for Spark
mask_email_udf = udf(mask_email, StringType())
mask_name_udf = udf(mask_name, StringType())
mask_date_udf = udf(mask_date, StringType())
mask_ssn_udf = udf(mask_ssn, StringType())
mask_itin_udf = udf(mask_itin, StringType())
mask_phone_udf = udf(mask_phone, StringType())