victormiller
commited on
Commit
•
11009b9
1
Parent(s):
c6e3afb
Update common.py
Browse files
common.py
CHANGED
@@ -37,7 +37,12 @@ dask.bag.from_sequence(doc_file_paths)
|
|
37 |
.map_partitions(make_doc_pairs)
|
38 |
.compute()
|
39 |
"""
|
40 |
-
email_code = "
|
|
|
|
|
|
|
|
|
|
|
41 |
|
42 |
global_div = Div(
|
43 |
Section(
|
@@ -98,7 +103,7 @@ global_div = Div(
|
|
98 |
H3("Removing PII"),
|
99 |
P("We have removed two types of PII from the dataset: email address and IP address. Regular expressions are used to identify and replace these PII with a generic placeholder. Below is an example of how we removed email addresses from the dataset:"),
|
100 |
P("We have used the following regular expressions to identify and replace PII:"),
|
101 |
-
Ul(Li("Email:
|
102 |
),
|
103 |
Section(
|
104 |
H2("Normalization Form C (NFC)"),
|
|
|
37 |
.map_partitions(make_doc_pairs)
|
38 |
.compute()
|
39 |
"""
|
40 |
+
email_code = """
|
41 |
+
r"[A-Za-z0-9!#$%&'*+/=?^_`{|}~-]+(?:\.[A-Za-z0-9!#$%&'*+/=?^_`{|}~-]+)*@(?:(?:[A-Za-z0-9]
|
42 |
+
(?:["r"A-Za-z0-9-]*[A-Za-z0-9])?\.)+[A-Za-z0-9](?:[A-Za-z0-9-]*[A-Za-z0-9])?|\[(?:(?:25
|
43 |
+
[0-5]|2[0-4][0-9]|[" r"01]?[0-9][0-9]?)\.){3}(?:25[0-5]|2[0-4][0-9]|[01]?[0-9][0-9]?
|
44 |
+
|[A-Za-z0-9-]*[A-Za-z0-9]:)])
|
45 |
+
"""
|
46 |
|
47 |
global_div = Div(
|
48 |
Section(
|
|
|
103 |
H3("Removing PII"),
|
104 |
P("We have removed two types of PII from the dataset: email address and IP address. Regular expressions are used to identify and replace these PII with a generic placeholder. Below is an example of how we removed email addresses from the dataset:"),
|
105 |
P("We have used the following regular expressions to identify and replace PII:"),
|
106 |
+
Ul(Li("Email:"), Li(email_code, style="list-style-type: none")Li("IP Address: NEED TO UPDATE")),
|
107 |
),
|
108 |
Section(
|
109 |
H2("Normalization Form C (NFC)"),
|