Spaces:
Running
Running
victormiller
commited on
Update common.py
Browse files
common.py
CHANGED
@@ -37,6 +37,7 @@ dask.bag.from_sequence(doc_file_paths)
|
|
37 |
.map_partitions(make_doc_pairs)
|
38 |
.compute()
|
39 |
"""
|
|
|
40 |
|
41 |
global_div = Div(
|
42 |
Section(
|
@@ -97,7 +98,7 @@ global_div = Div(
|
|
97 |
H3("Removing PII"),
|
98 |
P("We have removed two types of PII from the dataset: email address and IP address. Regular expressions are used to identify and replace these PII with a generic placeholder. Below is an example of how we removed email addresses from the dataset:"),
|
99 |
P("We have used the following regular expressions to identify and replace PII:"),
|
100 |
-
Ul(Li("Email:
|
101 |
),
|
102 |
Section(
|
103 |
H2("Normalization Form C (NFC)"),
|
|
|
37 |
.map_partitions(make_doc_pairs)
|
38 |
.compute()
|
39 |
"""
|
40 |
+
email_code = "r"[A-Za-z0-9!#$%&'*+/=?^_`{|}~-]+(?:\.[A-Za-z0-9!#$%&'*+/=?^_`{|}~-]+)*@(?:(?:[A-Za-z0-9](?:[" r"A-Za-z0-9-]*[A-Za-z0-9])?\.)+[A-Za-z0-9](?:[A-Za-z0-9-]*[A-Za-z0-9])?|\[(?:(?:25[0-5]|2[0-4][0-9]|[" r"01]?[0-9][0-9]?)\.){3}(?:25[0-5]|2[0-4][0-9]|[01]?[0-9][0-9]?|[A-Za-z0-9-]*[A-Za-z0-9]:)])"
|
41 |
|
42 |
global_div = Div(
|
43 |
Section(
|
|
|
98 |
H3("Removing PII"),
|
99 |
P("We have removed two types of PII from the dataset: email address and IP address. Regular expressions are used to identify and replace these PII with a generic placeholder. Below is an example of how we removed email addresses from the dataset:"),
|
100 |
P("We have used the following regular expressions to identify and replace PII:"),
|
101 |
+
Ul(Li("Email:" + Pre(Code(email_code))),Li("IP Address: NEED TO UPDATE")),
|
102 |
),
|
103 |
Section(
|
104 |
H2("Normalization Form C (NFC)"),
|