victormiller
commited on
Commit
•
6084136
1
Parent(s):
b8195ee
Update web.py
Browse files
web.py
CHANGED
@@ -450,10 +450,11 @@ def web_data():
|
|
450 |
),
|
451 |
|
452 |
#DV("data/sample_non_en.json", 3, "Sample documents that are classified as non-English"),
|
453 |
-
|
454 |
-
|
455 |
-
|
456 |
-
|
|
|
457 |
|
458 |
H4("1.3 URL Filtering"),
|
459 |
P("""
|
@@ -466,12 +467,18 @@ def web_data():
|
|
466 |
articles, sex education, technical blogs, etc. Specifically, we randomly took 903M URLs and matched them with
|
467 |
4.6M domain names in the UT1 blocklist. 24 URL domains were detected with more than 4k matches, which are shown below.
|
468 |
"""),
|
|
|
|
|
|
|
|
|
|
|
469 |
|
470 |
-
DVS(urls_high_matches, "24 URL domains with more than 4k matches"),
|
471 |
P("""
|
472 |
We manually removed the following 6 domains from the UT1 blocklist so that they will not be removed from our dataset.
|
473 |
"""),
|
|
|
474 |
DVS(urls_false_positives, "6 url domains that are removed from the blocklist"),
|
|
|
475 |
DV(
|
476 |
"data/bad_url_doc.jsonl",
|
477 |
3,
|
|
|
450 |
),
|
451 |
|
452 |
#DV("data/sample_non_en.json", 3, "Sample documents that are classified as non-English"),
|
453 |
+
|
454 |
+
Details(
|
455 |
+
Summary("English Documents Scoring Lower than 0.65"),
|
456 |
+
DV("data/sample_en_low.json", 3, "Sample documents that are classified as English but with score less than 0.65"),
|
457 |
+
),
|
458 |
|
459 |
H4("1.3 URL Filtering"),
|
460 |
P("""
|
|
|
467 |
articles, sex education, technical blogs, etc. Specifically, we randomly took 903M URLs and matched them with
|
468 |
4.6M domain names in the UT1 blocklist. 24 URL domains were detected with more than 4k matches, which are shown below.
|
469 |
"""),
|
470 |
+
|
471 |
+
Details(
|
472 |
+
Summary("24 URL domains with more than 4k matches"),
|
473 |
+
DVS(urls_high_matches, "24 URL domains with more than 4k matches"),
|
474 |
+
),
|
475 |
|
|
|
476 |
P("""
|
477 |
We manually removed the following 6 domains from the UT1 blocklist so that they will not be removed from our dataset.
|
478 |
"""),
|
479 |
+
|
480 |
DVS(urls_false_positives, "6 url domains that are removed from the blocklist"),
|
481 |
+
|
482 |
DV(
|
483 |
"data/bad_url_doc.jsonl",
|
484 |
3,
|