victormiller
commited on
Commit
•
6263148
1
Parent(s):
88c0211
Update web.py
Browse files
web.py
CHANGED
@@ -586,9 +586,12 @@ def web_data():
|
|
586 |
P("""
|
587 |
In this section, we introduce all the quality signals that we have used to filter out low-quality documents.
|
588 |
Overview of all the quality signals that are used for filtering."""),
|
589 |
-
|
590 |
-
|
591 |
-
|
|
|
|
|
|
|
592 |
),
|
593 |
P("""Similar to previous sections, we will present sample documents filtered out by the given quality signals.
|
594 |
Most of these quality signals were initially introduced by Gopher [2] and subsequently adopted by later
|
@@ -636,10 +639,13 @@ def web_data():
|
|
636 |
ensures consistency with the overall document character count calculation.
|
637 |
"""),
|
638 |
H5("Our Implementation"),
|
639 |
-
|
640 |
-
"
|
641 |
-
|
642 |
-
|
|
|
|
|
|
|
643 |
),
|
644 |
H5("3.1.2 Fraction of Characters in the Most Common N-grams (n=2,3,4)"),
|
645 |
P("""
|
@@ -663,10 +669,13 @@ def web_data():
|
|
663 |
only once — tend to be short.
|
664 |
"""),
|
665 |
H5("Our Implementations"),
|
666 |
-
|
667 |
-
"
|
668 |
-
|
669 |
-
|
|
|
|
|
|
|
670 |
),
|
671 |
H5("3.1.3 Fraction of Characters in Duplicated N-grams (n=5,...,10)"),
|
672 |
P("""
|
@@ -710,10 +719,13 @@ def web_data():
|
|
710 |
works ([2], [3], [6]), we remove the documents if more than 30% of the lines end with an ellipsis or more than
|
711 |
90% of lines start with a bullet point.
|
712 |
"""),
|
713 |
-
|
714 |
-
"
|
715 |
-
|
716 |
-
|
|
|
|
|
|
|
717 |
),
|
718 |
H4("3.3 Statistics-based Heuristics"),
|
719 |
P("""
|
@@ -806,7 +818,11 @@ def web_data():
|
|
806 |
Following C4, we remove any page where the phrase “lorem ipsum” appeared since some pages had placeholder “lorem ipsum”
|
807 |
text.
|
808 |
"""),
|
809 |
-
|
|
|
|
|
|
|
|
|
810 |
H3("4. Deduplication"),
|
811 |
P("..."), # Add detailed content and images as needed
|
812 |
H3("5. PII Removal"),
|
|
|
586 |
P("""
|
587 |
In this section, we introduce all the quality signals that we have used to filter out low-quality documents.
|
588 |
Overview of all the quality signals that are used for filtering."""),
|
589 |
+
Details(
|
590 |
+
Summary("Overview of all the quality signals that are used for filtering"),
|
591 |
+
DVS(
|
592 |
+
json.load(open("data/all_signals.json")),
|
593 |
+
"Overview of all the quality signals that are used for filtering",
|
594 |
+
),
|
595 |
),
|
596 |
P("""Similar to previous sections, we will present sample documents filtered out by the given quality signals.
|
597 |
Most of these quality signals were initially introduced by Gopher [2] and subsequently adopted by later
|
|
|
639 |
ensures consistency with the overall document character count calculation.
|
640 |
"""),
|
641 |
H5("Our Implementation"),
|
642 |
+
Details(
|
643 |
+
Summary("Sample documents filtered by excessive line repetitions / characters in repeated lines"),
|
644 |
+
DV(
|
645 |
+
"data/repeat_line_frac.jsonl",
|
646 |
+
0,
|
647 |
+
"Sample documents filtered by excessive line repetitions / characters in repeated lines",
|
648 |
+
),
|
649 |
),
|
650 |
H5("3.1.2 Fraction of Characters in the Most Common N-grams (n=2,3,4)"),
|
651 |
P("""
|
|
|
669 |
only once — tend to be short.
|
670 |
"""),
|
671 |
H5("Our Implementations"),
|
672 |
+
Details(
|
673 |
+
Summary("Sample documents filtered by the fraction of characters in the most common n-grams (n=2,3,4)"),
|
674 |
+
DV(
|
675 |
+
"data/sample_top_ngram.json",
|
676 |
+
0,
|
677 |
+
"Sample documents filtered by the fraction of characters in the most common n-grams (n=2,3,4)",
|
678 |
+
),
|
679 |
),
|
680 |
H5("3.1.3 Fraction of Characters in Duplicated N-grams (n=5,...,10)"),
|
681 |
P("""
|
|
|
719 |
works ([2], [3], [6]), we remove the documents if more than 30% of the lines end with an ellipsis or more than
|
720 |
90% of lines start with a bullet point.
|
721 |
"""),
|
722 |
+
Details(
|
723 |
+
Summary("Sample documents that are filtered out by line-wise heuristics"),
|
724 |
+
DV(
|
725 |
+
"data/line_info.json",
|
726 |
+
0,
|
727 |
+
"Sample documents that are filtered out by line-wise heuristics",
|
728 |
+
),
|
729 |
),
|
730 |
H4("3.3 Statistics-based Heuristics"),
|
731 |
P("""
|
|
|
818 |
Following C4, we remove any page where the phrase “lorem ipsum” appeared since some pages had placeholder “lorem ipsum”
|
819 |
text.
|
820 |
"""),
|
821 |
+
|
822 |
+
Details(
|
823 |
+
Summary("Sample documents containing 'lorem ipsum'"),
|
824 |
+
DV("data/lorem_ipsum.json", 0, "Sample documents containing 'lorem ipsum'"),
|
825 |
+
),
|
826 |
H3("4. Deduplication"),
|
827 |
P("..."), # Add detailed content and images as needed
|
828 |
H3("5. PII Removal"),
|