victormiller
commited on
Update web.py
Browse files
web.py
CHANGED
@@ -750,16 +750,29 @@ def web_data():
|
|
750 |
Li("it contains less than two of the stop words (the, be, to, of, and, that, have, with", style = "margin-bottom: 5px"),
|
751 |
),
|
752 |
H5("Word Count"),
|
753 |
-
|
754 |
-
|
755 |
-
|
756 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
757 |
|
|
|
|
|
|
|
|
|
|
|
|
|
758 |
Both Dolma and RedPajama V2 split texts into words using white spaces and newline symbols. However,
|
759 |
DataTrove employs a tokenizer to split texts into words and ignore punctuations, resulting in a higher
|
760 |
word count compared to simple `text.split()`.
|
761 |
We decided to use simple `len(text.split())` to compute the word count.
|
762 |
"""),
|
|
|
763 |
H5("Mean Word Length"),
|
764 |
P("""
|
765 |
There is minimal variation among existing pipeline implementations. We simply compute the mean word length as follows:
|
@@ -782,30 +795,97 @@ def web_data():
|
|
782 |
The only publicly available implementation of this quality signal is from RedPajama V2, which uses regular expressions
|
783 |
to split text into sentences.
|
784 |
"""),
|
|
|
|
|
|
|
|
|
|
|
785 |
P("""
|
786 |
However, we found that this approach can mistakenly interpret periods in URLs as sentence endings. To address this,
|
787 |
we opted to use `nltk.tokenize.sent_tokenize` for more accurate sentence splitting.
|
788 |
"""),
|
|
|
|
|
|
|
|
|
|
|
|
|
789 |
H5("Symbol to Word Ratio"),
|
790 |
P("""
|
791 |
-
Implementations from Dolma
|
792 |
-
Implementations from RedPajama-V2
|
793 |
-
Implementations from DataTrove
|
794 |
-
|
795 |
Following RedPajama-V2 and DataTrove, we use the symbols of ("#", "...", "…").
|
796 |
We calculate the ratio as the number of symbols divided by the total number of words.
|
797 |
"""),
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
798 |
H5("Fraction of Alphabetic Words"),
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
799 |
P("""
|
800 |
-
Implementations from Dolma
|
801 |
-
Implementations from RedPajama-V2
|
802 |
-
Implementations from DataTrove
|
803 |
-
|
804 |
Both Dolma and DataTrove use `char.isalpha()` to detect whether a word contains alphabetic characters while
|
805 |
RedPajama-V2 employs regular expressions for this purpose. We opt to use regular expressions since `char.isalpha()`
|
806 |
can also match words in other languages as long as they are not punctuations.
|
807 |
"""),
|
808 |
-
Img(),
|
809 |
H5("Number of Stop Words"),
|
810 |
P("""
|
811 |
The implementations across existing pipelines are largely identical. We adopt them and apply them to our pipeline.
|
@@ -841,11 +921,11 @@ def web_data():
|
|
841 |
P("""
|
842 |
After careful filtering, although data quality has improved, a large fraction of the content is repeated across documents. This may be due to the crawler indirectly hitting the same page multiple times, to boilerplate content being repeated (e.g., licences), or even to plagiarism. These duplicates can strongly impact models, favoring memorization instead of generalization.
|
843 |
"""), # Add detailed content and images as needed
|
844 |
-
P("We perform two-level deduplication: local exact deduplication and global fuzzy deduplication")
|
845 |
-
P(B("Local Exact Deduplication"))
|
846 |
-
P("To reduce the expensive cost of global deduplication, we apply a local exact deduplication before it. Specifically, each dump is split into 70 splits. A bloom filter is applied within each split.")
|
847 |
-
P(B("Global Fuzzy Deduplication"))
|
848 |
-
P("NEED TO UPDATE")
|
849 |
H3("5. PII Removal"),
|
850 |
P("..."), # Add detailed content and images as needed
|
851 |
H2("Reference"),
|
|
|
750 |
Li("it contains less than two of the stop words (the, be, to, of, and, that, have, with", style = "margin-bottom: 5px"),
|
751 |
),
|
752 |
H5("Word Count"),
|
753 |
+
Details(
|
754 |
+
Summary("Implementations from Dolma"),
|
755 |
+
D_code("""
|
756 |
+
""", block="block", language="python"),
|
757 |
+
),
|
758 |
+
Details(
|
759 |
+
Summary("Implementations from RedPajama-V2"),
|
760 |
+
D_code("""
|
761 |
+
""", block="block", language="python"),
|
762 |
+
),
|
763 |
|
764 |
+
Details(
|
765 |
+
Summary("Implementations from DataTrove"),
|
766 |
+
D_code("""
|
767 |
+
""", block="block", language="python"),
|
768 |
+
),
|
769 |
+
P("""
|
770 |
Both Dolma and RedPajama V2 split texts into words using white spaces and newline symbols. However,
|
771 |
DataTrove employs a tokenizer to split texts into words and ignore punctuations, resulting in a higher
|
772 |
word count compared to simple `text.split()`.
|
773 |
We decided to use simple `len(text.split())` to compute the word count.
|
774 |
"""),
|
775 |
+
|
776 |
H5("Mean Word Length"),
|
777 |
P("""
|
778 |
There is minimal variation among existing pipeline implementations. We simply compute the mean word length as follows:
|
|
|
795 |
The only publicly available implementation of this quality signal is from RedPajama V2, which uses regular expressions
|
796 |
to split text into sentences.
|
797 |
"""),
|
798 |
+
Details(
|
799 |
+
Summary("Implementations from RedPajama-V2"),
|
800 |
+
D_code("""
|
801 |
+
""", block="block", language="python"),
|
802 |
+
),
|
803 |
P("""
|
804 |
However, we found that this approach can mistakenly interpret periods in URLs as sentence endings. To address this,
|
805 |
we opted to use `nltk.tokenize.sent_tokenize` for more accurate sentence splitting.
|
806 |
"""),
|
807 |
+
Details(
|
808 |
+
Summary("TxT360 Implementation"),
|
809 |
+
D_code("""
|
810 |
+
""", block="block", language="python"),
|
811 |
+
),
|
812 |
+
|
813 |
H5("Symbol to Word Ratio"),
|
814 |
P("""
|
|
|
|
|
|
|
|
|
815 |
Following RedPajama-V2 and DataTrove, we use the symbols of ("#", "...", "…").
|
816 |
We calculate the ratio as the number of symbols divided by the total number of words.
|
817 |
"""),
|
818 |
+
Details(
|
819 |
+
Summary("Implementations from Dolma"),
|
820 |
+
D_code("""
|
821 |
+
""", block="block", language="python"),
|
822 |
+
),
|
823 |
+
Details(
|
824 |
+
Summary("Implementations from RedPajama-V2"),
|
825 |
+
D_code("""
|
826 |
+
""", block="block", language="python"),
|
827 |
+
),
|
828 |
+
|
829 |
+
Details(
|
830 |
+
Summary("Implementations from DataTrove"),
|
831 |
+
D_code("""
|
832 |
+
""", block="block", language="python"),
|
833 |
+
),
|
834 |
+
Details(
|
835 |
+
Summary("TxT360 Implementation"),
|
836 |
+
D_code("""
|
837 |
+
""", block="block", language="python"),
|
838 |
+
),
|
839 |
+
|
840 |
H5("Fraction of Alphabetic Words"),
|
841 |
+
Details(
|
842 |
+
Summary("Implementations from Dolma"),
|
843 |
+
D_code("""
|
844 |
+
attrs.fraction_of_words_with_alpha_character = sum(
|
845 |
+
1 for word in words if any(c.isalpha() for c in word)
|
846 |
+
) / max(word_count, 1)
|
847 |
+
""", block="block", language="python"),
|
848 |
+
),
|
849 |
+
Details(
|
850 |
+
Summary("Implementations from RedPajama-V2"),
|
851 |
+
D_code("""
|
852 |
+
class RPS_Doc_Frac_No_Alph_Words(RPSBase): # noqa
|
853 |
+
ALPH_REGEX = re.compile(r"[a-zA-Z]")
|
854 |
+
|
855 |
+
__slots__ = ()
|
856 |
+
|
857 |
+
def __call__(self, document: Document) -> SignalType:
|
858 |
+
num_words = document.num_raw_words
|
859 |
+
|
860 |
+
if num_words == 0:
|
861 |
+
return [(0, len(document), None)]
|
862 |
+
|
863 |
+
num_words_with_alpha = float(sum(
|
864 |
+
int(self.ALPH_REGEX.search(word) is not None)
|
865 |
+
for word in document.raw_words
|
866 |
+
))
|
867 |
+
|
868 |
+
score = 1.0 - num_words_with_alpha / num_words
|
869 |
+
score = round(score, PRECISION)
|
870 |
+
return [(0, len(document), score)]
|
871 |
+
""", block="block", language="python"),
|
872 |
+
),
|
873 |
+
Details(
|
874 |
+
Summary("Implementations from DataTrove"),
|
875 |
+
D_code("""
|
876 |
+
# that 80 % of words in a document contain at least one alphabetic character
|
877 |
+
if (
|
878 |
+
self.max_non_alpha_words_ratio
|
879 |
+
and sum([any((c.isalpha() for c in w)) for w in words]) / n_words < self.max_non_alpha_words_ratio
|
880 |
+
):
|
881 |
+
return False, "gopher_below_alpha_threshold"
|
882 |
+
""", block="block", language="python"),
|
883 |
+
),
|
884 |
P("""
|
|
|
|
|
|
|
|
|
885 |
Both Dolma and DataTrove use `char.isalpha()` to detect whether a word contains alphabetic characters while
|
886 |
RedPajama-V2 employs regular expressions for this purpose. We opt to use regular expressions since `char.isalpha()`
|
887 |
can also match words in other languages as long as they are not punctuations.
|
888 |
"""),
|
|
|
889 |
H5("Number of Stop Words"),
|
890 |
P("""
|
891 |
The implementations across existing pipelines are largely identical. We adopt them and apply them to our pipeline.
|
|
|
921 |
P("""
|
922 |
After careful filtering, although data quality has improved, a large fraction of the content is repeated across documents. This may be due to the crawler indirectly hitting the same page multiple times, to boilerplate content being repeated (e.g., licences), or even to plagiarism. These duplicates can strongly impact models, favoring memorization instead of generalization.
|
923 |
"""), # Add detailed content and images as needed
|
924 |
+
P("We perform two-level deduplication: local exact deduplication and global fuzzy deduplication"),
|
925 |
+
P(B("Local Exact Deduplication")),
|
926 |
+
P("To reduce the expensive cost of global deduplication, we apply a local exact deduplication before it. Specifically, each dump is split into 70 splits. A bloom filter is applied within each split."),
|
927 |
+
P(B("Global Fuzzy Deduplication")),
|
928 |
+
P("NEED TO UPDATE"),
|
929 |
H3("5. PII Removal"),
|
930 |
P("..."), # Add detailed content and images as needed
|
931 |
H2("Reference"),
|