omwdataset

Runtime error

App Files Files Community

CarisMu commited on Oct 3

Commit

2767124

•

1 Parent(s): e52677f

Update web.py

Browse files

Files changed (1) hide show

web.py +573 -448

web.py CHANGED Viewed

@@ -319,7 +319,10 @@ def web_data():
         Details(
             Summary("Non-English Documents"),
-            DV("data/sample_non_en.json", 3, "Sample documents that are classified as non-English"),
             style="""
             background-color: #FAEAEA; /* Light pink background */
             padding: 15px;
@@ -332,7 +335,10 @@ def web_data():
         Details(
             Summary("English Documents Scoring Lower than 0.65"),
-            DV("data/sample_en_low.json", 3, "Sample documents that are classified as English but with score less than 0.65"),
             style="""
             background-color: #EAFFF1; /* Light green background */
             padding: 15px;
@@ -355,7 +361,10 @@ def web_data():
         Details(
             Summary("24 URL domains with more than 4k matches"),
-            DVS(urls_high_matches, "24 URL domains with more than 4k matches"),
             style="""
             background-color: #FAEAEA; /* Light pink background */
             padding: 15px;
@@ -369,7 +378,10 @@ def web_data():
         """),
         Details(
             Summary("6 url domains that are removed from the blocklist"),
-            DVS(urls_false_positives, "6 url domains that are removed from the blocklist"),
             style="""
             background-color: #FAEAEA; /* Light pink background */
             padding: 15px;
@@ -380,11 +392,13 @@ def web_data():
         Details(
             Summary("Sample documents whose urls are blocked by the refined url blocklist"),
-            DV(
             "data/bad_url_doc.jsonl",
             3,
             "Sample documents whose urls are blocked by the refined url blocklist",
-            ),
             style="""
             background-color: #FAEAEA; /* Light pink background */
             padding: 15px;
@@ -400,9 +414,12 @@ def web_data():
         Details(
             Summary("curated url domains that are excluded from our dataset"),
-            DVS(
                 non_web_urls,
                 "curated url domains that are excluded from our dataset",
             ),
             style="""
             background-color: #FAEAEA; /* Light pink background */
@@ -414,7 +431,10 @@ def web_data():
         Details(
             Summary("Sample documents whose urls are in our curated url domain list"),
-            DV("data/sample_url_exclusion.json", 0, "Sample documents whose urls are in our curated url domain list"),
             style="""
             background-color: #EAFFF1; /* Light green background */
             padding: 15px;
@@ -444,11 +464,14 @@ def web_data():
         Details(
             Summary("Sample documents with lines that are removed by the rule of terminal punctuation"),
-            DV(
-            "data/sample_terminal_punc.json",
-            0,
-            "Sample documents with lines that are removed by the rule of terminal punctuation",
-            ),
             style="""
             background-color: #FAEAEA; /* Light pink background */
             padding: 15px;
@@ -471,10 +494,13 @@ def web_data():
         """),
         Details(
             Summary("Sample documents that are removed by original C4 javascript rule but are kept after our refinement"),
-            DV(
                 "data/sample_java.jsonl",
                 0,
                 "Sample documents that are removed by original C4 javascript rule but are kept after our refinement",
             ),
             style="""
             background-color: #FAEAEA; /* Light pink background */
@@ -495,10 +521,13 @@ def web_data():
         ),
         Details(
             Summary("Sample documents with lines that are removed by the RefinedWeb rules"),
-            DV(
                 "data/sample_refinedweb_line.json",
                 0,
                 "Sample documents with lines that are removed by the RefinedWeb rules",
             ),
             style="""
             background-color: #FAEAEA; /* Light pink background */
@@ -517,9 +546,12 @@ def web_data():
         """),
         Details(
             Summary("Sample documents with toxic lines"),
-            DVS(
                 json.load(open("data/toxic_lines.json")),
                 "Sample documents with toxic lines",
             ),
             style="""
             background-color: #FAEAEA; /* Light pink background */
@@ -535,9 +567,12 @@ def web_data():
         """),
         Details(
             Summary("Overview of all the quality signals that are used for filtering"),
-            DVS(
                 json.load(open("data/all_signals.json")),
                 "Overview of all the quality signals that are used for filtering",
             ),
             style="""
             background-color: #EAFFF1; /* Light green background */
@@ -567,22 +602,25 @@ def web_data():
         """),
         Details(
             Summary("Implementations from Dolma"),
-            D_code("""
-            words = text.split()
-            word_count = len(words)
-            character_count = sum(len(word) for word in words)
-            ...
-            lines = text.split("\n")
-            line_count = len(lines)
-            ...
-            line_counts = Counter(lines)
-            attrs.fraction_of_duplicate_lines = sum(count for line, count in line_counts.items() if count > 1) / max(
-                line_count, 1
-            )
-            attrs.fraction_of_characters_in_duplicate_lines = sum(
-                len(line) * count for line, count in line_counts.items() if count > 1
-            ) / max(character_count, 1)
-            """, block="block", language="python"),
             style="""
             background-color: #FFFAEA; /* Light yellow background */
             padding: 15px;
@@ -592,37 +630,40 @@ def web_data():
         ),
         Details(
             Summary("Implementations from DataTrove"),
-            D_code("""
-            def find_duplicates(x: list[str]) -> tuple[int, int]:
-                unique_x = set()
-                duplicate_chars = 0
-                duplicate_elements = 0
-                for element in x:
-                    if element in unique_x:
-                        duplicate_chars += len(element)
-                        duplicate_elements += 1
-                    else:
-                        unique_x.add(element)
-                return duplicate_elements, duplicate_chars
-            ...
-            self.paragraph_exp = re.compile(r"\n{2,}")
-            self._line_splitter = re.compile("\n+")
-            ...
-            paragraphs = self.paragraph_exp.split(text.strip())
-            paragraphs_duplicates, char_duplicates = find_duplicates(paragraphs)
-            if self.dup_para_frac and paragraphs_duplicates / len(paragraphs) > self.dup_para_frac:
-                return False, "dup_para_frac"
-            if self.dup_para_char_frac and char_duplicates / len(text) > self.dup_para_char_frac:
-                return False, "dup_para_char_frac"
-            lines = self._line_splitter.split(text)
-            line_duplicates, char_duplicates = find_duplicates(lines)
-            if self.dup_line_frac and line_duplicates / len(lines) > self.dup_line_frac:
-                return False, "dup_line_frac"
-            if self.dup_line_char_frac and char_duplicates / len(text) > self.dup_line_char_frac:
-                return False, "dup_line_char_frac"
-            """, block="block", language="python"),
             style="""
             background-color: #FFFAEA; /* Light yellow background */
             padding: 15px;
@@ -654,22 +695,25 @@ def web_data():
         H3("TxT360 Implementation"),
         Details(
             Summary("TxT360 Implementation"),
-            D_code("""
-            words = text.split()
-            word_count = len(words)
-            character_count = sum(len(word) for word in words)
-            ...
-            lines = text.split("\n")
-            line_count = len(lines)
-            line_counts = Counter(lines)
-            attrs.fraction_of_duplicate_lines = (
-                sum((count - 1) for line, count in line_counts.items() if count > 1) / line_count
-            )
-            attrs.fraction_of_characters_in_duplicate_lines = (
-                sum(sum(len(w) for w in line.split()) * (count - 1) for line, count in
-                line_counts.items() if count > 1) / character_count
-            """, block="block", language="python"),
             style="""
             background-color: #EAFFF1; /* Light green background */
             padding: 15px;
@@ -679,10 +723,13 @@ def web_data():
         ),
         Details(
             Summary("Sample documents filtered by excessive line repetitions / characters in repeated lines"),
-            DV(
                 "data/repeat_line_frac.jsonl",
                 0,
                 "Sample documents filtered by excessive line repetitions / characters in repeated lines",
             ),
             style="""
             background-color: #EAFFF1; /* Light green background */
@@ -698,21 +745,24 @@ def web_data():
         """),
         Details(
             Summary("Implementations from Dolma"),
-            D_code("""
-            def all_ngram_counts(words) -> List[Tuple[int, CounterType[Tuple[str, ...]]]]:
-                return [(n, Counter(list(zip(*[words[i:] for i in range(n)])))) for n in range(2, 11)]
-            ...
-            all_counts = all_ngram_counts(words)
-            count_most_common_ngrams = (2, 3, 4)
-            for n, ngram_counts in all_counts:
-                if not ngram_counts:
-                    continue
-                if n in count_most_common_ngrams:
-                    most_common_ngram, count = ngram_counts.most_common(1)[0]
-                    value = count * sum(len(w) for w in most_common_ngram) / max(character_count, 1)
-                    attrs.fraction_of_characters_in_most_common_ngram.append((n, value))
-            """, block="block", language="python"),
             style="""
             background-color: #FFFAEA; /* Light yellow background */
             padding: 15px;
@@ -722,7 +772,8 @@ def web_data():
         ),
         Details(
             Summary("Implementations from RedPajama-V2"),
-            D_code("""
                 class Base_RPS_Frac_Chars_In_Top_NGram(RPSBase):  # noqa
                     ## Base class for calculating the fraction of characters in the top N-gram. This operates on the lower-cased, punctation removed content.
                     NGRAM_SIZE: int = None
@@ -756,7 +807,9 @@ def web_data():
                         score = sum(len(w) for w in ngram) * count / total_chars
                         score = round(score, PRECISION)
                         return [(0, len(document), score)]
-            """, block="block", language="python"),
             style="""
             background-color: #FFFAEA; /* Light yellow background */
             padding: 15px;
@@ -767,25 +820,28 @@ def web_data():
         Details(
             Summary("Implementations from DataTrove"),
-            D_code("""
-            def get_n_grams(words: list[str], n: int) -> list[str]:
-                return [" ".join(words[i : i + n]) for i in range(len(words) - n + 1)]
-            def find_top_duplicate(x: list[str]) -> int:
-                counter = Counter()
-                for element in x:
-                    counter[element] += 1
-                top_n_gram = counter.most_common(1)[0]
-                return len(top_n_gram[0]) * top_n_gram[1]
-            ...
-            for n, n_frac in self.top_n_grams:
-                n_grams = get_n_grams(words, n)
-                if not n_grams:
-                    continue
-                top_char_length = find_top_duplicate(n_grams)
-                if top_char_length / len(text) > n_frac:
-                    return False, f"top_n_gram"
-            """, block="block", language="python"),
             style="""
             background-color: #FFFAEA; /* Light yellow background */
             padding: 15px;
@@ -805,20 +861,23 @@ def web_data():
         """),
         Details(
             Summary("TxT360 Implementation"),
-            D_code("""
-            def all_ngram_counts_new(words) -> List[Tuple[int, CounterType[Tuple[str, ...]]]]:
-                return [(n, list(zip(*[words[i:] for i in range(n)]))) for n in range(2, 11)]
-            ...
-            all_counts = all_ngram_counts_new(words)
-            count_most_common_ngrams = (2, 3, 4)
-            for n, ngram_counts in all_counts:
-                if not ngram_counts:
-                    continue
-                if n in count_most_common_ngrams:
-                    most_common_ngram, count = Counter(ngram_counts).most_common(1)[0]
-                    value = count * sum(len(w) for w in most_common_ngram) / character_count
-                    attrs.fraction_of_characters_in_most_common_ngram.append((n, value))
-            """, block="block", language="python"),
             style="""
             background-color: #EAFFF1; /* Light green background */
             padding: 15px;
@@ -828,10 +887,13 @@ def web_data():
         ),
         Details(
             Summary("Sample documents filtered by the fraction of characters in the most common n-grams (n=2,3,4)"),
-            DV(
                 "data/sample_top_ngram.json",
                 0,
                 "Sample documents filtered by the fraction of characters in the most common n-grams (n=2,3,4)",
             ),
             style="""
             background-color: #EAFFF1; /* Light green background */
@@ -848,23 +910,26 @@ def web_data():
         """),
         Details(
             Summary("Implementations from Dolma"),
-            D_code("""
-            def all_ngram_counts(words) -> List[Tuple[int, CounterType[Tuple[str, ...]]]]:
-                return [(n, Counter(list(zip(*[words[i:] for i in range(n)])))) for n in range(2, 11)]
-            ...
-            all_counts = all_ngram_counts(words)
-            for n, ngram_counts in all_counts:
-                if not ngram_counts:
-                    continue
-                if n in count_most_common_ngrams:
-                    ...
-                else:
-                    ng_char_count = sum(count * sum(len(w) for w in ng) for ng, count in ngram_counts.items())
-                    value = sum(
-                        count * sum(len(w) for w in ng) for ng, count in ngram_counts.items() if count > 1
-                    ) / max(ng_char_count, 1)
-                    attrs.fraction_of_characters_in_duplicate_ngrams.append((n, value))
-            """, block="block", language="python"),
             style="""
             background-color: #FFFAEA; /* Light yellow background */
             padding: 15px;
@@ -874,56 +939,59 @@ def web_data():
         ),
         Details(
             Summary("Implementations from RedPajama-V2"),
-            D_code("""
-            class Base_RPS_Frac_Chars_In_Dupe_NGrams(RPSBase):  # noqa
-                ## Base class for calculating the fraction of characters in duplicate word N-grams. This operates on the lower-cased, punctation removed content. The function also ensures that characters in overlapping ngrams are only counted once.
-                NGRAM_SIZE: int = None
-                __slots__ = []
-                def __call__(self, document: Document) -> SignalType:
-                    if self.NGRAM_SIZE is None:
-                        raise NotImplementedError(
-                            "NGRAM_SIZE must be set in the subclass"
                         )
-                    if len(document.normalized_words) < self.NGRAM_SIZE:
-                        return [(0, len(document), 0.0)]
-                    # fetch the ngrams from the document if they exist, otherwise
-                    # compute them
-                    doc_n_grams = (
-                            getattr(document, f"norm_self.NGRAM_SIZEgrams", None)
-                            or
-                            tuple(form_ngrams(
-                                iter(document.normalized_words), self.NGRAM_SIZE
-                            ))
-                    )
-                    # keep only ngrams which occur at least twice
-                    ngram_dupes =
-                        ngram for ngram, count in Counter(doc_n_grams).items() if count > 1
-                    duplicated_grams = np.zeros(len(document.normalized_words), dtype=int)
-                    i = 0
-                    for ngram in doc_n_grams:
-                        if ngram in ngram_dupes:
-                            duplicated_grams[i: i + self.NGRAM_SIZE] = 1
-                        i += 1
-                    word_lengths = np.array(list(map(len, document.normalized_words)))
-                    chars_duped = np.sum(word_lengths * duplicated_grams)
-                    total_chars = np.sum(word_lengths)
-                    if total_chars == 0:
-                        return [(0, len(document), 0.0)]
-                    score = float(chars_duped / total_chars)
-                    score = round(score, PRECISION)
-                    return [(0, len(document), score)]
-            """, block="block", language="python"),
             style="""
             background-color: #FFFAEA; /* Light yellow background */
             padding: 15px;
@@ -934,27 +1002,30 @@ def web_data():
         Details(
             Summary("Implementations from DataTrove"),
-            D_code("""
-            def find_all_duplicate(words: list[str], n: int) -> int:
-                n_words = len(words)
-                unique = set()
-                repeated_chars, idx = 0, 0
-                while idx < n_words - n + 1:
-                    n_gram = "".join(words[idx : idx + n])
-                    if n_gram in unique:
-                        repeated_chars += len(n_gram)
-                        idx += n
-                    else:
-                        unique.add(n_gram)
-                        idx += 1
-                assert repeated_chars <= len("".join(words))
-                return repeated_chars
-            ...
-            for n, n_frac in self.dup_n_grams:
-                n_duplicates_char = find_all_duplicate(words, n)
-                if n_duplicates_char / len(text) > n_frac:
-                    return False, f"duplicated_n_grams"
-            """, block="block", language="python"),
             style="""
             background-color: #FFFAEA; /* Light yellow background */
             padding: 15px;
@@ -979,41 +1050,44 @@ def web_data():
         """),
         Details(
             Summary("TxT360 Implementation"),
-            D_code("""
-            def get_dup_ngram_frac(n, doc_n_grams, text):
-                # fetch the ngrams from the document if they exist, otherwise compute them
-                # doc_n_grams = list(zip(*[words[i:] for i in range(n)]))
-                duplicated_grams = np.zeros(len(text.split()), dtype=int)
-                unique_ngrams = set()
-                for i, ngram in enumerate(doc_n_grams):
-                    if ngram in unique_ngrams:
-                        duplicated_grams[i: i + n] = 1
                     else:
-                        unique_ngrams.add(ngram)
-                word_lengths = np.array(list(map(len, text.split())))
-                chars_duped = np.sum(word_lengths * duplicated_grams)
-                total_chars = np.sum(word_lengths)
-                return float(chars_duped / total_chars)
-            def all_ngram_counts_new(words) -> List[Tuple[int, CounterType[Tuple[str, ...]]]]:
-                return [(n, list(zip(*[words[i:] for i in range(n)]))) for n in range(2, 11)]
-            ...
-            all_counts = all_ngram_counts_new(words)
-            count_most_common_ngrams = (2, 3, 4)
-            for n, ngram_counts in all_counts:
-                if not ngram_counts:
-                    continue
-                if n in count_most_common_ngrams:
-                    ...
-                else:
-                    score = get_dup_ngram_frac(n, ngram_counts, text)
-                    attrs.fraction_of_characters_in_duplicate_ngrams.append((n, score))
-            """, block="block", language="python"),
             style="""
             background-color: #EAFFF1; /* Light green background */
             padding: 15px;
@@ -1046,10 +1120,13 @@ def web_data():
         ),
         Details(
             Summary("Sample documents filtered by the fraction of characters in duplicated n-grams (n=5,...,10)"),
-            DV(
                 "data/sample_dup_ngram.json",
                 0,
                 "Sample documents filtered by the fraction of characters in duplicated n-grams (n=5,...,10)",
             ),
             style="""
             background-color: #EAFFF1; /* Light green background */
@@ -1067,22 +1144,25 @@ def web_data():
         """),
         Details(
             Summary("Ellipsis Symbol Identification Implemetations"),
-            P("Dolma: "),
-            D_code("""
-            ELLIPSIS_SYMBOLS = ("…")
-            """, block="block", language="python"),
-            P("RedPajamaV2: "),
-            D_code("""
-            ELLIPSIS_SYMBOLS = ("...", "…")
-            """, block="block", language="python"),
-            P("DataTrove: "),
-            D_code("""
-            ELLIPSIS_SYMBOLS = ("...", "…")
-            """, block="block", language="python"),
-            P("TxT360: "),
-            D_code("""
-            ELLIPSIS_SYMBOLS = ("...", "…", "[...]", "[…]")
-            """, block="block", language="python"),
             style="""
             background-color: #FFFAEA; /* Light yellow background */
             padding: 15px;
@@ -1092,47 +1172,50 @@ def web_data():
         ),
         Details(
             Summary("Bullet Point Identification Implemetations"),
-            P("Dolma: "),
-            D_code("""
-            BULLET_POINTS = ("*", "-"
-            """, block="block", language="python"),
-            P("RedPajamaV2: "),
-            D_code("""
-            BULLET_POINT_SYMBOLS = (
-                "•",  # bullet point
-                "‣",  # triangular bullet point
-                "▶",  # black right pointing triangle
-                "◀",  # black left pointing triangle
-                "◦",  # white bullet point
-                "■",  # black square
-                "□",  # white square
-                "▪",  # black small square
-                "▫",  # white small square
-                "–",  # en dash
-            )
-            """, block="block", language="python"),
-            P("DataTrove: "),
-            D_code("""
-            BULLET_POINT_SYMBOLS = ("•" , "-")
-            """, block="block", language="python"),
-            P("TxT360: "),
-            D_code("""
-            BULLET_POINT_SYMBOLS = (
-                "•",  # • bullet point
-                "‣",  # ‣ triangular bullet point
-                "▶",  # ▶ black right pointing triangle
-                "◀",  # ◀ black left pointing triangle
-                "◦",  # ◦ white bullet point
-                "■",  # ■ black square
-                "□",  # □ white square
-                "▪",  # ▪ black small square
-                "▫",  # ▫ white small square
-                "-",  # - en dash
-                "–",  # – dash
-                "—",  # — zh dash
-                "*",  # * star
-            )
-            """, block="block", language="python"),
             style="""
             background-color: #FFFAEA; /* Light yellow background */
             padding: 15px;
@@ -1144,10 +1227,13 @@ def web_data():
         Details(
             Summary("Sample documents that are filtered out by line-wise heuristics"),
-            DV(
                 "data/line_info.json",
                 0,
                 "Sample documents that are filtered out by line-wise heuristics",
             ),
             style="""
             background-color: #EAFFF1; /* Light green background */
@@ -1186,35 +1272,38 @@ def web_data():
         ),
         Details(
             Summary("Implementations from RedPajama-V2"),
-            D_code("""
-            # the normalized content: lowercased and punctuation removed
-            self._normalized_content = normalize(content)
-            self._normalized_words = tuple(self._normalized_content.split())
-            self._num_normalized_words = len(self._normalized_words)
-            ...
-            def normalize(
-                   text: str,
-                   remove_punct: bool = True,
-                   lowercase: bool = True,
-                   nfd_unicode: bool = True,
-                   white_space: bool = True
-            ) -> str:
-               #Normalize the text by lowercasing and removing punctuation.
-               # remove punctuation
-               if remove_punct:
-                   text = text.translate(TRANSLATION_TABLE_PUNCTUATION)
-               # lowercase
-               if lowercase:
-                   text = text.lower()
-               if white_space:
-                   text = text.strip()
-                   text = re.sub(r"\s+", " ", text)
-               # NFD unicode normalization
-               if nfd_unicode:
-                   text = unicodedata.normalize("NFD", text)
-               return text
-            """, block="block", language="python"),
             style="""
             background-color: #FFFAEA; /* Light yellow background */
             padding: 15px;
@@ -1225,13 +1314,16 @@ def web_data():
         Details(
             Summary("Implementations from DataTrove"),
-            D_code("""
-            words = self.tokenizer.word_tokenize(text)
-            n_words = len(words)
-            non_symbol_words = [w for w in words if any(ch not in PUNCTUATION_SET for ch in w)]
-            n_non_symbol_words_words = len(non_symbol_words)
-            """, block="block", language="python"),
             style="""
             background-color: #FFFAEA; /* Light yellow background */
             padding: 15px;
@@ -1270,18 +1362,21 @@ def web_data():
         """),
         Details(
             Summary("Implementations from RedPajama-V2"),
-            D_code("""
-            class RPS_Doc_Num_Sentences(RPSBase):  # noqa
-             ##The number of sentences in the content. This is calculated using the regex r'[^.!?]+[.!?]*'
-            SENT_PATTERN = re.compile(r'[^.!?]+[.!?]*', flags=re.UNICODE)
-            __slots__ = ()
-            def __call__(self, document: Document) -> SignalType:
-                ##count the number of sentences in the content using regex
-                score = float(len(self.SENT_PATTERN.findall(document.raw_content)))
-                return [(0, len(document), score)]
-            """, block="block", language="python"),
             style="""
             background-color: #FFFAEA; /* Light yellow background */
             padding: 15px;
@@ -1295,15 +1390,18 @@ def web_data():
         """),
         Details(
             Summary("TxT360 Implementation"),
-            D_code("""
-            from nltk.tokenize import sent_tokenize
-            ...
-            def count_sentences(text):
-                sentences = sent_tokenize(text)
-                return len(sentences)
-            ...
-            attrs.num_of_sentences = count_sentences(text)
-            """, block="block", language="python"),
             style="""
             background-color: #EAFFF1; /* Light green background */
             padding: 15px;
@@ -1319,13 +1417,16 @@ def web_data():
         """),
         Details(
             Summary("Implementations from Dolma"),
-            D_code("""
-            SYMBOLS = ("#", "…")
-            ...
-            attrs.symbol_to_word_ratio = sum(1 for word in words if any(s in word for s in SYMBOLS)) / max(
-                        word_count, 1
-                    )
-            """, block="block", language="python"),
              style="""
             background-color: #FFFAEA; /* Light yellow background */
             padding: 15px;
@@ -1335,29 +1436,32 @@ def web_data():
         ),
         Details(
             Summary("Implementations from RedPajama-V2"),
-            D_code("""
-            class RPS_Doc_Symbol_To_Word_Ratio(RPSBase):  # noqa
-    ##The ratio of symbols to words in the content. This is analogous to
-    ##the signal used in Gopher. Symbols are defined "#", "...", and "…".
-                SYMBOLS = ("#", "...", "…")
-                __slots__ = ()
-                def __call__(self, document: Document) -> SignalType:
-                    num_words = document.num_raw_words
-                    if num_words == 0:
-                        return [(0, len(document), None)]
-                    # count the number of symbols in the content
-                    num_symbols = float(sum(
-                        document.raw_content.count(x) for x in self.SYMBOLS
-                    ))
-                    score = num_symbols / num_words
-                    score = round(score, PRECISION)
-                    return [(0, len(document), score)]
-            """, block="block", language="python"),
             style="""
             background-color: #FFFAEA; /* Light yellow background */
             padding: 15px;
@@ -1368,12 +1472,15 @@ def web_data():
         Details(
             Summary("Implementations from DataTrove"),
-            D_code("""
-            if self.max_symbol_word_ratio and text.count("#") / n_words > self.max_symbol_word_ratio:
-                return False, "gopher_too_many_hashes"
-            if self.max_symbol_word_ratio and (text.count("...") + text.count("…")) / n_words > self.max_symbol_word_ratio:
-                return False, "gopher_too_many_ellipsis"
-            """, block="block", language="python"),
             style="""
             background-color: #FFFAEA; /* Light yellow background */
             padding: 15px;
@@ -1383,13 +1490,16 @@ def web_data():
         ),
         Details(
             Summary("TxT360 Implementation"),
-            D_code("""
-            SYMBOLS = ("#", "...", "…")
-            ...
-            symbol_pattern = re.compile("|".join(re.escape(symbol) for symbol in SYMBOLS))
-            ...
-            attrs.symbol_to_word_ratio = sum(1 for word in words if symbol_pattern.search(word)) / word_count
-            """, block="block", language="python"),
             style="""
             background-color: #EAFFF1; /* Light green background */
             padding: 15px;
@@ -1401,11 +1511,14 @@ def web_data():
         H3("Fraction of Alphabetic Words"),
         Details(
             Summary("Implementations from Dolma"),
-            D_code("""
-            attrs.fraction_of_words_with_alpha_character = sum(
-            1 for word in words if any(c.isalpha() for c in word)
-        ) / max(word_count, 1)
-            """, block="block", language="python"),
             style="""
             background-color: #FFFAEA; /* Light yellow background */
             padding: 15px;
@@ -1415,27 +1528,30 @@ def web_data():
         ),
         Details(
             Summary("Implementations from RedPajama-V2"),
-            D_code("""
-            class RPS_Doc_Frac_No_Alph_Words(RPSBase):  # noqa
-                ALPH_REGEX = re.compile(r"[a-zA-Z]")
-                __slots__ = ()
-                def __call__(self, document: Document) -> SignalType:
-                    num_words = document.num_raw_words
-                    if num_words == 0:
-                        return [(0, len(document), None)]
-                    num_words_with_alpha = float(sum(
-                        int(self.ALPH_REGEX.search(word) is not None)
-                        for word in document.raw_words
-                    ))
-                    score = 1.0 - num_words_with_alpha / num_words
-                    score = round(score, PRECISION)
-                    return [(0, len(document), score)]
-            """, block="block", language="python"),
             style="""
             background-color: #FFFAEA; /* Light yellow background */
             padding: 15px;
@@ -1445,14 +1561,17 @@ def web_data():
         ),
         Details(
             Summary("Implementations from DataTrove"),
-            D_code("""
-            # that 80 % of words in a document contain at least one alphabetic character
-            if (
-                self.max_non_alpha_words_ratio
-                and sum([any((c.isalpha() for c in w)) for w in words]) / n_words < self.max_non_alpha_words_ratio
-            ):
-                return False, "gopher_below_alpha_threshold"
-            """, block="block", language="python"),
             style="""
             background-color: #FFFAEA; /* Light yellow background */
             padding: 15px;
@@ -1480,10 +1599,13 @@ def web_data():
         H3("TxT360 Implementation"),
         Details(
             Summary("Sample documents that are filtered out by statistics-based heuristics"),
-            DV(
                 "data/sample_doc_stat.json",
                 0,
                 "Sample documents that are filtered out by statistics-based heuristics",
             ),
             style="""
             background-color: #EAFFF1; /* Light green background */
@@ -1500,7 +1622,10 @@ def web_data():
         Details(
             Summary("Sample documents containing 'lorem ipsum'"),
-            DV("data/lorem_ipsum.json", 0, "Sample documents containing 'lorem ipsum'"),
             style="""
             background-color: #FAEAEA; /* Light pink background */
             padding: 15px;

         Details(
             Summary("Non-English Documents"),
+            Div(
+                DV("data/sample_non_en.json", 3, "Sample documents that are classified as non-English"),
+                style="background-color: white; padding: 15px; margin-top: 10px; margin-bottom: 10px; border-radius: 8px; border: none; "  # Styling for the DV2 part
+            ),
             style="""
             background-color: #FAEAEA; /* Light pink background */
             padding: 15px;
         Details(
             Summary("English Documents Scoring Lower than 0.65"),
+            Div(
+                DV("data/sample_en_low.json", 3, "Sample documents that are classified as English but with score less than 0.65"),
+                style="background-color: white; padding: 15px; margin-top: 10px; margin-bottom: 10px; border-radius: 8px; border: none; "  # Styling for the DV2 part
+            ),
             style="""
             background-color: #EAFFF1; /* Light green background */
             padding: 15px;
         Details(
             Summary("24 URL domains with more than 4k matches"),
+            Div (
+                DVS(urls_high_matches, "24 URL domains with more than 4k matches"),
+                style="background-color: white; padding: 15px; margin-top: 10px; margin-bottom: 10px; border-radius: 8px; border: none; "  # Styling for the DV2 part
+            ),
             style="""
             background-color: #FAEAEA; /* Light pink background */
             padding: 15px;
         """),
         Details(
             Summary("6 url domains that are removed from the blocklist"),
+            Div (
+                DVS(urls_false_positives, "6 url domains that are removed from the blocklist"),
+                style="background-color: white; padding: 15px; margin-top: 10px; margin-bottom: 10px; border-radius: 8px; border: none; "  # Styling for the DV2 part
+            ),
             style="""
             background-color: #FAEAEA; /* Light pink background */
             padding: 15px;
         Details(
             Summary("Sample documents whose urls are blocked by the refined url blocklist"),
+            Div(
+                DV(
             "data/bad_url_doc.jsonl",
             3,
             "Sample documents whose urls are blocked by the refined url blocklist",
+            ), style="background-color: white; padding: 15px; margin-top: 10px; margin-bottom: 10px; border-radius: 8px; border: none; "  # Styling for the DV2 part
+            ),
             style="""
             background-color: #FAEAEA; /* Light pink background */
             padding: 15px;
         Details(
             Summary("curated url domains that are excluded from our dataset"),
+            Div (
+                DVS(
                 non_web_urls,
                 "curated url domains that are excluded from our dataset",
+            ),
+                style="background-color: white; padding: 15px; margin-top: 10px; margin-bottom: 10px; border-radius: 8px; border: none; "  # Styling for the DV2 part
             ),
             style="""
             background-color: #FAEAEA; /* Light pink background */
         Details(
             Summary("Sample documents whose urls are in our curated url domain list"),
+            Div (
+                DV("data/sample_url_exclusion.json", 0, "Sample documents whose urls are in our curated url domain list"),
+                style="background-color: white; padding: 15px; margin-top: 10px; margin-bottom: 10px; border-radius: 8px; border: none; "  # Styling for the DV2 part
+            ),
             style="""
             background-color: #EAFFF1; /* Light green background */
             padding: 15px;
         Details(
             Summary("Sample documents with lines that are removed by the rule of terminal punctuation"),
+            Div (
+                DV(
+                "data/sample_terminal_punc.json",
+                0,
+                "Sample documents with lines that are removed by the rule of terminal punctuation",
+                ),
+                style="background-color: white; padding: 15px; margin-top: 10px; margin-bottom: 10px; border-radius: 8px; border: none; "  # Styling for the DV2 part
+            ),
             style="""
             background-color: #FAEAEA; /* Light pink background */
             padding: 15px;
         """),
         Details(
             Summary("Sample documents that are removed by original C4 javascript rule but are kept after our refinement"),
+            Div (
+                DV(
                 "data/sample_java.jsonl",
                 0,
                 "Sample documents that are removed by original C4 javascript rule but are kept after our refinement",
+                ),
+                style="background-color: white; padding: 15px; margin-top: 10px; margin-bottom: 10px; border-radius: 8px; border: none; "  # Styling for the DV2 part
             ),
             style="""
             background-color: #FAEAEA; /* Light pink background */
         ),
         Details(
             Summary("Sample documents with lines that are removed by the RefinedWeb rules"),
+            Div (
+                DV(
                 "data/sample_refinedweb_line.json",
                 0,
                 "Sample documents with lines that are removed by the RefinedWeb rules",
+                ),
+                style="background-color: white; padding: 15px; margin-top: 10px; margin-bottom: 10px; border-radius: 8px; border: none; "  # Styling for the DV2 part
             ),
             style="""
             background-color: #FAEAEA; /* Light pink background */
         """),
         Details(
             Summary("Sample documents with toxic lines"),
+            Div (
+                DVS(
                 json.load(open("data/toxic_lines.json")),
                 "Sample documents with toxic lines",
+                ),
+                style="background-color: white; padding: 15px; margin-top: 10px; margin-bottom: 10px; border-radius: 8px; border: none; "  # Styling for the DV2 part
             ),
             style="""
             background-color: #FAEAEA; /* Light pink background */
         """),
         Details(
             Summary("Overview of all the quality signals that are used for filtering"),
+            Div (
+                DVS(
                 json.load(open("data/all_signals.json")),
                 "Overview of all the quality signals that are used for filtering",
+                ),
+                style="background-color: white; padding: 15px; margin-top: 10px; margin-bottom: 10px; border-radius: 8px; border: none; "  # Styling for the DV2 part
             ),
             style="""
             background-color: #EAFFF1; /* Light green background */
         """),
         Details(
             Summary("Implementations from Dolma"),
+            Div(
+                D_code("""
+                words = text.split()
+                word_count = len(words)
+                character_count = sum(len(word) for word in words)
+                ...
+                lines = text.split("\n")
+                line_count = len(lines)
+                ...
+                line_counts = Counter(lines)
+                attrs.fraction_of_duplicate_lines = sum(count for line, count in line_counts.items() if count > 1) / max(
+                    line_count, 1
+                )
+                attrs.fraction_of_characters_in_duplicate_lines = sum(
+                    len(line) * count for line, count in line_counts.items() if count > 1
+                ) / max(character_count, 1)
+                """, block="block", language="python"),
+                style="background-color: white; padding: 15px; margin-top: 10px; margin-bottom: 10px; border-radius: 8px; border: none; "  # Styling for the DV2 part
+            ),
             style="""
             background-color: #FFFAEA; /* Light yellow background */
             padding: 15px;
         ),
         Details(
             Summary("Implementations from DataTrove"),
+            Div(
+                D_code("""
+                def find_duplicates(x: list[str]) -> tuple[int, int]:
+                    unique_x = set()
+                    duplicate_chars = 0
+                    duplicate_elements = 0
+                    for element in x:
+                        if element in unique_x:
+                            duplicate_chars += len(element)
+                            duplicate_elements += 1
+                        else:
+                            unique_x.add(element)
+                    return duplicate_elements, duplicate_chars
+                ...
+                self.paragraph_exp = re.compile(r"\n{2,}")
+                self._line_splitter = re.compile("\n+")
+                ...
+                paragraphs = self.paragraph_exp.split(text.strip())
+                paragraphs_duplicates, char_duplicates = find_duplicates(paragraphs)
+                if self.dup_para_frac and paragraphs_duplicates / len(paragraphs) > self.dup_para_frac:
+                    return False, "dup_para_frac"
+                if self.dup_para_char_frac and char_duplicates / len(text) > self.dup_para_char_frac:
+                    return False, "dup_para_char_frac"
+                lines = self._line_splitter.split(text)
+                line_duplicates, char_duplicates = find_duplicates(lines)
+                if self.dup_line_frac and line_duplicates / len(lines) > self.dup_line_frac:
+                    return False, "dup_line_frac"
+                if self.dup_line_char_frac and char_duplicates / len(text) > self.dup_line_char_frac:
+                    return False, "dup_line_char_frac"
+                """, block="block", language="python"),
+                style="background-color: white; padding: 15px; margin-top: 10px; margin-bottom: 10px; border-radius: 8px; border: none; "  # Styling for the DV2 part
+            ),
             style="""
             background-color: #FFFAEA; /* Light yellow background */
             padding: 15px;
         H3("TxT360 Implementation"),
         Details(
             Summary("TxT360 Implementation"),
+            Div(
+                D_code("""
+                words = text.split()
+                word_count = len(words)
+                character_count = sum(len(word) for word in words)
+                ...
+                lines = text.split("\n")
+                line_count = len(lines)
+                line_counts = Counter(lines)
+                attrs.fraction_of_duplicate_lines = (
+                    sum((count - 1) for line, count in line_counts.items() if count > 1) / line_count
+                )
+                attrs.fraction_of_characters_in_duplicate_lines = (
+                    sum(sum(len(w) for w in line.split()) * (count - 1) for line, count in
+                    line_counts.items() if count > 1) / character_count
+                """, block="block", language="python"),
+                style="background-color: white; padding: 15px; margin-top: 10px; margin-bottom: 10px; border-radius: 8px; border: none; "  # Styling for the DV2 part
+            ),
             style="""
             background-color: #EAFFF1; /* Light green background */
             padding: 15px;
         ),
         Details(
             Summary("Sample documents filtered by excessive line repetitions / characters in repeated lines"),
+            Div(
+                DV(
                 "data/repeat_line_frac.jsonl",
                 0,
                 "Sample documents filtered by excessive line repetitions / characters in repeated lines",
+                ),
+                style="background-color: white; padding: 15px; margin-top: 10px; margin-bottom: 10px; border-radius: 8px; border: none; "  # Styling for the DV2 part
             ),
             style="""
             background-color: #EAFFF1; /* Light green background */
         """),
         Details(
             Summary("Implementations from Dolma"),
+            Div(
+                D_code("""
+                def all_ngram_counts(words) -> List[Tuple[int, CounterType[Tuple[str, ...]]]]:
+                    return [(n, Counter(list(zip(*[words[i:] for i in range(n)])))) for n in range(2, 11)]
+                ...
+                all_counts = all_ngram_counts(words)
+                count_most_common_ngrams = (2, 3, 4)
+                for n, ngram_counts in all_counts:
+                    if not ngram_counts:
+                        continue
+                    if n in count_most_common_ngrams:
+                        most_common_ngram, count = ngram_counts.most_common(1)[0]
+                        value = count * sum(len(w) for w in most_common_ngram) / max(character_count, 1)
+                        attrs.fraction_of_characters_in_most_common_ngram.append((n, value))
+                """, block="block", language="python"),
+                style="background-color: white; padding: 15px; margin-top: 10px; margin-bottom: 10px; border-radius: 8px; border: none; "  # Styling for the DV2 part
+            ),
             style="""
             background-color: #FFFAEA; /* Light yellow background */
             padding: 15px;
         ),
         Details(
             Summary("Implementations from RedPajama-V2"),
+            Div(
+                D_code("""
                 class Base_RPS_Frac_Chars_In_Top_NGram(RPSBase):  # noqa
                     ## Base class for calculating the fraction of characters in the top N-gram. This operates on the lower-cased, punctation removed content.
                     NGRAM_SIZE: int = None
                         score = sum(len(w) for w in ngram) * count / total_chars
                         score = round(score, PRECISION)
                         return [(0, len(document), score)]
+                """, block="block", language="python"),
+                style="background-color: white; padding: 15px; margin-top: 10px; margin-bottom: 10px; border-radius: 8px; border: none; "  # Styling for the DV2 part
+            ),
             style="""
             background-color: #FFFAEA; /* Light yellow background */
             padding: 15px;
         Details(
             Summary("Implementations from DataTrove"),
+            Div(
+                D_code("""
+                def get_n_grams(words: list[str], n: int) -> list[str]:
+                    return [" ".join(words[i : i + n]) for i in range(len(words) - n + 1)]
+                def find_top_duplicate(x: list[str]) -> int:
+                    counter = Counter()
+                    for element in x:
+                        counter[element] += 1
+                    top_n_gram = counter.most_common(1)[0]
+                    return len(top_n_gram[0]) * top_n_gram[1]
+                ...
+                for n, n_frac in self.top_n_grams:
+                    n_grams = get_n_grams(words, n)
+                    if not n_grams:
+                        continue
+                    top_char_length = find_top_duplicate(n_grams)
+                    if top_char_length / len(text) > n_frac:
+                        return False, f"top_n_gram"
+                """, block="block", language="python"),
+                style="background-color: white; padding: 15px; margin-top: 10px; margin-bottom: 10px; border-radius: 8px; border: none; "  # Styling for the DV2 part
+            ),
             style="""
             background-color: #FFFAEA; /* Light yellow background */
             padding: 15px;
         """),
         Details(
             Summary("TxT360 Implementation"),
+            Div(
+                D_code("""
+                def all_ngram_counts_new(words) -> List[Tuple[int, CounterType[Tuple[str, ...]]]]:
+                    return [(n, list(zip(*[words[i:] for i in range(n)]))) for n in range(2, 11)]
+                ...
+                all_counts = all_ngram_counts_new(words)
+                count_most_common_ngrams = (2, 3, 4)
+                for n, ngram_counts in all_counts:
+                    if not ngram_counts:
+                        continue
+                    if n in count_most_common_ngrams:
+                        most_common_ngram, count = Counter(ngram_counts).most_common(1)[0]
+                        value = count * sum(len(w) for w in most_common_ngram) / character_count
+                        attrs.fraction_of_characters_in_most_common_ngram.append((n, value))
+                """, block="block", language="python"),
+                style="background-color: white; padding: 15px; margin-top: 10px; margin-bottom: 10px; border-radius: 8px; border: none; "  # Styling for the DV2 part
+            ),
             style="""
             background-color: #EAFFF1; /* Light green background */
             padding: 15px;
         ),
         Details(
             Summary("Sample documents filtered by the fraction of characters in the most common n-grams (n=2,3,4)"),
+            Div(
+                DV(
                 "data/sample_top_ngram.json",
                 0,
                 "Sample documents filtered by the fraction of characters in the most common n-grams (n=2,3,4)",
+                ),
+                style="background-color: white; padding: 15px; margin-top: 10px; margin-bottom: 10px; border-radius: 8px; border: none; "  # Styling for the DV2 part
             ),
             style="""
             background-color: #EAFFF1; /* Light green background */
         """),
         Details(
             Summary("Implementations from Dolma"),
+            Div(
+                D_code("""
+                def all_ngram_counts(words) -> List[Tuple[int, CounterType[Tuple[str, ...]]]]:
+                    return [(n, Counter(list(zip(*[words[i:] for i in range(n)])))) for n in range(2, 11)]
+                ...
+                all_counts = all_ngram_counts(words)
+                for n, ngram_counts in all_counts:
+                    if not ngram_counts:
+                        continue
+                    if n in count_most_common_ngrams:
+                        ...
+                    else:
+                        ng_char_count = sum(count * sum(len(w) for w in ng) for ng, count in ngram_counts.items())
+                        value = sum(
+                            count * sum(len(w) for w in ng) for ng, count in ngram_counts.items() if count > 1
+                        ) / max(ng_char_count, 1)
+                        attrs.fraction_of_characters_in_duplicate_ngrams.append((n, value))
+                """, block="block", language="python"),
+                style="background-color: white; padding: 15px; margin-top: 10px; margin-bottom: 10px; border-radius: 8px; border: none; "  # Styling for the DV2 part
+            ),
             style="""
             background-color: #FFFAEA; /* Light yellow background */
             padding: 15px;
         ),
         Details(
             Summary("Implementations from RedPajama-V2"),
+            Div(
+                D_code("""
+                class Base_RPS_Frac_Chars_In_Dupe_NGrams(RPSBase):  # noqa
+                    ## Base class for calculating the fraction of characters in duplicate word N-grams. This operates on the lower-cased, punctation removed content. The function also ensures that characters in overlapping ngrams are only counted once.
+                    NGRAM_SIZE: int = None
+                    __slots__ = []
+                    def __call__(self, document: Document) -> SignalType:
+                        if self.NGRAM_SIZE is None:
+                            raise NotImplementedError(
+                                "NGRAM_SIZE must be set in the subclass"
+                            )
+                        if len(document.normalized_words) < self.NGRAM_SIZE:
+                            return [(0, len(document), 0.0)]
+                        # fetch the ngrams from the document if they exist, otherwise
+                        # compute them
+                        doc_n_grams = (
+                                getattr(document, f"norm_self.NGRAM_SIZEgrams", None)
+                                or
+                                tuple(form_ngrams(
+                                    iter(document.normalized_words), self.NGRAM_SIZE
+                                ))
                         )
+                        # keep only ngrams which occur at least twice
+                        ngram_dupes =
+                            ngram for ngram, count in Counter(doc_n_grams).items() if count > 1
+                        duplicated_grams = np.zeros(len(document.normalized_words), dtype=int)
+                        i = 0
+                        for ngram in doc_n_grams:
+                            if ngram in ngram_dupes:
+                                duplicated_grams[i: i + self.NGRAM_SIZE] = 1
+                            i += 1
+                        word_lengths = np.array(list(map(len, document.normalized_words)))
+                        chars_duped = np.sum(word_lengths * duplicated_grams)
+                        total_chars = np.sum(word_lengths)
+                        if total_chars == 0:
+                            return [(0, len(document), 0.0)]
+                        score = float(chars_duped / total_chars)
+                        score = round(score, PRECISION)
+                        return [(0, len(document), score)]
+                """, block="block", language="python"),
+                style="background-color: white; padding: 15px; margin-top: 10px; margin-bottom: 10px; border-radius: 8px; border: none; "  # Styling for the DV2 part
+            ),
             style="""
             background-color: #FFFAEA; /* Light yellow background */
             padding: 15px;
         Details(
             Summary("Implementations from DataTrove"),
+            Div(
+                D_code("""
+                def find_all_duplicate(words: list[str], n: int) -> int:
+                    n_words = len(words)
+                    unique = set()
+                    repeated_chars, idx = 0, 0
+                    while idx < n_words - n + 1:
+                        n_gram = "".join(words[idx : idx + n])
+                        if n_gram in unique:
+                            repeated_chars += len(n_gram)
+                            idx += n
+                        else:
+                            unique.add(n_gram)
+                            idx += 1
+                    assert repeated_chars <= len("".join(words))
+                    return repeated_chars
+                ...
+                for n, n_frac in self.dup_n_grams:
+                    n_duplicates_char = find_all_duplicate(words, n)
+                    if n_duplicates_char / len(text) > n_frac:
+                        return False, f"duplicated_n_grams"
+                """, block="block", language="python"),
+                style="background-color: white; padding: 15px; margin-top: 10px; margin-bottom: 10px; border-radius: 8px; border: none; "  # Styling for the DV2 part
+            ),
             style="""
             background-color: #FFFAEA; /* Light yellow background */
             padding: 15px;
         """),
         Details(
             Summary("TxT360 Implementation"),
+            Div(
+                D_code("""
+                def get_dup_ngram_frac(n, doc_n_grams, text):
+                    # fetch the ngrams from the document if they exist, otherwise compute them
+                    # doc_n_grams = list(zip(*[words[i:] for i in range(n)]))
+                    duplicated_grams = np.zeros(len(text.split()), dtype=int)
+                    unique_ngrams = set()
+                    for i, ngram in enumerate(doc_n_grams):
+                        if ngram in unique_ngrams:
+                            duplicated_grams[i: i + n] = 1
+                        else:
+                            unique_ngrams.add(ngram)
+                    word_lengths = np.array(list(map(len, text.split())))
+                    chars_duped = np.sum(word_lengths * duplicated_grams)
+                    total_chars = np.sum(word_lengths)
+                    return float(chars_duped / total_chars)
+                def all_ngram_counts_new(words) -> List[Tuple[int, CounterType[Tuple[str, ...]]]]:
+                    return [(n, list(zip(*[words[i:] for i in range(n)]))) for n in range(2, 11)]
+                ...
+                all_counts = all_ngram_counts_new(words)
+                count_most_common_ngrams = (2, 3, 4)
+                for n, ngram_counts in all_counts:
+                    if not ngram_counts:
+                        continue
+                    if n in count_most_common_ngrams:
+                        ...
                     else:
+                        score = get_dup_ngram_frac(n, ngram_counts, text)
+                        attrs.fraction_of_characters_in_duplicate_ngrams.append((n, score))
+                """, block="block", language="python"),
+                style="background-color: white; padding: 15px; margin-top: 10px; margin-bottom: 10px; border-radius: 8px; border: none; "  # Styling for the DV2 part
+            ),
             style="""
             background-color: #EAFFF1; /* Light green background */
             padding: 15px;
         ),
         Details(
             Summary("Sample documents filtered by the fraction of characters in duplicated n-grams (n=5,...,10)"),
+            Div(
+                DV(
                 "data/sample_dup_ngram.json",
                 0,
                 "Sample documents filtered by the fraction of characters in duplicated n-grams (n=5,...,10)",
+                ),
+                style="background-color: white; padding: 15px; margin-top: 10px; margin-bottom: 10px; border-radius: 8px; border: none; "  # Styling for the DV2 part
             ),
             style="""
             background-color: #EAFFF1; /* Light green background */
         """),
         Details(
             Summary("Ellipsis Symbol Identification Implemetations"),
+            Div(
+                P("Dolma: "),
+                D_code("""
+                ELLIPSIS_SYMBOLS = ("…")
+                """, block="block", language="python"),
+                P("RedPajamaV2: "),
+                D_code("""
+                ELLIPSIS_SYMBOLS = ("...", "…")
+                """, block="block", language="python"),
+                P("DataTrove: "),
+                D_code("""
+                ELLIPSIS_SYMBOLS = ("...", "…")
+                """, block="block", language="python"),
+                P("TxT360: "),
+                D_code("""
+                ELLIPSIS_SYMBOLS = ("...", "…", "[...]", "[…]")
+                """, block="block", language="python"),
+                style="background-color: white; padding: 15px; margin-top: 10px; margin-bottom: 10px; border-radius: 8px; border: none; "  # Styling for the DV2 part
+            ),
             style="""
             background-color: #FFFAEA; /* Light yellow background */
             padding: 15px;
         ),
         Details(
             Summary("Bullet Point Identification Implemetations"),
+            Div(
+                P("Dolma: "),
+                D_code("""
+                BULLET_POINTS = ("*", "-"
+                """, block="block", language="python"),
+                P("RedPajamaV2: "),
+                D_code("""
+                BULLET_POINT_SYMBOLS = (
+                    "•",  # bullet point
+                    "‣",  # triangular bullet point
+                    "▶",  # black right pointing triangle
+                    "◀",  # black left pointing triangle
+                    "◦",  # white bullet point
+                    "■",  # black square
+                    "□",  # white square
+                    "▪",  # black small square
+                    "▫",  # white small square
+                    "–",  # en dash
+                )
+                """, block="block", language="python"),
+                P("DataTrove: "),
+                D_code("""
+                BULLET_POINT_SYMBOLS = ("•" , "-")
+                """, block="block", language="python"),
+                P("TxT360: "),
+                D_code("""
+                BULLET_POINT_SYMBOLS = (
+                    "•",  # • bullet point
+                    "‣",  # ‣ triangular bullet point
+                    "▶",  # ▶ black right pointing triangle
+                    "◀",  # ◀ black left pointing triangle
+                    "◦",  # ◦ white bullet point
+                    "■",  # ■ black square
+                    "□",  # □ white square
+                    "▪",  # ▪ black small square
+                    "▫",  # ▫ white small square
+                    "-",  # - en dash
+                    "–",  # – dash
+                    "—",  # — zh dash
+                    "*",  # * star
+                )
+                """, block="block", language="python"),
+                style="background-color: white; padding: 15px; margin-top: 10px; margin-bottom: 10px; border-radius: 8px; border: none; "  # Styling for the DV2 part
+            ),
             style="""
             background-color: #FFFAEA; /* Light yellow background */
             padding: 15px;
         Details(
             Summary("Sample documents that are filtered out by line-wise heuristics"),
+            Div(
+                DV(
                 "data/line_info.json",
                 0,
                 "Sample documents that are filtered out by line-wise heuristics",
+                ),
+                style="background-color: white; padding: 15px; margin-top: 10px; margin-bottom: 10px; border-radius: 8px; border: none; "  # Styling for the DV2 part
             ),
             style="""
             background-color: #EAFFF1; /* Light green background */
         ),
         Details(
             Summary("Implementations from RedPajama-V2"),
+            Div(
+                D_code("""
+                # the normalized content: lowercased and punctuation removed
+                self._normalized_content = normalize(content)
+                self._normalized_words = tuple(self._normalized_content.split())
+                self._num_normalized_words = len(self._normalized_words)
+                ...
+                def normalize(
+                       text: str,
+                       remove_punct: bool = True,
+                       lowercase: bool = True,
+                       nfd_unicode: bool = True,
+                       white_space: bool = True
+                ) -> str:
+                   #Normalize the text by lowercasing and removing punctuation.
+                   # remove punctuation
+                   if remove_punct:
+                       text = text.translate(TRANSLATION_TABLE_PUNCTUATION)
+                   # lowercase
+                   if lowercase:
+                       text = text.lower()
+                   if white_space:
+                       text = text.strip()
+                       text = re.sub(r"\s+", " ", text)
+                   # NFD unicode normalization
+                   if nfd_unicode:
+                       text = unicodedata.normalize("NFD", text)
+                   return text
+                """, block="block", language="python"),
+                style="background-color: white; padding: 15px; margin-top: 10px; margin-bottom: 10px; border-radius: 8px; border: none; "  # Styling for the DV2 part
+            ),
             style="""
             background-color: #FFFAEA; /* Light yellow background */
             padding: 15px;
         Details(
             Summary("Implementations from DataTrove"),
+            Div(
+                D_code("""
+                words = self.tokenizer.word_tokenize(text)
+                n_words = len(words)
+                non_symbol_words = [w for w in words if any(ch not in PUNCTUATION_SET for ch in w)]
+                n_non_symbol_words_words = len(non_symbol_words)
+                """, block="block", language="python"),
+                style="background-color: white; padding: 15px; margin-top: 10px; margin-bottom: 10px; border-radius: 8px; border: none; "  # Styling for the DV2 part
+            ),
             style="""
             background-color: #FFFAEA; /* Light yellow background */
             padding: 15px;
         """),
         Details(
             Summary("Implementations from RedPajama-V2"),
+            Div(
+                D_code("""
+                class RPS_Doc_Num_Sentences(RPSBase):  # noqa
+                 ##The number of sentences in the content. This is calculated using the regex r'[^.!?]+[.!?]*'
+                SENT_PATTERN = re.compile(r'[^.!?]+[.!?]*', flags=re.UNICODE)
+                __slots__ = ()
+                def __call__(self, document: Document) -> SignalType:
+                    ##count the number of sentences in the content using regex
+                    score = float(len(self.SENT_PATTERN.findall(document.raw_content)))
+                    return [(0, len(document), score)]
+                """, block="block", language="python"),
+                style="background-color: white; padding: 15px; margin-top: 10px; margin-bottom: 10px; border-radius: 8px; border: none; "  # Styling for the DV2 part
+            ),
             style="""
             background-color: #FFFAEA; /* Light yellow background */
             padding: 15px;
         """),
         Details(
             Summary("TxT360 Implementation"),
+            Div(
+                D_code("""
+                from nltk.tokenize import sent_tokenize
+                ...
+                def count_sentences(text):
+                    sentences = sent_tokenize(text)
+                    return len(sentences)
+                ...
+                attrs.num_of_sentences = count_sentences(text)
+                """, block="block", language="python"),
+                style="background-color: white; padding: 15px; margin-top: 10px; margin-bottom: 10px; border-radius: 8px; border: none; "  # Styling for the DV2 part
+            ),
             style="""
             background-color: #EAFFF1; /* Light green background */
             padding: 15px;
         """),
         Details(
             Summary("Implementations from Dolma"),
+            Div(
+                D_code("""
+                SYMBOLS = ("#", "…")
+                ...
+                attrs.symbol_to_word_ratio = sum(1 for word in words if any(s in word for s in SYMBOLS)) / max(
+                            word_count, 1
+                        )
+                """, block="block", language="python"),
+                style="background-color: white; padding: 15px; margin-top: 10px; margin-bottom: 10px; border-radius: 8px; border: none; "  # Styling for the DV2 part
+            ),
              style="""
             background-color: #FFFAEA; /* Light yellow background */
             padding: 15px;
         ),
         Details(
             Summary("Implementations from RedPajama-V2"),
+            Div(
+                D_code("""
+                class RPS_Doc_Symbol_To_Word_Ratio(RPSBase):  # noqa
+                ##The ratio of symbols to words in the content. This is analogous to
+                ##the signal used in Gopher. Symbols are defined "#", "...", and "…".
+                    SYMBOLS = ("#", "...", "…")
+                    __slots__ = ()
+                    def __call__(self, document: Document) -> SignalType:
+                        num_words = document.num_raw_words
+                        if num_words == 0:
+                            return [(0, len(document), None)]
+                        # count the number of symbols in the content
+                        num_symbols = float(sum(
+                            document.raw_content.count(x) for x in self.SYMBOLS
+                        ))
+                        score = num_symbols / num_words
+                        score = round(score, PRECISION)
+                        return [(0, len(document), score)]
+                """, block="block", language="python"),
+                style="background-color: white; padding: 15px; margin-top: 10px; margin-bottom: 10px; border-radius: 8px; border: none; "  # Styling for the DV2 part
+            ),
             style="""
             background-color: #FFFAEA; /* Light yellow background */
             padding: 15px;
         Details(
             Summary("Implementations from DataTrove"),
+            Div(
+                D_code("""
+                if self.max_symbol_word_ratio and text.count("#") / n_words > self.max_symbol_word_ratio:
+                    return False, "gopher_too_many_hashes"
+                if self.max_symbol_word_ratio and (text.count("...") + text.count("…")) / n_words > self.max_symbol_word_ratio:
+                    return False, "gopher_too_many_ellipsis"
+                """, block="block", language="python"),
+                style="background-color: white; padding: 15px; margin-top: 10px; margin-bottom: 10px; border-radius: 8px; border: none; "  # Styling for the DV2 part
+            ),
             style="""
             background-color: #FFFAEA; /* Light yellow background */
             padding: 15px;
         ),
         Details(
             Summary("TxT360 Implementation"),
+            Div(
+                D_code("""
+                SYMBOLS = ("#", "...", "…")
+                ...
+                symbol_pattern = re.compile("|".join(re.escape(symbol) for symbol in SYMBOLS))
+                ...
+                attrs.symbol_to_word_ratio = sum(1 for word in words if symbol_pattern.search(word)) / word_count
+                """, block="block", language="python"),
+                style="background-color: white; padding: 15px; margin-top: 10px; margin-bottom: 10px; border-radius: 8px; border: none; "  # Styling for the DV2 part
+            ),
             style="""
             background-color: #EAFFF1; /* Light green background */
             padding: 15px;
         H3("Fraction of Alphabetic Words"),
         Details(
             Summary("Implementations from Dolma"),
+            Div(
+                D_code("""
+                attrs.fraction_of_words_with_alpha_character = sum(
+                1 for word in words if any(c.isalpha() for c in word)
+            ) / max(word_count, 1)
+                """, block="block", language="python"),
+                style="background-color: white; padding: 15px; margin-top: 10px; margin-bottom: 10px; border-radius: 8px; border: none; "  # Styling for the DV2 part
+            ),
             style="""
             background-color: #FFFAEA; /* Light yellow background */
             padding: 15px;
         ),
         Details(
             Summary("Implementations from RedPajama-V2"),
+            Div(
+                D_code("""
+                class RPS_Doc_Frac_No_Alph_Words(RPSBase):  # noqa
+                    ALPH_REGEX = re.compile(r"[a-zA-Z]")
+                    __slots__ = ()
+                    def __call__(self, document: Document) -> SignalType:
+                        num_words = document.num_raw_words
+                        if num_words == 0:
+                            return [(0, len(document), None)]
+                        num_words_with_alpha = float(sum(
+                            int(self.ALPH_REGEX.search(word) is not None)
+                            for word in document.raw_words
+                        ))
+                        score = 1.0 - num_words_with_alpha / num_words
+                        score = round(score, PRECISION)
+                        return [(0, len(document), score)]
+                """, block="block", language="python"),
+                style="background-color: white; padding: 15px; margin-top: 10px; margin-bottom: 10px; border-radius: 8px; border: none; "  # Styling for the DV2 part
+            ),
             style="""
             background-color: #FFFAEA; /* Light yellow background */
             padding: 15px;
         ),
         Details(
             Summary("Implementations from DataTrove"),
+            Div(
+                D_code("""
+                # that 80 % of words in a document contain at least one alphabetic character
+                if (
+                    self.max_non_alpha_words_ratio
+                    and sum([any((c.isalpha() for c in w)) for w in words]) / n_words < self.max_non_alpha_words_ratio
+                ):
+                    return False, "gopher_below_alpha_threshold"
+                """, block="block", language="python"),
+                style="background-color: white; padding: 15px; margin-top: 10px; margin-bottom: 10px; border-radius: 8px; border: none; "  # Styling for the DV2 part
+            ),
             style="""
             background-color: #FFFAEA; /* Light yellow background */
             padding: 15px;
         H3("TxT360 Implementation"),
         Details(
             Summary("Sample documents that are filtered out by statistics-based heuristics"),
+            Div(
+                DV(
                 "data/sample_doc_stat.json",
                 0,
                 "Sample documents that are filtered out by statistics-based heuristics",
+                ),
+                style="background-color: white; padding: 15px; margin-top: 10px; margin-bottom: 10px; border-radius: 8px; border: none; "  # Styling for the DV2 part
             ),
             style="""
             background-color: #EAFFF1; /* Light green background */
         Details(
             Summary("Sample documents containing 'lorem ipsum'"),
+            Div(
+                DV("data/lorem_ipsum.json", 0, "Sample documents containing 'lorem ipsum'"),
+                style="background-color: white; padding: 15px; margin-top: 10px; margin-bottom: 10px; border-radius: 8px; border: none; "  # Styling for the DV2 part
+            ),
             style="""
             background-color: #FAEAEA; /* Light pink background */
             padding: 15px;