fh-new-vm1

Sleeping

File size: 5,810 Bytes

from fasthtml.common import *
from fasthtml.components import *
from fasthtml.components import D_title, D_article, D_front_matter, D_contents, D_byline
from plotly import graph_objects as go
from fh_plotly import plotly2fasthtml
import pandas as pd
import json
from rich import print
import curated
import web
import common
import results

dataset_comparison = pd.DataFrame(
        {
            "Dataset": [
                "TxT360",
                "FineWeb",
                "RefinedWeb",
                "RedPajama-v2",
                "C4",
                "Dolma",
                "RedPajama-v1",
                "The Pile",
            ],
            "CommonCrawl": [
                "99 Snapshots",
                "96 Snapshots",
                "90 Snapshots",
                "84 Snapshots",
                "1 Snapshots",
                "24 Snapshots",
                "5 Snapshots",
                "0.6% of 74 Snapshots",
            ],
            "Papers": [
                "5 Sources",
                "-",
                "-",
                "-",
                "-",
                "1 Source",
                "1 Source",
                "4 Sources",
            ],
            "Wikipedia": [
                "310+ Languages",
                "-",
                "-",
                "-",
                "-",
                "what does a check mark mean?",
                "what does a check mark mean?",
                "English Only",
            ],
            "FreeLaw": [
                "Included",
                "-",
                "-",
                "-",
                "-",
                "-",
                "-",
                "Included",
            ],
            "DM Math": [
                "Included",
                "-",
                "-",
                "-",
                "-",
                "-",
                "-",
                "Included",
            ],
            "USPTO": [
                "Included",
                "-",
                "-",
                "-",
                "-",
                "-",
                "-",
                "Included",
            ],
            "PG-19": [
                "Included",
                "-",
                "-",
                "-",
                "-",
                "Included",
                "Included",
                "Included",
            ],
            "HackerNews": [
                "Included",
                "-",
                "-",
                "-",
                "-",
                "-",
                "-",
                "Included",
            ],
            "Ubuntu IRC": [
                "Included",
                "-",
                "-",
                "-",
                "-",
                "-",
                "-",
                "Included",
            ],
            "EuroParl": [
                "Included",
                "-",
                "-",
                "-",
                "-",
                "-",
                "-",
                "Included",
            ],
            "StackExchange": [
                "Included",
                "-",
                "-",
                "-",
                "-",
                "-",
                "Included",
                "Included",
            ],
            "Code": [
                "- what is this?",
                "-",
                "-",
                "-",
                "-",
                "Included",
                "Included",
                "Included",
            ],
        }
    )

table_html = dataset_comparison.to_html(index=False, border=0)
table_div = Div(NotStr(table_html), style="margin: 40px;")

dataset_sources = pd.DataFrame(
        {
            "Data Source": [
                "CommonCrawl",
                "Papers",
                "Wikipedia",
                "Freelaw",
                "DM Math",
                "USPTO",
                "PG-19",
                "HackerNews",
                "Ubuntu IRC",
                "Europarl",
                "StackExchange",
            ],
            "Raw Data Size": [
                "11 TB",
                "712 GB",
                "210 GB",
                "23 GB",
                "22 GB",
                "45 GB",
                "11 GB",
                "4.1 GB",
                "4.7 GB",
                "6.1 GB",
                "45 GB",
            ],
            "Token Count": [
                "5.71T",
                "154.96B",
                "4.75B",
                "7.34B",
                "5.23B",
                "4.95B",
                "2.94B",
                "1.08B",
                "1.54B",
                "1.96B",
                "8.37B",
            ],
            "Cut-Off Date": [
                "2024-30",
                "Q4 2023",
                "-",
                "Q1 2024",
                "-",
                "Q4 2023",
                "-",
                "Q4 2023",
                "Q4 2023",
                "-",
                "Q4 2023",
            ],
        }
    )

table_html = dataset_sources.to_html(index=False, border=0)
table_div1 = Div(NotStr(table_html), style="margin: 40px;")

def overview():
    return Div(Section(
            H2("Combining the Best of Web and Curated Sources"),
            H3("Why combine the web and highly curated sources? Isn't the web-only data enough?"),
            P("Table 1: TxT360 combines both the web data and highly-curated sources, which none of the existing datasets have covered. The following table shows TxT360 and other well-known datasets on the coverage and size of data sources."),
            table_div,
            P("Table 2: Statistics of TxT360. The basic statistics of TxT360 are presented."),
            table_div1,
            id="inner-text",
        )
    )