Spaces:
Sleeping
Sleeping
from fasthtml.common import * | |
from fasthtml.components import * | |
from fasthtml.components import D_title, D_article, D_front_matter, D_contents, D_byline | |
from plotly import graph_objects as go | |
from fh_plotly import plotly2fasthtml | |
import pandas as pd | |
import json | |
from rich import print | |
import curated | |
import web | |
import common | |
import results | |
app, rt = fast_app( | |
debug=True, | |
pico=False, | |
hdrs=( | |
Meta(charset="UTF-8"), | |
Meta(name="viewport", content="width=device-width, initial-scale=1.0"), | |
Script(src="https://distill.pub/template.v2.js"), | |
Script(src="https://unpkg.com/htmx.org@next/dist/htmx.min.js"), | |
Script(src="https://cdn.plot.ly/plotly-latest.min.js"), | |
Link(rel="stylesheet", href="style.css"), | |
MarkdownJS(), | |
HighlightJS(langs=["python", "javascript", "html", "css"]), | |
), | |
) | |
def main(): | |
return Div( | |
D_front_matter(), | |
D_title( | |
H1( | |
"TxT360: the most comprehensive, highest quality, and production ready pretraining dataset", | |
cls="l-body", | |
style="text-align: center;", | |
), | |
Div( | |
Img(src="images/llm360_logo.png"), | |
id="title-plot", | |
cls="main-plot-container l-page", | |
), | |
), | |
D_article( | |
D_contents( | |
Nav( | |
H3("Table of Contents"), | |
Div( | |
A("TxT360", href="#_self"), | |
hx_get="/intro", | |
hx_target="#inner-text", | |
), | |
Div( | |
Ul( | |
Li( | |
A( | |
"Introduction", | |
href="/intro#section1", | |
hx_get="/intro#section1", | |
hx_target="#inner-text", | |
) | |
), | |
Li( | |
A( | |
"Background", | |
href="/intro#section2", | |
hx_get="/intro#section2", | |
hx_target="#inner-text", | |
) | |
), | |
Li( | |
A( | |
"Main Content", | |
href="/intro#section3", | |
hx_get="/intro#section3", | |
hx_target="#inner-text", | |
) | |
), | |
Li( | |
A( | |
"Conclusion", | |
href="/intro#section4", | |
hx_get="/intro#section4", | |
hx_target="#inner-text", | |
) | |
), | |
), | |
), | |
Div( | |
A("Web Data", href="#inner-text"), | |
hx_get="/webdata", | |
hx_target="#inner-text", | |
), | |
Div( | |
A("Curated Sources", href="#inner-text"), | |
hx_get="/curated", | |
hx_target="#inner-text", | |
), | |
Div( | |
A("Common Steps", href="#inner-text"), | |
hx_get="/common", | |
hx_target="#inner-text", | |
), | |
Div( | |
A("TxT360 Results", href="#inner-text"), | |
hx_get="/results", | |
hx_target="#inner-text", | |
), | |
role="navigation", | |
cls="l-text figcaption", | |
), | |
), | |
intro(), | |
), | |
) | |
def intro(): | |
return Div( | |
Section( | |
H2("Introduction"), | |
P("""Pretraining performant large language models (LLMs) requires trillions of tokens of high quality data. Many prior work, including our previous pretraining projects Amber-7B, Crystal-7B, and K2-65B have demonstrated how data curation is a ‘make-or-break’ decision for model quality and capability. | |
We present TxT360, the Trillion eXtracted Text corpus, a 5.7T token dataset for pretraining projects that: | |
1. Curates commonly used pretraining datasets, including all CommonCrawl | |
2. Employs carefully selected filters designed for each data source | |
3. Provides only unique data elements via globally deduplicated across all datasets | |
4. Retains all deduplication metadata for custom upweighting | |
5. Is Production ready! Download here [link to HF repo] | |
"""), | |
id="section1", | |
), | |
Section( | |
H2("Background"), | |
P( | |
""" The quality and size of a pre-training dataset | |
play a crucial role in the performance of large | |
language models (LLMs). The community has | |
introduced a variety of datasets for this purpose, | |
including purely web-based datasets like RefinedWeb | |
[1], RedPajama-Data-V2 [2], DCLM [3], and | |
FineWeb [4], as well as comprehensive datasets | |
derived from multiple highly-curated data sources | |
such as The Pile [5], RedPajama-Data-V1 [6], and | |
Dolma [7] . It is commonly known that web-based | |
datasets provide a vast quantity of data, while | |
highly-curated multi-source datasets consistently | |
deliver high quality and diversity, both critical | |
for effective LLM pre-training. However, despite | |
the advancements in both types of data, each type | |
of dataset has its limitations. For instance, the | |
processing scripts for the web dataset, RefinedWeb, | |
known for its high quality, are not public, and | |
only about 10% of the entire dataset has been | |
disclosed. Conversely, the web component of | |
existing highly-curated multi-source datasets is | |
relatively small compared to purely web-based | |
datasets, limiting their coverage and diversity | |
compared to the scale of information from the | |
internet. By integrating the extensive reach of | |
web data with the exceptional quality of curated | |
sources, TxT360 is crafted to meet and surpass the | |
rigorous standards required for state-of-the-art | |
LLM pre-training. """ | |
), | |
id="section2", | |
), | |
Section( | |
H2("Main Content"), | |
P("""The performance of a large language model (LLM) | |
depends heavily on the quality and size of its | |
pretraining dataset. However, the pretraining | |
datasets for state-of-the-art open LLMs like Llama | |
3 and Mixtral are not publicly available and very | |
little is known about how they were created. | |
Reading time: 45 min. For the best reading | |
experience, we recommend not using a mobile phone. | |
Recently, we released 🍷 FineWeb, a new, | |
large-scale (15-trillion tokens, 44TB disk space) | |
dataset for LLM pretraining. FineWeb is derived | |
from 96 CommonCrawl snapshots and produces | |
better-performing LLMs than other open pretraining | |
datasets. To bring more clarity in machine learning | |
and advance the open understanding of how to train | |
good quality large language models, we carefully | |
documented and ablated all of the design choices | |
used in FineWeb, including in-depth investigations | |
of deduplication and filtering strategies. The | |
present long form report is a deep dive in how to | |
create a large and high-quality web-scale dataset | |
for LLM pretraining. The dataset itself, 🍷 | |
FineWeb, is available here. We are extremely | |
thankful to the whole distill.pub team (Christopher | |
Olah, Shan Carter, Ludwig Schubert in particular) | |
for creating the template on which we based this | |
blog post. Thanks also for inspiring us with | |
exquisitely crafted articles and blog posts. In | |
this report we also introduce 📚 FineWeb-Edu, a | |
subset of FineWeb constructed using scalable | |
automated high-quality annotations for educational | |
value, and which outperforms all openly accessible | |
web-datasets on a number of educational benchmarks | |
such as MMLU, ARC, and OpenBookQA. 📚 FineWeb-Edu | |
is available in two sizes/filtering-level: 1.3 | |
trillion (very high educational content) and 5.4 | |
trillion (high educational content) tokens (all | |
tokens are measured with GPT2 tokenizer). You can | |
download it here. Both datasets are released under | |
the permissive ODC-By 1.0 license TLDR: This blog | |
covers a discussion on processing and evaluating | |
data quality at scale, the 🍷 FineWeb recipe | |
(listing and explaining all of our design choices), | |
and the process followed to create its 📚 | |
FineWeb-Edu subset."""), | |
id="section3", | |
), | |
Section( | |
H2("Conclusion"), | |
P("""This is the conclusion section where we | |
summarize the key points discussed in the blog post | |
and provide final thoughts."""), | |
id="section4", | |
), | |
id="inner-text", | |
) | |
rt("/curated")(curated.curated) | |
rt("/webdata")(web.web_data) | |
rt("/common")(common.common_steps) | |
rt("/results")(results.results) | |
serve() | |