Spaces:
Sleeping
Sleeping
from fasthtml.common import * | |
from fasthtml.components import * | |
from fasthtml.components import D_title, D_article, D_front_matter, D_contents, D_byline | |
from plotly import graph_objects as go | |
from fh_plotly import plotly2fasthtml | |
import pandas as pd | |
import json | |
from rich import print | |
import curated | |
import web | |
import common | |
import results | |
app, rt = fast_app( | |
debug=True, | |
pico=False, | |
hdrs=( | |
Meta(charset="UTF-8"), | |
Meta(name="viewport", content="width=device-width, initial-scale=1.0"), | |
Script(src="https://distill.pub/template.v2.js"), | |
Script(src="https://unpkg.com/htmx.org@next/dist/htmx.min.js"), | |
Script(src="https://cdn.plot.ly/plotly-latest.min.js"), | |
Link(rel="stylesheet", href="style.css"), | |
MarkdownJS(), | |
HighlightJS(langs=["python", "javascript", "html", "css"]), | |
), | |
) | |
def main(): | |
return Div( | |
D_front_matter(), | |
D_title( | |
H1( | |
"TxT360: fully open and transparent fusion of web and curated corpora for pre-training large language models", | |
cls="l-body", | |
style="text-align: center;", | |
), | |
Div( | |
Img(src="images/llm360_logo.png"), | |
id="title-plot", | |
cls="main-plot-container l-page", | |
), | |
), | |
D_article( | |
D_contents( | |
Nav( | |
H3("Table of Contents"), | |
Div( | |
A("TxT360", href="#_self"), | |
hx_get="/intro", | |
hx_target="#inner-text", | |
), | |
Div( | |
Ul( | |
Li( | |
A( | |
"Introduction", | |
href="/intro#section1", | |
hx_get="/intro#section1", | |
hx_target="#inner-text", | |
) | |
), | |
Li( | |
A( | |
"Background", | |
href="/intro#section2", | |
hx_get="/intro#section2", | |
hx_target="#inner-text", | |
) | |
), | |
Li( | |
A( | |
"Main Content", | |
href="/intro#section3", | |
hx_get="/intro#section3", | |
hx_target="#inner-text", | |
) | |
), | |
Li( | |
A( | |
"Conclusion", | |
href="/intro#section4", | |
hx_get="/intro#section4", | |
hx_target="#inner-text", | |
) | |
), | |
), | |
), | |
Div( | |
A("Web Data", href="#inner-text"), | |
hx_get="/webdata", | |
hx_target="#inner-text", | |
), | |
Div( | |
A("Curated Sources", href="#inner-text"), | |
hx_get="/curated", | |
hx_target="#inner-text", | |
), | |
Div( | |
A("Common Steps", href="#inner-text"), | |
hx_get="/common", | |
hx_target="#inner-text", | |
), | |
Div( | |
A("TxT360 Results", href="#inner-text"), | |
hx_get="/results", | |
hx_target="#inner-text", | |
), | |
role="navigation", | |
cls="l-text figcaption", | |
), | |
), | |
intro(), | |
), | |
) | |
def intro(): | |
return Div( | |
Section( | |
H2("Introduction"), | |
P("""We are excited to introduce TxT360, a | |
large-scale, comprehensive, and fully transparent | |
dataset designed for Large Language Model (LLM) | |
pre-training. TxT360 is engineered to strike a | |
balance between the quantity and quality of | |
pre-training data, pushing the limit on both | |
fronts. This comprehensive dataset encompasses both | |
expansive web-based data and highly curated data | |
sources, making it one of the most robust LLM | |
pre-training corpora available today. Our web data | |
component includes 99 snapshots from Common Crawl, | |
amassing 5.7 trillion tokens and occupying 11 TB of | |
disk space in jsonl.gz format. On the curated side, | |
TxT360 integrates one of the most extensive | |
collections of high-quality sources across multiple | |
domains, ensuring diverse and rich content referred | |
to as curated sources, 14 sources across 10 | |
domains. To maintain the highest quality, we | |
meticulously pre-processed the web data to filter | |
out low-quality content and conducted thorough | |
reviews of the curated sources. This process not | |
only unified their formats but also identified and | |
rectified any anomalies. Not only do we 100% | |
open-source our processing scripts, but we also | |
release the details of our data reviews, revealing | |
the decision-making processes behind data selection | |
and quality assurance. This level of transparency | |
allows researchers and practitioners to fully | |
understand the dataset’s composition and make | |
informed decisions when using TxT360 for training. | |
Additionally, TxT360 includes detailed | |
documentation and analysis of the data, covering | |
distribution statistics, domain coverage, and | |
processing pipeline, which helps users navigate and | |
utilize the dataset effectively. Overall, TxT360 | |
represents a significant step forward in the | |
availability and transparency of large-scale | |
training data for language models, setting a new | |
standard for dataset quality and openness."""), | |
id="section1", | |
), | |
Section( | |
H2("Background"), | |
P( | |
""" The quality and size of a pre-training dataset | |
play a crucial role in the performance of large | |
language models (LLMs). The community has | |
introduced a variety of datasets for this purpose, | |
including purely web-based datasets like RefinedWeb | |
[1], RedPajama-Data-V2 [2], DCLM [3], and | |
FineWeb [4], as well as comprehensive datasets | |
derived from multiple highly-curated data sources | |
such as The Pile [5], RedPajama-Data-V1 [6], and | |
Dolma [7] . It is commonly known that web-based | |
datasets provide a vast quantity of data, while | |
highly-curated multi-source datasets consistently | |
deliver high quality and diversity, both critical | |
for effective LLM pre-training. However, despite | |
the advancements in both types of data, each type | |
of dataset has its limitations. For instance, the | |
processing scripts for the web dataset, RefinedWeb, | |
known for its high quality, are not public, and | |
only about 10% of the entire dataset has been | |
disclosed. Conversely, the web component of | |
existing highly-curated multi-source datasets is | |
relatively small compared to purely web-based | |
datasets, limiting their coverage and diversity | |
compared to the scale of information from the | |
internet. By integrating the extensive reach of | |
web data with the exceptional quality of curated | |
sources, TxT360 is crafted to meet and surpass the | |
rigorous standards required for state-of-the-art | |
LLM pre-training. """ | |
), | |
id="section2", | |
), | |
Section( | |
H2("Main Content"), | |
P("""The performance of a large language model (LLM) | |
depends heavily on the quality and size of its | |
pretraining dataset. However, the pretraining | |
datasets for state-of-the-art open LLMs like Llama | |
3 and Mixtral are not publicly available and very | |
little is known about how they were created. | |
Reading time: 45 min. For the best reading | |
experience, we recommend not using a mobile phone. | |
Recently, we released 🍷 FineWeb, a new, | |
large-scale (15-trillion tokens, 44TB disk space) | |
dataset for LLM pretraining. FineWeb is derived | |
from 96 CommonCrawl snapshots and produces | |
better-performing LLMs than other open pretraining | |
datasets. To bring more clarity in machine learning | |
and advance the open understanding of how to train | |
good quality large language models, we carefully | |
documented and ablated all of the design choices | |
used in FineWeb, including in-depth investigations | |
of deduplication and filtering strategies. The | |
present long form report is a deep dive in how to | |
create a large and high-quality web-scale dataset | |
for LLM pretraining. The dataset itself, 🍷 | |
FineWeb, is available here. We are extremely | |
thankful to the whole distill.pub team (Christopher | |
Olah, Shan Carter, Ludwig Schubert in particular) | |
for creating the template on which we based this | |
blog post. Thanks also for inspiring us with | |
exquisitely crafted articles and blog posts. In | |
this report we also introduce 📚 FineWeb-Edu, a | |
subset of FineWeb constructed using scalable | |
automated high-quality annotations for educational | |
value, and which outperforms all openly accessible | |
web-datasets on a number of educational benchmarks | |
such as MMLU, ARC, and OpenBookQA. 📚 FineWeb-Edu | |
is available in two sizes/filtering-level: 1.3 | |
trillion (very high educational content) and 5.4 | |
trillion (high educational content) tokens (all | |
tokens are measured with GPT2 tokenizer). You can | |
download it here. Both datasets are released under | |
the permissive ODC-By 1.0 license TLDR: This blog | |
covers a discussion on processing and evaluating | |
data quality at scale, the 🍷 FineWeb recipe | |
(listing and explaining all of our design choices), | |
and the process followed to create its 📚 | |
FineWeb-Edu subset."""), | |
id="section3", | |
), | |
Section( | |
H2("Conclusion"), | |
P("""This is the conclusion section where we | |
summarize the key points discussed in the blog post | |
and provide final thoughts."""), | |
id="section4", | |
), | |
id="inner-text", | |
) | |
rt("/curated")(curated.curated) | |
rt("/curated/{target}")(curated.update) | |
rt("/webdata")(web.web_data) | |
rt("/webdata/{target}")(web.update) | |
rt("/common")(common.common_steps) | |
rt("/results")(results.results) | |
serve() | |