Spaces:
Runtime error
Runtime error
victormiller
commited on
Commit
•
7533364
1
Parent(s):
e4314a9
Update results.py
Browse files- results.py +61 -1
results.py
CHANGED
@@ -614,6 +614,62 @@ fig.update_layout(
|
|
614 |
# Show the figure
|
615 |
llama_graph6 = fig
|
616 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
617 |
|
618 |
intro_div = Div(
|
619 |
H2("Perplexity Evaluation on Duplicate Data"),
|
@@ -621,13 +677,16 @@ intro_div = Div(
|
|
621 |
P("We took one of the model-based data quality evaluation strategies adopted by [DataComp-LM](https://arxiv.org/abs/2406.11794), which used perplexity filtering as a candidate for quality filtering. DataComp-LM followed [CCNet’s](https://arxiv.org/abs/1911.00359) practice to use a 5-gram Kneser-Ney model as implemented in the [KenLM](https://github.com/kpu/kenlm) library for efficient perplexity calculation. Following this practice, we estimated data quality by taking a KenLM model (from [edugp/kenlm](https://huggingface.co/edugp/kenlm)) trained on English Wikipedia data to compute perplexity on data with different duplication patterns. Lower perplexity is regarded as a signal of higher quality."),
|
622 |
H3("Sampling Strategy"),
|
623 |
P("We started from a processed Common Crawl (CC) ablation dataset divided by the number of duplicates of each document. For each CC dump, we have different buckets each holding chunks of document with different duplicate count ranges (1-1, 2-5, 6-10, 11-100, 101-1000, 1001-30000000). We sampled the first 10k documents from each chunk with their meta data."),
|
624 |
-
|
625 |
)
|
626 |
|
627 |
upsampling_exp = Div(
|
628 |
H2("Upsampling Experiment: TxT360 vs FineWeb"),
|
629 |
H3("Experiment Setup"),
|
630 |
P("We performed a comparison of 1.5T tokens from FineWeb and 1.5T tokens of TxT360 across 10 diverse evaluations. Our FineWeb evaluation is based on a random sample 1.5T tokens from FineWeb (base). For TxT360, we also random sample 1.5T tokens by upsampling data instances with more duplicates. Concretely, the upsampling weight is set to 3 for data points with duplicates in the range from 2 to 5, 5 for the range from 5 to 100, 8 for that from 101 to 1000, and 10 for more than 1000 duplicates."),
|
|
|
|
|
|
|
|
|
631 |
)
|
632 |
|
633 |
perp1_div = Div(
|
@@ -707,6 +766,7 @@ def results():
|
|
707 |
return Div(
|
708 |
Section(
|
709 |
intro_div,
|
|
|
710 |
perp1_div,
|
711 |
llama_div,
|
712 |
P("test plotly"),
|
|
|
614 |
# Show the figure
|
615 |
llama_graph6 = fig
|
616 |
|
617 |
+
##txt360 vs fineweb comparison table
|
618 |
+
dataset_comparison = pd.DataFrame(
|
619 |
+
{
|
620 |
+
"Metric": [
|
621 |
+
"BoolQ",
|
622 |
+
"PIQA",
|
623 |
+
"HellaSwag",
|
624 |
+
"Winogrande",
|
625 |
+
"MMLU",
|
626 |
+
"Natural Questions",
|
627 |
+
"TriviaQA",
|
628 |
+
"GSM8K",
|
629 |
+
"MATH",
|
630 |
+
"MedQA",
|
631 |
+
],
|
632 |
+
"TxT360 - Upsampling": [
|
633 |
+
"70.31",
|
634 |
+
"80.36",
|
635 |
+
"73.54",
|
636 |
+
"68.43",
|
637 |
+
"30.26",
|
638 |
+
"22.22",
|
639 |
+
"58.52",
|
640 |
+
"3.41",
|
641 |
+
"28.04",
|
642 |
+
"25.61",
|
643 |
+
],
|
644 |
+
"FineWeb-1.5T": [
|
645 |
+
"71.5",
|
646 |
+
"82.1",
|
647 |
+
"79.46",
|
648 |
+
"70.96",
|
649 |
+
"28.5",
|
650 |
+
"19.03",
|
651 |
+
"36.65",
|
652 |
+
"5.31",
|
653 |
+
"29.65",
|
654 |
+
"27.26",
|
655 |
+
],
|
656 |
+
"TxT360 Difference": [
|
657 |
+
"(1.19)",
|
658 |
+
"(1.74)",
|
659 |
+
"(5.92)",
|
660 |
+
"(2.53)",
|
661 |
+
"1.76",
|
662 |
+
"3.19",
|
663 |
+
"21.87",
|
664 |
+
"(1.9)",
|
665 |
+
"(1.61)",
|
666 |
+
"(1.65)",
|
667 |
+
],
|
668 |
+
}
|
669 |
+
)
|
670 |
+
|
671 |
+
table_html = dataset_comparison.to_html(index=False, border=0)
|
672 |
+
table_div_1 = Div(NotStr(table_html), style="margin: 40px;")
|
673 |
|
674 |
intro_div = Div(
|
675 |
H2("Perplexity Evaluation on Duplicate Data"),
|
|
|
677 |
P("We took one of the model-based data quality evaluation strategies adopted by [DataComp-LM](https://arxiv.org/abs/2406.11794), which used perplexity filtering as a candidate for quality filtering. DataComp-LM followed [CCNet’s](https://arxiv.org/abs/1911.00359) practice to use a 5-gram Kneser-Ney model as implemented in the [KenLM](https://github.com/kpu/kenlm) library for efficient perplexity calculation. Following this practice, we estimated data quality by taking a KenLM model (from [edugp/kenlm](https://huggingface.co/edugp/kenlm)) trained on English Wikipedia data to compute perplexity on data with different duplication patterns. Lower perplexity is regarded as a signal of higher quality."),
|
678 |
H3("Sampling Strategy"),
|
679 |
P("We started from a processed Common Crawl (CC) ablation dataset divided by the number of duplicates of each document. For each CC dump, we have different buckets each holding chunks of document with different duplicate count ranges (1-1, 2-5, 6-10, 11-100, 101-1000, 1001-30000000). We sampled the first 10k documents from each chunk with their meta data."),
|
|
|
680 |
)
|
681 |
|
682 |
upsampling_exp = Div(
|
683 |
H2("Upsampling Experiment: TxT360 vs FineWeb"),
|
684 |
H3("Experiment Setup"),
|
685 |
P("We performed a comparison of 1.5T tokens from FineWeb and 1.5T tokens of TxT360 across 10 diverse evaluations. Our FineWeb evaluation is based on a random sample 1.5T tokens from FineWeb (base). For TxT360, we also random sample 1.5T tokens by upsampling data instances with more duplicates. Concretely, the upsampling weight is set to 3 for data points with duplicates in the range from 2 to 5, 5 for the range from 5 to 100, 8 for that from 101 to 1000, and 10 for more than 1000 duplicates."),
|
686 |
+
table_div_1,
|
687 |
+
P("To account for differing dataset sizes, the evaluation scores represent the final evaluation score after the entire dataset has been processed."),
|
688 |
+
H3("Training Evaluations"),
|
689 |
+
P("We also conducted full scale training using TxT360 and FineWeb-1.5T. Below are plots of the training and validation loss curves for each dataset. We can see that TxT360 achieves a lower training and validation loss compared to FineWeb-1.5T. "),
|
690 |
)
|
691 |
|
692 |
perp1_div = Div(
|
|
|
766 |
return Div(
|
767 |
Section(
|
768 |
intro_div,
|
769 |
+
upsampling_exp,
|
770 |
perp1_div,
|
771 |
llama_div,
|
772 |
P("test plotly"),
|