omwdataset / .gitattributes
omkarenator's picture
add better findcc figure
dcb73ca
dist_assets_images_fineweb-recipe.png filter=lfs diff=lfs merge=lfs -text
pipeline.png filter=lfs diff=lfs merge=lfs -text
prep-across-diff-dump-dup-counts-global.png filter=lfs diff=lfs merge=lfs -text
prep-across-diff-years-global.png filter=lfs diff=lfs merge=lfs -text
prep-diff-buckets-global.png filter=lfs diff=lfs merge=lfs -text
prep-diff-buckets-local.png filter=lfs diff=lfs merge=lfs -text
prep-vs-dump-dup-global.png filter=lfs diff=lfs merge=lfs -text
perp-across-diff-buckets-global.png filter=lfs diff=lfs merge=lfs -text
prep-across-diff-buckets-local.png filter=lfs diff=lfs merge=lfs -text
prep-across-diff-docs-dup-count-global.png filter=lfs diff=lfs merge=lfs -text
cc.png filter=lfs diff=lfs merge=lfs -text
image3.png filter=lfs diff=lfs merge=lfs -text
image7.png filter=lfs diff=lfs merge=lfs -text
prep-diff-dump-dump-counts-local.png filter=lfs diff=lfs merge=lfs -text
prep-vs-dump-dup-local.png filter=lfs diff=lfs merge=lfs -text
100k.png filter=lfs diff=lfs merge=lfs -text
image9.png filter=lfs diff=lfs merge=lfs -text
prep-across-diff-year-global-dup-buckets.png filter=lfs diff=lfs merge=lfs -text
data/sample_doc_stat.json filter=lfs diff=lfs merge=lfs -text
data/sample_en_low.json filter=lfs diff=lfs merge=lfs -text
data/toxic_lines.json filter=lfs diff=lfs merge=lfs -text
data/web_filter_pipeline.json filter=lfs diff=lfs merge=lfs -text
data/mbzuai-llm-us-east-1[[:space:]]-[[:space:]]S3[[:space:]]bucket[[:space:]]_[[:space:]]S3[[:space:]]_[[:space:]]us-east-1.mhtml filter=lfs diff=lfs merge=lfs -text
data/url_blocklist.py filter=lfs diff=lfs merge=lfs -text
data/sample_top_ngram.json filter=lfs diff=lfs merge=lfs -text
data/dataset_inclusion.csv filter=lfs diff=lfs merge=lfs -text
data/line_info.json filter=lfs diff=lfs merge=lfs -text
data/non_web_urls.py filter=lfs diff=lfs merge=lfs -text
data/web_pipeline_comparison.csv filter=lfs diff=lfs merge=lfs -text
data/all_signals.json filter=lfs diff=lfs merge=lfs -text
data/dataset_inclusion_size.csv filter=lfs diff=lfs merge=lfs -text
data/sample_java.jsonl filter=lfs diff=lfs merge=lfs -text
data/sample_warc.json filter=lfs diff=lfs merge=lfs -text
data/sample_wet.json filter=lfs diff=lfs merge=lfs -text
data/curated_samples filter=lfs diff=lfs merge=lfs -text
data/dataset_details.csv filter=lfs diff=lfs merge=lfs -text
data/sample.py filter=lfs diff=lfs merge=lfs -text
data/sample_dup_ngram.json filter=lfs diff=lfs merge=lfs -text
data/sample_non_en.json filter=lfs diff=lfs merge=lfs -text
data/sample_terminal_punc.json filter=lfs diff=lfs merge=lfs -text
data/__pycache__ filter=lfs diff=lfs merge=lfs -text
data/bad_url_doc.jsonl filter=lfs diff=lfs merge=lfs -text
data/cluster_dist.json filter=lfs diff=lfs merge=lfs -text
data/lorem_ipsum.json filter=lfs diff=lfs merge=lfs -text
data/repeat_line_frac.jsonl filter=lfs diff=lfs merge=lfs -text
data/sample_url_exclusion.json filter=lfs diff=lfs merge=lfs -text
data/meta_non_web.py filter=lfs diff=lfs merge=lfs -text
data/sample_bad_urls.py filter=lfs diff=lfs merge=lfs -text
data/sample_refinedweb_line.json filter=lfs diff=lfs merge=lfs -text
images/llm360_logo.png filter=lfs diff=lfs merge=lfs -text
images/findcc.svg filter=lfs diff=lfs merge=lfs -text