TxT360

Sleeping

App Files Files Community

victormiller commited on Oct 2, 2024

Commit

5e5aef1

•

1 Parent(s): e8dab56

Update curated.py

Browse files

Files changed (1) hide show

curated.py +363 -13

curated.py CHANGED Viewed

@@ -74,7 +74,7 @@ wikipedia_filter = pd.DataFrame(
             "Percent Removed After Unigram Probability Filter": [
                 "0.00%",
             ],
-            "Lines Remaining After Local Dedup": [
                 "",
             ],
             "Total Percentage Remaining": [
@@ -86,6 +86,356 @@ wikipedia_filter = pd.DataFrame(
 table_html_wikipedia = wikipedia_filter.to_html(index=False, border=0)
 table_div_wikipedia = Div(NotStr(table_html_wikipedia), style="margin: 40px;")
 filtering_process = Div(
     Section(
@@ -139,7 +489,7 @@ filtering_process = Div(
         Ol(
             Li("This data was part of paper domain which are combined together and minhash was generated and deduped together with all the datasets after doing local dedup."),
         ),
     ),
     Section(
         H3("S2ORC"),
@@ -174,7 +524,7 @@ filtering_process = Div(
         Ol(
             Li("This data was part of paper domain which are combined together and minhash was generated and deduped together with all the datasets after doing local dedup"),
         ),
     ),
     Section(
         H3("PubMed"),
@@ -203,7 +553,7 @@ filtering_process = Div(
         Ol(
             Li("This data was part of paper domain which are combined together and minhash was generated and deduped together with all the datasets after doing local dedup."),
         ),
     ),
     Section(
         H3("Phil Papers"),
@@ -226,7 +576,7 @@ filtering_process = Div(
         Ol(
             Li("This data was part of paper domain which are combined together and minhash was generated and deduped together with all the datasets after doing local dedup."),
         ),
     ),
     Section(
         H3("Europarl"),
@@ -248,7 +598,7 @@ filtering_process = Div(
         Ol(
             Li("After local dedup, remaining europarl was deduped again with all the datasets combined"),
         ),
     ),
     Section(
         H3("HackerNews"),
@@ -273,7 +623,7 @@ filtering_process = Div(
         Ol(
             Li("After local dedup, remaining data was deduped again with all the datasets combined"),
         ),
     ),
     Section(
         H3("USPTO"),
@@ -297,7 +647,7 @@ filtering_process = Div(
         Ol(
             Li("After local dedup, remaining data was deduped again with all the datasets combined"),
         ),
     ),
     Section(
         H3("FreeLaw"),
@@ -325,7 +675,7 @@ filtering_process = Div(
         Ol(
             Li("After local dedup, remaining data was deduped again with all the datasets combined"),
         ),
     ),
     Section(
         H3("StackExchange"),
@@ -358,7 +708,7 @@ filtering_process = Div(
         Ol(
             Li("After local dedup, remaining data was deduped again with all the datasets combined"),
         ),
     ),
     Section(
         H3("Ubuntu IRC"),
@@ -382,7 +732,7 @@ filtering_process = Div(
         Ol(
             Li("After local dedup, remaining data was deduped again with all the datasets combined"),
         ),
     ),
     Section(
         H3("DM Maths"),
@@ -403,7 +753,7 @@ filtering_process = Div(
         Ol(
             Li("None"),
         ),
     ),
     Section(
         H3("PG19"),
@@ -425,7 +775,7 @@ filtering_process = Div(
         Ol(
             Li("After local dedup, remaining data was deduped again with all the datasets combined"),
         ),
     ),
 )

             "Percent Removed After Unigram Probability Filter": [
                 "0.00%",
             ],
+            "Percent Removed After Local Dedup": [
                 "",
             ],
             "Total Percentage Remaining": [
 table_html_wikipedia = wikipedia_filter.to_html(index=False, border=0)
 table_div_wikipedia = Div(NotStr(table_html_wikipedia), style="margin: 40px;")
+freelaw_filter = pd.DataFrame(
+        {
+            "Dataset": [
+                "Wikipedia",
+            ],
+            "Lines Downloaded": [
+                "61614907",
+            ],
+            "Percent Removed After Language Filter": [
+                "0.00%",
+            ],
+            "Percent Removed After Min Word Count Filter": [
+                "1.86%",
+            ],
+            "Percent Removed After Unigram Probability Filter": [
+                "0.00%",
+            ],
+            "Percent Removed After Local Dedup": [
+                "",
+            ],
+            "Total Percentage Remaining": [
+                "98.14%",
+            ],
+        }
+    )
+table_html_freelaw = freelaw_filter.to_html(index=False, border=0)
+table_div_freelaw = Div(NotStr(table_html_freelaw), style="margin: 40px;")
+dmm_filter = pd.DataFrame(
+        {
+            "Dataset": [
+                "Wikipedia",
+            ],
+            "Lines Downloaded": [
+                "61614907",
+            ],
+            "Percent Removed After Language Filter": [
+                "0.00%",
+            ],
+            "Percent Removed After Min Word Count Filter": [
+                "1.86%",
+            ],
+            "Percent Removed After Unigram Probability Filter": [
+                "0.00%",
+            ],
+            "Percent Removed After Local Dedup": [
+                "",
+            ],
+            "Total Percentage Remaining": [
+                "98.14%",
+            ],
+        }
+    )
+table_html_dmm = dmm_filter.to_html(index=False, border=0)
+table_div_dmm = Div(NotStr(table_html_dmm), style="margin: 40px;")
+uspto_filter = pd.DataFrame(
+        {
+            "Dataset": [
+                "Wikipedia",
+            ],
+            "Lines Downloaded": [
+                "61614907",
+            ],
+            "Percent Removed After Language Filter": [
+                "0.00%",
+            ],
+            "Percent Removed After Min Word Count Filter": [
+                "1.86%",
+            ],
+            "Percent Removed After Unigram Probability Filter": [
+                "0.00%",
+            ],
+            "Percent Removed After Local Dedup": [
+                "",
+            ],
+            "Total Percentage Remaining": [
+                "98.14%",
+            ],
+        }
+    )
+table_html_uspto = uspto_filter.to_html(index=False, border=0)
+table_div_uspto = Div(NotStr(table_html_uspto), style="margin: 40px;")
+pg19_filter = pd.DataFrame(
+        {
+            "Dataset": [
+                "Wikipedia",
+            ],
+            "Lines Downloaded": [
+                "61614907",
+            ],
+            "Percent Removed After Language Filter": [
+                "0.00%",
+            ],
+            "Percent Removed After Min Word Count Filter": [
+                "1.86%",
+            ],
+            "Percent Removed After Unigram Probability Filter": [
+                "0.00%",
+            ],
+            "Percent Removed After Local Dedup": [
+                "",
+            ],
+            "Total Percentage Remaining": [
+                "98.14%",
+            ],
+        }
+    )
+table_html_pg19 = pg19_filter.to_html(index=False, border=0)
+table_div_pg19 = Div(NotStr(table_html_pg19), style="margin: 40px;")
+hn_filter = pd.DataFrame(
+        {
+            "Dataset": [
+                "Wikipedia",
+            ],
+            "Lines Downloaded": [
+                "61614907",
+            ],
+            "Percent Removed After Language Filter": [
+                "0.00%",
+            ],
+            "Percent Removed After Min Word Count Filter": [
+                "1.86%",
+            ],
+            "Percent Removed After Unigram Probability Filter": [
+                "0.00%",
+            ],
+            "Percent Removed After Local Dedup": [
+                "",
+            ],
+            "Total Percentage Remaining": [
+                "98.14%",
+            ],
+        }
+    )
+table_html_hn = hn_filter.to_html(index=False, border=0)
+table_div_hn = Div(NotStr(table_html_hn), style="margin: 40px;")
+uirc_filter = pd.DataFrame(
+        {
+            "Dataset": [
+                "Wikipedia",
+            ],
+            "Lines Downloaded": [
+                "61614907",
+            ],
+            "Percent Removed After Language Filter": [
+                "0.00%",
+            ],
+            "Percent Removed After Min Word Count Filter": [
+                "1.86%",
+            ],
+            "Percent Removed After Unigram Probability Filter": [
+                "0.00%",
+            ],
+            "Percent Removed After Local Dedup": [
+                "",
+            ],
+            "Total Percentage Remaining": [
+                "98.14%",
+            ],
+        }
+    )
+table_html_uirc = uirc_filter.to_html(index=False, border=0)
+table_div_uirc = Div(NotStr(table_html_uirc), style="margin: 40px;")
+up_filter = pd.DataFrame(
+        {
+            "Dataset": [
+                "Wikipedia",
+            ],
+            "Lines Downloaded": [
+                "61614907",
+            ],
+            "Percent Removed After Language Filter": [
+                "0.00%",
+            ],
+            "Percent Removed After Min Word Count Filter": [
+                "1.86%",
+            ],
+            "Percent Removed After Unigram Probability Filter": [
+                "0.00%",
+            ],
+            "Percent Removed After Local Dedup": [
+                "",
+            ],
+            "Total Percentage Remaining": [
+                "98.14%",
+            ],
+        }
+    )
+table_html_up = up_filter.to_html(index=False, border=0)
+table_div_up = Div(NotStr(table_html_up), style="margin: 40px;")
+se_filter = pd.DataFrame(
+        {
+            "Dataset": [
+                "Wikipedia",
+            ],
+            "Lines Downloaded": [
+                "61614907",
+            ],
+            "Percent Removed After Language Filter": [
+                "0.00%",
+            ],
+            "Percent Removed After Min Word Count Filter": [
+                "1.86%",
+            ],
+            "Percent Removed After Unigram Probability Filter": [
+                "0.00%",
+            ],
+            "Percent Removed After Local Dedup": [
+                "",
+            ],
+            "Total Percentage Remaining": [
+                "98.14%",
+            ],
+        }
+    )
+table_html_se = se_filter.to_html(index=False, border=0)
+table_div_se = Div(NotStr(table_html_se), style="margin: 40px;")
+arx_filter = pd.DataFrame(
+        {
+            "Dataset": [
+                "Wikipedia",
+            ],
+            "Lines Downloaded": [
+                "61614907",
+            ],
+            "Percent Removed After Language Filter": [
+                "0.00%",
+            ],
+            "Percent Removed After Min Word Count Filter": [
+                "1.86%",
+            ],
+            "Percent Removed After Unigram Probability Filter": [
+                "0.00%",
+            ],
+            "Percent Removed After Local Dedup": [
+                "",
+            ],
+            "Total Percentage Remaining": [
+                "98.14%",
+            ],
+        }
+    )
+table_html_arx = arx_filter.to_html(index=False, border=0)
+table_div_arx = Div(NotStr(table_html_arx), style="margin: 40px;")
+s2o_filter = pd.DataFrame(
+        {
+            "Dataset": [
+                "Wikipedia",
+            ],
+            "Lines Downloaded": [
+                "61614907",
+            ],
+            "Percent Removed After Language Filter": [
+                "0.00%",
+            ],
+            "Percent Removed After Min Word Count Filter": [
+                "1.86%",
+            ],
+            "Percent Removed After Unigram Probability Filter": [
+                "0.00%",
+            ],
+            "Percent Removed After Local Dedup": [
+                "",
+            ],
+            "Total Percentage Remaining": [
+                "98.14%",
+            ],
+        }
+    )
+table_html_s2o = s2o_filter.to_html(index=False, border=0)
+table_div_s2o = Div(NotStr(table_html_s2o), style="margin: 40px;")
+med_filter = pd.DataFrame(
+        {
+            "Dataset": [
+                "Wikipedia",
+            ],
+            "Lines Downloaded": [
+                "61614907",
+            ],
+            "Percent Removed After Language Filter": [
+                "0.00%",
+            ],
+            "Percent Removed After Min Word Count Filter": [
+                "1.86%",
+            ],
+            "Percent Removed After Unigram Probability Filter": [
+                "0.00%",
+            ],
+            "Percent Removed After Local Dedup": [
+                "",
+            ],
+            "Total Percentage Remaining": [
+                "98.14%",
+            ],
+        }
+    )
+table_html_med = med_filter.to_html(index=False, border=0)
+table_div_med = Div(NotStr(table_html_med), style="margin: 40px;")
+phil_filter = pd.DataFrame(
+        {
+            "Dataset": [
+                "Wikipedia",
+            ],
+            "Lines Downloaded": [
+                "61614907",
+            ],
+            "Percent Removed After Language Filter": [
+                "0.00%",
+            ],
+            "Percent Removed After Min Word Count Filter": [
+                "1.86%",
+            ],
+            "Percent Removed After Unigram Probability Filter": [
+                "0.00%",
+            ],
+            "Percent Removed After Local Dedup": [
+                "",
+            ],
+            "Total Percentage Remaining": [
+                "98.14%",
+            ],
+        }
+    )
+table_html_phil = phil_filter.to_html(index=False, border=0)
+table_div_phil = Div(NotStr(table_html_phil), style="margin: 40px;")
 filtering_process = Div(
     Section(
         Ol(
             Li("This data was part of paper domain which are combined together and minhash was generated and deduped together with all the datasets after doing local dedup."),
         ),
+        table_div_arx,
     ),
     Section(
         H3("S2ORC"),
         Ol(
             Li("This data was part of paper domain which are combined together and minhash was generated and deduped together with all the datasets after doing local dedup"),
         ),
+        table_div_s2o,
     ),
     Section(
         H3("PubMed"),
         Ol(
             Li("This data was part of paper domain which are combined together and minhash was generated and deduped together with all the datasets after doing local dedup."),
         ),
+        table_div_med,
     ),
     Section(
         H3("Phil Papers"),
         Ol(
             Li("This data was part of paper domain which are combined together and minhash was generated and deduped together with all the datasets after doing local dedup."),
         ),
+        table_div_phil,
     ),
     Section(
         H3("Europarl"),
         Ol(
             Li("After local dedup, remaining europarl was deduped again with all the datasets combined"),
         ),
+        table_div_up,
     ),
     Section(
         H3("HackerNews"),
         Ol(
             Li("After local dedup, remaining data was deduped again with all the datasets combined"),
         ),
+        table_div_hn,
     ),
     Section(
         H3("USPTO"),
         Ol(
             Li("After local dedup, remaining data was deduped again with all the datasets combined"),
         ),
+        table_div_uspto,
     ),
     Section(
         H3("FreeLaw"),
         Ol(
             Li("After local dedup, remaining data was deduped again with all the datasets combined"),
         ),
+        table_div_freelaw,
     ),
     Section(
         H3("StackExchange"),
         Ol(
             Li("After local dedup, remaining data was deduped again with all the datasets combined"),
         ),
+        table_div_se,
     ),
     Section(
         H3("Ubuntu IRC"),
         Ol(
             Li("After local dedup, remaining data was deduped again with all the datasets combined"),
         ),
+        table_div_uirc,
     ),
     Section(
         H3("DM Maths"),
         Ol(
             Li("None"),
         ),
+        table_div_dmm,
     ),
     Section(
         H3("PG19"),
         Ol(
             Li("After local dedup, remaining data was deduped again with all the datasets combined"),
         ),
+        table_div_pg19,
     ),
 )