fh-new-vm1

Sleeping

App Files Files Community

victormiller commited on Sep 24

Commit

1703b06

•

1 Parent(s): 230ca5c

Update overview

Browse files

Files changed (1) hide show

overview +62 -0

overview CHANGED Viewed

@@ -149,12 +149,74 @@ dataset_comparison = pd.DataFrame(
 table_html = dataset_comparison.to_html(index=False, border=0)
 table_div = Div(NotStr(table_html), style="margin: 40px;")
 def overview():
     return Div(Section(
             H2("Combining the Best of Web and Curated Sources"),
             H3("Why combine the web and highly curated sources? Isn't the web-only data enough?"),
             P("Table 1: TxT360 combines both the web data and highly-curated sources, which none of the existing datasets have covered. The following table shows TxT360 and other well-known datasets on the coverage and size of data sources."),
             table_div,
             id="section5",
         ),
         id="inner-text",

 table_html = dataset_comparison.to_html(index=False, border=0)
 table_div = Div(NotStr(table_html), style="margin: 40px;")
+dataset_sources = pd.DataFrame(
+        {
+            "Data Source": [
+                "CommonCrawl",
+                "Papers",
+                "Wikipedia",
+                "Freelaw",
+                "DM Math",
+                "USPTO",
+                "PG-19",
+                "HackerNews",
+                "Ubuntu IRC",
+                "Europarl",
+                "StackExchange",
+            ],
+            "Raw Data Size": [
+                "11 TB",
+                "712 GB",
+                "210 GB",
+                "23 GB",
+                "22 GB",
+                "45 GB",
+                "11 GB",
+                "4.1 GB",
+                "4.7 GB",
+                "6.1 GB",
+                "45 GB",
+            ],
+            "Token Count": [
+                "5.71T",
+                "154.96B",
+                "4.75B",
+                "7.34B",
+                "5.23B",
+                "4.95B",
+                "2.94B",
+                "1.08B",
+                "1.54B",
+                "1.96B",
+                "8.37B",
+            ],
+            "Cut-Off Date": [
+                "2024-30",
+                "Q4 2023",
+                "-",
+                "Q1 2024",
+                "-",
+                "Q4 2023",
+                "-",
+                "Q4 2023",
+                "Q4 2023",
+                "-",
+                "Q4 2023",
+            ],
+        }
+    )
+table_html = dataset_sources.to_html(index=False, border=0)
+table_div1 = Div(NotStr(table_html), style="margin: 40px;")
 def overview():
     return Div(Section(
             H2("Combining the Best of Web and Curated Sources"),
             H3("Why combine the web and highly curated sources? Isn't the web-only data enough?"),
             P("Table 1: TxT360 combines both the web data and highly-curated sources, which none of the existing datasets have covered. The following table shows TxT360 and other well-known datasets on the coverage and size of data sources."),
             table_div,
+            P("Table 2: Statistics of TxT360. The basic statistics of TxT360 are presented.")
+            table_div1
             id="section5",
         ),
         id="inner-text",