victormiller commited on
Commit
1703b06
1 Parent(s): 230ca5c

Update overview

Browse files
Files changed (1) hide show
  1. overview +62 -0
overview CHANGED
@@ -149,12 +149,74 @@ dataset_comparison = pd.DataFrame(
149
  table_html = dataset_comparison.to_html(index=False, border=0)
150
  table_div = Div(NotStr(table_html), style="margin: 40px;")
151
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
152
  def overview():
153
  return Div(Section(
154
  H2("Combining the Best of Web and Curated Sources"),
155
  H3("Why combine the web and highly curated sources? Isn't the web-only data enough?"),
156
  P("Table 1: TxT360 combines both the web data and highly-curated sources, which none of the existing datasets have covered. The following table shows TxT360 and other well-known datasets on the coverage and size of data sources."),
157
  table_div,
 
 
158
  id="section5",
159
  ),
160
  id="inner-text",
 
149
  table_html = dataset_comparison.to_html(index=False, border=0)
150
  table_div = Div(NotStr(table_html), style="margin: 40px;")
151
 
152
+ dataset_sources = pd.DataFrame(
153
+ {
154
+ "Data Source": [
155
+ "CommonCrawl",
156
+ "Papers",
157
+ "Wikipedia",
158
+ "Freelaw",
159
+ "DM Math",
160
+ "USPTO",
161
+ "PG-19",
162
+ "HackerNews",
163
+ "Ubuntu IRC",
164
+ "Europarl",
165
+ "StackExchange",
166
+ ],
167
+ "Raw Data Size": [
168
+ "11 TB",
169
+ "712 GB",
170
+ "210 GB",
171
+ "23 GB",
172
+ "22 GB",
173
+ "45 GB",
174
+ "11 GB",
175
+ "4.1 GB",
176
+ "4.7 GB",
177
+ "6.1 GB",
178
+ "45 GB",
179
+ ],
180
+ "Token Count": [
181
+ "5.71T",
182
+ "154.96B",
183
+ "4.75B",
184
+ "7.34B",
185
+ "5.23B",
186
+ "4.95B",
187
+ "2.94B",
188
+ "1.08B",
189
+ "1.54B",
190
+ "1.96B",
191
+ "8.37B",
192
+ ],
193
+ "Cut-Off Date": [
194
+ "2024-30",
195
+ "Q4 2023",
196
+ "-",
197
+ "Q1 2024",
198
+ "-",
199
+ "Q4 2023",
200
+ "-",
201
+ "Q4 2023",
202
+ "Q4 2023",
203
+ "-",
204
+ "Q4 2023",
205
+ ],
206
+ }
207
+ )
208
+
209
+ table_html = dataset_sources.to_html(index=False, border=0)
210
+ table_div1 = Div(NotStr(table_html), style="margin: 40px;")
211
+
212
  def overview():
213
  return Div(Section(
214
  H2("Combining the Best of Web and Curated Sources"),
215
  H3("Why combine the web and highly curated sources? Isn't the web-only data enough?"),
216
  P("Table 1: TxT360 combines both the web data and highly-curated sources, which none of the existing datasets have covered. The following table shows TxT360 and other well-known datasets on the coverage and size of data sources."),
217
  table_div,
218
+ P("Table 2: Statistics of TxT360. The basic statistics of TxT360 are presented.")
219
+ table_div1
220
  id="section5",
221
  ),
222
  id="inner-text",