Spaces:
Sleeping
Sleeping
victormiller
commited on
Commit
•
1703b06
1
Parent(s):
230ca5c
Update overview
Browse files
overview
CHANGED
@@ -149,12 +149,74 @@ dataset_comparison = pd.DataFrame(
|
|
149 |
table_html = dataset_comparison.to_html(index=False, border=0)
|
150 |
table_div = Div(NotStr(table_html), style="margin: 40px;")
|
151 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
152 |
def overview():
|
153 |
return Div(Section(
|
154 |
H2("Combining the Best of Web and Curated Sources"),
|
155 |
H3("Why combine the web and highly curated sources? Isn't the web-only data enough?"),
|
156 |
P("Table 1: TxT360 combines both the web data and highly-curated sources, which none of the existing datasets have covered. The following table shows TxT360 and other well-known datasets on the coverage and size of data sources."),
|
157 |
table_div,
|
|
|
|
|
158 |
id="section5",
|
159 |
),
|
160 |
id="inner-text",
|
|
|
149 |
table_html = dataset_comparison.to_html(index=False, border=0)
|
150 |
table_div = Div(NotStr(table_html), style="margin: 40px;")
|
151 |
|
152 |
+
dataset_sources = pd.DataFrame(
|
153 |
+
{
|
154 |
+
"Data Source": [
|
155 |
+
"CommonCrawl",
|
156 |
+
"Papers",
|
157 |
+
"Wikipedia",
|
158 |
+
"Freelaw",
|
159 |
+
"DM Math",
|
160 |
+
"USPTO",
|
161 |
+
"PG-19",
|
162 |
+
"HackerNews",
|
163 |
+
"Ubuntu IRC",
|
164 |
+
"Europarl",
|
165 |
+
"StackExchange",
|
166 |
+
],
|
167 |
+
"Raw Data Size": [
|
168 |
+
"11 TB",
|
169 |
+
"712 GB",
|
170 |
+
"210 GB",
|
171 |
+
"23 GB",
|
172 |
+
"22 GB",
|
173 |
+
"45 GB",
|
174 |
+
"11 GB",
|
175 |
+
"4.1 GB",
|
176 |
+
"4.7 GB",
|
177 |
+
"6.1 GB",
|
178 |
+
"45 GB",
|
179 |
+
],
|
180 |
+
"Token Count": [
|
181 |
+
"5.71T",
|
182 |
+
"154.96B",
|
183 |
+
"4.75B",
|
184 |
+
"7.34B",
|
185 |
+
"5.23B",
|
186 |
+
"4.95B",
|
187 |
+
"2.94B",
|
188 |
+
"1.08B",
|
189 |
+
"1.54B",
|
190 |
+
"1.96B",
|
191 |
+
"8.37B",
|
192 |
+
],
|
193 |
+
"Cut-Off Date": [
|
194 |
+
"2024-30",
|
195 |
+
"Q4 2023",
|
196 |
+
"-",
|
197 |
+
"Q1 2024",
|
198 |
+
"-",
|
199 |
+
"Q4 2023",
|
200 |
+
"-",
|
201 |
+
"Q4 2023",
|
202 |
+
"Q4 2023",
|
203 |
+
"-",
|
204 |
+
"Q4 2023",
|
205 |
+
],
|
206 |
+
}
|
207 |
+
)
|
208 |
+
|
209 |
+
table_html = dataset_sources.to_html(index=False, border=0)
|
210 |
+
table_div1 = Div(NotStr(table_html), style="margin: 40px;")
|
211 |
+
|
212 |
def overview():
|
213 |
return Div(Section(
|
214 |
H2("Combining the Best of Web and Curated Sources"),
|
215 |
H3("Why combine the web and highly curated sources? Isn't the web-only data enough?"),
|
216 |
P("Table 1: TxT360 combines both the web data and highly-curated sources, which none of the existing datasets have covered. The following table shows TxT360 and other well-known datasets on the coverage and size of data sources."),
|
217 |
table_div,
|
218 |
+
P("Table 2: Statistics of TxT360. The basic statistics of TxT360 are presented.")
|
219 |
+
table_div1
|
220 |
id="section5",
|
221 |
),
|
222 |
id="inner-text",
|