victormiller commited on
Commit
24b53c0
1 Parent(s): ee3ad0f

Update curated.py

Browse files
Files changed (1) hide show
  1. curated.py +32 -3
curated.py CHANGED
@@ -9,12 +9,41 @@ from rich import print
9
  import uuid
10
  import plotly.express as px
11
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
12
  overview_text = P("Curated sources comprise high-quality datasets that contain domain-specificity. These sources, such as Arxiv, Wikipedia, and Stack Exchange, provide valuable data that is excluded from the web dataset mentioned above. Analyzing and processing non-web data can yield insights and opportunities for various applications. Details about each of the sources are provided below. ")
13
  copyright_disclaimer = P("We respect the copyright of the data sources and have not included the controversial data that was used in Pile like YouTube and Opensubtitles, Reddit threads, and books.")
14
 
15
  local_dedup_text = P("Each curated data source has been prepared using its specific rules and has been locally deduped using min-hash near deduplication. Details about the dataset are shown below in the table:")
16
 
17
-
18
  treemap_data = {
19
  'Source': ['ArXiv', 'PubMed Central', 'PubMed Abstract', 'S2ORC Full Text', 'S2ORC Abstract', 'PhilPapers', 'Wikipedia', 'StackExchange', 'EuroParl', 'Ubuntu IRC', 'Freelaw', 'PG19', 'USPTO', 'HackerNews', 'DM Maths'],
20
  'Category': ['Papers', 'Papers', 'Papers', 'Papers', 'Papers', 'Papers', 'Internet', 'Conversational', 'Legal/Formal', 'Conversational', 'Legal/Formal', 'Books', 'Legal/Formal', 'Conversational', 'Reasoning'],
@@ -467,7 +496,7 @@ def curated(request):
467
  table_html = preprocessing_steps.to_html(index=False, border=0)
468
  table_div = Div(NotStr(table_html), style="margin: 40px;")
469
  data_preprocessing_div = Div(H3("Data Preprocessing"), text, table_div)
470
-
471
  return Div(
472
  H2("Curated Sources: Overview"),
473
  overview_text,
@@ -475,7 +504,7 @@ def curated(request):
475
  plotly2fasthtml(treemap_chart),
476
  table_desc,
477
  H2("Curated Sources: Data Gathering and Filtering"),
478
- H3("Data Acquisition"),
479
  data_preparation_div,
480
  H3("Data Filtering"),
481
  data_preprocessing_div,
 
9
  import uuid
10
  import plotly.express as px
11
 
12
+ filtering_process = Div(
13
+ Section(
14
+ H3("Title"),
15
+ H4("Download and Extraction"),
16
+ Ol(
17
+ Li("one"),
18
+ Li("two"),
19
+ ),
20
+ H4("Filtering"),
21
+ Ol(
22
+ Li("one"),
23
+ Li("two"),
24
+ ),
25
+ H4("Local Deduplication Process"),
26
+ Ol(
27
+ Li("one"),
28
+ Li("two"),
29
+ ),
30
+ H4("Global Deduplication Process"),
31
+ Ol(
32
+ Li("one"),
33
+ Li("two"),
34
+ ),
35
+
36
+ ),
37
+ )
38
+
39
+
40
+
41
+
42
  overview_text = P("Curated sources comprise high-quality datasets that contain domain-specificity. These sources, such as Arxiv, Wikipedia, and Stack Exchange, provide valuable data that is excluded from the web dataset mentioned above. Analyzing and processing non-web data can yield insights and opportunities for various applications. Details about each of the sources are provided below. ")
43
  copyright_disclaimer = P("We respect the copyright of the data sources and have not included the controversial data that was used in Pile like YouTube and Opensubtitles, Reddit threads, and books.")
44
 
45
  local_dedup_text = P("Each curated data source has been prepared using its specific rules and has been locally deduped using min-hash near deduplication. Details about the dataset are shown below in the table:")
46
 
 
47
  treemap_data = {
48
  'Source': ['ArXiv', 'PubMed Central', 'PubMed Abstract', 'S2ORC Full Text', 'S2ORC Abstract', 'PhilPapers', 'Wikipedia', 'StackExchange', 'EuroParl', 'Ubuntu IRC', 'Freelaw', 'PG19', 'USPTO', 'HackerNews', 'DM Maths'],
49
  'Category': ['Papers', 'Papers', 'Papers', 'Papers', 'Papers', 'Papers', 'Internet', 'Conversational', 'Legal/Formal', 'Conversational', 'Legal/Formal', 'Books', 'Legal/Formal', 'Conversational', 'Reasoning'],
 
496
  table_html = preprocessing_steps.to_html(index=False, border=0)
497
  table_div = Div(NotStr(table_html), style="margin: 40px;")
498
  data_preprocessing_div = Div(H3("Data Preprocessing"), text, table_div)
499
+
500
  return Div(
501
  H2("Curated Sources: Overview"),
502
  overview_text,
 
504
  plotly2fasthtml(treemap_chart),
505
  table_desc,
506
  H2("Curated Sources: Data Gathering and Filtering"),
507
+ filtering_process,
508
  data_preparation_div,
509
  H3("Data Filtering"),
510
  data_preprocessing_div,