omwdataset

Running

App Files Files Community

victormiller commited on 16 days ago

Commit

24b53c0

•

1 Parent(s): ee3ad0f

Update curated.py

Browse files

Files changed (1) hide show

curated.py +32 -3

curated.py CHANGED Viewed

@@ -9,12 +9,41 @@ from rich import print
 import uuid
 import plotly.express as px
 overview_text = P("Curated sources comprise high-quality datasets that contain domain-specificity. These sources, such as Arxiv, Wikipedia, and Stack Exchange, provide valuable data that is excluded from the web dataset mentioned above. Analyzing and processing non-web data can yield insights and opportunities for various applications. Details about each of the sources are provided below. ")
 copyright_disclaimer = P("We respect the copyright of the data sources and have not included the controversial data that was used in Pile like YouTube and Opensubtitles, Reddit threads, and books.")
 local_dedup_text = P("Each curated data source has been prepared using its specific rules and has been locally deduped using min-hash near deduplication. Details about the dataset are shown below in the table:")
 treemap_data = {
   'Source': ['ArXiv', 'PubMed Central', 'PubMed Abstract', 'S2ORC Full Text', 'S2ORC Abstract', 'PhilPapers', 'Wikipedia', 'StackExchange', 'EuroParl', 'Ubuntu IRC', 'Freelaw', 'PG19', 'USPTO', 'HackerNews', 'DM Maths'],
   'Category': ['Papers', 'Papers', 'Papers', 'Papers', 'Papers', 'Papers', 'Internet', 'Conversational', 'Legal/Formal', 'Conversational', 'Legal/Formal', 'Books', 'Legal/Formal', 'Conversational', 'Reasoning'],
@@ -467,7 +496,7 @@ def curated(request):
     table_html = preprocessing_steps.to_html(index=False, border=0)
     table_div = Div(NotStr(table_html), style="margin: 40px;")
     data_preprocessing_div = Div(H3("Data Preprocessing"), text, table_div)
     return Div(
             H2("Curated Sources: Overview"),
             overview_text,
@@ -475,7 +504,7 @@ def curated(request):
             plotly2fasthtml(treemap_chart),
             table_desc,
             H2("Curated Sources: Data Gathering and Filtering"),
-            H3("Data Acquisition"),
             data_preparation_div,
             H3("Data Filtering"),
             data_preprocessing_div,

 import uuid
 import plotly.express as px
+filtering_process = Div(
+    Section(
+        H3("Title"),
+        H4("Download and Extraction"),
+        Ol(
+            Li("one"),
+            Li("two"),
+        ),
+        H4("Filtering"),
+        Ol(
+            Li("one"),
+            Li("two"),
+        ),
+        H4("Local Deduplication Process"),
+        Ol(
+            Li("one"),
+            Li("two"),
+        ),
+        H4("Global Deduplication Process"),
+        Ol(
+            Li("one"),
+            Li("two"),
+        ),
+    ),
+)
 overview_text = P("Curated sources comprise high-quality datasets that contain domain-specificity. These sources, such as Arxiv, Wikipedia, and Stack Exchange, provide valuable data that is excluded from the web dataset mentioned above. Analyzing and processing non-web data can yield insights and opportunities for various applications. Details about each of the sources are provided below. ")
 copyright_disclaimer = P("We respect the copyright of the data sources and have not included the controversial data that was used in Pile like YouTube and Opensubtitles, Reddit threads, and books.")
 local_dedup_text = P("Each curated data source has been prepared using its specific rules and has been locally deduped using min-hash near deduplication. Details about the dataset are shown below in the table:")
 treemap_data = {
   'Source': ['ArXiv', 'PubMed Central', 'PubMed Abstract', 'S2ORC Full Text', 'S2ORC Abstract', 'PhilPapers', 'Wikipedia', 'StackExchange', 'EuroParl', 'Ubuntu IRC', 'Freelaw', 'PG19', 'USPTO', 'HackerNews', 'DM Maths'],
   'Category': ['Papers', 'Papers', 'Papers', 'Papers', 'Papers', 'Papers', 'Internet', 'Conversational', 'Legal/Formal', 'Conversational', 'Legal/Formal', 'Books', 'Legal/Formal', 'Conversational', 'Reasoning'],
     table_html = preprocessing_steps.to_html(index=False, border=0)
     table_div = Div(NotStr(table_html), style="margin: 40px;")
     data_preprocessing_div = Div(H3("Data Preprocessing"), text, table_div)
     return Div(
             H2("Curated Sources: Overview"),
             overview_text,
             plotly2fasthtml(treemap_chart),
             table_desc,
             H2("Curated Sources: Data Gathering and Filtering"),
+            filtering_process,
             data_preparation_div,
             H3("Data Filtering"),
             data_preprocessing_div,