Spaces:
				
			
			
	
			
			
		Runtime error
		
	
	
	
			
			
	
	
	
	
		
		
		Runtime error
		
	Update curated.py
Browse files- curated.py +32 -3
 
    	
        curated.py
    CHANGED
    
    | 
         @@ -9,12 +9,41 @@ from rich import print 
     | 
|
| 9 | 
         
             
            import uuid
         
     | 
| 10 | 
         
             
            import plotly.express as px
         
     | 
| 11 | 
         | 
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 12 | 
         
             
            overview_text = P("Curated sources comprise high-quality datasets that contain domain-specificity. These sources, such as Arxiv, Wikipedia, and Stack Exchange, provide valuable data that is excluded from the web dataset mentioned above. Analyzing and processing non-web data can yield insights and opportunities for various applications. Details about each of the sources are provided below. ")
         
     | 
| 13 | 
         
             
            copyright_disclaimer = P("We respect the copyright of the data sources and have not included the controversial data that was used in Pile like YouTube and Opensubtitles, Reddit threads, and books.")
         
     | 
| 14 | 
         | 
| 15 | 
         
             
            local_dedup_text = P("Each curated data source has been prepared using its specific rules and has been locally deduped using min-hash near deduplication. Details about the dataset are shown below in the table:")
         
     | 
| 16 | 
         | 
| 17 | 
         
            -
             
     | 
| 18 | 
         
             
            treemap_data = {
         
     | 
| 19 | 
         
             
              'Source': ['ArXiv', 'PubMed Central', 'PubMed Abstract', 'S2ORC Full Text', 'S2ORC Abstract', 'PhilPapers', 'Wikipedia', 'StackExchange', 'EuroParl', 'Ubuntu IRC', 'Freelaw', 'PG19', 'USPTO', 'HackerNews', 'DM Maths'],
         
     | 
| 20 | 
         
             
              'Category': ['Papers', 'Papers', 'Papers', 'Papers', 'Papers', 'Papers', 'Internet', 'Conversational', 'Legal/Formal', 'Conversational', 'Legal/Formal', 'Books', 'Legal/Formal', 'Conversational', 'Reasoning'],
         
     | 
| 
         @@ -467,7 +496,7 @@ def curated(request): 
     | 
|
| 467 | 
         
             
                table_html = preprocessing_steps.to_html(index=False, border=0)
         
     | 
| 468 | 
         
             
                table_div = Div(NotStr(table_html), style="margin: 40px;")
         
     | 
| 469 | 
         
             
                data_preprocessing_div = Div(H3("Data Preprocessing"), text, table_div)
         
     | 
| 470 | 
         
            -
             
     | 
| 471 | 
         
             
                return Div(
         
     | 
| 472 | 
         
             
                        H2("Curated Sources: Overview"),
         
     | 
| 473 | 
         
             
                        overview_text,
         
     | 
| 
         @@ -475,7 +504,7 @@ def curated(request): 
     | 
|
| 475 | 
         
             
                        plotly2fasthtml(treemap_chart),
         
     | 
| 476 | 
         
             
                        table_desc,
         
     | 
| 477 | 
         
             
                        H2("Curated Sources: Data Gathering and Filtering"),
         
     | 
| 478 | 
         
            -
                         
     | 
| 479 | 
         
             
                        data_preparation_div,
         
     | 
| 480 | 
         
             
                        H3("Data Filtering"),
         
     | 
| 481 | 
         
             
                        data_preprocessing_div,
         
     | 
| 
         | 
|
| 9 | 
         
             
            import uuid
         
     | 
| 10 | 
         
             
            import plotly.express as px
         
     | 
| 11 | 
         | 
| 12 | 
         
            +
            filtering_process = Div(
         
     | 
| 13 | 
         
            +
                Section(
         
     | 
| 14 | 
         
            +
                    H3("Title"),
         
     | 
| 15 | 
         
            +
                    H4("Download and Extraction"),
         
     | 
| 16 | 
         
            +
                    Ol(
         
     | 
| 17 | 
         
            +
                        Li("one"),
         
     | 
| 18 | 
         
            +
                        Li("two"),
         
     | 
| 19 | 
         
            +
                    ),
         
     | 
| 20 | 
         
            +
                    H4("Filtering"),
         
     | 
| 21 | 
         
            +
                    Ol(
         
     | 
| 22 | 
         
            +
                        Li("one"),
         
     | 
| 23 | 
         
            +
                        Li("two"),
         
     | 
| 24 | 
         
            +
                    ),
         
     | 
| 25 | 
         
            +
                    H4("Local Deduplication Process"),
         
     | 
| 26 | 
         
            +
                    Ol(
         
     | 
| 27 | 
         
            +
                        Li("one"),
         
     | 
| 28 | 
         
            +
                        Li("two"),
         
     | 
| 29 | 
         
            +
                    ),
         
     | 
| 30 | 
         
            +
                    H4("Global Deduplication Process"),
         
     | 
| 31 | 
         
            +
                    Ol(
         
     | 
| 32 | 
         
            +
                        Li("one"),
         
     | 
| 33 | 
         
            +
                        Li("two"),
         
     | 
| 34 | 
         
            +
                    ),
         
     | 
| 35 | 
         
            +
                    
         
     | 
| 36 | 
         
            +
                ),
         
     | 
| 37 | 
         
            +
            )
         
     | 
| 38 | 
         
            +
             
     | 
| 39 | 
         
            +
             
     | 
| 40 | 
         
            +
             
     | 
| 41 | 
         
            +
             
     | 
| 42 | 
         
             
            overview_text = P("Curated sources comprise high-quality datasets that contain domain-specificity. These sources, such as Arxiv, Wikipedia, and Stack Exchange, provide valuable data that is excluded from the web dataset mentioned above. Analyzing and processing non-web data can yield insights and opportunities for various applications. Details about each of the sources are provided below. ")
         
     | 
| 43 | 
         
             
            copyright_disclaimer = P("We respect the copyright of the data sources and have not included the controversial data that was used in Pile like YouTube and Opensubtitles, Reddit threads, and books.")
         
     | 
| 44 | 
         | 
| 45 | 
         
             
            local_dedup_text = P("Each curated data source has been prepared using its specific rules and has been locally deduped using min-hash near deduplication. Details about the dataset are shown below in the table:")
         
     | 
| 46 | 
         | 
| 
         | 
|
| 47 | 
         
             
            treemap_data = {
         
     | 
| 48 | 
         
             
              'Source': ['ArXiv', 'PubMed Central', 'PubMed Abstract', 'S2ORC Full Text', 'S2ORC Abstract', 'PhilPapers', 'Wikipedia', 'StackExchange', 'EuroParl', 'Ubuntu IRC', 'Freelaw', 'PG19', 'USPTO', 'HackerNews', 'DM Maths'],
         
     | 
| 49 | 
         
             
              'Category': ['Papers', 'Papers', 'Papers', 'Papers', 'Papers', 'Papers', 'Internet', 'Conversational', 'Legal/Formal', 'Conversational', 'Legal/Formal', 'Books', 'Legal/Formal', 'Conversational', 'Reasoning'],
         
     | 
| 
         | 
|
| 496 | 
         
             
                table_html = preprocessing_steps.to_html(index=False, border=0)
         
     | 
| 497 | 
         
             
                table_div = Div(NotStr(table_html), style="margin: 40px;")
         
     | 
| 498 | 
         
             
                data_preprocessing_div = Div(H3("Data Preprocessing"), text, table_div)
         
     | 
| 499 | 
         
            +
                
         
     | 
| 500 | 
         
             
                return Div(
         
     | 
| 501 | 
         
             
                        H2("Curated Sources: Overview"),
         
     | 
| 502 | 
         
             
                        overview_text,
         
     | 
| 
         | 
|
| 504 | 
         
             
                        plotly2fasthtml(treemap_chart),
         
     | 
| 505 | 
         
             
                        table_desc,
         
     | 
| 506 | 
         
             
                        H2("Curated Sources: Data Gathering and Filtering"),
         
     | 
| 507 | 
         
            +
                        filtering_process,
         
     | 
| 508 | 
         
             
                        data_preparation_div,
         
     | 
| 509 | 
         
             
                        H3("Data Filtering"),
         
     | 
| 510 | 
         
             
                        data_preprocessing_div,
         
     |