victormiller
commited on
Update curated.py
Browse files- curated.py +5 -280
curated.py
CHANGED
@@ -46,19 +46,12 @@ treemap_data = {
|
|
46 |
'Deep Mind Maths dataset with generated questions.'
|
47 |
]
|
48 |
}
|
49 |
-
# Calculate percentage for each data source
|
50 |
total_count = sum(treemap_data['Count'])
|
51 |
treemap_data['Percentage'] = [count / total_count * 100 for count in treemap_data['Count']]
|
52 |
-
|
53 |
-
# Create treemap
|
54 |
fig = px.treemap(treemap_data, path=['Category', 'Source'], values='Count', hover_data=['Details', 'Percentage'], hover_name='Source')
|
55 |
-
|
56 |
-
# Set the size of the chart
|
57 |
-
|
58 |
-
|
59 |
-
# Display treemap if you want to update the size.update_layout(width=800, height=600)
|
60 |
treemap_chart = fig
|
61 |
|
|
|
62 |
wikipedia_filter = pd.DataFrame(
|
63 |
{
|
64 |
"Dataset": [
|
@@ -438,291 +431,23 @@ phil_filter = pd.DataFrame(
|
|
438 |
|
439 |
table_html_phil = phil_filter.to_html(index=False, border=0)
|
440 |
table_div_phil = Div(NotStr(table_html_phil), style="margin: 40px;")
|
|
|
441 |
|
442 |
-
data_sources = [
|
443 |
-
"Freelaw",
|
444 |
-
"Wikipedia",
|
445 |
-
"PhilPapers",
|
446 |
-
"Arxiv",
|
447 |
-
"S2ORC",
|
448 |
-
"S2ORC Abstract",
|
449 |
-
"Pubmed",
|
450 |
-
"USPTO",
|
451 |
-
"Hackernews",
|
452 |
-
"Ubuntu IRC",
|
453 |
-
"StackExchange",
|
454 |
-
"DM Maths",
|
455 |
-
"PG19",
|
456 |
-
"Europarl",
|
457 |
-
]
|
458 |
-
|
459 |
-
|
460 |
-
|
461 |
-
def get_wiki_data(data_source: str = "Wikipedia", doc_id: int = 3, target: str = "foo"):
|
462 |
-
doc_id = max(0, min(int(doc_id), 9))
|
463 |
-
|
464 |
-
if data_source == "Wikipedia":
|
465 |
-
raw_sample_doc = extracted_sample_doc = json.load(
|
466 |
-
open("data/curated_samples/wiki.json")
|
467 |
-
)
|
468 |
-
else:
|
469 |
-
raw_sample_doc = extracted_sample_doc = [{} for _ in range(10)]
|
470 |
-
|
471 |
-
raw_json = raw_sample_doc[doc_id]
|
472 |
-
extracted_json = extracted_sample_doc[doc_id]
|
473 |
-
return view_data(
|
474 |
-
raw_json,
|
475 |
-
extracted_json,
|
476 |
-
doc_id=doc_id,
|
477 |
-
data_source="Wikipedia",
|
478 |
-
data_sources="Wikipedia",
|
479 |
-
target=target,
|
480 |
-
)
|
481 |
-
|
482 |
-
wiki_examples = Div(
|
483 |
-
Div(
|
484 |
-
get_wiki_data(target=gen_random_id()),
|
485 |
-
style="border: 1px solid #ccc; padding: 20px;",
|
486 |
-
),
|
487 |
-
)
|
488 |
|
|
|
489 |
wiki_examples = DV("data/curated_samples/wiki.json", 0, "Wikipedia")
|
490 |
-
|
491 |
-
|
492 |
-
def get_freelaw_data(data_source: str = "Freelaw", doc_id: int = 3, target: str = "foo"):
|
493 |
-
doc_id = max(0, min(int(doc_id), 9))
|
494 |
-
|
495 |
-
if data_source == "Freelaw":
|
496 |
-
raw_sample_doc = json.load(open("data/curated_samples/freelaw_raw.json"))
|
497 |
-
extracted_sample_doc = json.load(
|
498 |
-
open("data/curated_samples/freelaw_extract.json")
|
499 |
-
)
|
500 |
-
else:
|
501 |
-
raw_sample_doc = extracted_sample_doc = [{} for _ in range(10)]
|
502 |
-
|
503 |
-
raw_json = raw_sample_doc[doc_id]
|
504 |
-
extracted_json = extracted_sample_doc[doc_id]
|
505 |
-
return view_data(
|
506 |
-
raw_json,
|
507 |
-
extracted_json,
|
508 |
-
doc_id=doc_id,
|
509 |
-
data_source="Freelaw",
|
510 |
-
data_sources="Freelaw",
|
511 |
-
target=target,
|
512 |
-
)
|
513 |
-
|
514 |
freelaw_examples = DV2("data/curated_samples/freelaw_raw.json", "data/curated_samples/freelaw_extract.json", 2)
|
515 |
-
|
516 |
-
def get_se_data(data_source: str = "StackExchange", doc_id: int = 3, target: str = "foo"):
|
517 |
-
doc_id = max(0, min(int(doc_id), 9))
|
518 |
-
|
519 |
-
if data_source == "StackExchange":
|
520 |
-
raw_sample_doc = json.load(open("data/curated_samples/stackexchange_raw.json"))
|
521 |
-
extracted_sample_doc = json.load(
|
522 |
-
open("data/curated_samples/stackexchange_extract.json")
|
523 |
-
)
|
524 |
-
else:
|
525 |
-
raw_sample_doc = extracted_sample_doc = [{} for _ in range(10)]
|
526 |
-
|
527 |
-
raw_json = raw_sample_doc[doc_id]
|
528 |
-
extracted_json = extracted_sample_doc[doc_id]
|
529 |
-
return view_data(
|
530 |
-
raw_json,
|
531 |
-
extracted_json,
|
532 |
-
doc_id=doc_id,
|
533 |
-
data_source="StackExchange",
|
534 |
-
data_sources="StackExchange",
|
535 |
-
target=target,
|
536 |
-
)
|
537 |
-
|
538 |
se_examples = DV2("data/curated_samples/stackexchange_raw.json", "data/curated_samples/stackexchange_extract.json", 3)
|
539 |
-
|
540 |
-
def get_phil_data(data_source: str = "PhilPapers", doc_id: int = 3, target: str = "foo"):
|
541 |
-
doc_id = max(0, min(int(doc_id), 9))
|
542 |
-
|
543 |
-
if data_source == "PhilPapers":
|
544 |
-
raw_sample_doc = extracted_sample_doc = json.load(
|
545 |
-
open("data/curated_samples/philpapers_raw.json")
|
546 |
-
)
|
547 |
-
else:
|
548 |
-
raw_sample_doc = extracted_sample_doc = [{} for _ in range(10)]
|
549 |
-
|
550 |
-
raw_json = raw_sample_doc[doc_id]
|
551 |
-
extracted_json = extracted_sample_doc[doc_id]
|
552 |
-
return view_data(
|
553 |
-
raw_json,
|
554 |
-
extracted_json,
|
555 |
-
doc_id=doc_id,
|
556 |
-
data_source="PhilPapers",
|
557 |
-
data_sources="PhilPapers",
|
558 |
-
target=target,
|
559 |
-
)
|
560 |
-
|
561 |
phil_examples = DV("data/curated_samples/philpapers_raw.json", 2, "PhilPapers")
|
562 |
-
|
563 |
-
def get_arx_data(data_source: str = "Arxiv", doc_id: int = 3, target: str = "foo"):
|
564 |
-
doc_id = max(0, min(int(doc_id), 9))
|
565 |
-
|
566 |
-
if data_source == "Arxiv":
|
567 |
-
raw_sample_doc = json.load(open("data/curated_samples/arxiv_raw.json"))
|
568 |
-
extracted_sample_doc = json.load(
|
569 |
-
open("data/curated_samples/arxiv_extract.json")
|
570 |
-
)
|
571 |
-
else:
|
572 |
-
raw_sample_doc = extracted_sample_doc = [{} for _ in range(10)]
|
573 |
-
|
574 |
-
raw_json = raw_sample_doc[doc_id]
|
575 |
-
extracted_json = extracted_sample_doc[doc_id]
|
576 |
-
return view_data(
|
577 |
-
raw_json,
|
578 |
-
extracted_json,
|
579 |
-
doc_id=doc_id,
|
580 |
-
data_source="Arxiv",
|
581 |
-
data_sources="Arxiv",
|
582 |
-
target=target,
|
583 |
-
)
|
584 |
-
|
585 |
arx_examples = DV2("data/curated_samples/arxiv_raw.json", "data/curated_samples/arxiv_extract.json", 3)
|
586 |
-
|
587 |
-
def get_S2ORC_data(data_source: str = "S2ORC", doc_id: int = 3, target: str = "foo"):
|
588 |
-
doc_id = max(0, min(int(doc_id), 9))
|
589 |
-
|
590 |
-
if data_source == "S2ORC":
|
591 |
-
raw_sample_doc = extracted_sample_doc = json.load(
|
592 |
-
open("data/curated_samples/s2orc_raw.json")
|
593 |
-
)
|
594 |
-
else:
|
595 |
-
raw_sample_doc = extracted_sample_doc = [{} for _ in range(10)]
|
596 |
-
|
597 |
-
raw_json = raw_sample_doc[doc_id]
|
598 |
-
extracted_json = extracted_sample_doc[doc_id]
|
599 |
-
return view_data(
|
600 |
-
raw_json,
|
601 |
-
extracted_json,
|
602 |
-
doc_id=doc_id,
|
603 |
-
data_source="S2ORC",
|
604 |
-
data_sources="S2ORC",
|
605 |
-
target=target,
|
606 |
-
)
|
607 |
-
|
608 |
s2o_examples = DV("data/curated_samples/s2orc_raw.json", 0, "S2ORC")
|
609 |
-
|
610 |
-
def get_S2ORCA_data(data_source: str = "S2ORC Abstract", doc_id: int = 3, target: str = "foo"):
|
611 |
-
doc_id = max(0, min(int(doc_id), 9))
|
612 |
-
|
613 |
-
if data_source == "S2ORC":
|
614 |
-
raw_sample_doc = extracted_sample_doc = json.load(
|
615 |
-
open("data/curated_samples/s2orc_abstract_raw.json")
|
616 |
-
)
|
617 |
-
else:
|
618 |
-
raw_sample_doc = extracted_sample_doc = [{} for _ in range(10)]
|
619 |
-
|
620 |
-
raw_json = raw_sample_doc[doc_id]
|
621 |
-
extracted_json = extracted_sample_doc[doc_id]
|
622 |
-
return view_data(
|
623 |
-
raw_json,
|
624 |
-
extracted_json,
|
625 |
-
doc_id=doc_id,
|
626 |
-
data_source="S2ORC Abstract",
|
627 |
-
data_sources="S2ORC Abstract",
|
628 |
-
target=target,
|
629 |
-
)
|
630 |
-
|
631 |
s2oa_examples = DV("data/curated_samples/s2orc_abstract_raw.json", 0, "S2ORC Abstract")
|
632 |
-
|
633 |
-
def get_pubmed_data(data_source: str = "Pubmed", doc_id: int = 3, target: str = "foo"):
|
634 |
-
doc_id = max(0, min(int(doc_id), 9))
|
635 |
-
|
636 |
-
if data_source == "Pubmed":
|
637 |
-
raw_sample_doc = json.load(open("data/curated_samples/pubmed_raw.json"))
|
638 |
-
extracted_sample_doc = json.load(
|
639 |
-
open("data/curated_samples/pubmed_extract.json")
|
640 |
-
)
|
641 |
-
else:
|
642 |
-
raw_sample_doc = extracted_sample_doc = [{} for _ in range(10)]
|
643 |
-
|
644 |
-
raw_json = raw_sample_doc[doc_id]
|
645 |
-
extracted_json = extracted_sample_doc[doc_id]
|
646 |
-
return view_data(
|
647 |
-
raw_json,
|
648 |
-
extracted_json,
|
649 |
-
doc_id=doc_id,
|
650 |
-
data_source="Pubmed",
|
651 |
-
data_sources="Pubmed",
|
652 |
-
target=target,
|
653 |
-
)
|
654 |
-
|
655 |
pubmed_examples = DV2("data/curated_samples/pubmed_raw.json", "data/curated_samples/pubmed_extract.json", 3)
|
656 |
-
|
657 |
-
def get_dmm_data(data_source: str = "DM Math", doc_id: int = 3, target: str = "foo"):
|
658 |
-
doc_id = max(0, min(int(doc_id), 9))
|
659 |
-
|
660 |
-
if data_source == "DM Math":
|
661 |
-
raw_sample_doc = json.load(open("data/curated_samples/dm_maths_raw.json"))
|
662 |
-
extracted_sample_doc = json.load(
|
663 |
-
open("data/curated_samples/dm_maths_extract.json")
|
664 |
-
)
|
665 |
-
else:
|
666 |
-
raw_sample_doc = extracted_sample_doc = [{} for _ in range(10)]
|
667 |
-
|
668 |
-
raw_json = raw_sample_doc[doc_id]
|
669 |
-
extracted_json = extracted_sample_doc[doc_id]
|
670 |
-
return view_data(
|
671 |
-
raw_json,
|
672 |
-
extracted_json,
|
673 |
-
doc_id=doc_id,
|
674 |
-
data_source="DM Math",
|
675 |
-
data_sources="DM Math",
|
676 |
-
target=target,
|
677 |
-
)
|
678 |
-
|
679 |
dmm_examples = DV2("data/curated_samples/dm_maths_raw.json", "data/curated_samples/dm_maths_extract.json", 3)
|
680 |
-
|
681 |
-
def get_pg19_data(data_source: str = "PG19", doc_id: int = 3, target: str = "foo"):
|
682 |
-
doc_id = max(0, min(int(doc_id), 9))
|
683 |
-
|
684 |
-
if data_source == "PG19":
|
685 |
-
raw_sample_doc = extracted_sample_doc = json.load(
|
686 |
-
open("data/curated_samples/pg19_raw.json")
|
687 |
-
)
|
688 |
-
else:
|
689 |
-
raw_sample_doc = extracted_sample_doc = [{} for _ in range(10)]
|
690 |
-
|
691 |
-
raw_json = raw_sample_doc[doc_id]
|
692 |
-
extracted_json = extracted_sample_doc[doc_id]
|
693 |
-
return view_data(
|
694 |
-
raw_json,
|
695 |
-
extracted_json,
|
696 |
-
doc_id=doc_id,
|
697 |
-
data_source="PG19",
|
698 |
-
data_sources="PG19",
|
699 |
-
target=target,
|
700 |
-
)
|
701 |
-
|
702 |
pg19_examples = DV("data/curated_samples/pg19_raw.json", 0, "PG19")
|
703 |
-
|
704 |
-
def get_eu_data(data_source: str = "Europarl", doc_id: int = 3, target: str = "foo"):
|
705 |
-
doc_id = max(0, min(int(doc_id), 9))
|
706 |
-
|
707 |
-
if data_source == "Europarl":
|
708 |
-
raw_sample_doc = extracted_sample_doc = json.load(
|
709 |
-
open("data/curated_samples/europarl_raw.json")
|
710 |
-
)
|
711 |
-
else:
|
712 |
-
raw_sample_doc = extracted_sample_doc = [{} for _ in range(10)]
|
713 |
-
|
714 |
-
raw_json = raw_sample_doc[doc_id]
|
715 |
-
extracted_json = extracted_sample_doc[doc_id]
|
716 |
-
return view_data(
|
717 |
-
raw_json,
|
718 |
-
extracted_json,
|
719 |
-
doc_id=doc_id,
|
720 |
-
data_source="Europarl",
|
721 |
-
data_sources="Europarl",
|
722 |
-
target=target,
|
723 |
-
)
|
724 |
-
|
725 |
eu_examples = DV("data/curated_samples/europarl_raw.json", 0, "Europarl")
|
|
|
|
|
726 |
|
727 |
filtering_process = Div(
|
728 |
Section(
|
|
|
46 |
'Deep Mind Maths dataset with generated questions.'
|
47 |
]
|
48 |
}
|
|
|
49 |
total_count = sum(treemap_data['Count'])
|
50 |
treemap_data['Percentage'] = [count / total_count * 100 for count in treemap_data['Count']]
|
|
|
|
|
51 |
fig = px.treemap(treemap_data, path=['Category', 'Source'], values='Count', hover_data=['Details', 'Percentage'], hover_name='Source')
|
|
|
|
|
|
|
|
|
|
|
52 |
treemap_chart = fig
|
53 |
|
54 |
+
#start individual tables showing filterin
|
55 |
wikipedia_filter = pd.DataFrame(
|
56 |
{
|
57 |
"Dataset": [
|
|
|
431 |
|
432 |
table_html_phil = phil_filter.to_html(index=False, border=0)
|
433 |
table_div_phil = Div(NotStr(table_html_phil), style="margin: 40px;")
|
434 |
+
## end individual tables showing filterin
|
435 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
436 |
|
437 |
+
## start filtered examples
|
438 |
wiki_examples = DV("data/curated_samples/wiki.json", 0, "Wikipedia")
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
439 |
freelaw_examples = DV2("data/curated_samples/freelaw_raw.json", "data/curated_samples/freelaw_extract.json", 2)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
440 |
se_examples = DV2("data/curated_samples/stackexchange_raw.json", "data/curated_samples/stackexchange_extract.json", 3)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
441 |
phil_examples = DV("data/curated_samples/philpapers_raw.json", 2, "PhilPapers")
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
442 |
arx_examples = DV2("data/curated_samples/arxiv_raw.json", "data/curated_samples/arxiv_extract.json", 3)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
443 |
s2o_examples = DV("data/curated_samples/s2orc_raw.json", 0, "S2ORC")
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
444 |
s2oa_examples = DV("data/curated_samples/s2orc_abstract_raw.json", 0, "S2ORC Abstract")
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
445 |
pubmed_examples = DV2("data/curated_samples/pubmed_raw.json", "data/curated_samples/pubmed_extract.json", 3)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
446 |
dmm_examples = DV2("data/curated_samples/dm_maths_raw.json", "data/curated_samples/dm_maths_extract.json", 3)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
447 |
pg19_examples = DV("data/curated_samples/pg19_raw.json", 0, "PG19")
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
448 |
eu_examples = DV("data/curated_samples/europarl_raw.json", 0, "Europarl")
|
449 |
+
## end filtered examples
|
450 |
+
|
451 |
|
452 |
filtering_process = Div(
|
453 |
Section(
|