marinone94 commited on
Commit
06d47ea
1 Parent(s): 4abd0b0

add nst sv to hf dataset

Browse files
Files changed (3) hide show
  1. .gitignore +2 -0
  2. eda.ipynb +26 -1005
  3. upload_nst_sv_to_hf_dataset.py +30 -17
.gitignore CHANGED
@@ -2,3 +2,5 @@
2
  .ipynb_checkpoints/
3
  checkpoint*/
4
  wandb/
 
 
 
2
  .ipynb_checkpoints/
3
  checkpoint*/
4
  wandb/
5
+ *parquet
6
+ launch.json
eda.ipynb CHANGED
@@ -250,1023 +250,44 @@
250
  },
251
  {
252
  "cell_type": "code",
253
- "execution_count": 14,
254
- "id": "900052ec",
255
  "metadata": {},
256
  "outputs": [
257
  {
258
  "data": {
259
  "text/plain": [
260
- "['',\n",
261
- " '',\n",
262
- " '',\n",
263
- " '',\n",
264
- " '',\n",
265
- " '',\n",
266
- " '',\n",
267
- " '',\n",
268
- " '',\n",
269
- " '',\n",
270
- " '',\n",
271
- " '',\n",
272
- " '',\n",
273
- " '',\n",
274
- " '',\n",
275
- " '',\n",
276
- " '',\n",
277
- " '',\n",
278
- " '',\n",
279
- " '',\n",
280
- " '',\n",
281
- " '',\n",
282
- " '',\n",
283
- " '',\n",
284
- " '',\n",
285
- " '',\n",
286
- " '',\n",
287
- " '',\n",
288
- " '',\n",
289
- " '',\n",
290
- " '',\n",
291
- " '',\n",
292
- " '',\n",
293
- " '',\n",
294
- " '',\n",
295
- " '',\n",
296
- " '',\n",
297
- " '',\n",
298
- " '',\n",
299
- " '',\n",
300
- " '',\n",
301
- " '',\n",
302
- " '',\n",
303
- " '',\n",
304
- " '',\n",
305
- " '',\n",
306
- " '',\n",
307
- " '',\n",
308
- " '',\n",
309
- " '',\n",
310
- " '',\n",
311
- " '',\n",
312
- " '',\n",
313
- " '',\n",
314
- " '',\n",
315
- " '',\n",
316
- " '',\n",
317
- " '',\n",
318
- " '',\n",
319
- " '',\n",
320
- " '',\n",
321
- " '',\n",
322
- " '',\n",
323
- " '',\n",
324
- " '',\n",
325
- " '',\n",
326
- " '',\n",
327
- " '',\n",
328
- " '',\n",
329
- " '',\n",
330
- " '',\n",
331
- " '',\n",
332
- " '',\n",
333
- " '',\n",
334
- " '',\n",
335
- " '',\n",
336
- " '',\n",
337
- " '',\n",
338
- " '',\n",
339
- " '',\n",
340
- " '',\n",
341
- " '',\n",
342
- " '',\n",
343
- " '',\n",
344
- " '',\n",
345
- " '',\n",
346
- " '',\n",
347
- " '',\n",
348
- " '',\n",
349
- " '',\n",
350
- " '',\n",
351
- " '',\n",
352
- " '',\n",
353
- " '',\n",
354
- " '',\n",
355
- " '',\n",
356
- " '',\n",
357
- " '',\n",
358
- " '',\n",
359
- " '',\n",
360
- " '',\n",
361
- " '',\n",
362
- " '',\n",
363
- " '',\n",
364
- " '',\n",
365
- " '',\n",
366
- " '',\n",
367
- " '',\n",
368
- " '',\n",
369
- " '',\n",
370
- " '',\n",
371
- " '',\n",
372
- " '',\n",
373
- " '',\n",
374
- " '',\n",
375
- " '',\n",
376
- " '',\n",
377
- " '',\n",
378
- " '',\n",
379
- " '',\n",
380
- " '',\n",
381
- " '',\n",
382
- " '',\n",
383
- " '',\n",
384
- " '',\n",
385
- " '',\n",
386
- " '',\n",
387
- " '',\n",
388
- " '',\n",
389
- " '',\n",
390
- " '',\n",
391
- " '',\n",
392
- " '',\n",
393
- " '',\n",
394
- " '',\n",
395
- " '',\n",
396
- " '',\n",
397
- " '',\n",
398
- " '',\n",
399
- " '',\n",
400
- " '',\n",
401
- " '',\n",
402
- " '',\n",
403
- " '',\n",
404
- " '',\n",
405
- " '',\n",
406
- " '',\n",
407
- " '',\n",
408
- " '',\n",
409
- " '',\n",
410
- " '',\n",
411
- " '',\n",
412
- " '',\n",
413
- " '',\n",
414
- " '',\n",
415
- " '',\n",
416
- " '',\n",
417
- " '',\n",
418
- " '',\n",
419
- " '',\n",
420
- " '',\n",
421
- " '',\n",
422
- " '',\n",
423
- " '',\n",
424
- " '',\n",
425
- " '',\n",
426
- " '',\n",
427
- " '',\n",
428
- " '',\n",
429
- " '',\n",
430
- " '',\n",
431
- " '',\n",
432
- " '',\n",
433
- " '',\n",
434
- " '',\n",
435
- " '',\n",
436
- " '',\n",
437
- " '',\n",
438
- " '',\n",
439
- " '',\n",
440
- " '',\n",
441
- " '',\n",
442
- " '',\n",
443
- " '',\n",
444
- " '',\n",
445
- " '',\n",
446
- " '',\n",
447
- " '',\n",
448
- " '',\n",
449
- " '',\n",
450
- " '',\n",
451
- " '',\n",
452
- " '',\n",
453
- " '',\n",
454
- " '',\n",
455
- " '',\n",
456
- " '',\n",
457
- " '',\n",
458
- " '',\n",
459
- " '',\n",
460
- " '',\n",
461
- " '',\n",
462
- " '',\n",
463
- " '',\n",
464
- " '',\n",
465
- " '',\n",
466
- " '',\n",
467
- " '',\n",
468
- " '',\n",
469
- " '',\n",
470
- " '',\n",
471
- " '',\n",
472
- " '',\n",
473
- " '',\n",
474
- " '',\n",
475
- " '',\n",
476
- " '',\n",
477
- " '',\n",
478
- " '',\n",
479
- " '',\n",
480
- " '',\n",
481
- " '',\n",
482
- " '',\n",
483
- " '',\n",
484
- " '',\n",
485
- " '',\n",
486
- " '',\n",
487
- " '',\n",
488
- " '',\n",
489
- " '',\n",
490
- " '',\n",
491
- " '',\n",
492
- " '',\n",
493
- " '',\n",
494
- " '',\n",
495
- " '',\n",
496
- " '',\n",
497
- " '',\n",
498
- " '',\n",
499
- " '',\n",
500
- " '',\n",
501
- " '',\n",
502
- " '',\n",
503
- " '',\n",
504
- " '',\n",
505
- " '',\n",
506
- " '',\n",
507
- " '',\n",
508
- " '',\n",
509
- " '',\n",
510
- " '',\n",
511
- " '',\n",
512
- " '',\n",
513
- " '',\n",
514
- " '',\n",
515
- " '',\n",
516
- " '',\n",
517
- " '',\n",
518
- " '',\n",
519
- " '',\n",
520
- " '',\n",
521
- " '',\n",
522
- " '',\n",
523
- " '',\n",
524
- " '',\n",
525
- " '',\n",
526
- " '',\n",
527
- " '',\n",
528
- " '',\n",
529
- " '',\n",
530
- " '',\n",
531
- " '',\n",
532
- " '',\n",
533
- " '',\n",
534
- " '',\n",
535
- " '',\n",
536
- " '',\n",
537
- " '',\n",
538
- " '',\n",
539
- " '',\n",
540
- " '',\n",
541
- " '',\n",
542
- " '',\n",
543
- " '',\n",
544
- " '',\n",
545
- " '',\n",
546
- " '',\n",
547
- " '',\n",
548
- " '',\n",
549
- " '',\n",
550
- " '',\n",
551
- " '',\n",
552
- " '',\n",
553
- " '',\n",
554
- " '',\n",
555
- " '',\n",
556
- " '',\n",
557
- " '',\n",
558
- " '',\n",
559
- " '',\n",
560
- " '',\n",
561
- " '',\n",
562
- " '',\n",
563
- " '',\n",
564
- " '',\n",
565
- " '',\n",
566
- " '',\n",
567
- " '',\n",
568
- " '',\n",
569
- " '',\n",
570
- " '',\n",
571
- " '',\n",
572
- " '',\n",
573
- " '',\n",
574
- " '',\n",
575
- " '',\n",
576
- " '',\n",
577
- " '',\n",
578
- " '',\n",
579
- " '',\n",
580
- " '',\n",
581
- " '',\n",
582
- " '',\n",
583
- " '',\n",
584
- " '',\n",
585
- " '',\n",
586
- " '',\n",
587
- " '',\n",
588
- " '',\n",
589
- " '',\n",
590
- " '',\n",
591
- " '',\n",
592
- " '',\n",
593
- " '',\n",
594
- " '',\n",
595
- " '',\n",
596
- " '',\n",
597
- " '',\n",
598
- " '',\n",
599
- " '',\n",
600
- " '',\n",
601
- " '',\n",
602
- " '',\n",
603
- " '',\n",
604
- " '',\n",
605
- " '',\n",
606
- " '',\n",
607
- " '',\n",
608
- " '',\n",
609
- " '',\n",
610
- " '',\n",
611
- " '',\n",
612
- " '',\n",
613
- " '',\n",
614
- " '',\n",
615
- " '',\n",
616
- " '',\n",
617
- " '',\n",
618
- " '',\n",
619
- " '',\n",
620
- " '',\n",
621
- " '',\n",
622
- " '',\n",
623
- " '',\n",
624
- " '',\n",
625
- " '',\n",
626
- " '',\n",
627
- " '',\n",
628
- " '',\n",
629
- " '',\n",
630
- " '',\n",
631
- " '',\n",
632
- " '',\n",
633
- " '',\n",
634
- " '',\n",
635
- " '',\n",
636
- " '',\n",
637
- " '',\n",
638
- " '',\n",
639
- " '',\n",
640
- " '',\n",
641
- " '',\n",
642
- " '',\n",
643
- " '',\n",
644
- " '',\n",
645
- " '',\n",
646
- " '',\n",
647
- " '',\n",
648
- " '',\n",
649
- " '',\n",
650
- " '',\n",
651
- " '',\n",
652
- " '',\n",
653
- " '',\n",
654
- " '',\n",
655
- " '',\n",
656
- " '',\n",
657
- " '',\n",
658
- " '',\n",
659
- " '',\n",
660
- " '',\n",
661
- " '',\n",
662
- " '',\n",
663
- " '',\n",
664
- " '',\n",
665
- " '',\n",
666
- " '',\n",
667
- " '',\n",
668
- " '',\n",
669
- " '',\n",
670
- " '',\n",
671
- " '',\n",
672
- " '',\n",
673
- " '',\n",
674
- " '',\n",
675
- " '',\n",
676
- " '',\n",
677
- " '',\n",
678
- " '',\n",
679
- " '',\n",
680
- " '',\n",
681
- " '',\n",
682
- " '',\n",
683
- " '',\n",
684
- " '',\n",
685
- " '',\n",
686
- " '',\n",
687
- " '',\n",
688
- " '',\n",
689
- " '',\n",
690
- " '',\n",
691
- " '',\n",
692
- " '',\n",
693
- " '',\n",
694
- " '',\n",
695
- " '',\n",
696
- " '',\n",
697
- " '',\n",
698
- " '',\n",
699
- " '',\n",
700
- " '',\n",
701
- " '',\n",
702
- " '',\n",
703
- " '',\n",
704
- " '',\n",
705
- " '',\n",
706
- " '',\n",
707
- " '',\n",
708
- " '',\n",
709
- " '',\n",
710
- " '',\n",
711
- " '',\n",
712
- " '',\n",
713
- " '',\n",
714
- " '',\n",
715
- " '',\n",
716
- " '',\n",
717
- " '',\n",
718
- " '',\n",
719
- " '',\n",
720
- " '',\n",
721
- " '',\n",
722
- " '',\n",
723
- " '',\n",
724
- " '',\n",
725
- " '',\n",
726
- " '',\n",
727
- " '',\n",
728
- " '',\n",
729
- " '',\n",
730
- " '',\n",
731
- " '',\n",
732
- " '',\n",
733
- " '',\n",
734
- " '',\n",
735
- " '',\n",
736
- " '',\n",
737
- " '',\n",
738
- " '',\n",
739
- " '',\n",
740
- " '',\n",
741
- " '',\n",
742
- " '',\n",
743
- " '',\n",
744
- " '',\n",
745
- " '',\n",
746
- " '',\n",
747
- " '',\n",
748
- " '',\n",
749
- " '',\n",
750
- " '',\n",
751
- " '',\n",
752
- " '',\n",
753
- " '',\n",
754
- " '',\n",
755
- " '',\n",
756
- " '',\n",
757
- " '',\n",
758
- " '',\n",
759
- " '',\n",
760
- " '',\n",
761
- " '',\n",
762
- " '',\n",
763
- " '',\n",
764
- " '',\n",
765
- " '',\n",
766
- " '',\n",
767
- " '',\n",
768
- " '',\n",
769
- " '',\n",
770
- " '',\n",
771
- " '',\n",
772
- " '',\n",
773
- " '',\n",
774
- " '',\n",
775
- " '',\n",
776
- " '',\n",
777
- " '',\n",
778
- " '',\n",
779
- " '',\n",
780
- " '',\n",
781
- " '',\n",
782
- " '',\n",
783
- " '',\n",
784
- " '',\n",
785
- " '',\n",
786
- " '',\n",
787
- " '',\n",
788
- " '',\n",
789
- " '',\n",
790
- " '',\n",
791
- " '',\n",
792
- " '',\n",
793
- " '',\n",
794
- " '',\n",
795
- " '',\n",
796
- " '',\n",
797
- " '',\n",
798
- " '',\n",
799
- " '',\n",
800
- " '',\n",
801
- " '',\n",
802
- " '',\n",
803
- " '',\n",
804
- " '',\n",
805
- " '',\n",
806
- " '',\n",
807
- " '',\n",
808
- " '',\n",
809
- " '',\n",
810
- " '',\n",
811
- " '',\n",
812
- " '',\n",
813
- " '',\n",
814
- " '',\n",
815
- " '',\n",
816
- " '',\n",
817
- " '',\n",
818
- " '',\n",
819
- " '',\n",
820
- " '',\n",
821
- " '',\n",
822
- " '',\n",
823
- " '',\n",
824
- " '',\n",
825
- " '',\n",
826
- " '',\n",
827
- " '',\n",
828
- " '',\n",
829
- " '',\n",
830
- " '',\n",
831
- " '',\n",
832
- " '',\n",
833
- " '',\n",
834
- " '',\n",
835
- " '',\n",
836
- " '',\n",
837
- " '',\n",
838
- " '',\n",
839
- " '',\n",
840
- " '',\n",
841
- " '',\n",
842
- " '',\n",
843
- " '',\n",
844
- " '',\n",
845
- " '',\n",
846
- " '',\n",
847
- " '',\n",
848
- " '',\n",
849
- " '',\n",
850
- " '',\n",
851
- " '',\n",
852
- " '',\n",
853
- " '',\n",
854
- " '',\n",
855
- " '',\n",
856
- " '',\n",
857
- " '',\n",
858
- " '',\n",
859
- " '',\n",
860
- " '',\n",
861
- " '',\n",
862
- " '',\n",
863
- " '',\n",
864
- " '',\n",
865
- " '',\n",
866
- " '',\n",
867
- " '',\n",
868
- " '',\n",
869
- " '',\n",
870
- " '',\n",
871
- " '',\n",
872
- " '',\n",
873
- " '',\n",
874
- " '',\n",
875
- " '',\n",
876
- " '',\n",
877
- " '',\n",
878
- " '',\n",
879
- " '',\n",
880
- " '',\n",
881
- " '',\n",
882
- " '',\n",
883
- " '',\n",
884
- " '',\n",
885
- " '',\n",
886
- " '',\n",
887
- " '',\n",
888
- " '',\n",
889
- " '',\n",
890
- " '',\n",
891
- " '',\n",
892
- " '',\n",
893
- " '',\n",
894
- " '',\n",
895
- " '',\n",
896
- " '',\n",
897
- " '',\n",
898
- " '',\n",
899
- " '',\n",
900
- " '',\n",
901
- " '',\n",
902
- " '',\n",
903
- " '',\n",
904
- " '',\n",
905
- " '',\n",
906
- " '',\n",
907
- " '',\n",
908
- " '',\n",
909
- " '',\n",
910
- " '',\n",
911
- " '',\n",
912
- " '',\n",
913
- " '',\n",
914
- " '',\n",
915
- " '',\n",
916
- " '',\n",
917
- " '',\n",
918
- " '',\n",
919
- " '',\n",
920
- " '',\n",
921
- " '',\n",
922
- " '',\n",
923
- " '',\n",
924
- " '',\n",
925
- " '',\n",
926
- " '',\n",
927
- " '',\n",
928
- " '',\n",
929
- " '',\n",
930
- " '',\n",
931
- " '',\n",
932
- " '',\n",
933
- " '',\n",
934
- " '',\n",
935
- " '',\n",
936
- " '',\n",
937
- " '',\n",
938
- " '',\n",
939
- " '',\n",
940
- " '',\n",
941
- " '',\n",
942
- " '',\n",
943
- " '',\n",
944
- " '',\n",
945
- " '',\n",
946
- " '',\n",
947
- " '',\n",
948
- " '',\n",
949
- " '',\n",
950
- " '',\n",
951
- " '',\n",
952
- " '',\n",
953
- " '',\n",
954
- " '',\n",
955
- " '',\n",
956
- " '',\n",
957
- " '',\n",
958
- " '',\n",
959
- " '',\n",
960
- " '',\n",
961
- " '',\n",
962
- " '',\n",
963
- " '',\n",
964
- " '',\n",
965
- " '',\n",
966
- " '',\n",
967
- " '',\n",
968
- " '',\n",
969
- " '',\n",
970
- " '',\n",
971
- " '',\n",
972
- " '',\n",
973
- " '',\n",
974
- " '',\n",
975
- " '',\n",
976
- " '',\n",
977
- " '',\n",
978
- " '',\n",
979
- " '',\n",
980
- " '',\n",
981
- " '',\n",
982
- " '',\n",
983
- " '',\n",
984
- " '',\n",
985
- " '',\n",
986
- " '',\n",
987
- " '',\n",
988
- " '',\n",
989
- " '',\n",
990
- " '',\n",
991
- " '',\n",
992
- " '',\n",
993
- " '',\n",
994
- " '',\n",
995
- " '',\n",
996
- " '',\n",
997
- " '',\n",
998
- " '',\n",
999
- " '',\n",
1000
- " '',\n",
1001
- " '',\n",
1002
- " '',\n",
1003
- " '',\n",
1004
- " '',\n",
1005
- " '',\n",
1006
- " '',\n",
1007
- " '',\n",
1008
- " '',\n",
1009
- " '',\n",
1010
- " '',\n",
1011
- " '',\n",
1012
- " '',\n",
1013
- " '',\n",
1014
- " '',\n",
1015
- " '',\n",
1016
- " '',\n",
1017
- " '',\n",
1018
- " '',\n",
1019
- " '',\n",
1020
- " '',\n",
1021
- " '',\n",
1022
- " '',\n",
1023
- " '',\n",
1024
- " '',\n",
1025
- " '',\n",
1026
- " '',\n",
1027
- " '',\n",
1028
- " '',\n",
1029
- " '',\n",
1030
- " '',\n",
1031
- " '',\n",
1032
- " '',\n",
1033
- " '',\n",
1034
- " '',\n",
1035
- " '',\n",
1036
- " '',\n",
1037
- " '',\n",
1038
- " '',\n",
1039
- " '',\n",
1040
- " '',\n",
1041
- " '',\n",
1042
- " '',\n",
1043
- " '',\n",
1044
- " '',\n",
1045
- " '',\n",
1046
- " '',\n",
1047
- " '',\n",
1048
- " '',\n",
1049
- " '',\n",
1050
- " '',\n",
1051
- " '',\n",
1052
- " '',\n",
1053
- " '',\n",
1054
- " '',\n",
1055
- " '',\n",
1056
- " '',\n",
1057
- " '',\n",
1058
- " '',\n",
1059
- " '',\n",
1060
- " '',\n",
1061
- " '',\n",
1062
- " '',\n",
1063
- " '',\n",
1064
- " '',\n",
1065
- " '',\n",
1066
- " '',\n",
1067
- " '',\n",
1068
- " '',\n",
1069
- " '',\n",
1070
- " '',\n",
1071
- " '',\n",
1072
- " '',\n",
1073
- " '',\n",
1074
- " '',\n",
1075
- " '',\n",
1076
- " '',\n",
1077
- " '',\n",
1078
- " '',\n",
1079
- " '',\n",
1080
- " '',\n",
1081
- " '',\n",
1082
- " '',\n",
1083
- " '',\n",
1084
- " '',\n",
1085
- " '',\n",
1086
- " '',\n",
1087
- " '',\n",
1088
- " '',\n",
1089
- " '',\n",
1090
- " '',\n",
1091
- " '',\n",
1092
- " '',\n",
1093
- " '',\n",
1094
- " '',\n",
1095
- " '',\n",
1096
- " '',\n",
1097
- " '',\n",
1098
- " '',\n",
1099
- " '',\n",
1100
- " '',\n",
1101
- " '',\n",
1102
- " '',\n",
1103
- " '',\n",
1104
- " '',\n",
1105
- " '',\n",
1106
- " '',\n",
1107
- " '',\n",
1108
- " '',\n",
1109
- " '',\n",
1110
- " '',\n",
1111
- " '',\n",
1112
- " '',\n",
1113
- " '',\n",
1114
- " '',\n",
1115
- " '',\n",
1116
- " '',\n",
1117
- " '',\n",
1118
- " '',\n",
1119
- " '',\n",
1120
- " '',\n",
1121
- " '',\n",
1122
- " '',\n",
1123
- " '',\n",
1124
- " '',\n",
1125
- " '',\n",
1126
- " '',\n",
1127
- " '',\n",
1128
- " '',\n",
1129
- " '',\n",
1130
- " '',\n",
1131
- " '',\n",
1132
- " '',\n",
1133
- " '',\n",
1134
- " '',\n",
1135
- " '',\n",
1136
- " '',\n",
1137
- " '',\n",
1138
- " '',\n",
1139
- " '',\n",
1140
- " '',\n",
1141
- " '',\n",
1142
- " '',\n",
1143
- " '',\n",
1144
- " '',\n",
1145
- " '',\n",
1146
- " '',\n",
1147
- " '',\n",
1148
- " '',\n",
1149
- " '',\n",
1150
- " '',\n",
1151
- " '',\n",
1152
- " '',\n",
1153
- " '',\n",
1154
- " '',\n",
1155
- " '',\n",
1156
- " '',\n",
1157
- " '',\n",
1158
- " '',\n",
1159
- " '',\n",
1160
- " '',\n",
1161
- " '',\n",
1162
- " '',\n",
1163
- " '',\n",
1164
- " '',\n",
1165
- " '',\n",
1166
- " '',\n",
1167
- " '',\n",
1168
- " '',\n",
1169
- " '',\n",
1170
- " '',\n",
1171
- " '',\n",
1172
- " '',\n",
1173
- " '',\n",
1174
- " '',\n",
1175
- " '',\n",
1176
- " '',\n",
1177
- " '',\n",
1178
- " '',\n",
1179
- " '',\n",
1180
- " '',\n",
1181
- " '',\n",
1182
- " '',\n",
1183
- " '',\n",
1184
- " '',\n",
1185
- " '',\n",
1186
- " '',\n",
1187
- " '',\n",
1188
- " '',\n",
1189
- " '',\n",
1190
- " '',\n",
1191
- " '',\n",
1192
- " '',\n",
1193
- " '',\n",
1194
- " '',\n",
1195
- " '',\n",
1196
- " '',\n",
1197
- " '',\n",
1198
- " '',\n",
1199
- " '',\n",
1200
- " '',\n",
1201
- " '',\n",
1202
- " '',\n",
1203
- " '',\n",
1204
- " '',\n",
1205
- " '',\n",
1206
- " '',\n",
1207
- " '',\n",
1208
- " '',\n",
1209
- " '',\n",
1210
- " '',\n",
1211
- " '',\n",
1212
- " '',\n",
1213
- " '',\n",
1214
- " '',\n",
1215
- " '',\n",
1216
- " '',\n",
1217
- " '',\n",
1218
- " '',\n",
1219
- " '',\n",
1220
- " '',\n",
1221
- " '',\n",
1222
- " '',\n",
1223
- " '',\n",
1224
- " '',\n",
1225
- " '',\n",
1226
- " '',\n",
1227
- " '',\n",
1228
- " '',\n",
1229
- " '',\n",
1230
- " '',\n",
1231
- " '',\n",
1232
- " '',\n",
1233
- " '',\n",
1234
- " '',\n",
1235
- " '',\n",
1236
- " '',\n",
1237
- " '',\n",
1238
- " '',\n",
1239
- " '',\n",
1240
- " '',\n",
1241
- " '',\n",
1242
- " '',\n",
1243
- " '',\n",
1244
- " '',\n",
1245
- " '',\n",
1246
- " '',\n",
1247
- " '',\n",
1248
- " '',\n",
1249
- " '',\n",
1250
- " '',\n",
1251
- " '',\n",
1252
- " '',\n",
1253
- " '',\n",
1254
- " '',\n",
1255
- " '',\n",
1256
- " '',\n",
1257
- " '',\n",
1258
- " '',\n",
1259
- " '',\n",
1260
- " ...]"
1261
  ]
1262
  },
1263
- "execution_count": 14,
1264
  "metadata": {},
1265
  "output_type": "execute_result"
1266
  }
1267
  ],
1268
  "source": [
1269
- "test_data[\"segment\"]"
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1270
  ]
1271
  },
1272
  {
 
250
  },
251
  {
252
  "cell_type": "code",
253
+ "execution_count": 23,
254
+ "id": "406af02e",
255
  "metadata": {},
256
  "outputs": [
257
  {
258
  "data": {
259
  "text/plain": [
260
+ "datasets.arrow_dataset.Dataset"
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
261
  ]
262
  },
263
+ "execution_count": 23,
264
  "metadata": {},
265
  "output_type": "execute_result"
266
  }
267
  ],
268
  "source": [
269
+ "type(test_data)"
270
+ ]
271
+ },
272
+ {
273
+ "cell_type": "code",
274
+ "execution_count": 21,
275
+ "id": "8cc0fa51",
276
+ "metadata": {},
277
+ "outputs": [
278
+ {
279
+ "data": {
280
+ "text/plain": [
281
+ "'cv-corpus-7.0-2021-07-21/sv-SE/clips/common_voice_sv-SE_18711293.mp3'"
282
+ ]
283
+ },
284
+ "execution_count": 21,
285
+ "metadata": {},
286
+ "output_type": "execute_result"
287
+ }
288
+ ],
289
+ "source": [
290
+ "test_data[0][\"path\"]"
291
  ]
292
  },
293
  {
upload_nst_sv_to_hf_dataset.py CHANGED
@@ -25,16 +25,20 @@ Procedure:
25
  5. Upload to hub
26
 
27
  Filter out:
28
- - single words
29
- - single characters
30
- - words splitted in single characters
 
31
 
32
  """
33
 
34
  import json
35
  import os
36
 
37
- from datasets import DatasetDict
 
 
 
38
 
39
 
40
  hf_dataset_repo = "marinone94/nst_sv"
@@ -42,8 +46,15 @@ audio_files_path = "/Users/emiliomarinone/datasets/nst_sv/audio_files"
42
  annotations_path = "/Users/emiliomarinone/datasets/nst_sv/annotations"
43
 
44
 
45
- def load_audio_file(filepath):
46
- return None
 
 
 
 
 
 
 
47
 
48
  def is_record_valid(text):
49
  text_split = text.split()
@@ -62,6 +73,10 @@ def is_record_valid(text):
62
  return True
63
 
64
 
 
 
 
 
65
  def create_dataset_row(annotation_filename):
66
  annotations_filepath = os.path.join(annotations_path, annotation_filename)
67
  with open(annotations_filepath, "r") as f:
@@ -70,12 +85,12 @@ def create_dataset_row(annotation_filename):
70
  dataset_rows = []
71
  for recording in annotation["val_recordings"]:
72
  if is_record_valid(recording["text"]):
73
- audio_filepath = f'{audio_files_path}/{annotation["pid"]}/{annotation["pid"]}_{recording["file"]}'
74
  dataset_row = {
75
  "client_id": annotation["info"]["Speaker_ID"],
76
- 'path': recording["file"],
77
- 'audio': load_audio_file(audio_filepath),
78
- 'sentence': recording["text"],
79
  'up_votes': 0,
80
  'down_votes': 0,
81
  'age': annotation["info"]["Age"],
@@ -92,12 +107,10 @@ def create_dataset_row(annotation_filename):
92
  dataset_rows = []
93
  for i, filename in enumerate(os.listdir(annotations_path)):
94
  dataset_rows.extend(create_dataset_row(filename))
95
- if i == 5:
96
  break
97
 
98
- from pprint import pformat
99
- pformat(dataset_rows)
100
-
101
- # dataset = DatasetDict(dataset_rows)
102
- # with open("temp.json", "w") as f:
103
- # json.dump(f, dataset_rows)
 
25
  5. Upload to hub
26
 
27
  Filter out:
28
+ - examples with single words
29
+ - examples with single characters
30
+ - examples with words splitted in single characters
31
+ - remove "\\Punkt", "\\Komma" from sentences
32
 
33
  """
34
 
35
  import json
36
  import os
37
 
38
+ import pandas as pd
39
+ import torchaudio
40
+
41
+ from datasets import load_dataset
42
 
43
 
44
  hf_dataset_repo = "marinone94/nst_sv"
 
46
  annotations_path = "/Users/emiliomarinone/datasets/nst_sv/annotations"
47
 
48
 
49
+ def load_audio_file(rel_filepath):
50
+ audio_filepath = f'{audio_files_path}/{rel_filepath}'
51
+ data_waveform, sampling_rate = torchaudio.load(audio_filepath)
52
+ return {
53
+ "path": rel_filepath,
54
+ "array": data_waveform[0].t().numpy(),
55
+ "sampling_rate": sampling_rate
56
+ }
57
+
58
 
59
  def is_record_valid(text):
60
  text_split = text.split()
 
73
  return True
74
 
75
 
76
+ def clean_text(text):
77
+ return text.replace("\\\\Komma", "").replace("\\\\Punkt", "")
78
+
79
+
80
  def create_dataset_row(annotation_filename):
81
  annotations_filepath = os.path.join(annotations_path, annotation_filename)
82
  with open(annotations_filepath, "r") as f:
 
85
  dataset_rows = []
86
  for recording in annotation["val_recordings"]:
87
  if is_record_valid(recording["text"]):
88
+ rel_filepath = f'{annotation["pid"]}/{annotation["pid"]}_{recording["file"]}'.replace(".wav", "-2.wav")
89
  dataset_row = {
90
  "client_id": annotation["info"]["Speaker_ID"],
91
+ 'path': rel_filepath,
92
+ 'audio': load_audio_file(rel_filepath),
93
+ 'sentence': clean_text(recording["text"]),
94
  'up_votes': 0,
95
  'down_votes': 0,
96
  'age': annotation["info"]["Age"],
 
107
  dataset_rows = []
108
  for i, filename in enumerate(os.listdir(annotations_path)):
109
  dataset_rows.extend(create_dataset_row(filename))
110
+ if i == 1:
111
  break
112
 
113
+ df = pd.DataFrame(dataset_rows)
114
+ df.to_parquet("dataset.parquet")
115
+ dataset = load_dataset("parquet", data_files="dataset.parquet")
116
+ dataset.push_to_hub("marinone94/nst_sv")