nbroad HF staff commited on
Commit
f9e6f44
1 Parent(s): d3bfd30

Upload remove-donut-tokens.ipynb

Browse files
Files changed (1) hide show
  1. remove-donut-tokens.ipynb +213 -1118
remove-donut-tokens.ipynb CHANGED
@@ -106,26 +106,26 @@
106
  {
107
  "data": {
108
  "text/plain": [
109
- "['',\n",
110
- " '引起',\n",
111
- " 'ärer',\n",
112
- " '▁vaziyat',\n",
113
- " '',\n",
114
- " '▁līderi',\n",
115
- " '▁సంతోష',\n",
116
- " '▁פלילי',\n",
117
- " '▁obvi',\n",
118
- " '▁автомобиля',\n",
119
- " '',\n",
120
- " '400',\n",
121
- " '',\n",
122
- " '摩擦',\n",
123
- " '▁Rund',\n",
124
- " 'ضخم',\n",
125
- " '▁ruin',\n",
126
- " '▁Center',\n",
127
- " 'DOWNLOAD',\n",
128
- " '😇']"
129
  ]
130
  },
131
  "execution_count": 3,
@@ -185,7 +185,7 @@
185
  },
186
  {
187
  "cell_type": "code",
188
- "execution_count": 9,
189
  "metadata": {},
190
  "outputs": [],
191
  "source": [
@@ -203,7 +203,7 @@
203
  },
204
  {
205
  "cell_type": "code",
206
- "execution_count": 10,
207
  "metadata": {},
208
  "outputs": [
209
  {
@@ -212,7 +212,7 @@
212
  "27510"
213
  ]
214
  },
215
- "execution_count": 10,
216
  "metadata": {},
217
  "output_type": "execute_result"
218
  }
@@ -234,7 +234,7 @@
234
  },
235
  {
236
  "cell_type": "code",
237
- "execution_count": 11,
238
  "metadata": {},
239
  "outputs": [
240
  {
@@ -250,7 +250,7 @@
250
  "[0, 56881, 3, 2]"
251
  ]
252
  },
253
- "execution_count": 11,
254
  "metadata": {},
255
  "output_type": "execute_result"
256
  }
@@ -264,7 +264,7 @@
264
  },
265
  {
266
  "cell_type": "code",
267
- "execution_count": 12,
268
  "metadata": {},
269
  "outputs": [
270
  {
@@ -283,7 +283,7 @@
283
  " (2, '</s>')]"
284
  ]
285
  },
286
- "execution_count": 12,
287
  "metadata": {},
288
  "output_type": "execute_result"
289
  }
@@ -298,7 +298,7 @@
298
  },
299
  {
300
  "cell_type": "code",
301
- "execution_count": 13,
302
  "metadata": {},
303
  "outputs": [
304
  {
@@ -309,7 +309,7 @@
309
  " type: NORMAL]"
310
  ]
311
  },
312
- "execution_count": 13,
313
  "metadata": {},
314
  "output_type": "execute_result"
315
  }
@@ -320,7 +320,7 @@
320
  },
321
  {
322
  "cell_type": "code",
323
- "execution_count": 14,
324
  "metadata": {},
325
  "outputs": [],
326
  "source": [
@@ -339,7 +339,7 @@
339
  },
340
  {
341
  "cell_type": "code",
342
- "execution_count": 19,
343
  "metadata": {},
344
  "outputs": [
345
  {
@@ -348,7 +348,7 @@
348
  "27511"
349
  ]
350
  },
351
- "execution_count": 19,
352
  "metadata": {},
353
  "output_type": "execute_result"
354
  }
@@ -366,7 +366,7 @@
366
  },
367
  {
368
  "cell_type": "code",
369
- "execution_count": 16,
370
  "metadata": {},
371
  "outputs": [],
372
  "source": [
@@ -383,7 +383,7 @@
383
  },
384
  {
385
  "cell_type": "code",
386
- "execution_count": 17,
387
  "metadata": {},
388
  "outputs": [],
389
  "source": [
@@ -394,7 +394,7 @@
394
  },
395
  {
396
  "cell_type": "code",
397
- "execution_count": 18,
398
  "metadata": {},
399
  "outputs": [
400
  {
@@ -403,7 +403,7 @@
403
  "(27513, 57525)"
404
  ]
405
  },
406
- "execution_count": 18,
407
  "metadata": {},
408
  "output_type": "execute_result"
409
  }
@@ -421,7 +421,7 @@
421
  },
422
  {
423
  "cell_type": "code",
424
- "execution_count": 20,
425
  "metadata": {},
426
  "outputs": [
427
  {
@@ -430,7 +430,7 @@
430
  "['<s_iitcdip>', '<s_synthdog>']"
431
  ]
432
  },
433
- "execution_count": 20,
434
  "metadata": {},
435
  "output_type": "execute_result"
436
  }
@@ -441,7 +441,7 @@
441
  },
442
  {
443
  "cell_type": "code",
444
- "execution_count": 21,
445
  "metadata": {},
446
  "outputs": [
447
  {
@@ -450,7 +450,7 @@
450
  "2"
451
  ]
452
  },
453
- "execution_count": 21,
454
  "metadata": {},
455
  "output_type": "execute_result"
456
  }
@@ -461,7 +461,7 @@
461
  },
462
  {
463
  "cell_type": "code",
464
- "execution_count": 22,
465
  "metadata": {},
466
  "outputs": [
467
  {
@@ -470,7 +470,7 @@
470
  "['▁<', 's', '>']"
471
  ]
472
  },
473
- "execution_count": 22,
474
  "metadata": {},
475
  "output_type": "execute_result"
476
  }
@@ -481,7 +481,7 @@
481
  },
482
  {
483
  "cell_type": "code",
484
- "execution_count": 23,
485
  "metadata": {},
486
  "outputs": [
487
  {
@@ -497,7 +497,7 @@
497
  " 'additional_special_tokens': ['<s_iitcdip>', '<s_synthdog>']}"
498
  ]
499
  },
500
- "execution_count": 23,
501
  "metadata": {},
502
  "output_type": "execute_result"
503
  }
@@ -508,7 +508,7 @@
508
  },
509
  {
510
  "cell_type": "code",
511
- "execution_count": 24,
512
  "metadata": {},
513
  "outputs": [
514
  {
@@ -520,7 +520,7 @@
520
  " 'donut-base-ascii/added_tokens.json')"
521
  ]
522
  },
523
- "execution_count": 24,
524
  "metadata": {},
525
  "output_type": "execute_result"
526
  }
@@ -535,7 +535,7 @@
535
  },
536
  {
537
  "cell_type": "code",
538
- "execution_count": 25,
539
  "metadata": {},
540
  "outputs": [
541
  {
@@ -544,7 +544,7 @@
544
  "['<s>']"
545
  ]
546
  },
547
- "execution_count": 25,
548
  "metadata": {},
549
  "output_type": "execute_result"
550
  }
@@ -555,7 +555,7 @@
555
  },
556
  {
557
  "cell_type": "code",
558
- "execution_count": 26,
559
  "metadata": {},
560
  "outputs": [
561
  {
@@ -564,7 +564,7 @@
564
  "(27515, 57525)"
565
  ]
566
  },
567
- "execution_count": 26,
568
  "metadata": {},
569
  "output_type": "execute_result"
570
  }
@@ -575,7 +575,7 @@
575
  },
576
  {
577
  "cell_type": "code",
578
- "execution_count": 28,
579
  "metadata": {},
580
  "outputs": [
581
  {
@@ -584,7 +584,7 @@
584
  "['<s>']"
585
  ]
586
  },
587
- "execution_count": 28,
588
  "metadata": {},
589
  "output_type": "execute_result"
590
  }
@@ -614,7 +614,7 @@
614
  },
615
  {
616
  "cell_type": "code",
617
- "execution_count": 29,
618
  "metadata": {},
619
  "outputs": [],
620
  "source": [
@@ -632,7 +632,7 @@
632
  },
633
  {
634
  "cell_type": "code",
635
- "execution_count": 30,
636
  "metadata": {},
637
  "outputs": [
638
  {
@@ -644,7 +644,7 @@
644
  " ('<s_synthdog>', 27514)]"
645
  ]
646
  },
647
- "execution_count": 30,
648
  "metadata": {},
649
  "output_type": "execute_result"
650
  }
@@ -666,9 +666,59 @@
666
  },
667
  {
668
  "cell_type": "code",
669
- "execution_count": 32,
670
  "metadata": {},
671
  "outputs": [
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
672
  {
673
  "name": "stdout",
674
  "output_type": "stream",
@@ -693,1022 +743,22 @@
693
  },
694
  {
695
  "cell_type": "code",
696
- "execution_count": 33,
697
  "metadata": {},
698
  "outputs": [
699
  {
700
  "data": {
701
  "text/plain": [
702
- "[0,\n",
703
- " 1,\n",
704
- " 2,\n",
705
- " 3,\n",
706
- " 4,\n",
707
- " 5,\n",
708
- " 6,\n",
709
- " 7,\n",
710
- " 8,\n",
711
- " 10,\n",
712
- " 12,\n",
713
- " 13,\n",
714
- " 17,\n",
715
- " 18,\n",
716
- " 26,\n",
717
- " 28,\n",
718
- " 30,\n",
719
- " 32,\n",
720
- " 33,\n",
721
- " 35,\n",
722
- " 36,\n",
723
- " 38,\n",
724
- " 40,\n",
725
- " 43,\n",
726
- " 44,\n",
727
- " 45,\n",
728
- " 47,\n",
729
- " 48,\n",
730
- " 51,\n",
731
- " 52,\n",
732
- " 54,\n",
733
- " 56,\n",
734
- " 57,\n",
735
- " 58,\n",
736
- " 60,\n",
737
- " 61,\n",
738
- " 63,\n",
739
- " 64,\n",
740
- " 69,\n",
741
- " 70,\n",
742
- " 72,\n",
743
- " 74,\n",
744
- " 75,\n",
745
- " 76,\n",
746
- " 77,\n",
747
- " 79,\n",
748
- " 82,\n",
749
- " 84,\n",
750
- " 85,\n",
751
- " 87,\n",
752
- " 88,\n",
753
- " 90,\n",
754
- " 93,\n",
755
- " 94,\n",
756
- " 95,\n",
757
- " 103,\n",
758
- " 104,\n",
759
- " 105,\n",
760
- " 107,\n",
761
- " 110,\n",
762
- " 111,\n",
763
- " 112,\n",
764
- " 113,\n",
765
- " 118,\n",
766
- " 119,\n",
767
- " 120,\n",
768
- " 123,\n",
769
- " 124,\n",
770
- " 130,\n",
771
- " 133,\n",
772
- " 136,\n",
773
- " 139,\n",
774
- " 146,\n",
775
- " 148,\n",
776
- " 149,\n",
777
- " 150,\n",
778
- " 152,\n",
779
- " 153,\n",
780
- " 155,\n",
781
- " 157,\n",
782
- " 160,\n",
783
- " 162,\n",
784
- " 168,\n",
785
- " 172,\n",
786
- " 179,\n",
787
- " 182,\n",
788
- " 187,\n",
789
- " 191,\n",
790
- " 195,\n",
791
- " 197,\n",
792
- " 199,\n",
793
- " 201,\n",
794
- " 202,\n",
795
- " 205,\n",
796
- " 207,\n",
797
- " 208,\n",
798
- " 209,\n",
799
- " 210,\n",
800
- " 211,\n",
801
- " 212,\n",
802
- " 214,\n",
803
- " 215,\n",
804
- " 216,\n",
805
- " 220,\n",
806
- " 226,\n",
807
- " 228,\n",
808
- " 230,\n",
809
- " 233,\n",
810
- " 234,\n",
811
- " 238,\n",
812
- " 240,\n",
813
- " 241,\n",
814
- " 243,\n",
815
- " 244,\n",
816
- " 245,\n",
817
- " 248,\n",
818
- " 249,\n",
819
- " 250,\n",
820
- " 251,\n",
821
- " 252,\n",
822
- " 256,\n",
823
- " 258,\n",
824
- " 261,\n",
825
- " 262,\n",
826
- " 264,\n",
827
- " 265,\n",
828
- " 266,\n",
829
- " 267,\n",
830
- " 271,\n",
831
- " 272,\n",
832
- " 273,\n",
833
- " 274,\n",
834
- " 275,\n",
835
- " 281,\n",
836
- " 285,\n",
837
- " 288,\n",
838
- " 290,\n",
839
- " 294,\n",
840
- " 301,\n",
841
- " 302,\n",
842
- " 303,\n",
843
- " 304,\n",
844
- " 306,\n",
845
- " 308,\n",
846
- " 311,\n",
847
- " 313,\n",
848
- " 315,\n",
849
- " 317,\n",
850
- " 322,\n",
851
- " 333,\n",
852
- " 336,\n",
853
- " 337,\n",
854
- " 341,\n",
855
- " 342,\n",
856
- " 348,\n",
857
- " 353,\n",
858
- " 354,\n",
859
- " 355,\n",
860
- " 356,\n",
861
- " 359,\n",
862
- " 360,\n",
863
- " 361,\n",
864
- " 363,\n",
865
- " 373,\n",
866
- " 375,\n",
867
- " 378,\n",
868
- " 379,\n",
869
- " 380,\n",
870
- " 381,\n",
871
- " 383,\n",
872
- " 385,\n",
873
- " 389,\n",
874
- " 390,\n",
875
- " 391,\n",
876
- " 394,\n",
877
- " 398,\n",
878
- " 400,\n",
879
- " 401,\n",
880
- " 404,\n",
881
- " 409,\n",
882
- " 410,\n",
883
- " 416,\n",
884
- " 417,\n",
885
- " 419,\n",
886
- " 422,\n",
887
- " 426,\n",
888
- " 428,\n",
889
- " 430,\n",
890
- " 436,\n",
891
- " 437,\n",
892
- " 438,\n",
893
- " 443,\n",
894
- " 446,\n",
895
- " 448,\n",
896
- " 452,\n",
897
- " 454,\n",
898
- " 456,\n",
899
- " 458,\n",
900
- " 460,\n",
901
- " 463,\n",
902
- " 465,\n",
903
- " 468,\n",
904
- " 469,\n",
905
- " 470,\n",
906
- " 472,\n",
907
- " 477,\n",
908
- " 478,\n",
909
- " 479,\n",
910
- " 480,\n",
911
- " 482,\n",
912
- " 483,\n",
913
- " 485,\n",
914
- " 486,\n",
915
- " 487,\n",
916
- " 489,\n",
917
- " 491,\n",
918
- " 497,\n",
919
- " 498,\n",
920
- " 500,\n",
921
- " 502,\n",
922
- " 505,\n",
923
- " 506,\n",
924
- " 507,\n",
925
- " 510,\n",
926
- " 511,\n",
927
- " 513,\n",
928
- " 516,\n",
929
- " 517,\n",
930
- " 520,\n",
931
- " 526,\n",
932
- " 527,\n",
933
- " 530,\n",
934
- " 531,\n",
935
- " 534,\n",
936
- " 535,\n",
937
- " 543,\n",
938
- " 546,\n",
939
- " 550,\n",
940
- " 552,\n",
941
- " 553,\n",
942
- " 554,\n",
943
- " 556,\n",
944
- " 559,\n",
945
- " 560,\n",
946
- " 566,\n",
947
- " 568,\n",
948
- " 571,\n",
949
- " 575,\n",
950
- " 580,\n",
951
- " 583,\n",
952
- " 584,\n",
953
- " 586,\n",
954
- " 589,\n",
955
- " 593,\n",
956
- " 595,\n",
957
- " 596,\n",
958
- " 597,\n",
959
- " 600,\n",
960
- " 602,\n",
961
- " 605,\n",
962
- " 609,\n",
963
- " 610,\n",
964
- " 612,\n",
965
- " 613,\n",
966
- " 614,\n",
967
- " 617,\n",
968
- " 620,\n",
969
- " 621,\n",
970
- " 622,\n",
971
- " 626,\n",
972
- " 628,\n",
973
- " 631,\n",
974
- " 632,\n",
975
- " 633,\n",
976
- " 641,\n",
977
- " 644,\n",
978
- " 645,\n",
979
- " 646,\n",
980
- " 647,\n",
981
- " 648,\n",
982
- " 651,\n",
983
- " 655,\n",
984
- " 656,\n",
985
- " 657,\n",
986
- " 658,\n",
987
- " 659,\n",
988
- " 660,\n",
989
- " 661,\n",
990
- " 662,\n",
991
- " 663,\n",
992
- " 666,\n",
993
- " 669,\n",
994
- " 670,\n",
995
- " 673,\n",
996
- " 674,\n",
997
- " 675,\n",
998
- " 676,\n",
999
- " 678,\n",
1000
- " 679,\n",
1001
- " 680,\n",
1002
- " 683,\n",
1003
- " 687,\n",
1004
- " 696,\n",
1005
- " 699,\n",
1006
- " 700,\n",
1007
- " 701,\n",
1008
- " 702,\n",
1009
- " 705,\n",
1010
- " 707,\n",
1011
- " 710,\n",
1012
- " 711,\n",
1013
- " 715,\n",
1014
- " 716,\n",
1015
- " 718,\n",
1016
- " 720,\n",
1017
- " 721,\n",
1018
- " 724,\n",
1019
- " 735,\n",
1020
- " 736,\n",
1021
- " 737,\n",
1022
- " 739,\n",
1023
- " 741,\n",
1024
- " 742,\n",
1025
- " 744,\n",
1026
- " 746,\n",
1027
- " 747,\n",
1028
- " 750,\n",
1029
- " 751,\n",
1030
- " 753,\n",
1031
- " 755,\n",
1032
- " 758,\n",
1033
- " 761,\n",
1034
- " 762,\n",
1035
- " 765,\n",
1036
- " 766,\n",
1037
- " 776,\n",
1038
- " 778,\n",
1039
- " 782,\n",
1040
- " 783,\n",
1041
- " 785,\n",
1042
- " 786,\n",
1043
- " 787,\n",
1044
- " 788,\n",
1045
- " 789,\n",
1046
- " 790,\n",
1047
- " 791,\n",
1048
- " 794,\n",
1049
- " 796,\n",
1050
- " 797,\n",
1051
- " 800,\n",
1052
- " 802,\n",
1053
- " 803,\n",
1054
- " 805,\n",
1055
- " 809,\n",
1056
- " 812,\n",
1057
- " 815,\n",
1058
- " 816,\n",
1059
- " 827,\n",
1060
- " 829,\n",
1061
- " 830,\n",
1062
- " 832,\n",
1063
- " 833,\n",
1064
- " 844,\n",
1065
- " 846,\n",
1066
- " 847,\n",
1067
- " 851,\n",
1068
- " 853,\n",
1069
- " 854,\n",
1070
- " 857,\n",
1071
- " 861,\n",
1072
- " 864,\n",
1073
- " 865,\n",
1074
- " 867,\n",
1075
- " 868,\n",
1076
- " 870,\n",
1077
- " 871,\n",
1078
- " 878,\n",
1079
- " 879,\n",
1080
- " 883,\n",
1081
- " 885,\n",
1082
- " 888,\n",
1083
- " 889,\n",
1084
- " 892,\n",
1085
- " 897,\n",
1086
- " 898,\n",
1087
- " 899,\n",
1088
- " 901,\n",
1089
- " 903,\n",
1090
- " 906,\n",
1091
- " 908,\n",
1092
- " 911,\n",
1093
- " 917,\n",
1094
- " 918,\n",
1095
- " 921,\n",
1096
- " 924,\n",
1097
- " 929,\n",
1098
- " 937,\n",
1099
- " 944,\n",
1100
- " 948,\n",
1101
- " 949,\n",
1102
- " 950,\n",
1103
- " 954,\n",
1104
- " 955,\n",
1105
- " 957,\n",
1106
- " 958,\n",
1107
- " 959,\n",
1108
- " 960,\n",
1109
- " 962,\n",
1110
- " 965,\n",
1111
- " 969,\n",
1112
- " 970,\n",
1113
- " 972,\n",
1114
- " 973,\n",
1115
- " 975,\n",
1116
- " 977,\n",
1117
- " 979,\n",
1118
- " 982,\n",
1119
- " 983,\n",
1120
- " 984,\n",
1121
- " 990,\n",
1122
- " 992,\n",
1123
- " 993,\n",
1124
- " 998,\n",
1125
- " 999,\n",
1126
- " 1001,\n",
1127
- " 1002,\n",
1128
- " 1003,\n",
1129
- " 1004,\n",
1130
- " 1007,\n",
1131
- " 1009,\n",
1132
- " 1010,\n",
1133
- " 1012,\n",
1134
- " 1016,\n",
1135
- " 1017,\n",
1136
- " 1018,\n",
1137
- " 1023,\n",
1138
- " 1024,\n",
1139
- " 1027,\n",
1140
- " 1028,\n",
1141
- " 1030,\n",
1142
- " 1031,\n",
1143
- " 1034,\n",
1144
- " 1036,\n",
1145
- " 1039,\n",
1146
- " 1051,\n",
1147
- " 1054,\n",
1148
- " 1056,\n",
1149
- " 1059,\n",
1150
- " 1062,\n",
1151
- " 1069,\n",
1152
- " 1074,\n",
1153
- " 1076,\n",
1154
- " 1077,\n",
1155
- " 1079,\n",
1156
- " 1080,\n",
1157
- " 1082,\n",
1158
- " 1085,\n",
1159
- " 1088,\n",
1160
- " 1089,\n",
1161
- " 1095,\n",
1162
- " 1096,\n",
1163
- " 1097,\n",
1164
- " 1098,\n",
1165
- " 1103,\n",
1166
- " 1105,\n",
1167
- " 1107,\n",
1168
- " 1108,\n",
1169
- " 1110,\n",
1170
- " 1112,\n",
1171
- " 1114,\n",
1172
- " 1116,\n",
1173
- " 1119,\n",
1174
- " 1121,\n",
1175
- " 1122,\n",
1176
- " 1123,\n",
1177
- " 1124,\n",
1178
- " 1126,\n",
1179
- " 1128,\n",
1180
- " 1129,\n",
1181
- " 1131,\n",
1182
- " 1132,\n",
1183
- " 1133,\n",
1184
- " 1138,\n",
1185
- " 1139,\n",
1186
- " 1140,\n",
1187
- " 1144,\n",
1188
- " 1145,\n",
1189
- " 1148,\n",
1190
- " 1152,\n",
1191
- " 1153,\n",
1192
- " 1156,\n",
1193
- " 1157,\n",
1194
- " 1160,\n",
1195
- " 1161,\n",
1196
- " 1162,\n",
1197
- " 1164,\n",
1198
- " 1165,\n",
1199
- " 1166,\n",
1200
- " 1169,\n",
1201
- " 1172,\n",
1202
- " 1178,\n",
1203
- " 1181,\n",
1204
- " 1182,\n",
1205
- " 1185,\n",
1206
- " 1186,\n",
1207
- " 1190,\n",
1208
- " 1191,\n",
1209
- " 1194,\n",
1210
- " 1197,\n",
1211
- " 1198,\n",
1212
- " 1201,\n",
1213
- " 1202,\n",
1214
- " 1205,\n",
1215
- " 1207,\n",
1216
- " 1208,\n",
1217
- " 1210,\n",
1218
- " 1211,\n",
1219
- " 1214,\n",
1220
- " 1215,\n",
1221
- " 1217,\n",
1222
- " 1218,\n",
1223
- " 1220,\n",
1224
- " 1221,\n",
1225
- " 1224,\n",
1226
- " 1227,\n",
1227
- " 1228,\n",
1228
- " 1229,\n",
1229
- " 1230,\n",
1230
- " 1235,\n",
1231
- " 1239,\n",
1232
- " 1241,\n",
1233
- " 1244,\n",
1234
- " 1246,\n",
1235
- " 1247,\n",
1236
- " 1248,\n",
1237
- " 1250,\n",
1238
- " 1253,\n",
1239
- " 1257,\n",
1240
- " 1260,\n",
1241
- " 1261,\n",
1242
- " 1263,\n",
1243
- " 1264,\n",
1244
- " 1265,\n",
1245
- " 1267,\n",
1246
- " 1271,\n",
1247
- " 1277,\n",
1248
- " 1279,\n",
1249
- " 1280,\n",
1250
- " 1290,\n",
1251
- " 1293,\n",
1252
- " 1296,\n",
1253
- " 1297,\n",
1254
- " 1302,\n",
1255
- " 1303,\n",
1256
- " 1304,\n",
1257
- " 1310,\n",
1258
- " 1313,\n",
1259
- " 1314,\n",
1260
- " 1321,\n",
1261
- " 1322,\n",
1262
- " 1323,\n",
1263
- " 1324,\n",
1264
- " 1325,\n",
1265
- " 1326,\n",
1266
- " 1330,\n",
1267
- " 1333,\n",
1268
- " 1334,\n",
1269
- " 1338,\n",
1270
- " 1340,\n",
1271
- " 1342,\n",
1272
- " 1347,\n",
1273
- " 1348,\n",
1274
- " 1350,\n",
1275
- " 1353,\n",
1276
- " 1354,\n",
1277
- " 1356,\n",
1278
- " 1358,\n",
1279
- " 1359,\n",
1280
- " 1360,\n",
1281
- " 1363,\n",
1282
- " 1364,\n",
1283
- " 1365,\n",
1284
- " 1366,\n",
1285
- " 1367,\n",
1286
- " 1370,\n",
1287
- " 1371,\n",
1288
- " 1376,\n",
1289
- " 1378,\n",
1290
- " 1379,\n",
1291
- " 1380,\n",
1292
- " 1382,\n",
1293
- " 1383,\n",
1294
- " 1385,\n",
1295
- " 1389,\n",
1296
- " 1390,\n",
1297
- " 1392,\n",
1298
- " 1393,\n",
1299
- " 1399,\n",
1300
- " 1403,\n",
1301
- " 1405,\n",
1302
- " 1410,\n",
1303
- " 1411,\n",
1304
- " 1416,\n",
1305
- " 1418,\n",
1306
- " 1419,\n",
1307
- " 1421,\n",
1308
- " 1424,\n",
1309
- " 1426,\n",
1310
- " 1427,\n",
1311
- " 1428,\n",
1312
- " 1430,\n",
1313
- " 1432,\n",
1314
- " 1433,\n",
1315
- " 1434,\n",
1316
- " 1435,\n",
1317
- " 1440,\n",
1318
- " 1448,\n",
1319
- " 1451,\n",
1320
- " 1452,\n",
1321
- " 1454,\n",
1322
- " 1456,\n",
1323
- " 1457,\n",
1324
- " 1464,\n",
1325
- " 1465,\n",
1326
- " 1466,\n",
1327
- " 1468,\n",
1328
- " 1470,\n",
1329
- " 1475,\n",
1330
- " 1480,\n",
1331
- " 1481,\n",
1332
- " 1482,\n",
1333
- " 1483,\n",
1334
- " 1489,\n",
1335
- " 1490,\n",
1336
- " 1493,\n",
1337
- " 1501,\n",
1338
- " 1504,\n",
1339
- " 1506,\n",
1340
- " 1507,\n",
1341
- " 1509,\n",
1342
- " 1510,\n",
1343
- " 1512,\n",
1344
- " 1514,\n",
1345
- " 1515,\n",
1346
- " 1516,\n",
1347
- " 1518,\n",
1348
- " 1521,\n",
1349
- " 1522,\n",
1350
- " 1524,\n",
1351
- " 1525,\n",
1352
- " 1531,\n",
1353
- " 1537,\n",
1354
- " 1538,\n",
1355
- " 1539,\n",
1356
- " 1540,\n",
1357
- " 1541,\n",
1358
- " 1542,\n",
1359
- " 1544,\n",
1360
- " 1547,\n",
1361
- " 1550,\n",
1362
- " 1551,\n",
1363
- " 1552,\n",
1364
- " 1554,\n",
1365
- " 1555,\n",
1366
- " 1556,\n",
1367
- " 1557,\n",
1368
- " 1558,\n",
1369
- " 1560,\n",
1370
- " 1561,\n",
1371
- " 1565,\n",
1372
- " 1566,\n",
1373
- " 1568,\n",
1374
- " 1570,\n",
1375
- " 1572,\n",
1376
- " 1573,\n",
1377
- " 1575,\n",
1378
- " 1576,\n",
1379
- " 1577,\n",
1380
- " 1579,\n",
1381
- " 1581,\n",
1382
- " 1582,\n",
1383
- " 1587,\n",
1384
- " 1592,\n",
1385
- " 1595,\n",
1386
- " 1598,\n",
1387
- " 1601,\n",
1388
- " 1602,\n",
1389
- " 1603,\n",
1390
- " 1604,\n",
1391
- " 1606,\n",
1392
- " 1609,\n",
1393
- " 1612,\n",
1394
- " 1615,\n",
1395
- " 1618,\n",
1396
- " 1619,\n",
1397
- " 1623,\n",
1398
- " 1628,\n",
1399
- " 1629,\n",
1400
- " 1634,\n",
1401
- " 1635,\n",
1402
- " 1638,\n",
1403
- " 1640,\n",
1404
- " 1641,\n",
1405
- " 1642,\n",
1406
- " 1643,\n",
1407
- " 1646,\n",
1408
- " 1649,\n",
1409
- " 1652,\n",
1410
- " 1653,\n",
1411
- " 1656,\n",
1412
- " 1663,\n",
1413
- " 1665,\n",
1414
- " 1667,\n",
1415
- " 1672,\n",
1416
- " 1673,\n",
1417
- " 1674,\n",
1418
- " 1675,\n",
1419
- " 1676,\n",
1420
- " 1677,\n",
1421
- " 1681,\n",
1422
- " 1682,\n",
1423
- " 1683,\n",
1424
- " 1686,\n",
1425
- " 1689,\n",
1426
- " 1690,\n",
1427
- " 1694,\n",
1428
- " 1695,\n",
1429
- " 1699,\n",
1430
- " 1707,\n",
1431
- " 1709,\n",
1432
- " 1713,\n",
1433
- " 1716,\n",
1434
- " 1722,\n",
1435
- " 1723,\n",
1436
- " 1725,\n",
1437
- " 1727,\n",
1438
- " 1729,\n",
1439
- " 1730,\n",
1440
- " 1732,\n",
1441
- " 1734,\n",
1442
- " 1737,\n",
1443
- " 1741,\n",
1444
- " 1745,\n",
1445
- " 1747,\n",
1446
- " 1748,\n",
1447
- " 1749,\n",
1448
- " 1750,\n",
1449
- " 1752,\n",
1450
- " 1755,\n",
1451
- " 1757,\n",
1452
- " 1758,\n",
1453
- " 1761,\n",
1454
- " 1765,\n",
1455
- " 1766,\n",
1456
- " 1771,\n",
1457
- " 1772,\n",
1458
- " 1774,\n",
1459
- " 1777,\n",
1460
- " 1780,\n",
1461
- " 1782,\n",
1462
- " 1784,\n",
1463
- " 1785,\n",
1464
- " 1786,\n",
1465
- " 1788,\n",
1466
- " 1789,\n",
1467
- " 1792,\n",
1468
- " 1794,\n",
1469
- " 1798,\n",
1470
- " 1799,\n",
1471
- " 1802,\n",
1472
- " 1803,\n",
1473
- " 1809,\n",
1474
- " 1810,\n",
1475
- " 1817,\n",
1476
- " 1818,\n",
1477
- " 1821,\n",
1478
- " 1825,\n",
1479
- " 1826,\n",
1480
- " 1827,\n",
1481
- " 1829,\n",
1482
- " 1831,\n",
1483
- " 1832,\n",
1484
- " 1834,\n",
1485
- " 1836,\n",
1486
- " 1837,\n",
1487
- " 1838,\n",
1488
- " 1839,\n",
1489
- " 1840,\n",
1490
- " 1841,\n",
1491
- " 1843,\n",
1492
- " 1845,\n",
1493
- " 1846,\n",
1494
- " 1847,\n",
1495
- " 1853,\n",
1496
- " 1855,\n",
1497
- " 1858,\n",
1498
- " 1861,\n",
1499
- " 1863,\n",
1500
- " 1866,\n",
1501
- " 1868,\n",
1502
- " 1871,\n",
1503
- " 1872,\n",
1504
- " 1873,\n",
1505
- " 1874,\n",
1506
- " 1878,\n",
1507
- " 1881,\n",
1508
- " 1882,\n",
1509
- " 1884,\n",
1510
- " 1886,\n",
1511
- " 1887,\n",
1512
- " 1888,\n",
1513
- " 1889,\n",
1514
- " 1890,\n",
1515
- " 1893,\n",
1516
- " 1895,\n",
1517
- " 1898,\n",
1518
- " 1900,\n",
1519
- " 1901,\n",
1520
- " 1902,\n",
1521
- " 1908,\n",
1522
- " 1913,\n",
1523
- " 1916,\n",
1524
- " 1917,\n",
1525
- " 1918,\n",
1526
- " 1919,\n",
1527
- " 1921,\n",
1528
- " 1922,\n",
1529
- " 1924,\n",
1530
- " 1927,\n",
1531
- " 1928,\n",
1532
- " 1932,\n",
1533
- " 1934,\n",
1534
- " 1935,\n",
1535
- " 1938,\n",
1536
- " 1939,\n",
1537
- " 1941,\n",
1538
- " 1944,\n",
1539
- " 1947,\n",
1540
- " 1949,\n",
1541
- " 1951,\n",
1542
- " 1953,\n",
1543
- " 1957,\n",
1544
- " 1958,\n",
1545
- " 1960,\n",
1546
- " 1961,\n",
1547
- " 1963,\n",
1548
- " 1967,\n",
1549
- " 1968,\n",
1550
- " 1969,\n",
1551
- " 1970,\n",
1552
- " 1971,\n",
1553
- " 1973,\n",
1554
- " 1974,\n",
1555
- " 1978,\n",
1556
- " 1979,\n",
1557
- " 1980,\n",
1558
- " 1986,\n",
1559
- " 1987,\n",
1560
- " 1988,\n",
1561
- " 1989,\n",
1562
- " 1991,\n",
1563
- " 1992,\n",
1564
- " 1995,\n",
1565
- " 1996,\n",
1566
- " 1997,\n",
1567
- " 1999,\n",
1568
- " 2001,\n",
1569
- " 2004,\n",
1570
- " 2005,\n",
1571
- " 2006,\n",
1572
- " 2007,\n",
1573
- " 2008,\n",
1574
- " 2011,\n",
1575
- " 2012,\n",
1576
- " 2015,\n",
1577
- " 2016,\n",
1578
- " 2017,\n",
1579
- " 2019,\n",
1580
- " 2021,\n",
1581
- " 2023,\n",
1582
- " 2025,\n",
1583
- " 2028,\n",
1584
- " 2033,\n",
1585
- " 2036,\n",
1586
- " 2037,\n",
1587
- " 2039,\n",
1588
- " 2045,\n",
1589
- " 2051,\n",
1590
- " 2053,\n",
1591
- " 2054,\n",
1592
- " 2055,\n",
1593
- " 2057,\n",
1594
- " 2059,\n",
1595
- " 2061,\n",
1596
- " 2064,\n",
1597
- " 2065,\n",
1598
- " 2067,\n",
1599
- " 2068,\n",
1600
- " 2069,\n",
1601
- " 2070,\n",
1602
- " 2074,\n",
1603
- " 2076,\n",
1604
- " 2080,\n",
1605
- " 2081,\n",
1606
- " 2082,\n",
1607
- " 2084,\n",
1608
- " 2085,\n",
1609
- " 2087,\n",
1610
- " 2088,\n",
1611
- " 2090,\n",
1612
- " 2092,\n",
1613
- " 2096,\n",
1614
- " 2106,\n",
1615
- " 2109,\n",
1616
- " 2110,\n",
1617
- " 2111,\n",
1618
- " 2115,\n",
1619
- " 2116,\n",
1620
- " 2118,\n",
1621
- " 2122,\n",
1622
- " 2123,\n",
1623
- " 2126,\n",
1624
- " 2128,\n",
1625
- " 2135,\n",
1626
- " 2137,\n",
1627
- " 2139,\n",
1628
- " 2142,\n",
1629
- " 2144,\n",
1630
- " 2145,\n",
1631
- " 2146,\n",
1632
- " 2150,\n",
1633
- " 2153,\n",
1634
- " 2157,\n",
1635
- " 2159,\n",
1636
- " 2161,\n",
1637
- " 2162,\n",
1638
- " 2163,\n",
1639
- " 2165,\n",
1640
- " 2166,\n",
1641
- " 2167,\n",
1642
- " 2169,\n",
1643
- " 2170,\n",
1644
- " 2171,\n",
1645
- " 2173,\n",
1646
- " 2174,\n",
1647
- " 2177,\n",
1648
- " 2182,\n",
1649
- " 2183,\n",
1650
- " 2187,\n",
1651
- " 2189,\n",
1652
- " 2192,\n",
1653
- " 2194,\n",
1654
- " 2198,\n",
1655
- " 2199,\n",
1656
- " 2200,\n",
1657
- " 2205,\n",
1658
- " 2206,\n",
1659
- " 2209,\n",
1660
- " 2210,\n",
1661
- " 2211,\n",
1662
- " 2213,\n",
1663
- " 2216,\n",
1664
- " 2218,\n",
1665
- " 2222,\n",
1666
- " 2223,\n",
1667
- " 2225,\n",
1668
- " 2228,\n",
1669
- " 2230,\n",
1670
- " 2231,\n",
1671
- " 2235,\n",
1672
- " 2236,\n",
1673
- " 2238,\n",
1674
- " 2239,\n",
1675
- " 2242,\n",
1676
- " 2243,\n",
1677
- " 2250,\n",
1678
- " 2251,\n",
1679
- " 2252,\n",
1680
- " 2253,\n",
1681
- " 2255,\n",
1682
- " 2256,\n",
1683
- " 2257,\n",
1684
- " 2258,\n",
1685
- " 2263,\n",
1686
- " 2264,\n",
1687
- " 2265,\n",
1688
- " 2267,\n",
1689
- " 2273,\n",
1690
- " 2276,\n",
1691
- " 2277,\n",
1692
- " 2278,\n",
1693
- " 2279,\n",
1694
- " 2280,\n",
1695
- " 2284,\n",
1696
- " 2286,\n",
1697
- " 2288,\n",
1698
- " 2289,\n",
1699
- " 2290,\n",
1700
- " 2291,\n",
1701
- " 2293,\n",
1702
- " ...]"
1703
  ]
1704
  },
1705
- "execution_count": 33,
1706
  "metadata": {},
1707
  "output_type": "execute_result"
1708
  }
1709
  ],
1710
  "source": [
1711
- "embed_indexes"
1712
  ]
1713
  },
1714
  {
@@ -1723,7 +773,7 @@
1723
  },
1724
  {
1725
  "cell_type": "code",
1726
- "execution_count": 34,
1727
  "metadata": {},
1728
  "outputs": [
1729
  {
@@ -1739,7 +789,7 @@
1739
  "torch.Size([27515, 1024])"
1740
  ]
1741
  },
1742
- "execution_count": 34,
1743
  "metadata": {},
1744
  "output_type": "execute_result"
1745
  }
@@ -1773,7 +823,7 @@
1773
  },
1774
  {
1775
  "cell_type": "code",
1776
- "execution_count": 35,
1777
  "metadata": {},
1778
  "outputs": [
1779
  {
@@ -1782,7 +832,7 @@
1782
  "torch.Size([27520, 1024])"
1783
  ]
1784
  },
1785
- "execution_count": 35,
1786
  "metadata": {},
1787
  "output_type": "execute_result"
1788
  }
@@ -1797,7 +847,7 @@
1797
  },
1798
  {
1799
  "cell_type": "code",
1800
- "execution_count": 36,
1801
  "metadata": {},
1802
  "outputs": [
1803
  {
@@ -1806,7 +856,7 @@
1806
  "430.0"
1807
  ]
1808
  },
1809
- "execution_count": 36,
1810
  "metadata": {},
1811
  "output_type": "execute_result"
1812
  }
@@ -1824,7 +874,7 @@
1824
  },
1825
  {
1826
  "cell_type": "code",
1827
- "execution_count": 37,
1828
  "metadata": {},
1829
  "outputs": [],
1830
  "source": [
@@ -1846,7 +896,7 @@
1846
  },
1847
  {
1848
  "cell_type": "code",
1849
- "execution_count": 38,
1850
  "metadata": {},
1851
  "outputs": [
1852
  {
@@ -1868,7 +918,7 @@
1868
  },
1869
  {
1870
  "cell_type": "code",
1871
- "execution_count": 39,
1872
  "metadata": {},
1873
  "outputs": [
1874
  {
@@ -1877,7 +927,7 @@
1877
  "tensor(True)"
1878
  ]
1879
  },
1880
- "execution_count": 39,
1881
  "metadata": {},
1882
  "output_type": "execute_result"
1883
  }
@@ -1900,7 +950,7 @@
1900
  },
1901
  {
1902
  "cell_type": "code",
1903
- "execution_count": 40,
1904
  "metadata": {},
1905
  "outputs": [
1906
  {
@@ -1916,7 +966,7 @@
1916
  "['donut-base-ascii/preprocessor_config.json']"
1917
  ]
1918
  },
1919
- "execution_count": 40,
1920
  "metadata": {},
1921
  "output_type": "execute_result"
1922
  }
@@ -1940,7 +990,7 @@
1940
  },
1941
  {
1942
  "cell_type": "code",
1943
- "execution_count": 41,
1944
  "metadata": {},
1945
  "outputs": [
1946
  {
@@ -1959,7 +1009,7 @@
1959
  " (2, '</s>')]"
1960
  ]
1961
  },
1962
- "execution_count": 41,
1963
  "metadata": {},
1964
  "output_type": "execute_result"
1965
  }
@@ -1980,7 +1030,7 @@
1980
  },
1981
  {
1982
  "cell_type": "code",
1983
- "execution_count": 42,
1984
  "metadata": {},
1985
  "outputs": [
1986
  {
@@ -2006,7 +1056,7 @@
2006
  },
2007
  {
2008
  "cell_type": "code",
2009
- "execution_count": 43,
2010
  "metadata": {},
2011
  "outputs": [],
2012
  "source": [
@@ -2022,7 +1072,7 @@
2022
  },
2023
  {
2024
  "cell_type": "code",
2025
- "execution_count": 44,
2026
  "metadata": {},
2027
  "outputs": [
2028
  {
@@ -2032,7 +1082,7 @@
2032
  "<PIL.PngImagePlugin.PngImageFile image mode=RGB size=1701x2386>"
2033
  ]
2034
  },
2035
- "execution_count": 44,
2036
  "metadata": {},
2037
  "output_type": "execute_result"
2038
  }
@@ -2043,44 +1093,74 @@
2043
  },
2044
  {
2045
  "cell_type": "code",
2046
- "execution_count": 48,
2047
  "metadata": {},
2048
  "outputs": [
2049
  {
2050
  "name": "stdout",
2051
  "output_type": "stream",
2052
  "text": [
2053
- "CPU times: user 2.66 s, sys: 208 ms, total: 2.87 s\n",
2054
- "Wall time: 579 ms\n"
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
2055
  ]
2056
  },
2057
  {
2058
  "data": {
2059
  "text/plain": [
2060
- "\"<s_synthdog> LOVE DELIGHTFULLY SOFT SKIN? GET INDIAS FIRST GET. BAR! WITH SKIN CONDITIONERS The Modern's take by traditions and the idea.</s>\""
2061
  ]
2062
  },
2063
- "execution_count": 48,
2064
  "metadata": {},
2065
  "output_type": "execute_result"
2066
  }
2067
  ],
2068
  "source": [
2069
- "%%time\n",
2070
- "\n",
2071
  "pixel_values = processor(image, return_tensors=\"pt\").pixel_values\n",
2072
  "\n",
2073
  "outputs = model.generate(\n",
2074
  " pixel_values.to(device),\n",
2075
  " decoder_input_ids=decoder_input_ids.to(device),\n",
2076
- " max_length=model.decoder.config.max_position_embeddings,\n",
2077
  " early_stopping=True,\n",
2078
  " pad_token_id=processor.tokenizer.pad_token_id,\n",
2079
  " eos_token_id=processor.tokenizer.eos_token_id,\n",
2080
  " use_cache=True,\n",
2081
  " num_beams=1,\n",
2082
  " bad_words_ids=[[processor.tokenizer.unk_token_id]],\n",
2083
- " return_dict_in_generate=True,\n",
2084
  ")\n",
2085
  "\n",
2086
  "sequence = processor.batch_decode(outputs.sequences)[0]\n",
@@ -2089,7 +1169,7 @@
2089
  },
2090
  {
2091
  "cell_type": "code",
2092
- "execution_count": 49,
2093
  "metadata": {},
2094
  "outputs": [
2095
  {
@@ -2098,7 +1178,7 @@
2098
  "tensor([[27514]])"
2099
  ]
2100
  },
2101
- "execution_count": 49,
2102
  "metadata": {},
2103
  "output_type": "execute_result"
2104
  }
@@ -2106,69 +1186,84 @@
2106
  "source": [
2107
  "model_name = \"./donut-base-ascii\"\n",
2108
  "\n",
2109
- "processor = AutoProcessor.from_pretrained(model_name)\n",
2110
- "model = VisionEncoderDecoderModel.from_pretrained(model_name)\n",
2111
  "\n",
2112
  "device = 0\n",
2113
  "\n",
2114
- "model.to(device);\n",
2115
  "\n",
2116
- "decoder_input_ids = processor.tokenizer(task_prompt, add_special_tokens=False, return_tensors=\"pt\").input_ids\n",
2117
- "decoder_input_ids"
2118
  ]
2119
  },
2120
  {
2121
  "cell_type": "code",
2122
- "execution_count": 56,
2123
  "metadata": {},
2124
  "outputs": [
2125
  {
2126
  "name": "stdout",
2127
  "output_type": "stream",
2128
  "text": [
2129
- "CPU times: user 2.53 s, sys: 39.2 ms, total: 2.57 s\n",
2130
- "Wall time: 530 ms\n"
2131
  ]
2132
- },
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
2133
  {
2134
  "data": {
2135
  "text/plain": [
2136
- "\"<s_synthdog> LOVE DELIGHTFULLY SOFT SKIN? GET INDIAS FIRST GET. BAR! WITH SKIN CONDITIONERS The Modern's take by traditions and the idea.</s>\""
2137
  ]
2138
  },
2139
- "execution_count": 56,
2140
  "metadata": {},
2141
  "output_type": "execute_result"
2142
  }
2143
  ],
2144
  "source": [
2145
- "%%time\n",
2146
- "\n",
2147
- "pixel_values = processor(image, return_tensors=\"pt\").pixel_values\n",
2148
  "\n",
2149
- "outputs = model.generate(\n",
2150
  " pixel_values.to(device),\n",
2151
- " decoder_input_ids=decoder_input_ids.to(device),\n",
2152
- " max_length=model.decoder.config.max_position_embeddings,\n",
2153
  " early_stopping=True,\n",
2154
- " pad_token_id=processor.tokenizer.pad_token_id,\n",
2155
- " eos_token_id=processor.tokenizer.eos_token_id,\n",
2156
  " use_cache=True,\n",
2157
  " num_beams=1,\n",
2158
- " bad_words_ids=[[processor.tokenizer.unk_token_id]],\n",
2159
- " return_dict_in_generate=True,\n",
2160
  ")\n",
2161
  "\n",
2162
- "sequence = processor.batch_decode(outputs.sequences)[0]\n",
2163
  "sequence"
2164
  ]
2165
- },
2166
- {
2167
- "cell_type": "markdown",
2168
- "metadata": {},
2169
- "source": [
2170
- "\"<s_synthdog> LOVE DELIGHTFULLY SOFT SKIN? GET INDIAS FIRST GET. BAR! WITH SKIN CONDITIONERS The Modern's take by traditions and the idea.</s>\""
2171
- ]
2172
  }
2173
  ],
2174
  "metadata": {
 
106
  {
107
  "data": {
108
  "text/plain": [
109
+ "['▁1',\n",
110
+ " '▁kilala',\n",
111
+ " 'ସୀ',\n",
112
+ " '错误',\n",
113
+ " '▁https',\n",
114
+ " '▁жасаған',\n",
115
+ " '病院',\n",
116
+ " '▁effort',\n",
117
+ " '▁기사',\n",
118
+ " 'ptu',\n",
119
+ " '',\n",
120
+ " 'jit',\n",
121
+ " '▁college',\n",
122
+ " '383',\n",
123
+ " '▁dveře',\n",
124
+ " 'masse',\n",
125
+ " '\\x9c',\n",
126
+ " 'rait',\n",
127
+ " '',\n",
128
+ " '▁gewoonlik']"
129
  ]
130
  },
131
  "execution_count": 3,
 
185
  },
186
  {
187
  "cell_type": "code",
188
+ "execution_count": 6,
189
  "metadata": {},
190
  "outputs": [],
191
  "source": [
 
203
  },
204
  {
205
  "cell_type": "code",
206
+ "execution_count": 7,
207
  "metadata": {},
208
  "outputs": [
209
  {
 
212
  "27510"
213
  ]
214
  },
215
+ "execution_count": 7,
216
  "metadata": {},
217
  "output_type": "execute_result"
218
  }
 
234
  },
235
  {
236
  "cell_type": "code",
237
+ "execution_count": 8,
238
  "metadata": {},
239
  "outputs": [
240
  {
 
250
  "[0, 56881, 3, 2]"
251
  ]
252
  },
253
+ "execution_count": 8,
254
  "metadata": {},
255
  "output_type": "execute_result"
256
  }
 
264
  },
265
  {
266
  "cell_type": "code",
267
+ "execution_count": 9,
268
  "metadata": {},
269
  "outputs": [
270
  {
 
283
  " (2, '</s>')]"
284
  ]
285
  },
286
+ "execution_count": 9,
287
  "metadata": {},
288
  "output_type": "execute_result"
289
  }
 
298
  },
299
  {
300
  "cell_type": "code",
301
+ "execution_count": 10,
302
  "metadata": {},
303
  "outputs": [
304
  {
 
309
  " type: NORMAL]"
310
  ]
311
  },
312
+ "execution_count": 10,
313
  "metadata": {},
314
  "output_type": "execute_result"
315
  }
 
320
  },
321
  {
322
  "cell_type": "code",
323
+ "execution_count": 11,
324
  "metadata": {},
325
  "outputs": [],
326
  "source": [
 
339
  },
340
  {
341
  "cell_type": "code",
342
+ "execution_count": 12,
343
  "metadata": {},
344
  "outputs": [
345
  {
 
348
  "27511"
349
  ]
350
  },
351
+ "execution_count": 12,
352
  "metadata": {},
353
  "output_type": "execute_result"
354
  }
 
366
  },
367
  {
368
  "cell_type": "code",
369
+ "execution_count": 13,
370
  "metadata": {},
371
  "outputs": [],
372
  "source": [
 
383
  },
384
  {
385
  "cell_type": "code",
386
+ "execution_count": 14,
387
  "metadata": {},
388
  "outputs": [],
389
  "source": [
 
394
  },
395
  {
396
  "cell_type": "code",
397
+ "execution_count": 15,
398
  "metadata": {},
399
  "outputs": [
400
  {
 
403
  "(27513, 57525)"
404
  ]
405
  },
406
+ "execution_count": 15,
407
  "metadata": {},
408
  "output_type": "execute_result"
409
  }
 
421
  },
422
  {
423
  "cell_type": "code",
424
+ "execution_count": 16,
425
  "metadata": {},
426
  "outputs": [
427
  {
 
430
  "['<s_iitcdip>', '<s_synthdog>']"
431
  ]
432
  },
433
+ "execution_count": 16,
434
  "metadata": {},
435
  "output_type": "execute_result"
436
  }
 
441
  },
442
  {
443
  "cell_type": "code",
444
+ "execution_count": 17,
445
  "metadata": {},
446
  "outputs": [
447
  {
 
450
  "2"
451
  ]
452
  },
453
+ "execution_count": 17,
454
  "metadata": {},
455
  "output_type": "execute_result"
456
  }
 
461
  },
462
  {
463
  "cell_type": "code",
464
+ "execution_count": 18,
465
  "metadata": {},
466
  "outputs": [
467
  {
 
470
  "['▁<', 's', '>']"
471
  ]
472
  },
473
+ "execution_count": 18,
474
  "metadata": {},
475
  "output_type": "execute_result"
476
  }
 
481
  },
482
  {
483
  "cell_type": "code",
484
+ "execution_count": 19,
485
  "metadata": {},
486
  "outputs": [
487
  {
 
497
  " 'additional_special_tokens': ['<s_iitcdip>', '<s_synthdog>']}"
498
  ]
499
  },
500
+ "execution_count": 19,
501
  "metadata": {},
502
  "output_type": "execute_result"
503
  }
 
508
  },
509
  {
510
  "cell_type": "code",
511
+ "execution_count": 20,
512
  "metadata": {},
513
  "outputs": [
514
  {
 
520
  " 'donut-base-ascii/added_tokens.json')"
521
  ]
522
  },
523
+ "execution_count": 20,
524
  "metadata": {},
525
  "output_type": "execute_result"
526
  }
 
535
  },
536
  {
537
  "cell_type": "code",
538
+ "execution_count": 21,
539
  "metadata": {},
540
  "outputs": [
541
  {
 
544
  "['<s>']"
545
  ]
546
  },
547
+ "execution_count": 21,
548
  "metadata": {},
549
  "output_type": "execute_result"
550
  }
 
555
  },
556
  {
557
  "cell_type": "code",
558
+ "execution_count": 22,
559
  "metadata": {},
560
  "outputs": [
561
  {
 
564
  "(27515, 57525)"
565
  ]
566
  },
567
+ "execution_count": 22,
568
  "metadata": {},
569
  "output_type": "execute_result"
570
  }
 
575
  },
576
  {
577
  "cell_type": "code",
578
+ "execution_count": 23,
579
  "metadata": {},
580
  "outputs": [
581
  {
 
584
  "['<s>']"
585
  ]
586
  },
587
+ "execution_count": 23,
588
  "metadata": {},
589
  "output_type": "execute_result"
590
  }
 
614
  },
615
  {
616
  "cell_type": "code",
617
+ "execution_count": 24,
618
  "metadata": {},
619
  "outputs": [],
620
  "source": [
 
632
  },
633
  {
634
  "cell_type": "code",
635
+ "execution_count": 25,
636
  "metadata": {},
637
  "outputs": [
638
  {
 
644
  " ('<s_synthdog>', 27514)]"
645
  ]
646
  },
647
+ "execution_count": 25,
648
  "metadata": {},
649
  "output_type": "execute_result"
650
  }
 
666
  },
667
  {
668
  "cell_type": "code",
669
+ "execution_count": 26,
670
  "metadata": {},
671
  "outputs": [
672
+ {
673
+ "name": "stdout",
674
+ "output_type": "stream",
675
+ "text": [
676
+ "\n",
677
+ "===================================BUG REPORT===================================\n",
678
+ "Welcome to bitsandbytes. For bug reports, please run\n",
679
+ "\n",
680
+ "python -m bitsandbytes\n",
681
+ "\n",
682
+ " and submit this information together with your error trace to: https://github.com/TimDettmers/bitsandbytes/issues\n",
683
+ "================================================================================\n",
684
+ "bin /home/nicholas/miniconda3/lib/python3.10/site-packages/bitsandbytes/libbitsandbytes_cpu.so\n",
685
+ "/home/nicholas/miniconda3/lib/python3.10/site-packages/bitsandbytes/libbitsandbytes_cpu.so: undefined symbol: cadam32bit_grad_fp32\n",
686
+ "CUDA_SETUP: WARNING! libcudart.so not found in any environmental path. Searching in backup paths...\n",
687
+ "CUDA SETUP: Highest compute capability among GPUs detected: 8.9\n",
688
+ "CUDA SETUP: Detected CUDA version 117\n",
689
+ "CUDA SETUP: Loading binary /home/nicholas/miniconda3/lib/python3.10/site-packages/bitsandbytes/libbitsandbytes_cpu.so...\n"
690
+ ]
691
+ },
692
+ {
693
+ "name": "stderr",
694
+ "output_type": "stream",
695
+ "text": [
696
+ "/home/nicholas/miniconda3/lib/python3.10/site-packages/bitsandbytes/cextension.py:34: UserWarning: The installed version of bitsandbytes was compiled without GPU support. 8-bit optimizers, 8-bit multiplication, and GPU quantization are unavailable.\n",
697
+ " warn(\"The installed version of bitsandbytes was compiled without GPU support. \"\n",
698
+ "/home/nicholas/miniconda3/lib/python3.10/site-packages/bitsandbytes/cuda_setup/main.py:149: UserWarning: /home/nicholas/miniconda3 did not contain ['libcudart.so', 'libcudart.so.11.0', 'libcudart.so.12.0'] as expected! Searching further paths...\n",
699
+ " warn(msg)\n",
700
+ "/home/nicholas/miniconda3/lib/python3.10/site-packages/bitsandbytes/cuda_setup/main.py:149: UserWarning: WARNING: The following directories listed in your path were found to be non-existent: {PosixPath('/usr/share/gconf/ubuntu.default.path')}\n",
701
+ " warn(msg)\n",
702
+ "/home/nicholas/miniconda3/lib/python3.10/site-packages/bitsandbytes/cuda_setup/main.py:149: UserWarning: WARNING: The following directories listed in your path were found to be non-existent: {PosixPath('0'), PosixPath('1')}\n",
703
+ " warn(msg)\n",
704
+ "/home/nicholas/miniconda3/lib/python3.10/site-packages/bitsandbytes/cuda_setup/main.py:149: UserWarning: WARNING: The following directories listed in your path were found to be non-existent: {PosixPath('/usr/share/gconf/ubuntu.mandatory.path')}\n",
705
+ " warn(msg)\n",
706
+ "/home/nicholas/miniconda3/lib/python3.10/site-packages/bitsandbytes/cuda_setup/main.py:149: UserWarning: WARNING: The following directories listed in your path were found to be non-existent: {PosixPath('@/tmp/.ICE-unix/2404,unix/nicholas-MS-7D25'), PosixPath('local/nicholas-MS-7D25')}\n",
707
+ " warn(msg)\n",
708
+ "/home/nicholas/miniconda3/lib/python3.10/site-packages/bitsandbytes/cuda_setup/main.py:149: UserWarning: WARNING: The following directories listed in your path were found to be non-existent: {PosixPath('vs/workbench/api/node/extensionHostProcess')}\n",
709
+ " warn(msg)\n",
710
+ "/home/nicholas/miniconda3/lib/python3.10/site-packages/bitsandbytes/cuda_setup/main.py:149: UserWarning: WARNING: The following directories listed in your path were found to be non-existent: {PosixPath('/etc/xdg/xdg-ubuntu')}\n",
711
+ " warn(msg)\n",
712
+ "/home/nicholas/miniconda3/lib/python3.10/site-packages/bitsandbytes/cuda_setup/main.py:149: UserWarning: WARNING: The following directories listed in your path were found to be non-existent: {PosixPath('/home/nicholas/.local/share/flatpak/exports/share')}\n",
713
+ " warn(msg)\n",
714
+ "/home/nicholas/miniconda3/lib/python3.10/site-packages/bitsandbytes/cuda_setup/main.py:149: UserWarning: WARNING: The following directories listed in your path were found to be non-existent: {PosixPath('module'), PosixPath('//matplotlib_inline.backend_inline')}\n",
715
+ " warn(msg)\n",
716
+ "/home/nicholas/miniconda3/lib/python3.10/site-packages/bitsandbytes/cuda_setup/main.py:149: UserWarning: WARNING: The following directories listed in your path were found to be non-existent: {PosixPath('/usr/local/cuda/lib64')}\n",
717
+ " warn(msg)\n",
718
+ "/home/nicholas/miniconda3/lib/python3.10/site-packages/bitsandbytes/cuda_setup/main.py:149: UserWarning: WARNING: No libcudart.so found! Install CUDA or the cudatoolkit package (anaconda)!\n",
719
+ " warn(msg)\n"
720
+ ]
721
+ },
722
  {
723
  "name": "stdout",
724
  "output_type": "stream",
 
743
  },
744
  {
745
  "cell_type": "code",
746
+ "execution_count": 27,
747
  "metadata": {},
748
  "outputs": [
749
  {
750
  "data": {
751
  "text/plain": [
752
+ "[0, 1, 2, 3, 4, 5, 6, 7, 8, 10]"
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
753
  ]
754
  },
755
+ "execution_count": 27,
756
  "metadata": {},
757
  "output_type": "execute_result"
758
  }
759
  ],
760
  "source": [
761
+ "embed_indexes[:10]"
762
  ]
763
  },
764
  {
 
773
  },
774
  {
775
  "cell_type": "code",
776
+ "execution_count": 28,
777
  "metadata": {},
778
  "outputs": [
779
  {
 
789
  "torch.Size([27515, 1024])"
790
  ]
791
  },
792
+ "execution_count": 28,
793
  "metadata": {},
794
  "output_type": "execute_result"
795
  }
 
823
  },
824
  {
825
  "cell_type": "code",
826
+ "execution_count": 29,
827
  "metadata": {},
828
  "outputs": [
829
  {
 
832
  "torch.Size([27520, 1024])"
833
  ]
834
  },
835
+ "execution_count": 29,
836
  "metadata": {},
837
  "output_type": "execute_result"
838
  }
 
847
  },
848
  {
849
  "cell_type": "code",
850
+ "execution_count": 30,
851
  "metadata": {},
852
  "outputs": [
853
  {
 
856
  "430.0"
857
  ]
858
  },
859
+ "execution_count": 30,
860
  "metadata": {},
861
  "output_type": "execute_result"
862
  }
 
874
  },
875
  {
876
  "cell_type": "code",
877
+ "execution_count": 31,
878
  "metadata": {},
879
  "outputs": [],
880
  "source": [
 
896
  },
897
  {
898
  "cell_type": "code",
899
+ "execution_count": 32,
900
  "metadata": {},
901
  "outputs": [
902
  {
 
918
  },
919
  {
920
  "cell_type": "code",
921
+ "execution_count": 33,
922
  "metadata": {},
923
  "outputs": [
924
  {
 
927
  "tensor(True)"
928
  ]
929
  },
930
+ "execution_count": 33,
931
  "metadata": {},
932
  "output_type": "execute_result"
933
  }
 
950
  },
951
  {
952
  "cell_type": "code",
953
+ "execution_count": 34,
954
  "metadata": {},
955
  "outputs": [
956
  {
 
966
  "['donut-base-ascii/preprocessor_config.json']"
967
  ]
968
  },
969
+ "execution_count": 34,
970
  "metadata": {},
971
  "output_type": "execute_result"
972
  }
 
990
  },
991
  {
992
  "cell_type": "code",
993
+ "execution_count": 35,
994
  "metadata": {},
995
  "outputs": [
996
  {
 
1009
  " (2, '</s>')]"
1010
  ]
1011
  },
1012
+ "execution_count": 35,
1013
  "metadata": {},
1014
  "output_type": "execute_result"
1015
  }
 
1030
  },
1031
  {
1032
  "cell_type": "code",
1033
+ "execution_count": 36,
1034
  "metadata": {},
1035
  "outputs": [
1036
  {
 
1056
  },
1057
  {
1058
  "cell_type": "code",
1059
+ "execution_count": 37,
1060
  "metadata": {},
1061
  "outputs": [],
1062
  "source": [
 
1072
  },
1073
  {
1074
  "cell_type": "code",
1075
+ "execution_count": 38,
1076
  "metadata": {},
1077
  "outputs": [
1078
  {
 
1082
  "<PIL.PngImagePlugin.PngImageFile image mode=RGB size=1701x2386>"
1083
  ]
1084
  },
1085
+ "execution_count": 38,
1086
  "metadata": {},
1087
  "output_type": "execute_result"
1088
  }
 
1093
  },
1094
  {
1095
  "cell_type": "code",
1096
+ "execution_count": 39,
1097
  "metadata": {},
1098
  "outputs": [
1099
  {
1100
  "name": "stdout",
1101
  "output_type": "stream",
1102
  "text": [
1103
+ "467 ms ± 42.1 ms per loop (mean ± std. dev. of 2 runs, 5 loops each)\n"
1104
+ ]
1105
+ }
1106
+ ],
1107
+ "source": [
1108
+ "%%timeit -n 5 -r 2\n",
1109
+ "\n",
1110
+ "pixel_values = processor(image, return_tensors=\"pt\").pixel_values\n",
1111
+ "\n",
1112
+ "outputs = model.generate(\n",
1113
+ " pixel_values.to(device),\n",
1114
+ " decoder_input_ids=decoder_input_ids.to(device),\n",
1115
+ " early_stopping=True,\n",
1116
+ " pad_token_id=processor.tokenizer.pad_token_id,\n",
1117
+ " eos_token_id=processor.tokenizer.eos_token_id,\n",
1118
+ " use_cache=True,\n",
1119
+ " num_beams=1,\n",
1120
+ " bad_words_ids=[[processor.tokenizer.unk_token_id]],\n",
1121
+ " return_dict_in_generate=True, \n",
1122
+ " min_length=10,\n",
1123
+ " max_length=10\n",
1124
+ ")"
1125
+ ]
1126
+ },
1127
+ {
1128
+ "cell_type": "code",
1129
+ "execution_count": 40,
1130
+ "metadata": {},
1131
+ "outputs": [
1132
+ {
1133
+ "name": "stderr",
1134
+ "output_type": "stream",
1135
+ "text": [
1136
+ "/home/nicholas/miniconda3/lib/python3.10/site-packages/transformers/generation/utils.py:1369: UserWarning: Using `max_length`'s default (20) to control the generation length. This behaviour is deprecated and will be removed from the config in v5 of Transformers -- we recommend using `max_new_tokens` to control the maximum length of the generation.\n",
1137
+ " warnings.warn(\n"
1138
  ]
1139
  },
1140
  {
1141
  "data": {
1142
  "text/plain": [
1143
+ "'<s_synthdog> LOVE DELIGHTFULLY SOFT SKIN? GET INDIAS FIR</s>'"
1144
  ]
1145
  },
1146
+ "execution_count": 40,
1147
  "metadata": {},
1148
  "output_type": "execute_result"
1149
  }
1150
  ],
1151
  "source": [
 
 
1152
  "pixel_values = processor(image, return_tensors=\"pt\").pixel_values\n",
1153
  "\n",
1154
  "outputs = model.generate(\n",
1155
  " pixel_values.to(device),\n",
1156
  " decoder_input_ids=decoder_input_ids.to(device),\n",
 
1157
  " early_stopping=True,\n",
1158
  " pad_token_id=processor.tokenizer.pad_token_id,\n",
1159
  " eos_token_id=processor.tokenizer.eos_token_id,\n",
1160
  " use_cache=True,\n",
1161
  " num_beams=1,\n",
1162
  " bad_words_ids=[[processor.tokenizer.unk_token_id]],\n",
1163
+ " return_dict_in_generate=True, \n",
1164
  ")\n",
1165
  "\n",
1166
  "sequence = processor.batch_decode(outputs.sequences)[0]\n",
 
1169
  },
1170
  {
1171
  "cell_type": "code",
1172
+ "execution_count": 41,
1173
  "metadata": {},
1174
  "outputs": [
1175
  {
 
1178
  "tensor([[27514]])"
1179
  ]
1180
  },
1181
+ "execution_count": 41,
1182
  "metadata": {},
1183
  "output_type": "execute_result"
1184
  }
 
1186
  "source": [
1187
  "model_name = \"./donut-base-ascii\"\n",
1188
  "\n",
1189
+ "new_processor = AutoProcessor.from_pretrained(model_name)\n",
1190
+ "new_model = VisionEncoderDecoderModel.from_pretrained(model_name)\n",
1191
  "\n",
1192
  "device = 0\n",
1193
  "\n",
1194
+ "new_model.to(device);\n",
1195
  "\n",
1196
+ "new_decoder_input_ids = new_processor.tokenizer(task_prompt, add_special_tokens=False, return_tensors=\"pt\").input_ids\n",
1197
+ "new_decoder_input_ids"
1198
  ]
1199
  },
1200
  {
1201
  "cell_type": "code",
1202
+ "execution_count": 42,
1203
  "metadata": {},
1204
  "outputs": [
1205
  {
1206
  "name": "stdout",
1207
  "output_type": "stream",
1208
  "text": [
1209
+ "382 ms ± 2.19 ms per loop (mean ± std. dev. of 2 runs, 5 loops each)\n"
 
1210
  ]
1211
+ }
1212
+ ],
1213
+ "source": [
1214
+ "%%timeit -n 5 -r 2\n",
1215
+ "\n",
1216
+ "pixel_values = new_processor(image, return_tensors=\"pt\").pixel_values\n",
1217
+ "\n",
1218
+ "outputs = new_model.generate(\n",
1219
+ " pixel_values.to(device),\n",
1220
+ " decoder_input_ids=new_decoder_input_ids.to(device),\n",
1221
+ " early_stopping=True,\n",
1222
+ " pad_token_id=new_processor.tokenizer.pad_token_id,\n",
1223
+ " eos_token_id=new_processor.tokenizer.eos_token_id,\n",
1224
+ " use_cache=True,\n",
1225
+ " num_beams=1,\n",
1226
+ " bad_words_ids=[[new_processor.tokenizer.unk_token_id]],\n",
1227
+ " return_dict_in_generate=True, \n",
1228
+ " min_length=10,\n",
1229
+ " max_length=10\n",
1230
+ ")"
1231
+ ]
1232
+ },
1233
+ {
1234
+ "cell_type": "code",
1235
+ "execution_count": 43,
1236
+ "metadata": {},
1237
+ "outputs": [
1238
  {
1239
  "data": {
1240
  "text/plain": [
1241
+ "'<s_synthdog> LOVE DELIGHTFULLY SOFT SKIN? GET INDIAS FIR</s>'"
1242
  ]
1243
  },
1244
+ "execution_count": 43,
1245
  "metadata": {},
1246
  "output_type": "execute_result"
1247
  }
1248
  ],
1249
  "source": [
1250
+ "pixel_values = new_processor(image, return_tensors=\"pt\").pixel_values\n",
 
 
1251
  "\n",
1252
+ "outputs = new_model.generate(\n",
1253
  " pixel_values.to(device),\n",
1254
+ " decoder_input_ids=new_decoder_input_ids.to(device),\n",
 
1255
  " early_stopping=True,\n",
1256
+ " pad_token_id=new_processor.tokenizer.pad_token_id,\n",
1257
+ " eos_token_id=new_processor.tokenizer.eos_token_id,\n",
1258
  " use_cache=True,\n",
1259
  " num_beams=1,\n",
1260
+ " bad_words_ids=[[new_processor.tokenizer.unk_token_id]],\n",
1261
+ " return_dict_in_generate=True, \n",
1262
  ")\n",
1263
  "\n",
1264
+ "sequence = new_processor.batch_decode(outputs.sequences)[0]\n",
1265
  "sequence"
1266
  ]
 
 
 
 
 
 
 
1267
  }
1268
  ],
1269
  "metadata": {